diff options
Diffstat (limited to 'kernel')
90 files changed, 4508 insertions, 2307 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 0b5ff083fa22..353d3fe8ba33 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -43,7 +43,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o | |||
43 | obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o | 43 | obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o |
44 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o | 44 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o |
45 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | 45 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o |
46 | obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o | 46 | obj-$(CONFIG_SMP) += smp.o |
47 | ifneq ($(CONFIG_SMP),y) | 47 | ifneq ($(CONFIG_SMP),y) |
48 | obj-y += up.o | 48 | obj-y += up.o |
49 | endif | 49 | endif |
@@ -100,6 +100,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/ | |||
100 | obj-$(CONFIG_TRACING) += trace/ | 100 | obj-$(CONFIG_TRACING) += trace/ |
101 | obj-$(CONFIG_X86_DS) += trace/ | 101 | obj-$(CONFIG_X86_DS) += trace/ |
102 | obj-$(CONFIG_RING_BUFFER) += trace/ | 102 | obj-$(CONFIG_RING_BUFFER) += trace/ |
103 | obj-$(CONFIG_TRACEPOINTS) += trace/ | ||
103 | obj-$(CONFIG_SMP) += sched_cpupri.o | 104 | obj-$(CONFIG_SMP) += sched_cpupri.o |
104 | obj-$(CONFIG_IRQ_WORK) += irq_work.o | 105 | obj-$(CONFIG_IRQ_WORK) += irq_work.o |
105 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o | 106 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o |
@@ -121,7 +122,7 @@ $(obj)/configs.o: $(obj)/config_data.h | |||
121 | # config_data.h contains the same information as ikconfig.h but gzipped. | 122 | # config_data.h contains the same information as ikconfig.h but gzipped. |
122 | # Info from config_data can be extracted from /proc/config* | 123 | # Info from config_data can be extracted from /proc/config* |
123 | targets += config_data.gz | 124 | targets += config_data.gz |
124 | $(obj)/config_data.gz: .config FORCE | 125 | $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE |
125 | $(call if_changed,gzip) | 126 | $(call if_changed,gzip) |
126 | 127 | ||
127 | quiet_cmd_ikconfiggz = IKCFG $@ | 128 | quiet_cmd_ikconfiggz = IKCFG $@ |
diff --git a/kernel/audit.c b/kernel/audit.c index 77770a034d59..e4956244ae50 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -400,7 +400,7 @@ static void kauditd_send_skb(struct sk_buff *skb) | |||
400 | if (err < 0) { | 400 | if (err < 0) { |
401 | BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ | 401 | BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ |
402 | printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); | 402 | printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); |
403 | audit_log_lost("auditd dissapeared\n"); | 403 | audit_log_lost("auditd disappeared\n"); |
404 | audit_pid = 0; | 404 | audit_pid = 0; |
405 | /* we might get lucky and get this in the next auditd */ | 405 | /* we might get lucky and get this in the next auditd */ |
406 | audit_hold_skb(skb); | 406 | audit_hold_skb(skb); |
diff --git a/kernel/capability.c b/kernel/capability.c index 2f05303715a5..9e9385f132c8 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -306,7 +306,7 @@ int capable(int cap) | |||
306 | BUG(); | 306 | BUG(); |
307 | } | 307 | } |
308 | 308 | ||
309 | if (security_capable(cap) == 0) { | 309 | if (security_capable(current_cred(), cap) == 0) { |
310 | current->flags |= PF_SUPERPRIV; | 310 | current->flags |= PF_SUPERPRIV; |
311 | return 1; | 311 | return 1; |
312 | } | 312 | } |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 66a416b42c18..b24d7027b83c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); | |||
764 | */ | 764 | */ |
765 | 765 | ||
766 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); | 766 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); |
767 | static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); | ||
767 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); | 768 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); |
768 | static int cgroup_populate_dir(struct cgroup *cgrp); | 769 | static int cgroup_populate_dir(struct cgroup *cgrp); |
769 | static const struct inode_operations cgroup_dir_inode_operations; | 770 | static const struct inode_operations cgroup_dir_inode_operations; |
@@ -860,6 +861,11 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
860 | iput(inode); | 861 | iput(inode); |
861 | } | 862 | } |
862 | 863 | ||
864 | static int cgroup_delete(const struct dentry *d) | ||
865 | { | ||
866 | return 1; | ||
867 | } | ||
868 | |||
863 | static void remove_dir(struct dentry *d) | 869 | static void remove_dir(struct dentry *d) |
864 | { | 870 | { |
865 | struct dentry *parent = dget(d->d_parent); | 871 | struct dentry *parent = dget(d->d_parent); |
@@ -874,25 +880,29 @@ static void cgroup_clear_directory(struct dentry *dentry) | |||
874 | struct list_head *node; | 880 | struct list_head *node; |
875 | 881 | ||
876 | BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); | 882 | BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); |
877 | spin_lock(&dcache_lock); | 883 | spin_lock(&dentry->d_lock); |
878 | node = dentry->d_subdirs.next; | 884 | node = dentry->d_subdirs.next; |
879 | while (node != &dentry->d_subdirs) { | 885 | while (node != &dentry->d_subdirs) { |
880 | struct dentry *d = list_entry(node, struct dentry, d_u.d_child); | 886 | struct dentry *d = list_entry(node, struct dentry, d_u.d_child); |
887 | |||
888 | spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); | ||
881 | list_del_init(node); | 889 | list_del_init(node); |
882 | if (d->d_inode) { | 890 | if (d->d_inode) { |
883 | /* This should never be called on a cgroup | 891 | /* This should never be called on a cgroup |
884 | * directory with child cgroups */ | 892 | * directory with child cgroups */ |
885 | BUG_ON(d->d_inode->i_mode & S_IFDIR); | 893 | BUG_ON(d->d_inode->i_mode & S_IFDIR); |
886 | d = dget_locked(d); | 894 | dget_dlock(d); |
887 | spin_unlock(&dcache_lock); | 895 | spin_unlock(&d->d_lock); |
896 | spin_unlock(&dentry->d_lock); | ||
888 | d_delete(d); | 897 | d_delete(d); |
889 | simple_unlink(dentry->d_inode, d); | 898 | simple_unlink(dentry->d_inode, d); |
890 | dput(d); | 899 | dput(d); |
891 | spin_lock(&dcache_lock); | 900 | spin_lock(&dentry->d_lock); |
892 | } | 901 | } else |
902 | spin_unlock(&d->d_lock); | ||
893 | node = dentry->d_subdirs.next; | 903 | node = dentry->d_subdirs.next; |
894 | } | 904 | } |
895 | spin_unlock(&dcache_lock); | 905 | spin_unlock(&dentry->d_lock); |
896 | } | 906 | } |
897 | 907 | ||
898 | /* | 908 | /* |
@@ -900,11 +910,16 @@ static void cgroup_clear_directory(struct dentry *dentry) | |||
900 | */ | 910 | */ |
901 | static void cgroup_d_remove_dir(struct dentry *dentry) | 911 | static void cgroup_d_remove_dir(struct dentry *dentry) |
902 | { | 912 | { |
913 | struct dentry *parent; | ||
914 | |||
903 | cgroup_clear_directory(dentry); | 915 | cgroup_clear_directory(dentry); |
904 | 916 | ||
905 | spin_lock(&dcache_lock); | 917 | parent = dentry->d_parent; |
918 | spin_lock(&parent->d_lock); | ||
919 | spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); | ||
906 | list_del_init(&dentry->d_u.d_child); | 920 | list_del_init(&dentry->d_u.d_child); |
907 | spin_unlock(&dcache_lock); | 921 | spin_unlock(&dentry->d_lock); |
922 | spin_unlock(&parent->d_lock); | ||
908 | remove_dir(dentry); | 923 | remove_dir(dentry); |
909 | } | 924 | } |
910 | 925 | ||
@@ -1440,6 +1455,11 @@ static int cgroup_set_super(struct super_block *sb, void *data) | |||
1440 | 1455 | ||
1441 | static int cgroup_get_rootdir(struct super_block *sb) | 1456 | static int cgroup_get_rootdir(struct super_block *sb) |
1442 | { | 1457 | { |
1458 | static const struct dentry_operations cgroup_dops = { | ||
1459 | .d_iput = cgroup_diput, | ||
1460 | .d_delete = cgroup_delete, | ||
1461 | }; | ||
1462 | |||
1443 | struct inode *inode = | 1463 | struct inode *inode = |
1444 | cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); | 1464 | cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); |
1445 | struct dentry *dentry; | 1465 | struct dentry *dentry; |
@@ -1457,6 +1477,8 @@ static int cgroup_get_rootdir(struct super_block *sb) | |||
1457 | return -ENOMEM; | 1477 | return -ENOMEM; |
1458 | } | 1478 | } |
1459 | sb->s_root = dentry; | 1479 | sb->s_root = dentry; |
1480 | /* for everything else we want ->d_op set */ | ||
1481 | sb->s_d_op = &cgroup_dops; | ||
1460 | return 0; | 1482 | return 0; |
1461 | } | 1483 | } |
1462 | 1484 | ||
@@ -2180,12 +2202,20 @@ static const struct file_operations cgroup_file_operations = { | |||
2180 | }; | 2202 | }; |
2181 | 2203 | ||
2182 | static const struct inode_operations cgroup_dir_inode_operations = { | 2204 | static const struct inode_operations cgroup_dir_inode_operations = { |
2183 | .lookup = simple_lookup, | 2205 | .lookup = cgroup_lookup, |
2184 | .mkdir = cgroup_mkdir, | 2206 | .mkdir = cgroup_mkdir, |
2185 | .rmdir = cgroup_rmdir, | 2207 | .rmdir = cgroup_rmdir, |
2186 | .rename = cgroup_rename, | 2208 | .rename = cgroup_rename, |
2187 | }; | 2209 | }; |
2188 | 2210 | ||
2211 | static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) | ||
2212 | { | ||
2213 | if (dentry->d_name.len > NAME_MAX) | ||
2214 | return ERR_PTR(-ENAMETOOLONG); | ||
2215 | d_add(dentry, NULL); | ||
2216 | return NULL; | ||
2217 | } | ||
2218 | |||
2189 | /* | 2219 | /* |
2190 | * Check if a file is a control file | 2220 | * Check if a file is a control file |
2191 | */ | 2221 | */ |
@@ -2199,10 +2229,6 @@ static inline struct cftype *__file_cft(struct file *file) | |||
2199 | static int cgroup_create_file(struct dentry *dentry, mode_t mode, | 2229 | static int cgroup_create_file(struct dentry *dentry, mode_t mode, |
2200 | struct super_block *sb) | 2230 | struct super_block *sb) |
2201 | { | 2231 | { |
2202 | static const struct dentry_operations cgroup_dops = { | ||
2203 | .d_iput = cgroup_diput, | ||
2204 | }; | ||
2205 | |||
2206 | struct inode *inode; | 2232 | struct inode *inode; |
2207 | 2233 | ||
2208 | if (!dentry) | 2234 | if (!dentry) |
@@ -2228,7 +2254,6 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode, | |||
2228 | inode->i_size = 0; | 2254 | inode->i_size = 0; |
2229 | inode->i_fop = &cgroup_file_operations; | 2255 | inode->i_fop = &cgroup_file_operations; |
2230 | } | 2256 | } |
2231 | dentry->d_op = &cgroup_dops; | ||
2232 | d_instantiate(dentry, inode); | 2257 | d_instantiate(dentry, inode); |
2233 | dget(dentry); /* Extra count - pin the dentry in core */ | 2258 | dget(dentry); /* Extra count - pin the dentry in core */ |
2234 | return 0; | 2259 | return 0; |
@@ -3638,9 +3663,7 @@ again: | |||
3638 | list_del(&cgrp->sibling); | 3663 | list_del(&cgrp->sibling); |
3639 | cgroup_unlock_hierarchy(cgrp->root); | 3664 | cgroup_unlock_hierarchy(cgrp->root); |
3640 | 3665 | ||
3641 | spin_lock(&cgrp->dentry->d_lock); | ||
3642 | d = dget(cgrp->dentry); | 3666 | d = dget(cgrp->dentry); |
3643 | spin_unlock(&d->d_lock); | ||
3644 | 3667 | ||
3645 | cgroup_d_remove_dir(d); | 3668 | cgroup_d_remove_dir(d); |
3646 | dput(d); | 3669 | dput(d); |
diff --git a/kernel/cpu.c b/kernel/cpu.c index f6e726f18491..156cc5556140 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu) | |||
189 | } | 189 | } |
190 | 190 | ||
191 | struct take_cpu_down_param { | 191 | struct take_cpu_down_param { |
192 | struct task_struct *caller; | ||
193 | unsigned long mod; | 192 | unsigned long mod; |
194 | void *hcpu; | 193 | void *hcpu; |
195 | }; | 194 | }; |
@@ -198,7 +197,6 @@ struct take_cpu_down_param { | |||
198 | static int __ref take_cpu_down(void *_param) | 197 | static int __ref take_cpu_down(void *_param) |
199 | { | 198 | { |
200 | struct take_cpu_down_param *param = _param; | 199 | struct take_cpu_down_param *param = _param; |
201 | unsigned int cpu = (unsigned long)param->hcpu; | ||
202 | int err; | 200 | int err; |
203 | 201 | ||
204 | /* Ensure this CPU doesn't handle any more interrupts. */ | 202 | /* Ensure this CPU doesn't handle any more interrupts. */ |
@@ -208,11 +206,6 @@ static int __ref take_cpu_down(void *_param) | |||
208 | 206 | ||
209 | cpu_notify(CPU_DYING | param->mod, param->hcpu); | 207 | cpu_notify(CPU_DYING | param->mod, param->hcpu); |
210 | 208 | ||
211 | if (task_cpu(param->caller) == cpu) | ||
212 | move_task_off_dead_cpu(cpu, param->caller); | ||
213 | /* Force idle task to run as soon as we yield: it should | ||
214 | immediately notice cpu is offline and die quickly. */ | ||
215 | sched_idle_next(); | ||
216 | return 0; | 209 | return 0; |
217 | } | 210 | } |
218 | 211 | ||
@@ -223,7 +216,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
223 | void *hcpu = (void *)(long)cpu; | 216 | void *hcpu = (void *)(long)cpu; |
224 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; | 217 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; |
225 | struct take_cpu_down_param tcd_param = { | 218 | struct take_cpu_down_param tcd_param = { |
226 | .caller = current, | ||
227 | .mod = mod, | 219 | .mod = mod, |
228 | .hcpu = hcpu, | 220 | .hcpu = hcpu, |
229 | }; | 221 | }; |
@@ -253,9 +245,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
253 | } | 245 | } |
254 | BUG_ON(cpu_online(cpu)); | 246 | BUG_ON(cpu_online(cpu)); |
255 | 247 | ||
256 | /* Wait for it to sleep (leaving idle task). */ | 248 | /* |
249 | * The migration_call() CPU_DYING callback will have removed all | ||
250 | * runnable tasks from the cpu, there's only the idle task left now | ||
251 | * that the migration thread is done doing the stop_machine thing. | ||
252 | * | ||
253 | * Wait for the stop thread to go away. | ||
254 | */ | ||
257 | while (!idle_cpu(cpu)) | 255 | while (!idle_cpu(cpu)) |
258 | yield(); | 256 | cpu_relax(); |
259 | 257 | ||
260 | /* This actually kills the CPU. */ | 258 | /* This actually kills the CPU. */ |
261 | __cpu_die(cpu); | 259 | __cpu_die(cpu); |
@@ -386,6 +384,14 @@ out: | |||
386 | #ifdef CONFIG_PM_SLEEP_SMP | 384 | #ifdef CONFIG_PM_SLEEP_SMP |
387 | static cpumask_var_t frozen_cpus; | 385 | static cpumask_var_t frozen_cpus; |
388 | 386 | ||
387 | void __weak arch_disable_nonboot_cpus_begin(void) | ||
388 | { | ||
389 | } | ||
390 | |||
391 | void __weak arch_disable_nonboot_cpus_end(void) | ||
392 | { | ||
393 | } | ||
394 | |||
389 | int disable_nonboot_cpus(void) | 395 | int disable_nonboot_cpus(void) |
390 | { | 396 | { |
391 | int cpu, first_cpu, error = 0; | 397 | int cpu, first_cpu, error = 0; |
@@ -397,6 +403,7 @@ int disable_nonboot_cpus(void) | |||
397 | * with the userspace trying to use the CPU hotplug at the same time | 403 | * with the userspace trying to use the CPU hotplug at the same time |
398 | */ | 404 | */ |
399 | cpumask_clear(frozen_cpus); | 405 | cpumask_clear(frozen_cpus); |
406 | arch_disable_nonboot_cpus_begin(); | ||
400 | 407 | ||
401 | printk("Disabling non-boot CPUs ...\n"); | 408 | printk("Disabling non-boot CPUs ...\n"); |
402 | for_each_online_cpu(cpu) { | 409 | for_each_online_cpu(cpu) { |
@@ -412,6 +419,8 @@ int disable_nonboot_cpus(void) | |||
412 | } | 419 | } |
413 | } | 420 | } |
414 | 421 | ||
422 | arch_disable_nonboot_cpus_end(); | ||
423 | |||
415 | if (!error) { | 424 | if (!error) { |
416 | BUG_ON(num_online_cpus() > 1); | 425 | BUG_ON(num_online_cpus() > 1); |
417 | /* Make sure the CPUs won't be enabled by someone else */ | 426 | /* Make sure the CPUs won't be enabled by someone else */ |
diff --git a/kernel/cred.c b/kernel/cred.c index 6a1aa004e376..3a9d6dd53a6c 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
@@ -252,13 +252,13 @@ struct cred *cred_alloc_blank(void) | |||
252 | #endif | 252 | #endif |
253 | 253 | ||
254 | atomic_set(&new->usage, 1); | 254 | atomic_set(&new->usage, 1); |
255 | #ifdef CONFIG_DEBUG_CREDENTIALS | ||
256 | new->magic = CRED_MAGIC; | ||
257 | #endif | ||
255 | 258 | ||
256 | if (security_cred_alloc_blank(new, GFP_KERNEL) < 0) | 259 | if (security_cred_alloc_blank(new, GFP_KERNEL) < 0) |
257 | goto error; | 260 | goto error; |
258 | 261 | ||
259 | #ifdef CONFIG_DEBUG_CREDENTIALS | ||
260 | new->magic = CRED_MAGIC; | ||
261 | #endif | ||
262 | return new; | 262 | return new; |
263 | 263 | ||
264 | error: | 264 | error: |
@@ -657,6 +657,8 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) | |||
657 | validate_creds(old); | 657 | validate_creds(old); |
658 | 658 | ||
659 | *new = *old; | 659 | *new = *old; |
660 | atomic_set(&new->usage, 1); | ||
661 | set_cred_subscribers(new, 0); | ||
660 | get_uid(new->user); | 662 | get_uid(new->user); |
661 | get_group_info(new->group_info); | 663 | get_group_info(new->group_info); |
662 | 664 | ||
@@ -674,8 +676,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) | |||
674 | if (security_prepare_creds(new, old, GFP_KERNEL) < 0) | 676 | if (security_prepare_creds(new, old, GFP_KERNEL) < 0) |
675 | goto error; | 677 | goto error; |
676 | 678 | ||
677 | atomic_set(&new->usage, 1); | ||
678 | set_cred_subscribers(new, 0); | ||
679 | put_cred(old); | 679 | put_cred(old); |
680 | validate_creds(new); | 680 | validate_creds(new); |
681 | return new; | 681 | return new; |
@@ -748,7 +748,11 @@ bool creds_are_invalid(const struct cred *cred) | |||
748 | if (cred->magic != CRED_MAGIC) | 748 | if (cred->magic != CRED_MAGIC) |
749 | return true; | 749 | return true; |
750 | #ifdef CONFIG_SECURITY_SELINUX | 750 | #ifdef CONFIG_SECURITY_SELINUX |
751 | if (selinux_is_enabled()) { | 751 | /* |
752 | * cred->security == NULL if security_cred_alloc_blank() or | ||
753 | * security_prepare_creds() returned an error. | ||
754 | */ | ||
755 | if (selinux_is_enabled() && cred->security) { | ||
752 | if ((unsigned long) cred->security < PAGE_SIZE) | 756 | if ((unsigned long) cred->security < PAGE_SIZE) |
753 | return true; | 757 | return true; |
754 | if ((*(u32 *)cred->security & 0xffffff00) == | 758 | if ((*(u32 *)cred->security & 0xffffff00) == |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index a6e729766821..bd3e8e29caa3 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
@@ -2914,7 +2914,7 @@ static void __init kdb_cmd_init(void) | |||
2914 | } | 2914 | } |
2915 | } | 2915 | } |
2916 | 2916 | ||
2917 | /* Intialize kdb_printf, breakpoint tables and kdb state */ | 2917 | /* Initialize kdb_printf, breakpoint tables and kdb state */ |
2918 | void __init kdb_init(int lvl) | 2918 | void __init kdb_init(int lvl) |
2919 | { | 2919 | { |
2920 | static int kdb_init_lvl = KDB_NOT_INITIALIZED; | 2920 | static int kdb_init_lvl = KDB_NOT_INITIALIZED; |
diff --git a/kernel/exit.c b/kernel/exit.c index 676149a4ac5f..f9a45ebcc7b1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -69,7 +69,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) | |||
69 | 69 | ||
70 | list_del_rcu(&p->tasks); | 70 | list_del_rcu(&p->tasks); |
71 | list_del_init(&p->sibling); | 71 | list_del_init(&p->sibling); |
72 | __get_cpu_var(process_counts)--; | 72 | __this_cpu_dec(process_counts); |
73 | } | 73 | } |
74 | list_del_rcu(&p->thread_group); | 74 | list_del_rcu(&p->thread_group); |
75 | } | 75 | } |
@@ -994,6 +994,15 @@ NORET_TYPE void do_exit(long code) | |||
994 | exit_fs(tsk); | 994 | exit_fs(tsk); |
995 | check_stack_usage(); | 995 | check_stack_usage(); |
996 | exit_thread(); | 996 | exit_thread(); |
997 | |||
998 | /* | ||
999 | * Flush inherited counters to the parent - before the parent | ||
1000 | * gets woken up by child-exit notifications. | ||
1001 | * | ||
1002 | * because of cgroup mode, must be called before cgroup_exit() | ||
1003 | */ | ||
1004 | perf_event_exit_task(tsk); | ||
1005 | |||
997 | cgroup_exit(tsk, 1); | 1006 | cgroup_exit(tsk, 1); |
998 | 1007 | ||
999 | if (group_dead) | 1008 | if (group_dead) |
@@ -1007,11 +1016,6 @@ NORET_TYPE void do_exit(long code) | |||
1007 | * FIXME: do that only when needed, using sched_exit tracepoint | 1016 | * FIXME: do that only when needed, using sched_exit tracepoint |
1008 | */ | 1017 | */ |
1009 | flush_ptrace_hw_breakpoint(tsk); | 1018 | flush_ptrace_hw_breakpoint(tsk); |
1010 | /* | ||
1011 | * Flush inherited counters to the parent - before the parent | ||
1012 | * gets woken up by child-exit notifications. | ||
1013 | */ | ||
1014 | perf_event_exit_task(tsk); | ||
1015 | 1019 | ||
1016 | exit_notify(tsk, group_dead); | 1020 | exit_notify(tsk, group_dead); |
1017 | #ifdef CONFIG_NUMA | 1021 | #ifdef CONFIG_NUMA |
diff --git a/kernel/fork.c b/kernel/fork.c index 5447dc7defa9..25e429152ddc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -66,6 +66,7 @@ | |||
66 | #include <linux/posix-timers.h> | 66 | #include <linux/posix-timers.h> |
67 | #include <linux/user-return-notifier.h> | 67 | #include <linux/user-return-notifier.h> |
68 | #include <linux/oom.h> | 68 | #include <linux/oom.h> |
69 | #include <linux/khugepaged.h> | ||
69 | 70 | ||
70 | #include <asm/pgtable.h> | 71 | #include <asm/pgtable.h> |
71 | #include <asm/pgalloc.h> | 72 | #include <asm/pgalloc.h> |
@@ -169,6 +170,7 @@ EXPORT_SYMBOL(free_task); | |||
169 | static inline void free_signal_struct(struct signal_struct *sig) | 170 | static inline void free_signal_struct(struct signal_struct *sig) |
170 | { | 171 | { |
171 | taskstats_tgid_free(sig); | 172 | taskstats_tgid_free(sig); |
173 | sched_autogroup_exit(sig); | ||
172 | kmem_cache_free(signal_cachep, sig); | 174 | kmem_cache_free(signal_cachep, sig); |
173 | } | 175 | } |
174 | 176 | ||
@@ -329,6 +331,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
329 | retval = ksm_fork(mm, oldmm); | 331 | retval = ksm_fork(mm, oldmm); |
330 | if (retval) | 332 | if (retval) |
331 | goto out; | 333 | goto out; |
334 | retval = khugepaged_fork(mm, oldmm); | ||
335 | if (retval) | ||
336 | goto out; | ||
332 | 337 | ||
333 | prev = NULL; | 338 | prev = NULL; |
334 | for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { | 339 | for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { |
@@ -528,6 +533,9 @@ void __mmdrop(struct mm_struct *mm) | |||
528 | mm_free_pgd(mm); | 533 | mm_free_pgd(mm); |
529 | destroy_context(mm); | 534 | destroy_context(mm); |
530 | mmu_notifier_mm_destroy(mm); | 535 | mmu_notifier_mm_destroy(mm); |
536 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
537 | VM_BUG_ON(mm->pmd_huge_pte); | ||
538 | #endif | ||
531 | free_mm(mm); | 539 | free_mm(mm); |
532 | } | 540 | } |
533 | EXPORT_SYMBOL_GPL(__mmdrop); | 541 | EXPORT_SYMBOL_GPL(__mmdrop); |
@@ -542,6 +550,7 @@ void mmput(struct mm_struct *mm) | |||
542 | if (atomic_dec_and_test(&mm->mm_users)) { | 550 | if (atomic_dec_and_test(&mm->mm_users)) { |
543 | exit_aio(mm); | 551 | exit_aio(mm); |
544 | ksm_exit(mm); | 552 | ksm_exit(mm); |
553 | khugepaged_exit(mm); /* must run before exit_mmap */ | ||
545 | exit_mmap(mm); | 554 | exit_mmap(mm); |
546 | set_mm_exe_file(mm, NULL); | 555 | set_mm_exe_file(mm, NULL); |
547 | if (!list_empty(&mm->mmlist)) { | 556 | if (!list_empty(&mm->mmlist)) { |
@@ -668,6 +677,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk) | |||
668 | mm->token_priority = 0; | 677 | mm->token_priority = 0; |
669 | mm->last_interval = 0; | 678 | mm->last_interval = 0; |
670 | 679 | ||
680 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
681 | mm->pmd_huge_pte = NULL; | ||
682 | #endif | ||
683 | |||
671 | if (!mm_init(mm, tsk)) | 684 | if (!mm_init(mm, tsk)) |
672 | goto fail_nomem; | 685 | goto fail_nomem; |
673 | 686 | ||
@@ -905,9 +918,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
905 | posix_cpu_timers_init_group(sig); | 918 | posix_cpu_timers_init_group(sig); |
906 | 919 | ||
907 | tty_audit_fork(sig); | 920 | tty_audit_fork(sig); |
921 | sched_autogroup_fork(sig); | ||
908 | 922 | ||
909 | sig->oom_adj = current->signal->oom_adj; | 923 | sig->oom_adj = current->signal->oom_adj; |
910 | sig->oom_score_adj = current->signal->oom_score_adj; | 924 | sig->oom_score_adj = current->signal->oom_score_adj; |
925 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; | ||
911 | 926 | ||
912 | mutex_init(&sig->cred_guard_mutex); | 927 | mutex_init(&sig->cred_guard_mutex); |
913 | 928 | ||
@@ -1283,7 +1298,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1283 | attach_pid(p, PIDTYPE_SID, task_session(current)); | 1298 | attach_pid(p, PIDTYPE_SID, task_session(current)); |
1284 | list_add_tail(&p->sibling, &p->real_parent->children); | 1299 | list_add_tail(&p->sibling, &p->real_parent->children); |
1285 | list_add_tail_rcu(&p->tasks, &init_task.tasks); | 1300 | list_add_tail_rcu(&p->tasks, &init_task.tasks); |
1286 | __get_cpu_var(process_counts)++; | 1301 | __this_cpu_inc(process_counts); |
1287 | } | 1302 | } |
1288 | attach_pid(p, PIDTYPE_PID, pid); | 1303 | attach_pid(p, PIDTYPE_PID, pid); |
1289 | nr_threads++; | 1304 | nr_threads++; |
@@ -1408,23 +1423,6 @@ long do_fork(unsigned long clone_flags, | |||
1408 | } | 1423 | } |
1409 | 1424 | ||
1410 | /* | 1425 | /* |
1411 | * We hope to recycle these flags after 2.6.26 | ||
1412 | */ | ||
1413 | if (unlikely(clone_flags & CLONE_STOPPED)) { | ||
1414 | static int __read_mostly count = 100; | ||
1415 | |||
1416 | if (count > 0 && printk_ratelimit()) { | ||
1417 | char comm[TASK_COMM_LEN]; | ||
1418 | |||
1419 | count--; | ||
1420 | printk(KERN_INFO "fork(): process `%s' used deprecated " | ||
1421 | "clone flags 0x%lx\n", | ||
1422 | get_task_comm(comm, current), | ||
1423 | clone_flags & CLONE_STOPPED); | ||
1424 | } | ||
1425 | } | ||
1426 | |||
1427 | /* | ||
1428 | * When called from kernel_thread, don't do user tracing stuff. | 1426 | * When called from kernel_thread, don't do user tracing stuff. |
1429 | */ | 1427 | */ |
1430 | if (likely(user_mode(regs))) | 1428 | if (likely(user_mode(regs))) |
@@ -1462,16 +1460,7 @@ long do_fork(unsigned long clone_flags, | |||
1462 | */ | 1460 | */ |
1463 | p->flags &= ~PF_STARTING; | 1461 | p->flags &= ~PF_STARTING; |
1464 | 1462 | ||
1465 | if (unlikely(clone_flags & CLONE_STOPPED)) { | 1463 | wake_up_new_task(p, clone_flags); |
1466 | /* | ||
1467 | * We'll start up with an immediate SIGSTOP. | ||
1468 | */ | ||
1469 | sigaddset(&p->pending.signal, SIGSTOP); | ||
1470 | set_tsk_thread_flag(p, TIF_SIGPENDING); | ||
1471 | __set_task_state(p, TASK_STOPPED); | ||
1472 | } else { | ||
1473 | wake_up_new_task(p, clone_flags); | ||
1474 | } | ||
1475 | 1464 | ||
1476 | tracehook_report_clone_complete(trace, regs, | 1465 | tracehook_report_clone_complete(trace, regs, |
1477 | clone_flags, nr, p); | 1466 | clone_flags, nr, p); |
diff --git a/kernel/freezer.c b/kernel/freezer.c index bd1d42b17cb2..66ecd2ead215 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c | |||
@@ -104,8 +104,13 @@ bool freeze_task(struct task_struct *p, bool sig_only) | |||
104 | } | 104 | } |
105 | 105 | ||
106 | if (should_send_signal(p)) { | 106 | if (should_send_signal(p)) { |
107 | if (!signal_pending(p)) | 107 | fake_signal_wake_up(p); |
108 | fake_signal_wake_up(p); | 108 | /* |
109 | * fake_signal_wake_up() goes through p's scheduler | ||
110 | * lock and guarantees that TASK_STOPPED/TRACED -> | ||
111 | * TASK_RUNNING transition can't race with task state | ||
112 | * testing in try_to_freeze_tasks(). | ||
113 | */ | ||
109 | } else if (sig_only) { | 114 | } else if (sig_only) { |
110 | return false; | 115 | return false; |
111 | } else { | 116 | } else { |
diff --git a/kernel/futex.c b/kernel/futex.c index 40a8777a27d0..b766d28accd6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -69,6 +69,14 @@ int __read_mostly futex_cmpxchg_enabled; | |||
69 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) | 69 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) |
70 | 70 | ||
71 | /* | 71 | /* |
72 | * Futex flags used to encode options to functions and preserve them across | ||
73 | * restarts. | ||
74 | */ | ||
75 | #define FLAGS_SHARED 0x01 | ||
76 | #define FLAGS_CLOCKRT 0x02 | ||
77 | #define FLAGS_HAS_TIMEOUT 0x04 | ||
78 | |||
79 | /* | ||
72 | * Priority Inheritance state: | 80 | * Priority Inheritance state: |
73 | */ | 81 | */ |
74 | struct futex_pi_state { | 82 | struct futex_pi_state { |
@@ -123,6 +131,12 @@ struct futex_q { | |||
123 | u32 bitset; | 131 | u32 bitset; |
124 | }; | 132 | }; |
125 | 133 | ||
134 | static const struct futex_q futex_q_init = { | ||
135 | /* list gets initialized in queue_me()*/ | ||
136 | .key = FUTEX_KEY_INIT, | ||
137 | .bitset = FUTEX_BITSET_MATCH_ANY | ||
138 | }; | ||
139 | |||
126 | /* | 140 | /* |
127 | * Hash buckets are shared by all the futex_keys that hash to the same | 141 | * Hash buckets are shared by all the futex_keys that hash to the same |
128 | * location. Each key may have multiple futex_q structures, one for each task | 142 | * location. Each key may have multiple futex_q structures, one for each task |
@@ -219,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) | |||
219 | { | 233 | { |
220 | unsigned long address = (unsigned long)uaddr; | 234 | unsigned long address = (unsigned long)uaddr; |
221 | struct mm_struct *mm = current->mm; | 235 | struct mm_struct *mm = current->mm; |
222 | struct page *page; | 236 | struct page *page, *page_head; |
223 | int err; | 237 | int err; |
224 | 238 | ||
225 | /* | 239 | /* |
@@ -251,11 +265,46 @@ again: | |||
251 | if (err < 0) | 265 | if (err < 0) |
252 | return err; | 266 | return err; |
253 | 267 | ||
254 | page = compound_head(page); | 268 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
255 | lock_page(page); | 269 | page_head = page; |
256 | if (!page->mapping) { | 270 | if (unlikely(PageTail(page))) { |
257 | unlock_page(page); | ||
258 | put_page(page); | 271 | put_page(page); |
272 | /* serialize against __split_huge_page_splitting() */ | ||
273 | local_irq_disable(); | ||
274 | if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) { | ||
275 | page_head = compound_head(page); | ||
276 | /* | ||
277 | * page_head is valid pointer but we must pin | ||
278 | * it before taking the PG_lock and/or | ||
279 | * PG_compound_lock. The moment we re-enable | ||
280 | * irqs __split_huge_page_splitting() can | ||
281 | * return and the head page can be freed from | ||
282 | * under us. We can't take the PG_lock and/or | ||
283 | * PG_compound_lock on a page that could be | ||
284 | * freed from under us. | ||
285 | */ | ||
286 | if (page != page_head) { | ||
287 | get_page(page_head); | ||
288 | put_page(page); | ||
289 | } | ||
290 | local_irq_enable(); | ||
291 | } else { | ||
292 | local_irq_enable(); | ||
293 | goto again; | ||
294 | } | ||
295 | } | ||
296 | #else | ||
297 | page_head = compound_head(page); | ||
298 | if (page != page_head) { | ||
299 | get_page(page_head); | ||
300 | put_page(page); | ||
301 | } | ||
302 | #endif | ||
303 | |||
304 | lock_page(page_head); | ||
305 | if (!page_head->mapping) { | ||
306 | unlock_page(page_head); | ||
307 | put_page(page_head); | ||
259 | goto again; | 308 | goto again; |
260 | } | 309 | } |
261 | 310 | ||
@@ -266,25 +315,24 @@ again: | |||
266 | * it's a read-only handle, it's expected that futexes attach to | 315 | * it's a read-only handle, it's expected that futexes attach to |
267 | * the object not the particular process. | 316 | * the object not the particular process. |
268 | */ | 317 | */ |
269 | if (PageAnon(page)) { | 318 | if (PageAnon(page_head)) { |
270 | key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ | 319 | key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ |
271 | key->private.mm = mm; | 320 | key->private.mm = mm; |
272 | key->private.address = address; | 321 | key->private.address = address; |
273 | } else { | 322 | } else { |
274 | key->both.offset |= FUT_OFF_INODE; /* inode-based key */ | 323 | key->both.offset |= FUT_OFF_INODE; /* inode-based key */ |
275 | key->shared.inode = page->mapping->host; | 324 | key->shared.inode = page_head->mapping->host; |
276 | key->shared.pgoff = page->index; | 325 | key->shared.pgoff = page_head->index; |
277 | } | 326 | } |
278 | 327 | ||
279 | get_futex_key_refs(key); | 328 | get_futex_key_refs(key); |
280 | 329 | ||
281 | unlock_page(page); | 330 | unlock_page(page_head); |
282 | put_page(page); | 331 | put_page(page_head); |
283 | return 0; | 332 | return 0; |
284 | } | 333 | } |
285 | 334 | ||
286 | static inline | 335 | static inline void put_futex_key(union futex_key *key) |
287 | void put_futex_key(int fshared, union futex_key *key) | ||
288 | { | 336 | { |
289 | drop_futex_key_refs(key); | 337 | drop_futex_key_refs(key); |
290 | } | 338 | } |
@@ -778,10 +826,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
778 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); | 826 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); |
779 | 827 | ||
780 | /* | 828 | /* |
781 | * This happens when we have stolen the lock and the original | 829 | * It is possible that the next waiter (the one that brought |
782 | * pending owner did not enqueue itself back on the rt_mutex. | 830 | * this owner to the kernel) timed out and is no longer |
783 | * Thats not a tragedy. We know that way, that a lock waiter | 831 | * waiting on the lock. |
784 | * is on the fly. We make the futex_q waiter the pending owner. | ||
785 | */ | 832 | */ |
786 | if (!new_owner) | 833 | if (!new_owner) |
787 | new_owner = this->task; | 834 | new_owner = this->task; |
@@ -870,7 +917,8 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) | |||
870 | /* | 917 | /* |
871 | * Wake up waiters matching bitset queued on this futex (uaddr). | 918 | * Wake up waiters matching bitset queued on this futex (uaddr). |
872 | */ | 919 | */ |
873 | static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) | 920 | static int |
921 | futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) | ||
874 | { | 922 | { |
875 | struct futex_hash_bucket *hb; | 923 | struct futex_hash_bucket *hb; |
876 | struct futex_q *this, *next; | 924 | struct futex_q *this, *next; |
@@ -881,7 +929,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) | |||
881 | if (!bitset) | 929 | if (!bitset) |
882 | return -EINVAL; | 930 | return -EINVAL; |
883 | 931 | ||
884 | ret = get_futex_key(uaddr, fshared, &key); | 932 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); |
885 | if (unlikely(ret != 0)) | 933 | if (unlikely(ret != 0)) |
886 | goto out; | 934 | goto out; |
887 | 935 | ||
@@ -907,7 +955,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) | |||
907 | } | 955 | } |
908 | 956 | ||
909 | spin_unlock(&hb->lock); | 957 | spin_unlock(&hb->lock); |
910 | put_futex_key(fshared, &key); | 958 | put_futex_key(&key); |
911 | out: | 959 | out: |
912 | return ret; | 960 | return ret; |
913 | } | 961 | } |
@@ -917,7 +965,7 @@ out: | |||
917 | * to this virtual address: | 965 | * to this virtual address: |
918 | */ | 966 | */ |
919 | static int | 967 | static int |
920 | futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, | 968 | futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, |
921 | int nr_wake, int nr_wake2, int op) | 969 | int nr_wake, int nr_wake2, int op) |
922 | { | 970 | { |
923 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; | 971 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; |
@@ -927,10 +975,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, | |||
927 | int ret, op_ret; | 975 | int ret, op_ret; |
928 | 976 | ||
929 | retry: | 977 | retry: |
930 | ret = get_futex_key(uaddr1, fshared, &key1); | 978 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); |
931 | if (unlikely(ret != 0)) | 979 | if (unlikely(ret != 0)) |
932 | goto out; | 980 | goto out; |
933 | ret = get_futex_key(uaddr2, fshared, &key2); | 981 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); |
934 | if (unlikely(ret != 0)) | 982 | if (unlikely(ret != 0)) |
935 | goto out_put_key1; | 983 | goto out_put_key1; |
936 | 984 | ||
@@ -962,11 +1010,11 @@ retry_private: | |||
962 | if (ret) | 1010 | if (ret) |
963 | goto out_put_keys; | 1011 | goto out_put_keys; |
964 | 1012 | ||
965 | if (!fshared) | 1013 | if (!(flags & FLAGS_SHARED)) |
966 | goto retry_private; | 1014 | goto retry_private; |
967 | 1015 | ||
968 | put_futex_key(fshared, &key2); | 1016 | put_futex_key(&key2); |
969 | put_futex_key(fshared, &key1); | 1017 | put_futex_key(&key1); |
970 | goto retry; | 1018 | goto retry; |
971 | } | 1019 | } |
972 | 1020 | ||
@@ -996,9 +1044,9 @@ retry_private: | |||
996 | 1044 | ||
997 | double_unlock_hb(hb1, hb2); | 1045 | double_unlock_hb(hb1, hb2); |
998 | out_put_keys: | 1046 | out_put_keys: |
999 | put_futex_key(fshared, &key2); | 1047 | put_futex_key(&key2); |
1000 | out_put_key1: | 1048 | out_put_key1: |
1001 | put_futex_key(fshared, &key1); | 1049 | put_futex_key(&key1); |
1002 | out: | 1050 | out: |
1003 | return ret; | 1051 | return ret; |
1004 | } | 1052 | } |
@@ -1133,13 +1181,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, | |||
1133 | /** | 1181 | /** |
1134 | * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 | 1182 | * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 |
1135 | * @uaddr1: source futex user address | 1183 | * @uaddr1: source futex user address |
1136 | * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED | 1184 | * @flags: futex flags (FLAGS_SHARED, etc.) |
1137 | * @uaddr2: target futex user address | 1185 | * @uaddr2: target futex user address |
1138 | * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) | 1186 | * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) |
1139 | * @nr_requeue: number of waiters to requeue (0-INT_MAX) | 1187 | * @nr_requeue: number of waiters to requeue (0-INT_MAX) |
1140 | * @cmpval: @uaddr1 expected value (or %NULL) | 1188 | * @cmpval: @uaddr1 expected value (or %NULL) |
1141 | * @requeue_pi: if we are attempting to requeue from a non-pi futex to a | 1189 | * @requeue_pi: if we are attempting to requeue from a non-pi futex to a |
1142 | * pi futex (pi to pi requeue is not supported) | 1190 | * pi futex (pi to pi requeue is not supported) |
1143 | * | 1191 | * |
1144 | * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire | 1192 | * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire |
1145 | * uaddr2 atomically on behalf of the top waiter. | 1193 | * uaddr2 atomically on behalf of the top waiter. |
@@ -1148,9 +1196,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, | |||
1148 | * >=0 - on success, the number of tasks requeued or woken | 1196 | * >=0 - on success, the number of tasks requeued or woken |
1149 | * <0 - on error | 1197 | * <0 - on error |
1150 | */ | 1198 | */ |
1151 | static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, | 1199 | static int futex_requeue(u32 __user *uaddr1, unsigned int flags, |
1152 | int nr_wake, int nr_requeue, u32 *cmpval, | 1200 | u32 __user *uaddr2, int nr_wake, int nr_requeue, |
1153 | int requeue_pi) | 1201 | u32 *cmpval, int requeue_pi) |
1154 | { | 1202 | { |
1155 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; | 1203 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; |
1156 | int drop_count = 0, task_count = 0, ret; | 1204 | int drop_count = 0, task_count = 0, ret; |
@@ -1191,10 +1239,10 @@ retry: | |||
1191 | pi_state = NULL; | 1239 | pi_state = NULL; |
1192 | } | 1240 | } |
1193 | 1241 | ||
1194 | ret = get_futex_key(uaddr1, fshared, &key1); | 1242 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); |
1195 | if (unlikely(ret != 0)) | 1243 | if (unlikely(ret != 0)) |
1196 | goto out; | 1244 | goto out; |
1197 | ret = get_futex_key(uaddr2, fshared, &key2); | 1245 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); |
1198 | if (unlikely(ret != 0)) | 1246 | if (unlikely(ret != 0)) |
1199 | goto out_put_key1; | 1247 | goto out_put_key1; |
1200 | 1248 | ||
@@ -1216,11 +1264,11 @@ retry_private: | |||
1216 | if (ret) | 1264 | if (ret) |
1217 | goto out_put_keys; | 1265 | goto out_put_keys; |
1218 | 1266 | ||
1219 | if (!fshared) | 1267 | if (!(flags & FLAGS_SHARED)) |
1220 | goto retry_private; | 1268 | goto retry_private; |
1221 | 1269 | ||
1222 | put_futex_key(fshared, &key2); | 1270 | put_futex_key(&key2); |
1223 | put_futex_key(fshared, &key1); | 1271 | put_futex_key(&key1); |
1224 | goto retry; | 1272 | goto retry; |
1225 | } | 1273 | } |
1226 | if (curval != *cmpval) { | 1274 | if (curval != *cmpval) { |
@@ -1260,8 +1308,8 @@ retry_private: | |||
1260 | break; | 1308 | break; |
1261 | case -EFAULT: | 1309 | case -EFAULT: |
1262 | double_unlock_hb(hb1, hb2); | 1310 | double_unlock_hb(hb1, hb2); |
1263 | put_futex_key(fshared, &key2); | 1311 | put_futex_key(&key2); |
1264 | put_futex_key(fshared, &key1); | 1312 | put_futex_key(&key1); |
1265 | ret = fault_in_user_writeable(uaddr2); | 1313 | ret = fault_in_user_writeable(uaddr2); |
1266 | if (!ret) | 1314 | if (!ret) |
1267 | goto retry; | 1315 | goto retry; |
@@ -1269,8 +1317,8 @@ retry_private: | |||
1269 | case -EAGAIN: | 1317 | case -EAGAIN: |
1270 | /* The owner was exiting, try again. */ | 1318 | /* The owner was exiting, try again. */ |
1271 | double_unlock_hb(hb1, hb2); | 1319 | double_unlock_hb(hb1, hb2); |
1272 | put_futex_key(fshared, &key2); | 1320 | put_futex_key(&key2); |
1273 | put_futex_key(fshared, &key1); | 1321 | put_futex_key(&key1); |
1274 | cond_resched(); | 1322 | cond_resched(); |
1275 | goto retry; | 1323 | goto retry; |
1276 | default: | 1324 | default: |
@@ -1352,9 +1400,9 @@ out_unlock: | |||
1352 | drop_futex_key_refs(&key1); | 1400 | drop_futex_key_refs(&key1); |
1353 | 1401 | ||
1354 | out_put_keys: | 1402 | out_put_keys: |
1355 | put_futex_key(fshared, &key2); | 1403 | put_futex_key(&key2); |
1356 | out_put_key1: | 1404 | out_put_key1: |
1357 | put_futex_key(fshared, &key1); | 1405 | put_futex_key(&key1); |
1358 | out: | 1406 | out: |
1359 | if (pi_state != NULL) | 1407 | if (pi_state != NULL) |
1360 | free_pi_state(pi_state); | 1408 | free_pi_state(pi_state); |
@@ -1494,7 +1542,7 @@ static void unqueue_me_pi(struct futex_q *q) | |||
1494 | * private futexes. | 1542 | * private futexes. |
1495 | */ | 1543 | */ |
1496 | static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | 1544 | static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, |
1497 | struct task_struct *newowner, int fshared) | 1545 | struct task_struct *newowner) |
1498 | { | 1546 | { |
1499 | u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; | 1547 | u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; |
1500 | struct futex_pi_state *pi_state = q->pi_state; | 1548 | struct futex_pi_state *pi_state = q->pi_state; |
@@ -1587,20 +1635,11 @@ handle_fault: | |||
1587 | goto retry; | 1635 | goto retry; |
1588 | } | 1636 | } |
1589 | 1637 | ||
1590 | /* | ||
1591 | * In case we must use restart_block to restart a futex_wait, | ||
1592 | * we encode in the 'flags' shared capability | ||
1593 | */ | ||
1594 | #define FLAGS_SHARED 0x01 | ||
1595 | #define FLAGS_CLOCKRT 0x02 | ||
1596 | #define FLAGS_HAS_TIMEOUT 0x04 | ||
1597 | |||
1598 | static long futex_wait_restart(struct restart_block *restart); | 1638 | static long futex_wait_restart(struct restart_block *restart); |
1599 | 1639 | ||
1600 | /** | 1640 | /** |
1601 | * fixup_owner() - Post lock pi_state and corner case management | 1641 | * fixup_owner() - Post lock pi_state and corner case management |
1602 | * @uaddr: user address of the futex | 1642 | * @uaddr: user address of the futex |
1603 | * @fshared: whether the futex is shared (1) or not (0) | ||
1604 | * @q: futex_q (contains pi_state and access to the rt_mutex) | 1643 | * @q: futex_q (contains pi_state and access to the rt_mutex) |
1605 | * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) | 1644 | * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) |
1606 | * | 1645 | * |
@@ -1613,8 +1652,7 @@ static long futex_wait_restart(struct restart_block *restart); | |||
1613 | * 0 - success, lock not taken | 1652 | * 0 - success, lock not taken |
1614 | * <0 - on error (-EFAULT) | 1653 | * <0 - on error (-EFAULT) |
1615 | */ | 1654 | */ |
1616 | static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, | 1655 | static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) |
1617 | int locked) | ||
1618 | { | 1656 | { |
1619 | struct task_struct *owner; | 1657 | struct task_struct *owner; |
1620 | int ret = 0; | 1658 | int ret = 0; |
@@ -1625,7 +1663,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, | |||
1625 | * did a lock-steal - fix up the PI-state in that case: | 1663 | * did a lock-steal - fix up the PI-state in that case: |
1626 | */ | 1664 | */ |
1627 | if (q->pi_state->owner != current) | 1665 | if (q->pi_state->owner != current) |
1628 | ret = fixup_pi_state_owner(uaddr, q, current, fshared); | 1666 | ret = fixup_pi_state_owner(uaddr, q, current); |
1629 | goto out; | 1667 | goto out; |
1630 | } | 1668 | } |
1631 | 1669 | ||
@@ -1652,7 +1690,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, | |||
1652 | * lock. Fix the state up. | 1690 | * lock. Fix the state up. |
1653 | */ | 1691 | */ |
1654 | owner = rt_mutex_owner(&q->pi_state->pi_mutex); | 1692 | owner = rt_mutex_owner(&q->pi_state->pi_mutex); |
1655 | ret = fixup_pi_state_owner(uaddr, q, owner, fshared); | 1693 | ret = fixup_pi_state_owner(uaddr, q, owner); |
1656 | goto out; | 1694 | goto out; |
1657 | } | 1695 | } |
1658 | 1696 | ||
@@ -1715,7 +1753,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, | |||
1715 | * futex_wait_setup() - Prepare to wait on a futex | 1753 | * futex_wait_setup() - Prepare to wait on a futex |
1716 | * @uaddr: the futex userspace address | 1754 | * @uaddr: the futex userspace address |
1717 | * @val: the expected value | 1755 | * @val: the expected value |
1718 | * @fshared: whether the futex is shared (1) or not (0) | 1756 | * @flags: futex flags (FLAGS_SHARED, etc.) |
1719 | * @q: the associated futex_q | 1757 | * @q: the associated futex_q |
1720 | * @hb: storage for hash_bucket pointer to be returned to caller | 1758 | * @hb: storage for hash_bucket pointer to be returned to caller |
1721 | * | 1759 | * |
@@ -1728,7 +1766,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, | |||
1728 | * 0 - uaddr contains val and hb has been locked | 1766 | * 0 - uaddr contains val and hb has been locked |
1729 | * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked | 1767 | * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked |
1730 | */ | 1768 | */ |
1731 | static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, | 1769 | static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, |
1732 | struct futex_q *q, struct futex_hash_bucket **hb) | 1770 | struct futex_q *q, struct futex_hash_bucket **hb) |
1733 | { | 1771 | { |
1734 | u32 uval; | 1772 | u32 uval; |
@@ -1752,8 +1790,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, | |||
1752 | * rare, but normal. | 1790 | * rare, but normal. |
1753 | */ | 1791 | */ |
1754 | retry: | 1792 | retry: |
1755 | q->key = FUTEX_KEY_INIT; | 1793 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); |
1756 | ret = get_futex_key(uaddr, fshared, &q->key); | ||
1757 | if (unlikely(ret != 0)) | 1794 | if (unlikely(ret != 0)) |
1758 | return ret; | 1795 | return ret; |
1759 | 1796 | ||
@@ -1769,10 +1806,10 @@ retry_private: | |||
1769 | if (ret) | 1806 | if (ret) |
1770 | goto out; | 1807 | goto out; |
1771 | 1808 | ||
1772 | if (!fshared) | 1809 | if (!(flags & FLAGS_SHARED)) |
1773 | goto retry_private; | 1810 | goto retry_private; |
1774 | 1811 | ||
1775 | put_futex_key(fshared, &q->key); | 1812 | put_futex_key(&q->key); |
1776 | goto retry; | 1813 | goto retry; |
1777 | } | 1814 | } |
1778 | 1815 | ||
@@ -1783,32 +1820,29 @@ retry_private: | |||
1783 | 1820 | ||
1784 | out: | 1821 | out: |
1785 | if (ret) | 1822 | if (ret) |
1786 | put_futex_key(fshared, &q->key); | 1823 | put_futex_key(&q->key); |
1787 | return ret; | 1824 | return ret; |
1788 | } | 1825 | } |
1789 | 1826 | ||
1790 | static int futex_wait(u32 __user *uaddr, int fshared, | 1827 | static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, |
1791 | u32 val, ktime_t *abs_time, u32 bitset, int clockrt) | 1828 | ktime_t *abs_time, u32 bitset) |
1792 | { | 1829 | { |
1793 | struct hrtimer_sleeper timeout, *to = NULL; | 1830 | struct hrtimer_sleeper timeout, *to = NULL; |
1794 | struct restart_block *restart; | 1831 | struct restart_block *restart; |
1795 | struct futex_hash_bucket *hb; | 1832 | struct futex_hash_bucket *hb; |
1796 | struct futex_q q; | 1833 | struct futex_q q = futex_q_init; |
1797 | int ret; | 1834 | int ret; |
1798 | 1835 | ||
1799 | if (!bitset) | 1836 | if (!bitset) |
1800 | return -EINVAL; | 1837 | return -EINVAL; |
1801 | |||
1802 | q.pi_state = NULL; | ||
1803 | q.bitset = bitset; | 1838 | q.bitset = bitset; |
1804 | q.rt_waiter = NULL; | ||
1805 | q.requeue_pi_key = NULL; | ||
1806 | 1839 | ||
1807 | if (abs_time) { | 1840 | if (abs_time) { |
1808 | to = &timeout; | 1841 | to = &timeout; |
1809 | 1842 | ||
1810 | hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : | 1843 | hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? |
1811 | CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | 1844 | CLOCK_REALTIME : CLOCK_MONOTONIC, |
1845 | HRTIMER_MODE_ABS); | ||
1812 | hrtimer_init_sleeper(to, current); | 1846 | hrtimer_init_sleeper(to, current); |
1813 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, | 1847 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, |
1814 | current->timer_slack_ns); | 1848 | current->timer_slack_ns); |
@@ -1819,7 +1853,7 @@ retry: | |||
1819 | * Prepare to wait on uaddr. On success, holds hb lock and increments | 1853 | * Prepare to wait on uaddr. On success, holds hb lock and increments |
1820 | * q.key refs. | 1854 | * q.key refs. |
1821 | */ | 1855 | */ |
1822 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); | 1856 | ret = futex_wait_setup(uaddr, val, flags, &q, &hb); |
1823 | if (ret) | 1857 | if (ret) |
1824 | goto out; | 1858 | goto out; |
1825 | 1859 | ||
@@ -1852,12 +1886,7 @@ retry: | |||
1852 | restart->futex.val = val; | 1886 | restart->futex.val = val; |
1853 | restart->futex.time = abs_time->tv64; | 1887 | restart->futex.time = abs_time->tv64; |
1854 | restart->futex.bitset = bitset; | 1888 | restart->futex.bitset = bitset; |
1855 | restart->futex.flags = FLAGS_HAS_TIMEOUT; | 1889 | restart->futex.flags = flags; |
1856 | |||
1857 | if (fshared) | ||
1858 | restart->futex.flags |= FLAGS_SHARED; | ||
1859 | if (clockrt) | ||
1860 | restart->futex.flags |= FLAGS_CLOCKRT; | ||
1861 | 1890 | ||
1862 | ret = -ERESTART_RESTARTBLOCK; | 1891 | ret = -ERESTART_RESTARTBLOCK; |
1863 | 1892 | ||
@@ -1873,7 +1902,6 @@ out: | |||
1873 | static long futex_wait_restart(struct restart_block *restart) | 1902 | static long futex_wait_restart(struct restart_block *restart) |
1874 | { | 1903 | { |
1875 | u32 __user *uaddr = restart->futex.uaddr; | 1904 | u32 __user *uaddr = restart->futex.uaddr; |
1876 | int fshared = 0; | ||
1877 | ktime_t t, *tp = NULL; | 1905 | ktime_t t, *tp = NULL; |
1878 | 1906 | ||
1879 | if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { | 1907 | if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { |
@@ -1881,11 +1909,9 @@ static long futex_wait_restart(struct restart_block *restart) | |||
1881 | tp = &t; | 1909 | tp = &t; |
1882 | } | 1910 | } |
1883 | restart->fn = do_no_restart_syscall; | 1911 | restart->fn = do_no_restart_syscall; |
1884 | if (restart->futex.flags & FLAGS_SHARED) | 1912 | |
1885 | fshared = 1; | 1913 | return (long)futex_wait(uaddr, restart->futex.flags, |
1886 | return (long)futex_wait(uaddr, fshared, restart->futex.val, tp, | 1914 | restart->futex.val, tp, restart->futex.bitset); |
1887 | restart->futex.bitset, | ||
1888 | restart->futex.flags & FLAGS_CLOCKRT); | ||
1889 | } | 1915 | } |
1890 | 1916 | ||
1891 | 1917 | ||
@@ -1895,12 +1921,12 @@ static long futex_wait_restart(struct restart_block *restart) | |||
1895 | * if there are waiters then it will block, it does PI, etc. (Due to | 1921 | * if there are waiters then it will block, it does PI, etc. (Due to |
1896 | * races the kernel might see a 0 value of the futex too.) | 1922 | * races the kernel might see a 0 value of the futex too.) |
1897 | */ | 1923 | */ |
1898 | static int futex_lock_pi(u32 __user *uaddr, int fshared, | 1924 | static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, |
1899 | int detect, ktime_t *time, int trylock) | 1925 | ktime_t *time, int trylock) |
1900 | { | 1926 | { |
1901 | struct hrtimer_sleeper timeout, *to = NULL; | 1927 | struct hrtimer_sleeper timeout, *to = NULL; |
1902 | struct futex_hash_bucket *hb; | 1928 | struct futex_hash_bucket *hb; |
1903 | struct futex_q q; | 1929 | struct futex_q q = futex_q_init; |
1904 | int res, ret; | 1930 | int res, ret; |
1905 | 1931 | ||
1906 | if (refill_pi_state_cache()) | 1932 | if (refill_pi_state_cache()) |
@@ -1914,12 +1940,8 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, | |||
1914 | hrtimer_set_expires(&to->timer, *time); | 1940 | hrtimer_set_expires(&to->timer, *time); |
1915 | } | 1941 | } |
1916 | 1942 | ||
1917 | q.pi_state = NULL; | ||
1918 | q.rt_waiter = NULL; | ||
1919 | q.requeue_pi_key = NULL; | ||
1920 | retry: | 1943 | retry: |
1921 | q.key = FUTEX_KEY_INIT; | 1944 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key); |
1922 | ret = get_futex_key(uaddr, fshared, &q.key); | ||
1923 | if (unlikely(ret != 0)) | 1945 | if (unlikely(ret != 0)) |
1924 | goto out; | 1946 | goto out; |
1925 | 1947 | ||
@@ -1941,7 +1963,7 @@ retry_private: | |||
1941 | * exit to complete. | 1963 | * exit to complete. |
1942 | */ | 1964 | */ |
1943 | queue_unlock(&q, hb); | 1965 | queue_unlock(&q, hb); |
1944 | put_futex_key(fshared, &q.key); | 1966 | put_futex_key(&q.key); |
1945 | cond_resched(); | 1967 | cond_resched(); |
1946 | goto retry; | 1968 | goto retry; |
1947 | default: | 1969 | default: |
@@ -1971,7 +1993,7 @@ retry_private: | |||
1971 | * Fixup the pi_state owner and possibly acquire the lock if we | 1993 | * Fixup the pi_state owner and possibly acquire the lock if we |
1972 | * haven't already. | 1994 | * haven't already. |
1973 | */ | 1995 | */ |
1974 | res = fixup_owner(uaddr, fshared, &q, !ret); | 1996 | res = fixup_owner(uaddr, &q, !ret); |
1975 | /* | 1997 | /* |
1976 | * If fixup_owner() returned an error, proprogate that. If it acquired | 1998 | * If fixup_owner() returned an error, proprogate that. If it acquired |
1977 | * the lock, clear our -ETIMEDOUT or -EINTR. | 1999 | * the lock, clear our -ETIMEDOUT or -EINTR. |
@@ -1995,7 +2017,7 @@ out_unlock_put_key: | |||
1995 | queue_unlock(&q, hb); | 2017 | queue_unlock(&q, hb); |
1996 | 2018 | ||
1997 | out_put_key: | 2019 | out_put_key: |
1998 | put_futex_key(fshared, &q.key); | 2020 | put_futex_key(&q.key); |
1999 | out: | 2021 | out: |
2000 | if (to) | 2022 | if (to) |
2001 | destroy_hrtimer_on_stack(&to->timer); | 2023 | destroy_hrtimer_on_stack(&to->timer); |
@@ -2008,10 +2030,10 @@ uaddr_faulted: | |||
2008 | if (ret) | 2030 | if (ret) |
2009 | goto out_put_key; | 2031 | goto out_put_key; |
2010 | 2032 | ||
2011 | if (!fshared) | 2033 | if (!(flags & FLAGS_SHARED)) |
2012 | goto retry_private; | 2034 | goto retry_private; |
2013 | 2035 | ||
2014 | put_futex_key(fshared, &q.key); | 2036 | put_futex_key(&q.key); |
2015 | goto retry; | 2037 | goto retry; |
2016 | } | 2038 | } |
2017 | 2039 | ||
@@ -2020,7 +2042,7 @@ uaddr_faulted: | |||
2020 | * This is the in-kernel slowpath: we look up the PI state (if any), | 2042 | * This is the in-kernel slowpath: we look up the PI state (if any), |
2021 | * and do the rt-mutex unlock. | 2043 | * and do the rt-mutex unlock. |
2022 | */ | 2044 | */ |
2023 | static int futex_unlock_pi(u32 __user *uaddr, int fshared) | 2045 | static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) |
2024 | { | 2046 | { |
2025 | struct futex_hash_bucket *hb; | 2047 | struct futex_hash_bucket *hb; |
2026 | struct futex_q *this, *next; | 2048 | struct futex_q *this, *next; |
@@ -2038,7 +2060,7 @@ retry: | |||
2038 | if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) | 2060 | if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) |
2039 | return -EPERM; | 2061 | return -EPERM; |
2040 | 2062 | ||
2041 | ret = get_futex_key(uaddr, fshared, &key); | 2063 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); |
2042 | if (unlikely(ret != 0)) | 2064 | if (unlikely(ret != 0)) |
2043 | goto out; | 2065 | goto out; |
2044 | 2066 | ||
@@ -2093,14 +2115,14 @@ retry: | |||
2093 | 2115 | ||
2094 | out_unlock: | 2116 | out_unlock: |
2095 | spin_unlock(&hb->lock); | 2117 | spin_unlock(&hb->lock); |
2096 | put_futex_key(fshared, &key); | 2118 | put_futex_key(&key); |
2097 | 2119 | ||
2098 | out: | 2120 | out: |
2099 | return ret; | 2121 | return ret; |
2100 | 2122 | ||
2101 | pi_faulted: | 2123 | pi_faulted: |
2102 | spin_unlock(&hb->lock); | 2124 | spin_unlock(&hb->lock); |
2103 | put_futex_key(fshared, &key); | 2125 | put_futex_key(&key); |
2104 | 2126 | ||
2105 | ret = fault_in_user_writeable(uaddr); | 2127 | ret = fault_in_user_writeable(uaddr); |
2106 | if (!ret) | 2128 | if (!ret) |
@@ -2160,7 +2182,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | |||
2160 | /** | 2182 | /** |
2161 | * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 | 2183 | * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 |
2162 | * @uaddr: the futex we initially wait on (non-pi) | 2184 | * @uaddr: the futex we initially wait on (non-pi) |
2163 | * @fshared: whether the futexes are shared (1) or not (0). They must be | 2185 | * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be |
2164 | * the same type, no requeueing from private to shared, etc. | 2186 | * the same type, no requeueing from private to shared, etc. |
2165 | * @val: the expected value of uaddr | 2187 | * @val: the expected value of uaddr |
2166 | * @abs_time: absolute timeout | 2188 | * @abs_time: absolute timeout |
@@ -2198,16 +2220,16 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | |||
2198 | * 0 - On success | 2220 | * 0 - On success |
2199 | * <0 - On error | 2221 | * <0 - On error |
2200 | */ | 2222 | */ |
2201 | static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | 2223 | static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, |
2202 | u32 val, ktime_t *abs_time, u32 bitset, | 2224 | u32 val, ktime_t *abs_time, u32 bitset, |
2203 | int clockrt, u32 __user *uaddr2) | 2225 | u32 __user *uaddr2) |
2204 | { | 2226 | { |
2205 | struct hrtimer_sleeper timeout, *to = NULL; | 2227 | struct hrtimer_sleeper timeout, *to = NULL; |
2206 | struct rt_mutex_waiter rt_waiter; | 2228 | struct rt_mutex_waiter rt_waiter; |
2207 | struct rt_mutex *pi_mutex = NULL; | 2229 | struct rt_mutex *pi_mutex = NULL; |
2208 | struct futex_hash_bucket *hb; | 2230 | struct futex_hash_bucket *hb; |
2209 | union futex_key key2; | 2231 | union futex_key key2 = FUTEX_KEY_INIT; |
2210 | struct futex_q q; | 2232 | struct futex_q q = futex_q_init; |
2211 | int res, ret; | 2233 | int res, ret; |
2212 | 2234 | ||
2213 | if (!bitset) | 2235 | if (!bitset) |
@@ -2215,8 +2237,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
2215 | 2237 | ||
2216 | if (abs_time) { | 2238 | if (abs_time) { |
2217 | to = &timeout; | 2239 | to = &timeout; |
2218 | hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : | 2240 | hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? |
2219 | CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | 2241 | CLOCK_REALTIME : CLOCK_MONOTONIC, |
2242 | HRTIMER_MODE_ABS); | ||
2220 | hrtimer_init_sleeper(to, current); | 2243 | hrtimer_init_sleeper(to, current); |
2221 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, | 2244 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, |
2222 | current->timer_slack_ns); | 2245 | current->timer_slack_ns); |
@@ -2229,12 +2252,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
2229 | debug_rt_mutex_init_waiter(&rt_waiter); | 2252 | debug_rt_mutex_init_waiter(&rt_waiter); |
2230 | rt_waiter.task = NULL; | 2253 | rt_waiter.task = NULL; |
2231 | 2254 | ||
2232 | key2 = FUTEX_KEY_INIT; | 2255 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); |
2233 | ret = get_futex_key(uaddr2, fshared, &key2); | ||
2234 | if (unlikely(ret != 0)) | 2256 | if (unlikely(ret != 0)) |
2235 | goto out; | 2257 | goto out; |
2236 | 2258 | ||
2237 | q.pi_state = NULL; | ||
2238 | q.bitset = bitset; | 2259 | q.bitset = bitset; |
2239 | q.rt_waiter = &rt_waiter; | 2260 | q.rt_waiter = &rt_waiter; |
2240 | q.requeue_pi_key = &key2; | 2261 | q.requeue_pi_key = &key2; |
@@ -2243,7 +2264,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
2243 | * Prepare to wait on uaddr. On success, increments q.key (key1) ref | 2264 | * Prepare to wait on uaddr. On success, increments q.key (key1) ref |
2244 | * count. | 2265 | * count. |
2245 | */ | 2266 | */ |
2246 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); | 2267 | ret = futex_wait_setup(uaddr, val, flags, &q, &hb); |
2247 | if (ret) | 2268 | if (ret) |
2248 | goto out_key2; | 2269 | goto out_key2; |
2249 | 2270 | ||
@@ -2273,8 +2294,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
2273 | */ | 2294 | */ |
2274 | if (q.pi_state && (q.pi_state->owner != current)) { | 2295 | if (q.pi_state && (q.pi_state->owner != current)) { |
2275 | spin_lock(q.lock_ptr); | 2296 | spin_lock(q.lock_ptr); |
2276 | ret = fixup_pi_state_owner(uaddr2, &q, current, | 2297 | ret = fixup_pi_state_owner(uaddr2, &q, current); |
2277 | fshared); | ||
2278 | spin_unlock(q.lock_ptr); | 2298 | spin_unlock(q.lock_ptr); |
2279 | } | 2299 | } |
2280 | } else { | 2300 | } else { |
@@ -2293,7 +2313,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
2293 | * Fixup the pi_state owner and possibly acquire the lock if we | 2313 | * Fixup the pi_state owner and possibly acquire the lock if we |
2294 | * haven't already. | 2314 | * haven't already. |
2295 | */ | 2315 | */ |
2296 | res = fixup_owner(uaddr2, fshared, &q, !ret); | 2316 | res = fixup_owner(uaddr2, &q, !ret); |
2297 | /* | 2317 | /* |
2298 | * If fixup_owner() returned an error, proprogate that. If it | 2318 | * If fixup_owner() returned an error, proprogate that. If it |
2299 | * acquired the lock, clear -ETIMEDOUT or -EINTR. | 2319 | * acquired the lock, clear -ETIMEDOUT or -EINTR. |
@@ -2324,9 +2344,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
2324 | } | 2344 | } |
2325 | 2345 | ||
2326 | out_put_keys: | 2346 | out_put_keys: |
2327 | put_futex_key(fshared, &q.key); | 2347 | put_futex_key(&q.key); |
2328 | out_key2: | 2348 | out_key2: |
2329 | put_futex_key(fshared, &key2); | 2349 | put_futex_key(&key2); |
2330 | 2350 | ||
2331 | out: | 2351 | out: |
2332 | if (to) { | 2352 | if (to) { |
@@ -2551,58 +2571,57 @@ void exit_robust_list(struct task_struct *curr) | |||
2551 | long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, | 2571 | long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, |
2552 | u32 __user *uaddr2, u32 val2, u32 val3) | 2572 | u32 __user *uaddr2, u32 val2, u32 val3) |
2553 | { | 2573 | { |
2554 | int clockrt, ret = -ENOSYS; | 2574 | int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK; |
2555 | int cmd = op & FUTEX_CMD_MASK; | 2575 | unsigned int flags = 0; |
2556 | int fshared = 0; | ||
2557 | 2576 | ||
2558 | if (!(op & FUTEX_PRIVATE_FLAG)) | 2577 | if (!(op & FUTEX_PRIVATE_FLAG)) |
2559 | fshared = 1; | 2578 | flags |= FLAGS_SHARED; |
2560 | 2579 | ||
2561 | clockrt = op & FUTEX_CLOCK_REALTIME; | 2580 | if (op & FUTEX_CLOCK_REALTIME) { |
2562 | if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) | 2581 | flags |= FLAGS_CLOCKRT; |
2563 | return -ENOSYS; | 2582 | if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) |
2583 | return -ENOSYS; | ||
2584 | } | ||
2564 | 2585 | ||
2565 | switch (cmd) { | 2586 | switch (cmd) { |
2566 | case FUTEX_WAIT: | 2587 | case FUTEX_WAIT: |
2567 | val3 = FUTEX_BITSET_MATCH_ANY; | 2588 | val3 = FUTEX_BITSET_MATCH_ANY; |
2568 | case FUTEX_WAIT_BITSET: | 2589 | case FUTEX_WAIT_BITSET: |
2569 | ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt); | 2590 | ret = futex_wait(uaddr, flags, val, timeout, val3); |
2570 | break; | 2591 | break; |
2571 | case FUTEX_WAKE: | 2592 | case FUTEX_WAKE: |
2572 | val3 = FUTEX_BITSET_MATCH_ANY; | 2593 | val3 = FUTEX_BITSET_MATCH_ANY; |
2573 | case FUTEX_WAKE_BITSET: | 2594 | case FUTEX_WAKE_BITSET: |
2574 | ret = futex_wake(uaddr, fshared, val, val3); | 2595 | ret = futex_wake(uaddr, flags, val, val3); |
2575 | break; | 2596 | break; |
2576 | case FUTEX_REQUEUE: | 2597 | case FUTEX_REQUEUE: |
2577 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); | 2598 | ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); |
2578 | break; | 2599 | break; |
2579 | case FUTEX_CMP_REQUEUE: | 2600 | case FUTEX_CMP_REQUEUE: |
2580 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, | 2601 | ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); |
2581 | 0); | ||
2582 | break; | 2602 | break; |
2583 | case FUTEX_WAKE_OP: | 2603 | case FUTEX_WAKE_OP: |
2584 | ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); | 2604 | ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); |
2585 | break; | 2605 | break; |
2586 | case FUTEX_LOCK_PI: | 2606 | case FUTEX_LOCK_PI: |
2587 | if (futex_cmpxchg_enabled) | 2607 | if (futex_cmpxchg_enabled) |
2588 | ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); | 2608 | ret = futex_lock_pi(uaddr, flags, val, timeout, 0); |
2589 | break; | 2609 | break; |
2590 | case FUTEX_UNLOCK_PI: | 2610 | case FUTEX_UNLOCK_PI: |
2591 | if (futex_cmpxchg_enabled) | 2611 | if (futex_cmpxchg_enabled) |
2592 | ret = futex_unlock_pi(uaddr, fshared); | 2612 | ret = futex_unlock_pi(uaddr, flags); |
2593 | break; | 2613 | break; |
2594 | case FUTEX_TRYLOCK_PI: | 2614 | case FUTEX_TRYLOCK_PI: |
2595 | if (futex_cmpxchg_enabled) | 2615 | if (futex_cmpxchg_enabled) |
2596 | ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); | 2616 | ret = futex_lock_pi(uaddr, flags, 0, timeout, 1); |
2597 | break; | 2617 | break; |
2598 | case FUTEX_WAIT_REQUEUE_PI: | 2618 | case FUTEX_WAIT_REQUEUE_PI: |
2599 | val3 = FUTEX_BITSET_MATCH_ANY; | 2619 | val3 = FUTEX_BITSET_MATCH_ANY; |
2600 | ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, | 2620 | ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, |
2601 | clockrt, uaddr2); | 2621 | uaddr2); |
2602 | break; | 2622 | break; |
2603 | case FUTEX_CMP_REQUEUE_PI: | 2623 | case FUTEX_CMP_REQUEUE_PI: |
2604 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, | 2624 | ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); |
2605 | 1); | ||
2606 | break; | 2625 | break; |
2607 | default: | 2626 | default: |
2608 | ret = -ENOSYS; | 2627 | ret = -ENOSYS; |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 72206cf5c6cf..0c8d7c048615 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -497,7 +497,7 @@ static inline int hrtimer_is_hres_enabled(void) | |||
497 | */ | 497 | */ |
498 | static inline int hrtimer_hres_active(void) | 498 | static inline int hrtimer_hres_active(void) |
499 | { | 499 | { |
500 | return __get_cpu_var(hrtimer_bases).hres_active; | 500 | return __this_cpu_read(hrtimer_bases.hres_active); |
501 | } | 501 | } |
502 | 502 | ||
503 | /* | 503 | /* |
@@ -516,10 +516,13 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) | |||
516 | 516 | ||
517 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { | 517 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { |
518 | struct hrtimer *timer; | 518 | struct hrtimer *timer; |
519 | struct timerqueue_node *next; | ||
519 | 520 | ||
520 | if (!base->first) | 521 | next = timerqueue_getnext(&base->active); |
522 | if (!next) | ||
521 | continue; | 523 | continue; |
522 | timer = rb_entry(base->first, struct hrtimer, node); | 524 | timer = container_of(next, struct hrtimer, node); |
525 | |||
523 | expires = ktime_sub(hrtimer_get_expires(timer), base->offset); | 526 | expires = ktime_sub(hrtimer_get_expires(timer), base->offset); |
524 | /* | 527 | /* |
525 | * clock_was_set() has changed base->offset so the | 528 | * clock_was_set() has changed base->offset so the |
@@ -840,48 +843,17 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); | |||
840 | static int enqueue_hrtimer(struct hrtimer *timer, | 843 | static int enqueue_hrtimer(struct hrtimer *timer, |
841 | struct hrtimer_clock_base *base) | 844 | struct hrtimer_clock_base *base) |
842 | { | 845 | { |
843 | struct rb_node **link = &base->active.rb_node; | ||
844 | struct rb_node *parent = NULL; | ||
845 | struct hrtimer *entry; | ||
846 | int leftmost = 1; | ||
847 | |||
848 | debug_activate(timer); | 846 | debug_activate(timer); |
849 | 847 | ||
850 | /* | 848 | timerqueue_add(&base->active, &timer->node); |
851 | * Find the right place in the rbtree: | ||
852 | */ | ||
853 | while (*link) { | ||
854 | parent = *link; | ||
855 | entry = rb_entry(parent, struct hrtimer, node); | ||
856 | /* | ||
857 | * We dont care about collisions. Nodes with | ||
858 | * the same expiry time stay together. | ||
859 | */ | ||
860 | if (hrtimer_get_expires_tv64(timer) < | ||
861 | hrtimer_get_expires_tv64(entry)) { | ||
862 | link = &(*link)->rb_left; | ||
863 | } else { | ||
864 | link = &(*link)->rb_right; | ||
865 | leftmost = 0; | ||
866 | } | ||
867 | } | ||
868 | |||
869 | /* | ||
870 | * Insert the timer to the rbtree and check whether it | ||
871 | * replaces the first pending timer | ||
872 | */ | ||
873 | if (leftmost) | ||
874 | base->first = &timer->node; | ||
875 | 849 | ||
876 | rb_link_node(&timer->node, parent, link); | ||
877 | rb_insert_color(&timer->node, &base->active); | ||
878 | /* | 850 | /* |
879 | * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the | 851 | * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the |
880 | * state of a possibly running callback. | 852 | * state of a possibly running callback. |
881 | */ | 853 | */ |
882 | timer->state |= HRTIMER_STATE_ENQUEUED; | 854 | timer->state |= HRTIMER_STATE_ENQUEUED; |
883 | 855 | ||
884 | return leftmost; | 856 | return (&timer->node == base->active.next); |
885 | } | 857 | } |
886 | 858 | ||
887 | /* | 859 | /* |
@@ -901,12 +873,7 @@ static void __remove_hrtimer(struct hrtimer *timer, | |||
901 | if (!(timer->state & HRTIMER_STATE_ENQUEUED)) | 873 | if (!(timer->state & HRTIMER_STATE_ENQUEUED)) |
902 | goto out; | 874 | goto out; |
903 | 875 | ||
904 | /* | 876 | if (&timer->node == timerqueue_getnext(&base->active)) { |
905 | * Remove the timer from the rbtree and replace the first | ||
906 | * entry pointer if necessary. | ||
907 | */ | ||
908 | if (base->first == &timer->node) { | ||
909 | base->first = rb_next(&timer->node); | ||
910 | #ifdef CONFIG_HIGH_RES_TIMERS | 877 | #ifdef CONFIG_HIGH_RES_TIMERS |
911 | /* Reprogram the clock event device. if enabled */ | 878 | /* Reprogram the clock event device. if enabled */ |
912 | if (reprogram && hrtimer_hres_active()) { | 879 | if (reprogram && hrtimer_hres_active()) { |
@@ -919,7 +886,7 @@ static void __remove_hrtimer(struct hrtimer *timer, | |||
919 | } | 886 | } |
920 | #endif | 887 | #endif |
921 | } | 888 | } |
922 | rb_erase(&timer->node, &base->active); | 889 | timerqueue_del(&base->active, &timer->node); |
923 | out: | 890 | out: |
924 | timer->state = newstate; | 891 | timer->state = newstate; |
925 | } | 892 | } |
@@ -1128,11 +1095,13 @@ ktime_t hrtimer_get_next_event(void) | |||
1128 | if (!hrtimer_hres_active()) { | 1095 | if (!hrtimer_hres_active()) { |
1129 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { | 1096 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { |
1130 | struct hrtimer *timer; | 1097 | struct hrtimer *timer; |
1098 | struct timerqueue_node *next; | ||
1131 | 1099 | ||
1132 | if (!base->first) | 1100 | next = timerqueue_getnext(&base->active); |
1101 | if (!next) | ||
1133 | continue; | 1102 | continue; |
1134 | 1103 | ||
1135 | timer = rb_entry(base->first, struct hrtimer, node); | 1104 | timer = container_of(next, struct hrtimer, node); |
1136 | delta.tv64 = hrtimer_get_expires_tv64(timer); | 1105 | delta.tv64 = hrtimer_get_expires_tv64(timer); |
1137 | delta = ktime_sub(delta, base->get_time()); | 1106 | delta = ktime_sub(delta, base->get_time()); |
1138 | if (delta.tv64 < mindelta.tv64) | 1107 | if (delta.tv64 < mindelta.tv64) |
@@ -1162,6 +1131,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | |||
1162 | 1131 | ||
1163 | timer->base = &cpu_base->clock_base[clock_id]; | 1132 | timer->base = &cpu_base->clock_base[clock_id]; |
1164 | hrtimer_init_timer_hres(timer); | 1133 | hrtimer_init_timer_hres(timer); |
1134 | timerqueue_init(&timer->node); | ||
1165 | 1135 | ||
1166 | #ifdef CONFIG_TIMER_STATS | 1136 | #ifdef CONFIG_TIMER_STATS |
1167 | timer->start_site = NULL; | 1137 | timer->start_site = NULL; |
@@ -1278,14 +1248,14 @@ retry: | |||
1278 | 1248 | ||
1279 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { | 1249 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { |
1280 | ktime_t basenow; | 1250 | ktime_t basenow; |
1281 | struct rb_node *node; | 1251 | struct timerqueue_node *node; |
1282 | 1252 | ||
1283 | basenow = ktime_add(now, base->offset); | 1253 | basenow = ktime_add(now, base->offset); |
1284 | 1254 | ||
1285 | while ((node = base->first)) { | 1255 | while ((node = timerqueue_getnext(&base->active))) { |
1286 | struct hrtimer *timer; | 1256 | struct hrtimer *timer; |
1287 | 1257 | ||
1288 | timer = rb_entry(node, struct hrtimer, node); | 1258 | timer = container_of(node, struct hrtimer, node); |
1289 | 1259 | ||
1290 | /* | 1260 | /* |
1291 | * The immediate goal for using the softexpires is | 1261 | * The immediate goal for using the softexpires is |
@@ -1441,7 +1411,7 @@ void hrtimer_run_pending(void) | |||
1441 | */ | 1411 | */ |
1442 | void hrtimer_run_queues(void) | 1412 | void hrtimer_run_queues(void) |
1443 | { | 1413 | { |
1444 | struct rb_node *node; | 1414 | struct timerqueue_node *node; |
1445 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 1415 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); |
1446 | struct hrtimer_clock_base *base; | 1416 | struct hrtimer_clock_base *base; |
1447 | int index, gettime = 1; | 1417 | int index, gettime = 1; |
@@ -1451,8 +1421,7 @@ void hrtimer_run_queues(void) | |||
1451 | 1421 | ||
1452 | for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { | 1422 | for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { |
1453 | base = &cpu_base->clock_base[index]; | 1423 | base = &cpu_base->clock_base[index]; |
1454 | 1424 | if (!timerqueue_getnext(&base->active)) | |
1455 | if (!base->first) | ||
1456 | continue; | 1425 | continue; |
1457 | 1426 | ||
1458 | if (gettime) { | 1427 | if (gettime) { |
@@ -1462,10 +1431,10 @@ void hrtimer_run_queues(void) | |||
1462 | 1431 | ||
1463 | raw_spin_lock(&cpu_base->lock); | 1432 | raw_spin_lock(&cpu_base->lock); |
1464 | 1433 | ||
1465 | while ((node = base->first)) { | 1434 | while ((node = timerqueue_getnext(&base->active))) { |
1466 | struct hrtimer *timer; | 1435 | struct hrtimer *timer; |
1467 | 1436 | ||
1468 | timer = rb_entry(node, struct hrtimer, node); | 1437 | timer = container_of(node, struct hrtimer, node); |
1469 | if (base->softirq_time.tv64 <= | 1438 | if (base->softirq_time.tv64 <= |
1470 | hrtimer_get_expires_tv64(timer)) | 1439 | hrtimer_get_expires_tv64(timer)) |
1471 | break; | 1440 | break; |
@@ -1630,8 +1599,10 @@ static void __cpuinit init_hrtimers_cpu(int cpu) | |||
1630 | 1599 | ||
1631 | raw_spin_lock_init(&cpu_base->lock); | 1600 | raw_spin_lock_init(&cpu_base->lock); |
1632 | 1601 | ||
1633 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) | 1602 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { |
1634 | cpu_base->clock_base[i].cpu_base = cpu_base; | 1603 | cpu_base->clock_base[i].cpu_base = cpu_base; |
1604 | timerqueue_init_head(&cpu_base->clock_base[i].active); | ||
1605 | } | ||
1635 | 1606 | ||
1636 | hrtimer_init_hres(cpu_base); | 1607 | hrtimer_init_hres(cpu_base); |
1637 | } | 1608 | } |
@@ -1642,10 +1613,10 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, | |||
1642 | struct hrtimer_clock_base *new_base) | 1613 | struct hrtimer_clock_base *new_base) |
1643 | { | 1614 | { |
1644 | struct hrtimer *timer; | 1615 | struct hrtimer *timer; |
1645 | struct rb_node *node; | 1616 | struct timerqueue_node *node; |
1646 | 1617 | ||
1647 | while ((node = rb_first(&old_base->active))) { | 1618 | while ((node = timerqueue_getnext(&old_base->active))) { |
1648 | timer = rb_entry(node, struct hrtimer, node); | 1619 | timer = container_of(node, struct hrtimer, node); |
1649 | BUG_ON(hrtimer_callback_running(timer)); | 1620 | BUG_ON(hrtimer_callback_running(timer)); |
1650 | debug_deactivate(timer); | 1621 | debug_deactivate(timer); |
1651 | 1622 | ||
@@ -1774,7 +1745,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, | |||
1774 | } | 1745 | } |
1775 | 1746 | ||
1776 | /* | 1747 | /* |
1777 | * A NULL parameter means "inifinte" | 1748 | * A NULL parameter means "infinite" |
1778 | */ | 1749 | */ |
1779 | if (!expires) { | 1750 | if (!expires) { |
1780 | schedule(); | 1751 | schedule(); |
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index e5325825aeb6..086adf25a55e 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c | |||
@@ -641,7 +641,7 @@ int __init init_hw_breakpoint(void) | |||
641 | 641 | ||
642 | constraints_initialized = 1; | 642 | constraints_initialized = 1; |
643 | 643 | ||
644 | perf_pmu_register(&perf_breakpoint); | 644 | perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); |
645 | 645 | ||
646 | return register_die_notifier(&hw_breakpoint_exceptions_nb); | 646 | return register_die_notifier(&hw_breakpoint_exceptions_nb); |
647 | 647 | ||
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 31d766bf5d2e..8e42fec7686d 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig | |||
@@ -9,9 +9,6 @@ menu "IRQ subsystem" | |||
9 | config GENERIC_HARDIRQS | 9 | config GENERIC_HARDIRQS |
10 | def_bool y | 10 | def_bool y |
11 | 11 | ||
12 | config GENERIC_HARDIRQS_NO__DO_IRQ | ||
13 | def_bool y | ||
14 | |||
15 | # Select this to disable the deprecated stuff | 12 | # Select this to disable the deprecated stuff |
16 | config GENERIC_HARDIRQS_NO_DEPRECATED | 13 | config GENERIC_HARDIRQS_NO_DEPRECATED |
17 | def_bool n | 14 | def_bool n |
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index e2347eb63306..3540a7190122 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
@@ -118,114 +118,3 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) | |||
118 | 118 | ||
119 | return retval; | 119 | return retval; |
120 | } | 120 | } |
121 | |||
122 | #ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ | ||
123 | |||
124 | #ifdef CONFIG_ENABLE_WARN_DEPRECATED | ||
125 | # warning __do_IRQ is deprecated. Please convert to proper flow handlers | ||
126 | #endif | ||
127 | |||
128 | /** | ||
129 | * __do_IRQ - original all in one highlevel IRQ handler | ||
130 | * @irq: the interrupt number | ||
131 | * | ||
132 | * __do_IRQ handles all normal device IRQ's (the special | ||
133 | * SMP cross-CPU interrupts have their own specific | ||
134 | * handlers). | ||
135 | * | ||
136 | * This is the original x86 implementation which is used for every | ||
137 | * interrupt type. | ||
138 | */ | ||
139 | unsigned int __do_IRQ(unsigned int irq) | ||
140 | { | ||
141 | struct irq_desc *desc = irq_to_desc(irq); | ||
142 | struct irqaction *action; | ||
143 | unsigned int status; | ||
144 | |||
145 | kstat_incr_irqs_this_cpu(irq, desc); | ||
146 | |||
147 | if (CHECK_IRQ_PER_CPU(desc->status)) { | ||
148 | irqreturn_t action_ret; | ||
149 | |||
150 | /* | ||
151 | * No locking required for CPU-local interrupts: | ||
152 | */ | ||
153 | if (desc->irq_data.chip->ack) | ||
154 | desc->irq_data.chip->ack(irq); | ||
155 | if (likely(!(desc->status & IRQ_DISABLED))) { | ||
156 | action_ret = handle_IRQ_event(irq, desc->action); | ||
157 | if (!noirqdebug) | ||
158 | note_interrupt(irq, desc, action_ret); | ||
159 | } | ||
160 | desc->irq_data.chip->end(irq); | ||
161 | return 1; | ||
162 | } | ||
163 | |||
164 | raw_spin_lock(&desc->lock); | ||
165 | if (desc->irq_data.chip->ack) | ||
166 | desc->irq_data.chip->ack(irq); | ||
167 | /* | ||
168 | * REPLAY is when Linux resends an IRQ that was dropped earlier | ||
169 | * WAITING is used by probe to mark irqs that are being tested | ||
170 | */ | ||
171 | status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING); | ||
172 | status |= IRQ_PENDING; /* we _want_ to handle it */ | ||
173 | |||
174 | /* | ||
175 | * If the IRQ is disabled for whatever reason, we cannot | ||
176 | * use the action we have. | ||
177 | */ | ||
178 | action = NULL; | ||
179 | if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) { | ||
180 | action = desc->action; | ||
181 | status &= ~IRQ_PENDING; /* we commit to handling */ | ||
182 | status |= IRQ_INPROGRESS; /* we are handling it */ | ||
183 | } | ||
184 | desc->status = status; | ||
185 | |||
186 | /* | ||
187 | * If there is no IRQ handler or it was disabled, exit early. | ||
188 | * Since we set PENDING, if another processor is handling | ||
189 | * a different instance of this same irq, the other processor | ||
190 | * will take care of it. | ||
191 | */ | ||
192 | if (unlikely(!action)) | ||
193 | goto out; | ||
194 | |||
195 | /* | ||
196 | * Edge triggered interrupts need to remember | ||
197 | * pending events. | ||
198 | * This applies to any hw interrupts that allow a second | ||
199 | * instance of the same irq to arrive while we are in do_IRQ | ||
200 | * or in the handler. But the code here only handles the _second_ | ||
201 | * instance of the irq, not the third or fourth. So it is mostly | ||
202 | * useful for irq hardware that does not mask cleanly in an | ||
203 | * SMP environment. | ||
204 | */ | ||
205 | for (;;) { | ||
206 | irqreturn_t action_ret; | ||
207 | |||
208 | raw_spin_unlock(&desc->lock); | ||
209 | |||
210 | action_ret = handle_IRQ_event(irq, action); | ||
211 | if (!noirqdebug) | ||
212 | note_interrupt(irq, desc, action_ret); | ||
213 | |||
214 | raw_spin_lock(&desc->lock); | ||
215 | if (likely(!(desc->status & IRQ_PENDING))) | ||
216 | break; | ||
217 | desc->status &= ~IRQ_PENDING; | ||
218 | } | ||
219 | desc->status &= ~IRQ_INPROGRESS; | ||
220 | |||
221 | out: | ||
222 | /* | ||
223 | * The ->end() handler has to deal with interrupts which got | ||
224 | * disabled while the handler was running. | ||
225 | */ | ||
226 | desc->irq_data.chip->end(irq); | ||
227 | raw_spin_unlock(&desc->lock); | ||
228 | |||
229 | return 1; | ||
230 | } | ||
231 | #endif | ||
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9988d03797f5..282f20230e67 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
@@ -72,6 +72,8 @@ static inline int desc_node(struct irq_desc *desc) { return 0; } | |||
72 | 72 | ||
73 | static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) | 73 | static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) |
74 | { | 74 | { |
75 | int cpu; | ||
76 | |||
75 | desc->irq_data.irq = irq; | 77 | desc->irq_data.irq = irq; |
76 | desc->irq_data.chip = &no_irq_chip; | 78 | desc->irq_data.chip = &no_irq_chip; |
77 | desc->irq_data.chip_data = NULL; | 79 | desc->irq_data.chip_data = NULL; |
@@ -83,7 +85,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) | |||
83 | desc->irq_count = 0; | 85 | desc->irq_count = 0; |
84 | desc->irqs_unhandled = 0; | 86 | desc->irqs_unhandled = 0; |
85 | desc->name = NULL; | 87 | desc->name = NULL; |
86 | memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); | 88 | for_each_possible_cpu(cpu) |
89 | *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; | ||
87 | desc_smp_init(desc, node); | 90 | desc_smp_init(desc, node); |
88 | } | 91 | } |
89 | 92 | ||
@@ -133,8 +136,7 @@ static struct irq_desc *alloc_desc(int irq, int node) | |||
133 | if (!desc) | 136 | if (!desc) |
134 | return NULL; | 137 | return NULL; |
135 | /* allocate based on nr_cpu_ids */ | 138 | /* allocate based on nr_cpu_ids */ |
136 | desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs), | 139 | desc->kstat_irqs = alloc_percpu(unsigned int); |
137 | gfp, node); | ||
138 | if (!desc->kstat_irqs) | 140 | if (!desc->kstat_irqs) |
139 | goto err_desc; | 141 | goto err_desc; |
140 | 142 | ||
@@ -149,7 +151,7 @@ static struct irq_desc *alloc_desc(int irq, int node) | |||
149 | return desc; | 151 | return desc; |
150 | 152 | ||
151 | err_kstat: | 153 | err_kstat: |
152 | kfree(desc->kstat_irqs); | 154 | free_percpu(desc->kstat_irqs); |
153 | err_desc: | 155 | err_desc: |
154 | kfree(desc); | 156 | kfree(desc); |
155 | return NULL; | 157 | return NULL; |
@@ -166,7 +168,7 @@ static void free_desc(unsigned int irq) | |||
166 | mutex_unlock(&sparse_irq_lock); | 168 | mutex_unlock(&sparse_irq_lock); |
167 | 169 | ||
168 | free_masks(desc); | 170 | free_masks(desc); |
169 | kfree(desc->kstat_irqs); | 171 | free_percpu(desc->kstat_irqs); |
170 | kfree(desc); | 172 | kfree(desc); |
171 | } | 173 | } |
172 | 174 | ||
@@ -234,7 +236,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { | |||
234 | } | 236 | } |
235 | }; | 237 | }; |
236 | 238 | ||
237 | static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; | ||
238 | int __init early_irq_init(void) | 239 | int __init early_irq_init(void) |
239 | { | 240 | { |
240 | int count, i, node = first_online_node; | 241 | int count, i, node = first_online_node; |
@@ -250,7 +251,8 @@ int __init early_irq_init(void) | |||
250 | for (i = 0; i < count; i++) { | 251 | for (i = 0; i < count; i++) { |
251 | desc[i].irq_data.irq = i; | 252 | desc[i].irq_data.irq = i; |
252 | desc[i].irq_data.chip = &no_irq_chip; | 253 | desc[i].irq_data.chip = &no_irq_chip; |
253 | desc[i].kstat_irqs = kstat_irqs_all[i]; | 254 | /* TODO : do this allocation on-demand ... */ |
255 | desc[i].kstat_irqs = alloc_percpu(unsigned int); | ||
254 | alloc_masks(desc + i, GFP_KERNEL, node); | 256 | alloc_masks(desc + i, GFP_KERNEL, node); |
255 | desc_smp_init(desc + i, node); | 257 | desc_smp_init(desc + i, node); |
256 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); | 258 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); |
@@ -275,6 +277,22 @@ static void free_desc(unsigned int irq) | |||
275 | 277 | ||
276 | static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) | 278 | static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) |
277 | { | 279 | { |
280 | #if defined(CONFIG_KSTAT_IRQS_ONDEMAND) | ||
281 | struct irq_desc *desc; | ||
282 | unsigned int i; | ||
283 | |||
284 | for (i = 0; i < cnt; i++) { | ||
285 | desc = irq_to_desc(start + i); | ||
286 | if (desc && !desc->kstat_irqs) { | ||
287 | unsigned int __percpu *stats = alloc_percpu(unsigned int); | ||
288 | |||
289 | if (!stats) | ||
290 | return -1; | ||
291 | if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL) | ||
292 | free_percpu(stats); | ||
293 | } | ||
294 | } | ||
295 | #endif | ||
278 | return start; | 296 | return start; |
279 | } | 297 | } |
280 | #endif /* !CONFIG_SPARSE_IRQ */ | 298 | #endif /* !CONFIG_SPARSE_IRQ */ |
@@ -391,7 +409,9 @@ void dynamic_irq_cleanup(unsigned int irq) | |||
391 | unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) | 409 | unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) |
392 | { | 410 | { |
393 | struct irq_desc *desc = irq_to_desc(irq); | 411 | struct irq_desc *desc = irq_to_desc(irq); |
394 | return desc ? desc->kstat_irqs[cpu] : 0; | 412 | |
413 | return desc && desc->kstat_irqs ? | ||
414 | *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; | ||
395 | } | 415 | } |
396 | 416 | ||
397 | #ifdef CONFIG_GENERIC_HARDIRQS | 417 | #ifdef CONFIG_GENERIC_HARDIRQS |
@@ -401,10 +421,10 @@ unsigned int kstat_irqs(unsigned int irq) | |||
401 | int cpu; | 421 | int cpu; |
402 | int sum = 0; | 422 | int sum = 0; |
403 | 423 | ||
404 | if (!desc) | 424 | if (!desc || !desc->kstat_irqs) |
405 | return 0; | 425 | return 0; |
406 | for_each_possible_cpu(cpu) | 426 | for_each_possible_cpu(cpu) |
407 | sum += desc->kstat_irqs[cpu]; | 427 | sum += *per_cpu_ptr(desc->kstat_irqs, cpu); |
408 | return sum; | 428 | return sum; |
409 | } | 429 | } |
410 | #endif /* CONFIG_GENERIC_HARDIRQS */ | 430 | #endif /* CONFIG_GENERIC_HARDIRQS */ |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5f92acc5f952..0caa59f747dd 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -577,7 +577,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } | |||
577 | */ | 577 | */ |
578 | static int irq_thread(void *data) | 578 | static int irq_thread(void *data) |
579 | { | 579 | { |
580 | struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; | 580 | static const struct sched_param param = { |
581 | .sched_priority = MAX_USER_RT_PRIO/2, | ||
582 | }; | ||
581 | struct irqaction *action = data; | 583 | struct irqaction *action = data; |
582 | struct irq_desc *desc = irq_to_desc(action->irq); | 584 | struct irq_desc *desc = irq_to_desc(action->irq); |
583 | int wake, oneshot = desc->status & IRQ_ONESHOT; | 585 | int wake, oneshot = desc->status & IRQ_ONESHOT; |
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 1d2541940480..441fd629ff04 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c | |||
@@ -56,6 +56,7 @@ void move_masked_irq(int irq) | |||
56 | void move_native_irq(int irq) | 56 | void move_native_irq(int irq) |
57 | { | 57 | { |
58 | struct irq_desc *desc = irq_to_desc(irq); | 58 | struct irq_desc *desc = irq_to_desc(irq); |
59 | bool masked; | ||
59 | 60 | ||
60 | if (likely(!(desc->status & IRQ_MOVE_PENDING))) | 61 | if (likely(!(desc->status & IRQ_MOVE_PENDING))) |
61 | return; | 62 | return; |
@@ -63,8 +64,15 @@ void move_native_irq(int irq) | |||
63 | if (unlikely(desc->status & IRQ_DISABLED)) | 64 | if (unlikely(desc->status & IRQ_DISABLED)) |
64 | return; | 65 | return; |
65 | 66 | ||
66 | desc->irq_data.chip->irq_mask(&desc->irq_data); | 67 | /* |
68 | * Be careful vs. already masked interrupts. If this is a | ||
69 | * threaded interrupt with ONESHOT set, we can end up with an | ||
70 | * interrupt storm. | ||
71 | */ | ||
72 | masked = desc->status & IRQ_MASKED; | ||
73 | if (!masked) | ||
74 | desc->irq_data.chip->irq_mask(&desc->irq_data); | ||
67 | move_masked_irq(irq); | 75 | move_masked_irq(irq); |
68 | desc->irq_data.chip->irq_unmask(&desc->irq_data); | 76 | if (!masked) |
77 | desc->irq_data.chip->irq_unmask(&desc->irq_data); | ||
69 | } | 78 | } |
70 | |||
diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 90f881904bb1..c58fa7da8aef 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c | |||
@@ -77,21 +77,21 @@ void __weak arch_irq_work_raise(void) | |||
77 | */ | 77 | */ |
78 | static void __irq_work_queue(struct irq_work *entry) | 78 | static void __irq_work_queue(struct irq_work *entry) |
79 | { | 79 | { |
80 | struct irq_work **head, *next; | 80 | struct irq_work *next; |
81 | 81 | ||
82 | head = &get_cpu_var(irq_work_list); | 82 | preempt_disable(); |
83 | 83 | ||
84 | do { | 84 | do { |
85 | next = *head; | 85 | next = __this_cpu_read(irq_work_list); |
86 | /* Can assign non-atomic because we keep the flags set. */ | 86 | /* Can assign non-atomic because we keep the flags set. */ |
87 | entry->next = next_flags(next, IRQ_WORK_FLAGS); | 87 | entry->next = next_flags(next, IRQ_WORK_FLAGS); |
88 | } while (cmpxchg(head, next, entry) != next); | 88 | } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next); |
89 | 89 | ||
90 | /* The list was empty, raise self-interrupt to start processing. */ | 90 | /* The list was empty, raise self-interrupt to start processing. */ |
91 | if (!irq_work_next(entry)) | 91 | if (!irq_work_next(entry)) |
92 | arch_irq_work_raise(); | 92 | arch_irq_work_raise(); |
93 | 93 | ||
94 | put_cpu_var(irq_work_list); | 94 | preempt_enable(); |
95 | } | 95 | } |
96 | 96 | ||
97 | /* | 97 | /* |
@@ -120,16 +120,16 @@ EXPORT_SYMBOL_GPL(irq_work_queue); | |||
120 | */ | 120 | */ |
121 | void irq_work_run(void) | 121 | void irq_work_run(void) |
122 | { | 122 | { |
123 | struct irq_work *list, **head; | 123 | struct irq_work *list; |
124 | 124 | ||
125 | head = &__get_cpu_var(irq_work_list); | 125 | if (this_cpu_read(irq_work_list) == NULL) |
126 | if (*head == NULL) | ||
127 | return; | 126 | return; |
128 | 127 | ||
129 | BUG_ON(!in_irq()); | 128 | BUG_ON(!in_irq()); |
130 | BUG_ON(!irqs_disabled()); | 129 | BUG_ON(!irqs_disabled()); |
131 | 130 | ||
132 | list = xchg(head, NULL); | 131 | list = this_cpu_xchg(irq_work_list, NULL); |
132 | |||
133 | while (list != NULL) { | 133 | while (list != NULL) { |
134 | struct irq_work *entry = list; | 134 | struct irq_work *entry = list; |
135 | 135 | ||
diff --git a/kernel/kexec.c b/kernel/kexec.c index b55045bc7563..ec19b92c7ebd 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -163,7 +163,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | |||
163 | * just verifies it is an address we can use. | 163 | * just verifies it is an address we can use. |
164 | * | 164 | * |
165 | * Since the kernel does everything in page size chunks ensure | 165 | * Since the kernel does everything in page size chunks ensure |
166 | * the destination addreses are page aligned. Too many | 166 | * the destination addresses are page aligned. Too many |
167 | * special cases crop of when we don't do this. The most | 167 | * special cases crop of when we don't do this. The most |
168 | * insidious is getting overlapping destination addresses | 168 | * insidious is getting overlapping destination addresses |
169 | * simply because addresses are changed to page size | 169 | * simply because addresses are changed to page size |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9737a76e106f..77981813a1e7 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -317,12 +317,12 @@ void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty) | |||
317 | /* We have preemption disabled.. so it is safe to use __ versions */ | 317 | /* We have preemption disabled.. so it is safe to use __ versions */ |
318 | static inline void set_kprobe_instance(struct kprobe *kp) | 318 | static inline void set_kprobe_instance(struct kprobe *kp) |
319 | { | 319 | { |
320 | __get_cpu_var(kprobe_instance) = kp; | 320 | __this_cpu_write(kprobe_instance, kp); |
321 | } | 321 | } |
322 | 322 | ||
323 | static inline void reset_kprobe_instance(void) | 323 | static inline void reset_kprobe_instance(void) |
324 | { | 324 | { |
325 | __get_cpu_var(kprobe_instance) = NULL; | 325 | __this_cpu_write(kprobe_instance, NULL); |
326 | } | 326 | } |
327 | 327 | ||
328 | /* | 328 | /* |
@@ -354,13 +354,20 @@ static inline int kprobe_aggrprobe(struct kprobe *p) | |||
354 | return p->pre_handler == aggr_pre_handler; | 354 | return p->pre_handler == aggr_pre_handler; |
355 | } | 355 | } |
356 | 356 | ||
357 | /* Return true(!0) if the kprobe is unused */ | ||
358 | static inline int kprobe_unused(struct kprobe *p) | ||
359 | { | ||
360 | return kprobe_aggrprobe(p) && kprobe_disabled(p) && | ||
361 | list_empty(&p->list); | ||
362 | } | ||
363 | |||
357 | /* | 364 | /* |
358 | * Keep all fields in the kprobe consistent | 365 | * Keep all fields in the kprobe consistent |
359 | */ | 366 | */ |
360 | static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) | 367 | static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p) |
361 | { | 368 | { |
362 | memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); | 369 | memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t)); |
363 | memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); | 370 | memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn)); |
364 | } | 371 | } |
365 | 372 | ||
366 | #ifdef CONFIG_OPTPROBES | 373 | #ifdef CONFIG_OPTPROBES |
@@ -384,6 +391,17 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) | |||
384 | } | 391 | } |
385 | } | 392 | } |
386 | 393 | ||
394 | /* Free optimized instructions and optimized_kprobe */ | ||
395 | static __kprobes void free_aggr_kprobe(struct kprobe *p) | ||
396 | { | ||
397 | struct optimized_kprobe *op; | ||
398 | |||
399 | op = container_of(p, struct optimized_kprobe, kp); | ||
400 | arch_remove_optimized_kprobe(op); | ||
401 | arch_remove_kprobe(p); | ||
402 | kfree(op); | ||
403 | } | ||
404 | |||
387 | /* Return true(!0) if the kprobe is ready for optimization. */ | 405 | /* Return true(!0) if the kprobe is ready for optimization. */ |
388 | static inline int kprobe_optready(struct kprobe *p) | 406 | static inline int kprobe_optready(struct kprobe *p) |
389 | { | 407 | { |
@@ -397,6 +415,33 @@ static inline int kprobe_optready(struct kprobe *p) | |||
397 | return 0; | 415 | return 0; |
398 | } | 416 | } |
399 | 417 | ||
418 | /* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */ | ||
419 | static inline int kprobe_disarmed(struct kprobe *p) | ||
420 | { | ||
421 | struct optimized_kprobe *op; | ||
422 | |||
423 | /* If kprobe is not aggr/opt probe, just return kprobe is disabled */ | ||
424 | if (!kprobe_aggrprobe(p)) | ||
425 | return kprobe_disabled(p); | ||
426 | |||
427 | op = container_of(p, struct optimized_kprobe, kp); | ||
428 | |||
429 | return kprobe_disabled(p) && list_empty(&op->list); | ||
430 | } | ||
431 | |||
432 | /* Return true(!0) if the probe is queued on (un)optimizing lists */ | ||
433 | static int __kprobes kprobe_queued(struct kprobe *p) | ||
434 | { | ||
435 | struct optimized_kprobe *op; | ||
436 | |||
437 | if (kprobe_aggrprobe(p)) { | ||
438 | op = container_of(p, struct optimized_kprobe, kp); | ||
439 | if (!list_empty(&op->list)) | ||
440 | return 1; | ||
441 | } | ||
442 | return 0; | ||
443 | } | ||
444 | |||
400 | /* | 445 | /* |
401 | * Return an optimized kprobe whose optimizing code replaces | 446 | * Return an optimized kprobe whose optimizing code replaces |
402 | * instructions including addr (exclude breakpoint). | 447 | * instructions including addr (exclude breakpoint). |
@@ -422,30 +467,23 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) | |||
422 | 467 | ||
423 | /* Optimization staging list, protected by kprobe_mutex */ | 468 | /* Optimization staging list, protected by kprobe_mutex */ |
424 | static LIST_HEAD(optimizing_list); | 469 | static LIST_HEAD(optimizing_list); |
470 | static LIST_HEAD(unoptimizing_list); | ||
425 | 471 | ||
426 | static void kprobe_optimizer(struct work_struct *work); | 472 | static void kprobe_optimizer(struct work_struct *work); |
427 | static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); | 473 | static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); |
474 | static DECLARE_COMPLETION(optimizer_comp); | ||
428 | #define OPTIMIZE_DELAY 5 | 475 | #define OPTIMIZE_DELAY 5 |
429 | 476 | ||
430 | /* Kprobe jump optimizer */ | 477 | /* |
431 | static __kprobes void kprobe_optimizer(struct work_struct *work) | 478 | * Optimize (replace a breakpoint with a jump) kprobes listed on |
479 | * optimizing_list. | ||
480 | */ | ||
481 | static __kprobes void do_optimize_kprobes(void) | ||
432 | { | 482 | { |
433 | struct optimized_kprobe *op, *tmp; | 483 | /* Optimization never be done when disarmed */ |
434 | 484 | if (kprobes_all_disarmed || !kprobes_allow_optimization || | |
435 | /* Lock modules while optimizing kprobes */ | 485 | list_empty(&optimizing_list)) |
436 | mutex_lock(&module_mutex); | 486 | return; |
437 | mutex_lock(&kprobe_mutex); | ||
438 | if (kprobes_all_disarmed || !kprobes_allow_optimization) | ||
439 | goto end; | ||
440 | |||
441 | /* | ||
442 | * Wait for quiesence period to ensure all running interrupts | ||
443 | * are done. Because optprobe may modify multiple instructions | ||
444 | * there is a chance that Nth instruction is interrupted. In that | ||
445 | * case, running interrupt can return to 2nd-Nth byte of jump | ||
446 | * instruction. This wait is for avoiding it. | ||
447 | */ | ||
448 | synchronize_sched(); | ||
449 | 487 | ||
450 | /* | 488 | /* |
451 | * The optimization/unoptimization refers online_cpus via | 489 | * The optimization/unoptimization refers online_cpus via |
@@ -459,17 +497,111 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) | |||
459 | */ | 497 | */ |
460 | get_online_cpus(); | 498 | get_online_cpus(); |
461 | mutex_lock(&text_mutex); | 499 | mutex_lock(&text_mutex); |
462 | list_for_each_entry_safe(op, tmp, &optimizing_list, list) { | 500 | arch_optimize_kprobes(&optimizing_list); |
463 | WARN_ON(kprobe_disabled(&op->kp)); | 501 | mutex_unlock(&text_mutex); |
464 | if (arch_optimize_kprobe(op) < 0) | 502 | put_online_cpus(); |
465 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | 503 | } |
466 | list_del_init(&op->list); | 504 | |
505 | /* | ||
506 | * Unoptimize (replace a jump with a breakpoint and remove the breakpoint | ||
507 | * if need) kprobes listed on unoptimizing_list. | ||
508 | */ | ||
509 | static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) | ||
510 | { | ||
511 | struct optimized_kprobe *op, *tmp; | ||
512 | |||
513 | /* Unoptimization must be done anytime */ | ||
514 | if (list_empty(&unoptimizing_list)) | ||
515 | return; | ||
516 | |||
517 | /* Ditto to do_optimize_kprobes */ | ||
518 | get_online_cpus(); | ||
519 | mutex_lock(&text_mutex); | ||
520 | arch_unoptimize_kprobes(&unoptimizing_list, free_list); | ||
521 | /* Loop free_list for disarming */ | ||
522 | list_for_each_entry_safe(op, tmp, free_list, list) { | ||
523 | /* Disarm probes if marked disabled */ | ||
524 | if (kprobe_disabled(&op->kp)) | ||
525 | arch_disarm_kprobe(&op->kp); | ||
526 | if (kprobe_unused(&op->kp)) { | ||
527 | /* | ||
528 | * Remove unused probes from hash list. After waiting | ||
529 | * for synchronization, these probes are reclaimed. | ||
530 | * (reclaiming is done by do_free_cleaned_kprobes.) | ||
531 | */ | ||
532 | hlist_del_rcu(&op->kp.hlist); | ||
533 | } else | ||
534 | list_del_init(&op->list); | ||
467 | } | 535 | } |
468 | mutex_unlock(&text_mutex); | 536 | mutex_unlock(&text_mutex); |
469 | put_online_cpus(); | 537 | put_online_cpus(); |
470 | end: | 538 | } |
539 | |||
540 | /* Reclaim all kprobes on the free_list */ | ||
541 | static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) | ||
542 | { | ||
543 | struct optimized_kprobe *op, *tmp; | ||
544 | |||
545 | list_for_each_entry_safe(op, tmp, free_list, list) { | ||
546 | BUG_ON(!kprobe_unused(&op->kp)); | ||
547 | list_del_init(&op->list); | ||
548 | free_aggr_kprobe(&op->kp); | ||
549 | } | ||
550 | } | ||
551 | |||
552 | /* Start optimizer after OPTIMIZE_DELAY passed */ | ||
553 | static __kprobes void kick_kprobe_optimizer(void) | ||
554 | { | ||
555 | if (!delayed_work_pending(&optimizing_work)) | ||
556 | schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); | ||
557 | } | ||
558 | |||
559 | /* Kprobe jump optimizer */ | ||
560 | static __kprobes void kprobe_optimizer(struct work_struct *work) | ||
561 | { | ||
562 | LIST_HEAD(free_list); | ||
563 | |||
564 | /* Lock modules while optimizing kprobes */ | ||
565 | mutex_lock(&module_mutex); | ||
566 | mutex_lock(&kprobe_mutex); | ||
567 | |||
568 | /* | ||
569 | * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) | ||
570 | * kprobes before waiting for quiesence period. | ||
571 | */ | ||
572 | do_unoptimize_kprobes(&free_list); | ||
573 | |||
574 | /* | ||
575 | * Step 2: Wait for quiesence period to ensure all running interrupts | ||
576 | * are done. Because optprobe may modify multiple instructions | ||
577 | * there is a chance that Nth instruction is interrupted. In that | ||
578 | * case, running interrupt can return to 2nd-Nth byte of jump | ||
579 | * instruction. This wait is for avoiding it. | ||
580 | */ | ||
581 | synchronize_sched(); | ||
582 | |||
583 | /* Step 3: Optimize kprobes after quiesence period */ | ||
584 | do_optimize_kprobes(); | ||
585 | |||
586 | /* Step 4: Free cleaned kprobes after quiesence period */ | ||
587 | do_free_cleaned_kprobes(&free_list); | ||
588 | |||
471 | mutex_unlock(&kprobe_mutex); | 589 | mutex_unlock(&kprobe_mutex); |
472 | mutex_unlock(&module_mutex); | 590 | mutex_unlock(&module_mutex); |
591 | |||
592 | /* Step 5: Kick optimizer again if needed */ | ||
593 | if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) | ||
594 | kick_kprobe_optimizer(); | ||
595 | else | ||
596 | /* Wake up all waiters */ | ||
597 | complete_all(&optimizer_comp); | ||
598 | } | ||
599 | |||
600 | /* Wait for completing optimization and unoptimization */ | ||
601 | static __kprobes void wait_for_kprobe_optimizer(void) | ||
602 | { | ||
603 | if (delayed_work_pending(&optimizing_work)) | ||
604 | wait_for_completion(&optimizer_comp); | ||
473 | } | 605 | } |
474 | 606 | ||
475 | /* Optimize kprobe if p is ready to be optimized */ | 607 | /* Optimize kprobe if p is ready to be optimized */ |
@@ -495,42 +627,99 @@ static __kprobes void optimize_kprobe(struct kprobe *p) | |||
495 | /* Check if it is already optimized. */ | 627 | /* Check if it is already optimized. */ |
496 | if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) | 628 | if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) |
497 | return; | 629 | return; |
498 | |||
499 | op->kp.flags |= KPROBE_FLAG_OPTIMIZED; | 630 | op->kp.flags |= KPROBE_FLAG_OPTIMIZED; |
500 | list_add(&op->list, &optimizing_list); | 631 | |
501 | if (!delayed_work_pending(&optimizing_work)) | 632 | if (!list_empty(&op->list)) |
502 | schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); | 633 | /* This is under unoptimizing. Just dequeue the probe */ |
634 | list_del_init(&op->list); | ||
635 | else { | ||
636 | list_add(&op->list, &optimizing_list); | ||
637 | kick_kprobe_optimizer(); | ||
638 | } | ||
639 | } | ||
640 | |||
641 | /* Short cut to direct unoptimizing */ | ||
642 | static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op) | ||
643 | { | ||
644 | get_online_cpus(); | ||
645 | arch_unoptimize_kprobe(op); | ||
646 | put_online_cpus(); | ||
647 | if (kprobe_disabled(&op->kp)) | ||
648 | arch_disarm_kprobe(&op->kp); | ||
503 | } | 649 | } |
504 | 650 | ||
505 | /* Unoptimize a kprobe if p is optimized */ | 651 | /* Unoptimize a kprobe if p is optimized */ |
506 | static __kprobes void unoptimize_kprobe(struct kprobe *p) | 652 | static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force) |
507 | { | 653 | { |
508 | struct optimized_kprobe *op; | 654 | struct optimized_kprobe *op; |
509 | 655 | ||
510 | if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) { | 656 | if (!kprobe_aggrprobe(p) || kprobe_disarmed(p)) |
511 | op = container_of(p, struct optimized_kprobe, kp); | 657 | return; /* This is not an optprobe nor optimized */ |
512 | if (!list_empty(&op->list)) | 658 | |
513 | /* Dequeue from the optimization queue */ | 659 | op = container_of(p, struct optimized_kprobe, kp); |
660 | if (!kprobe_optimized(p)) { | ||
661 | /* Unoptimized or unoptimizing case */ | ||
662 | if (force && !list_empty(&op->list)) { | ||
663 | /* | ||
664 | * Only if this is unoptimizing kprobe and forced, | ||
665 | * forcibly unoptimize it. (No need to unoptimize | ||
666 | * unoptimized kprobe again :) | ||
667 | */ | ||
514 | list_del_init(&op->list); | 668 | list_del_init(&op->list); |
515 | else | 669 | force_unoptimize_kprobe(op); |
516 | /* Replace jump with break */ | 670 | } |
517 | arch_unoptimize_kprobe(op); | 671 | return; |
518 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | 672 | } |
673 | |||
674 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | ||
675 | if (!list_empty(&op->list)) { | ||
676 | /* Dequeue from the optimization queue */ | ||
677 | list_del_init(&op->list); | ||
678 | return; | ||
679 | } | ||
680 | /* Optimized kprobe case */ | ||
681 | if (force) | ||
682 | /* Forcibly update the code: this is a special case */ | ||
683 | force_unoptimize_kprobe(op); | ||
684 | else { | ||
685 | list_add(&op->list, &unoptimizing_list); | ||
686 | kick_kprobe_optimizer(); | ||
519 | } | 687 | } |
520 | } | 688 | } |
521 | 689 | ||
690 | /* Cancel unoptimizing for reusing */ | ||
691 | static void reuse_unused_kprobe(struct kprobe *ap) | ||
692 | { | ||
693 | struct optimized_kprobe *op; | ||
694 | |||
695 | BUG_ON(!kprobe_unused(ap)); | ||
696 | /* | ||
697 | * Unused kprobe MUST be on the way of delayed unoptimizing (means | ||
698 | * there is still a relative jump) and disabled. | ||
699 | */ | ||
700 | op = container_of(ap, struct optimized_kprobe, kp); | ||
701 | if (unlikely(list_empty(&op->list))) | ||
702 | printk(KERN_WARNING "Warning: found a stray unused " | ||
703 | "aggrprobe@%p\n", ap->addr); | ||
704 | /* Enable the probe again */ | ||
705 | ap->flags &= ~KPROBE_FLAG_DISABLED; | ||
706 | /* Optimize it again (remove from op->list) */ | ||
707 | BUG_ON(!kprobe_optready(ap)); | ||
708 | optimize_kprobe(ap); | ||
709 | } | ||
710 | |||
522 | /* Remove optimized instructions */ | 711 | /* Remove optimized instructions */ |
523 | static void __kprobes kill_optimized_kprobe(struct kprobe *p) | 712 | static void __kprobes kill_optimized_kprobe(struct kprobe *p) |
524 | { | 713 | { |
525 | struct optimized_kprobe *op; | 714 | struct optimized_kprobe *op; |
526 | 715 | ||
527 | op = container_of(p, struct optimized_kprobe, kp); | 716 | op = container_of(p, struct optimized_kprobe, kp); |
528 | if (!list_empty(&op->list)) { | 717 | if (!list_empty(&op->list)) |
529 | /* Dequeue from the optimization queue */ | 718 | /* Dequeue from the (un)optimization queue */ |
530 | list_del_init(&op->list); | 719 | list_del_init(&op->list); |
531 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | 720 | |
532 | } | 721 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; |
533 | /* Don't unoptimize, because the target code will be freed. */ | 722 | /* Don't touch the code, because it is already freed. */ |
534 | arch_remove_optimized_kprobe(op); | 723 | arch_remove_optimized_kprobe(op); |
535 | } | 724 | } |
536 | 725 | ||
@@ -543,16 +732,6 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p) | |||
543 | arch_prepare_optimized_kprobe(op); | 732 | arch_prepare_optimized_kprobe(op); |
544 | } | 733 | } |
545 | 734 | ||
546 | /* Free optimized instructions and optimized_kprobe */ | ||
547 | static __kprobes void free_aggr_kprobe(struct kprobe *p) | ||
548 | { | ||
549 | struct optimized_kprobe *op; | ||
550 | |||
551 | op = container_of(p, struct optimized_kprobe, kp); | ||
552 | arch_remove_optimized_kprobe(op); | ||
553 | kfree(op); | ||
554 | } | ||
555 | |||
556 | /* Allocate new optimized_kprobe and try to prepare optimized instructions */ | 735 | /* Allocate new optimized_kprobe and try to prepare optimized instructions */ |
557 | static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) | 736 | static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) |
558 | { | 737 | { |
@@ -587,7 +766,8 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p) | |||
587 | op = container_of(ap, struct optimized_kprobe, kp); | 766 | op = container_of(ap, struct optimized_kprobe, kp); |
588 | if (!arch_prepared_optinsn(&op->optinsn)) { | 767 | if (!arch_prepared_optinsn(&op->optinsn)) { |
589 | /* If failed to setup optimizing, fallback to kprobe */ | 768 | /* If failed to setup optimizing, fallback to kprobe */ |
590 | free_aggr_kprobe(ap); | 769 | arch_remove_optimized_kprobe(op); |
770 | kfree(op); | ||
591 | return; | 771 | return; |
592 | } | 772 | } |
593 | 773 | ||
@@ -631,21 +811,16 @@ static void __kprobes unoptimize_all_kprobes(void) | |||
631 | return; | 811 | return; |
632 | 812 | ||
633 | kprobes_allow_optimization = false; | 813 | kprobes_allow_optimization = false; |
634 | printk(KERN_INFO "Kprobes globally unoptimized\n"); | ||
635 | get_online_cpus(); /* For avoiding text_mutex deadlock */ | ||
636 | mutex_lock(&text_mutex); | ||
637 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 814 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
638 | head = &kprobe_table[i]; | 815 | head = &kprobe_table[i]; |
639 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 816 | hlist_for_each_entry_rcu(p, node, head, hlist) { |
640 | if (!kprobe_disabled(p)) | 817 | if (!kprobe_disabled(p)) |
641 | unoptimize_kprobe(p); | 818 | unoptimize_kprobe(p, false); |
642 | } | 819 | } |
643 | } | 820 | } |
644 | 821 | /* Wait for unoptimizing completion */ | |
645 | mutex_unlock(&text_mutex); | 822 | wait_for_kprobe_optimizer(); |
646 | put_online_cpus(); | 823 | printk(KERN_INFO "Kprobes globally unoptimized\n"); |
647 | /* Allow all currently running kprobes to complete */ | ||
648 | synchronize_sched(); | ||
649 | } | 824 | } |
650 | 825 | ||
651 | int sysctl_kprobes_optimization; | 826 | int sysctl_kprobes_optimization; |
@@ -669,44 +844,60 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, | |||
669 | } | 844 | } |
670 | #endif /* CONFIG_SYSCTL */ | 845 | #endif /* CONFIG_SYSCTL */ |
671 | 846 | ||
847 | /* Put a breakpoint for a probe. Must be called with text_mutex locked */ | ||
672 | static void __kprobes __arm_kprobe(struct kprobe *p) | 848 | static void __kprobes __arm_kprobe(struct kprobe *p) |
673 | { | 849 | { |
674 | struct kprobe *old_p; | 850 | struct kprobe *_p; |
675 | 851 | ||
676 | /* Check collision with other optimized kprobes */ | 852 | /* Check collision with other optimized kprobes */ |
677 | old_p = get_optimized_kprobe((unsigned long)p->addr); | 853 | _p = get_optimized_kprobe((unsigned long)p->addr); |
678 | if (unlikely(old_p)) | 854 | if (unlikely(_p)) |
679 | unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */ | 855 | /* Fallback to unoptimized kprobe */ |
856 | unoptimize_kprobe(_p, true); | ||
680 | 857 | ||
681 | arch_arm_kprobe(p); | 858 | arch_arm_kprobe(p); |
682 | optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ | 859 | optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ |
683 | } | 860 | } |
684 | 861 | ||
685 | static void __kprobes __disarm_kprobe(struct kprobe *p) | 862 | /* Remove the breakpoint of a probe. Must be called with text_mutex locked */ |
863 | static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt) | ||
686 | { | 864 | { |
687 | struct kprobe *old_p; | 865 | struct kprobe *_p; |
688 | 866 | ||
689 | unoptimize_kprobe(p); /* Try to unoptimize */ | 867 | unoptimize_kprobe(p, false); /* Try to unoptimize */ |
690 | arch_disarm_kprobe(p); | ||
691 | 868 | ||
692 | /* If another kprobe was blocked, optimize it. */ | 869 | if (!kprobe_queued(p)) { |
693 | old_p = get_optimized_kprobe((unsigned long)p->addr); | 870 | arch_disarm_kprobe(p); |
694 | if (unlikely(old_p)) | 871 | /* If another kprobe was blocked, optimize it. */ |
695 | optimize_kprobe(old_p); | 872 | _p = get_optimized_kprobe((unsigned long)p->addr); |
873 | if (unlikely(_p) && reopt) | ||
874 | optimize_kprobe(_p); | ||
875 | } | ||
876 | /* TODO: reoptimize others after unoptimized this probe */ | ||
696 | } | 877 | } |
697 | 878 | ||
698 | #else /* !CONFIG_OPTPROBES */ | 879 | #else /* !CONFIG_OPTPROBES */ |
699 | 880 | ||
700 | #define optimize_kprobe(p) do {} while (0) | 881 | #define optimize_kprobe(p) do {} while (0) |
701 | #define unoptimize_kprobe(p) do {} while (0) | 882 | #define unoptimize_kprobe(p, f) do {} while (0) |
702 | #define kill_optimized_kprobe(p) do {} while (0) | 883 | #define kill_optimized_kprobe(p) do {} while (0) |
703 | #define prepare_optimized_kprobe(p) do {} while (0) | 884 | #define prepare_optimized_kprobe(p) do {} while (0) |
704 | #define try_to_optimize_kprobe(p) do {} while (0) | 885 | #define try_to_optimize_kprobe(p) do {} while (0) |
705 | #define __arm_kprobe(p) arch_arm_kprobe(p) | 886 | #define __arm_kprobe(p) arch_arm_kprobe(p) |
706 | #define __disarm_kprobe(p) arch_disarm_kprobe(p) | 887 | #define __disarm_kprobe(p, o) arch_disarm_kprobe(p) |
888 | #define kprobe_disarmed(p) kprobe_disabled(p) | ||
889 | #define wait_for_kprobe_optimizer() do {} while (0) | ||
890 | |||
891 | /* There should be no unused kprobes can be reused without optimization */ | ||
892 | static void reuse_unused_kprobe(struct kprobe *ap) | ||
893 | { | ||
894 | printk(KERN_ERR "Error: There should be no unused kprobe here.\n"); | ||
895 | BUG_ON(kprobe_unused(ap)); | ||
896 | } | ||
707 | 897 | ||
708 | static __kprobes void free_aggr_kprobe(struct kprobe *p) | 898 | static __kprobes void free_aggr_kprobe(struct kprobe *p) |
709 | { | 899 | { |
900 | arch_remove_kprobe(p); | ||
710 | kfree(p); | 901 | kfree(p); |
711 | } | 902 | } |
712 | 903 | ||
@@ -732,11 +923,10 @@ static void __kprobes arm_kprobe(struct kprobe *kp) | |||
732 | /* Disarm a kprobe with text_mutex */ | 923 | /* Disarm a kprobe with text_mutex */ |
733 | static void __kprobes disarm_kprobe(struct kprobe *kp) | 924 | static void __kprobes disarm_kprobe(struct kprobe *kp) |
734 | { | 925 | { |
735 | get_online_cpus(); /* For avoiding text_mutex deadlock */ | 926 | /* Ditto */ |
736 | mutex_lock(&text_mutex); | 927 | mutex_lock(&text_mutex); |
737 | __disarm_kprobe(kp); | 928 | __disarm_kprobe(kp, true); |
738 | mutex_unlock(&text_mutex); | 929 | mutex_unlock(&text_mutex); |
739 | put_online_cpus(); | ||
740 | } | 930 | } |
741 | 931 | ||
742 | /* | 932 | /* |
@@ -775,7 +965,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, | |||
775 | static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, | 965 | static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, |
776 | int trapnr) | 966 | int trapnr) |
777 | { | 967 | { |
778 | struct kprobe *cur = __get_cpu_var(kprobe_instance); | 968 | struct kprobe *cur = __this_cpu_read(kprobe_instance); |
779 | 969 | ||
780 | /* | 970 | /* |
781 | * if we faulted "during" the execution of a user specified | 971 | * if we faulted "during" the execution of a user specified |
@@ -790,7 +980,7 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, | |||
790 | 980 | ||
791 | static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) | 981 | static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) |
792 | { | 982 | { |
793 | struct kprobe *cur = __get_cpu_var(kprobe_instance); | 983 | struct kprobe *cur = __this_cpu_read(kprobe_instance); |
794 | int ret = 0; | 984 | int ret = 0; |
795 | 985 | ||
796 | if (cur && cur->break_handler) { | 986 | if (cur && cur->break_handler) { |
@@ -942,7 +1132,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) | |||
942 | BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); | 1132 | BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); |
943 | 1133 | ||
944 | if (p->break_handler || p->post_handler) | 1134 | if (p->break_handler || p->post_handler) |
945 | unoptimize_kprobe(ap); /* Fall back to normal kprobe */ | 1135 | unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */ |
946 | 1136 | ||
947 | if (p->break_handler) { | 1137 | if (p->break_handler) { |
948 | if (ap->break_handler) | 1138 | if (ap->break_handler) |
@@ -993,19 +1183,21 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | |||
993 | * This is the second or subsequent kprobe at the address - handle | 1183 | * This is the second or subsequent kprobe at the address - handle |
994 | * the intricacies | 1184 | * the intricacies |
995 | */ | 1185 | */ |
996 | static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | 1186 | static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, |
997 | struct kprobe *p) | 1187 | struct kprobe *p) |
998 | { | 1188 | { |
999 | int ret = 0; | 1189 | int ret = 0; |
1000 | struct kprobe *ap = old_p; | 1190 | struct kprobe *ap = orig_p; |
1001 | 1191 | ||
1002 | if (!kprobe_aggrprobe(old_p)) { | 1192 | if (!kprobe_aggrprobe(orig_p)) { |
1003 | /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */ | 1193 | /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */ |
1004 | ap = alloc_aggr_kprobe(old_p); | 1194 | ap = alloc_aggr_kprobe(orig_p); |
1005 | if (!ap) | 1195 | if (!ap) |
1006 | return -ENOMEM; | 1196 | return -ENOMEM; |
1007 | init_aggr_kprobe(ap, old_p); | 1197 | init_aggr_kprobe(ap, orig_p); |
1008 | } | 1198 | } else if (kprobe_unused(ap)) |
1199 | /* This probe is going to die. Rescue it */ | ||
1200 | reuse_unused_kprobe(ap); | ||
1009 | 1201 | ||
1010 | if (kprobe_gone(ap)) { | 1202 | if (kprobe_gone(ap)) { |
1011 | /* | 1203 | /* |
@@ -1039,23 +1231,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | |||
1039 | return add_new_kprobe(ap, p); | 1231 | return add_new_kprobe(ap, p); |
1040 | } | 1232 | } |
1041 | 1233 | ||
1042 | /* Try to disable aggr_kprobe, and return 1 if succeeded.*/ | ||
1043 | static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p) | ||
1044 | { | ||
1045 | struct kprobe *kp; | ||
1046 | |||
1047 | list_for_each_entry_rcu(kp, &p->list, list) { | ||
1048 | if (!kprobe_disabled(kp)) | ||
1049 | /* | ||
1050 | * There is an active probe on the list. | ||
1051 | * We can't disable aggr_kprobe. | ||
1052 | */ | ||
1053 | return 0; | ||
1054 | } | ||
1055 | p->flags |= KPROBE_FLAG_DISABLED; | ||
1056 | return 1; | ||
1057 | } | ||
1058 | |||
1059 | static int __kprobes in_kprobes_functions(unsigned long addr) | 1234 | static int __kprobes in_kprobes_functions(unsigned long addr) |
1060 | { | 1235 | { |
1061 | struct kprobe_blackpoint *kb; | 1236 | struct kprobe_blackpoint *kb; |
@@ -1098,34 +1273,33 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) | |||
1098 | /* Check passed kprobe is valid and return kprobe in kprobe_table. */ | 1273 | /* Check passed kprobe is valid and return kprobe in kprobe_table. */ |
1099 | static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) | 1274 | static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) |
1100 | { | 1275 | { |
1101 | struct kprobe *old_p, *list_p; | 1276 | struct kprobe *ap, *list_p; |
1102 | 1277 | ||
1103 | old_p = get_kprobe(p->addr); | 1278 | ap = get_kprobe(p->addr); |
1104 | if (unlikely(!old_p)) | 1279 | if (unlikely(!ap)) |
1105 | return NULL; | 1280 | return NULL; |
1106 | 1281 | ||
1107 | if (p != old_p) { | 1282 | if (p != ap) { |
1108 | list_for_each_entry_rcu(list_p, &old_p->list, list) | 1283 | list_for_each_entry_rcu(list_p, &ap->list, list) |
1109 | if (list_p == p) | 1284 | if (list_p == p) |
1110 | /* kprobe p is a valid probe */ | 1285 | /* kprobe p is a valid probe */ |
1111 | goto valid; | 1286 | goto valid; |
1112 | return NULL; | 1287 | return NULL; |
1113 | } | 1288 | } |
1114 | valid: | 1289 | valid: |
1115 | return old_p; | 1290 | return ap; |
1116 | } | 1291 | } |
1117 | 1292 | ||
1118 | /* Return error if the kprobe is being re-registered */ | 1293 | /* Return error if the kprobe is being re-registered */ |
1119 | static inline int check_kprobe_rereg(struct kprobe *p) | 1294 | static inline int check_kprobe_rereg(struct kprobe *p) |
1120 | { | 1295 | { |
1121 | int ret = 0; | 1296 | int ret = 0; |
1122 | struct kprobe *old_p; | ||
1123 | 1297 | ||
1124 | mutex_lock(&kprobe_mutex); | 1298 | mutex_lock(&kprobe_mutex); |
1125 | old_p = __get_valid_kprobe(p); | 1299 | if (__get_valid_kprobe(p)) |
1126 | if (old_p) | ||
1127 | ret = -EINVAL; | 1300 | ret = -EINVAL; |
1128 | mutex_unlock(&kprobe_mutex); | 1301 | mutex_unlock(&kprobe_mutex); |
1302 | |||
1129 | return ret; | 1303 | return ret; |
1130 | } | 1304 | } |
1131 | 1305 | ||
@@ -1229,67 +1403,121 @@ fail_with_jump_label: | |||
1229 | } | 1403 | } |
1230 | EXPORT_SYMBOL_GPL(register_kprobe); | 1404 | EXPORT_SYMBOL_GPL(register_kprobe); |
1231 | 1405 | ||
1406 | /* Check if all probes on the aggrprobe are disabled */ | ||
1407 | static int __kprobes aggr_kprobe_disabled(struct kprobe *ap) | ||
1408 | { | ||
1409 | struct kprobe *kp; | ||
1410 | |||
1411 | list_for_each_entry_rcu(kp, &ap->list, list) | ||
1412 | if (!kprobe_disabled(kp)) | ||
1413 | /* | ||
1414 | * There is an active probe on the list. | ||
1415 | * We can't disable this ap. | ||
1416 | */ | ||
1417 | return 0; | ||
1418 | |||
1419 | return 1; | ||
1420 | } | ||
1421 | |||
1422 | /* Disable one kprobe: Make sure called under kprobe_mutex is locked */ | ||
1423 | static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) | ||
1424 | { | ||
1425 | struct kprobe *orig_p; | ||
1426 | |||
1427 | /* Get an original kprobe for return */ | ||
1428 | orig_p = __get_valid_kprobe(p); | ||
1429 | if (unlikely(orig_p == NULL)) | ||
1430 | return NULL; | ||
1431 | |||
1432 | if (!kprobe_disabled(p)) { | ||
1433 | /* Disable probe if it is a child probe */ | ||
1434 | if (p != orig_p) | ||
1435 | p->flags |= KPROBE_FLAG_DISABLED; | ||
1436 | |||
1437 | /* Try to disarm and disable this/parent probe */ | ||
1438 | if (p == orig_p || aggr_kprobe_disabled(orig_p)) { | ||
1439 | disarm_kprobe(orig_p); | ||
1440 | orig_p->flags |= KPROBE_FLAG_DISABLED; | ||
1441 | } | ||
1442 | } | ||
1443 | |||
1444 | return orig_p; | ||
1445 | } | ||
1446 | |||
1232 | /* | 1447 | /* |
1233 | * Unregister a kprobe without a scheduler synchronization. | 1448 | * Unregister a kprobe without a scheduler synchronization. |
1234 | */ | 1449 | */ |
1235 | static int __kprobes __unregister_kprobe_top(struct kprobe *p) | 1450 | static int __kprobes __unregister_kprobe_top(struct kprobe *p) |
1236 | { | 1451 | { |
1237 | struct kprobe *old_p, *list_p; | 1452 | struct kprobe *ap, *list_p; |
1238 | 1453 | ||
1239 | old_p = __get_valid_kprobe(p); | 1454 | /* Disable kprobe. This will disarm it if needed. */ |
1240 | if (old_p == NULL) | 1455 | ap = __disable_kprobe(p); |
1456 | if (ap == NULL) | ||
1241 | return -EINVAL; | 1457 | return -EINVAL; |
1242 | 1458 | ||
1243 | if (old_p == p || | 1459 | if (ap == p) |
1244 | (kprobe_aggrprobe(old_p) && | ||
1245 | list_is_singular(&old_p->list))) { | ||
1246 | /* | 1460 | /* |
1247 | * Only probe on the hash list. Disarm only if kprobes are | 1461 | * This probe is an independent(and non-optimized) kprobe |
1248 | * enabled and not gone - otherwise, the breakpoint would | 1462 | * (not an aggrprobe). Remove from the hash list. |
1249 | * already have been removed. We save on flushing icache. | ||
1250 | */ | 1463 | */ |
1251 | if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) | 1464 | goto disarmed; |
1252 | disarm_kprobe(old_p); | 1465 | |
1253 | hlist_del_rcu(&old_p->hlist); | 1466 | /* Following process expects this probe is an aggrprobe */ |
1254 | } else { | 1467 | WARN_ON(!kprobe_aggrprobe(ap)); |
1468 | |||
1469 | if (list_is_singular(&ap->list) && kprobe_disarmed(ap)) | ||
1470 | /* | ||
1471 | * !disarmed could be happen if the probe is under delayed | ||
1472 | * unoptimizing. | ||
1473 | */ | ||
1474 | goto disarmed; | ||
1475 | else { | ||
1476 | /* If disabling probe has special handlers, update aggrprobe */ | ||
1255 | if (p->break_handler && !kprobe_gone(p)) | 1477 | if (p->break_handler && !kprobe_gone(p)) |
1256 | old_p->break_handler = NULL; | 1478 | ap->break_handler = NULL; |
1257 | if (p->post_handler && !kprobe_gone(p)) { | 1479 | if (p->post_handler && !kprobe_gone(p)) { |
1258 | list_for_each_entry_rcu(list_p, &old_p->list, list) { | 1480 | list_for_each_entry_rcu(list_p, &ap->list, list) { |
1259 | if ((list_p != p) && (list_p->post_handler)) | 1481 | if ((list_p != p) && (list_p->post_handler)) |
1260 | goto noclean; | 1482 | goto noclean; |
1261 | } | 1483 | } |
1262 | old_p->post_handler = NULL; | 1484 | ap->post_handler = NULL; |
1263 | } | 1485 | } |
1264 | noclean: | 1486 | noclean: |
1487 | /* | ||
1488 | * Remove from the aggrprobe: this path will do nothing in | ||
1489 | * __unregister_kprobe_bottom(). | ||
1490 | */ | ||
1265 | list_del_rcu(&p->list); | 1491 | list_del_rcu(&p->list); |
1266 | if (!kprobe_disabled(old_p)) { | 1492 | if (!kprobe_disabled(ap) && !kprobes_all_disarmed) |
1267 | try_to_disable_aggr_kprobe(old_p); | 1493 | /* |
1268 | if (!kprobes_all_disarmed) { | 1494 | * Try to optimize this probe again, because post |
1269 | if (kprobe_disabled(old_p)) | 1495 | * handler may have been changed. |
1270 | disarm_kprobe(old_p); | 1496 | */ |
1271 | else | 1497 | optimize_kprobe(ap); |
1272 | /* Try to optimize this probe again */ | ||
1273 | optimize_kprobe(old_p); | ||
1274 | } | ||
1275 | } | ||
1276 | } | 1498 | } |
1277 | return 0; | 1499 | return 0; |
1500 | |||
1501 | disarmed: | ||
1502 | BUG_ON(!kprobe_disarmed(ap)); | ||
1503 | hlist_del_rcu(&ap->hlist); | ||
1504 | return 0; | ||
1278 | } | 1505 | } |
1279 | 1506 | ||
1280 | static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) | 1507 | static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) |
1281 | { | 1508 | { |
1282 | struct kprobe *old_p; | 1509 | struct kprobe *ap; |
1283 | 1510 | ||
1284 | if (list_empty(&p->list)) | 1511 | if (list_empty(&p->list)) |
1512 | /* This is an independent kprobe */ | ||
1285 | arch_remove_kprobe(p); | 1513 | arch_remove_kprobe(p); |
1286 | else if (list_is_singular(&p->list)) { | 1514 | else if (list_is_singular(&p->list)) { |
1287 | /* "p" is the last child of an aggr_kprobe */ | 1515 | /* This is the last child of an aggrprobe */ |
1288 | old_p = list_entry(p->list.next, struct kprobe, list); | 1516 | ap = list_entry(p->list.next, struct kprobe, list); |
1289 | list_del(&p->list); | 1517 | list_del(&p->list); |
1290 | arch_remove_kprobe(old_p); | 1518 | free_aggr_kprobe(ap); |
1291 | free_aggr_kprobe(old_p); | ||
1292 | } | 1519 | } |
1520 | /* Otherwise, do nothing. */ | ||
1293 | } | 1521 | } |
1294 | 1522 | ||
1295 | int __kprobes register_kprobes(struct kprobe **kps, int num) | 1523 | int __kprobes register_kprobes(struct kprobe **kps, int num) |
@@ -1607,29 +1835,13 @@ static void __kprobes kill_kprobe(struct kprobe *p) | |||
1607 | int __kprobes disable_kprobe(struct kprobe *kp) | 1835 | int __kprobes disable_kprobe(struct kprobe *kp) |
1608 | { | 1836 | { |
1609 | int ret = 0; | 1837 | int ret = 0; |
1610 | struct kprobe *p; | ||
1611 | 1838 | ||
1612 | mutex_lock(&kprobe_mutex); | 1839 | mutex_lock(&kprobe_mutex); |
1613 | 1840 | ||
1614 | /* Check whether specified probe is valid. */ | 1841 | /* Disable this kprobe */ |
1615 | p = __get_valid_kprobe(kp); | 1842 | if (__disable_kprobe(kp) == NULL) |
1616 | if (unlikely(p == NULL)) { | ||
1617 | ret = -EINVAL; | 1843 | ret = -EINVAL; |
1618 | goto out; | ||
1619 | } | ||
1620 | 1844 | ||
1621 | /* If the probe is already disabled (or gone), just return */ | ||
1622 | if (kprobe_disabled(kp)) | ||
1623 | goto out; | ||
1624 | |||
1625 | kp->flags |= KPROBE_FLAG_DISABLED; | ||
1626 | if (p != kp) | ||
1627 | /* When kp != p, p is always enabled. */ | ||
1628 | try_to_disable_aggr_kprobe(p); | ||
1629 | |||
1630 | if (!kprobes_all_disarmed && kprobe_disabled(p)) | ||
1631 | disarm_kprobe(p); | ||
1632 | out: | ||
1633 | mutex_unlock(&kprobe_mutex); | 1845 | mutex_unlock(&kprobe_mutex); |
1634 | return ret; | 1846 | return ret; |
1635 | } | 1847 | } |
@@ -1927,36 +2139,27 @@ static void __kprobes disarm_all_kprobes(void) | |||
1927 | mutex_lock(&kprobe_mutex); | 2139 | mutex_lock(&kprobe_mutex); |
1928 | 2140 | ||
1929 | /* If kprobes are already disarmed, just return */ | 2141 | /* If kprobes are already disarmed, just return */ |
1930 | if (kprobes_all_disarmed) | 2142 | if (kprobes_all_disarmed) { |
1931 | goto already_disabled; | 2143 | mutex_unlock(&kprobe_mutex); |
2144 | return; | ||
2145 | } | ||
1932 | 2146 | ||
1933 | kprobes_all_disarmed = true; | 2147 | kprobes_all_disarmed = true; |
1934 | printk(KERN_INFO "Kprobes globally disabled\n"); | 2148 | printk(KERN_INFO "Kprobes globally disabled\n"); |
1935 | 2149 | ||
1936 | /* | ||
1937 | * Here we call get_online_cpus() for avoiding text_mutex deadlock, | ||
1938 | * because disarming may also unoptimize kprobes. | ||
1939 | */ | ||
1940 | get_online_cpus(); | ||
1941 | mutex_lock(&text_mutex); | 2150 | mutex_lock(&text_mutex); |
1942 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 2151 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
1943 | head = &kprobe_table[i]; | 2152 | head = &kprobe_table[i]; |
1944 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 2153 | hlist_for_each_entry_rcu(p, node, head, hlist) { |
1945 | if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) | 2154 | if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) |
1946 | __disarm_kprobe(p); | 2155 | __disarm_kprobe(p, false); |
1947 | } | 2156 | } |
1948 | } | 2157 | } |
1949 | |||
1950 | mutex_unlock(&text_mutex); | 2158 | mutex_unlock(&text_mutex); |
1951 | put_online_cpus(); | ||
1952 | mutex_unlock(&kprobe_mutex); | 2159 | mutex_unlock(&kprobe_mutex); |
1953 | /* Allow all currently running kprobes to complete */ | ||
1954 | synchronize_sched(); | ||
1955 | return; | ||
1956 | 2160 | ||
1957 | already_disabled: | 2161 | /* Wait for disarming all kprobes by optimizer */ |
1958 | mutex_unlock(&kprobe_mutex); | 2162 | wait_for_kprobe_optimizer(); |
1959 | return; | ||
1960 | } | 2163 | } |
1961 | 2164 | ||
1962 | /* | 2165 | /* |
diff --git a/kernel/kthread.c b/kernel/kthread.c index ca61bbdd44b2..c55afba990a3 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), | |||
148 | wait_for_completion(&create.done); | 148 | wait_for_completion(&create.done); |
149 | 149 | ||
150 | if (!IS_ERR(create.result)) { | 150 | if (!IS_ERR(create.result)) { |
151 | struct sched_param param = { .sched_priority = 0 }; | 151 | static const struct sched_param param = { .sched_priority = 0 }; |
152 | va_list args; | 152 | va_list args; |
153 | 153 | ||
154 | va_start(args, namefmt); | 154 | va_start(args, namefmt); |
diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 17110a4a4fc2..ee74b35e528d 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c | |||
@@ -241,24 +241,19 @@ static int lstats_show(struct seq_file *m, void *v) | |||
241 | seq_puts(m, "Latency Top version : v0.1\n"); | 241 | seq_puts(m, "Latency Top version : v0.1\n"); |
242 | 242 | ||
243 | for (i = 0; i < MAXLR; i++) { | 243 | for (i = 0; i < MAXLR; i++) { |
244 | if (latency_record[i].backtrace[0]) { | 244 | struct latency_record *lr = &latency_record[i]; |
245 | |||
246 | if (lr->backtrace[0]) { | ||
245 | int q; | 247 | int q; |
246 | seq_printf(m, "%i %lu %lu ", | 248 | seq_printf(m, "%i %lu %lu", |
247 | latency_record[i].count, | 249 | lr->count, lr->time, lr->max); |
248 | latency_record[i].time, | ||
249 | latency_record[i].max); | ||
250 | for (q = 0; q < LT_BACKTRACEDEPTH; q++) { | 250 | for (q = 0; q < LT_BACKTRACEDEPTH; q++) { |
251 | char sym[KSYM_SYMBOL_LEN]; | 251 | unsigned long bt = lr->backtrace[q]; |
252 | char *c; | 252 | if (!bt) |
253 | if (!latency_record[i].backtrace[q]) | ||
254 | break; | 253 | break; |
255 | if (latency_record[i].backtrace[q] == ULONG_MAX) | 254 | if (bt == ULONG_MAX) |
256 | break; | 255 | break; |
257 | sprint_symbol(sym, latency_record[i].backtrace[q]); | 256 | seq_printf(m, " %ps", (void *)bt); |
258 | c = strchr(sym, '+'); | ||
259 | if (c) | ||
260 | *c = 0; | ||
261 | seq_printf(m, "%s ", sym); | ||
262 | } | 257 | } |
263 | seq_printf(m, "\n"); | 258 | seq_printf(m, "\n"); |
264 | } | 259 | } |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 42ba65dff7d9..0d2058da80f5 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -2292,22 +2292,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) | |||
2292 | } | 2292 | } |
2293 | 2293 | ||
2294 | /* | 2294 | /* |
2295 | * Debugging helper: via this flag we know that we are in | ||
2296 | * 'early bootup code', and will warn about any invalid irqs-on event: | ||
2297 | */ | ||
2298 | static int early_boot_irqs_enabled; | ||
2299 | |||
2300 | void early_boot_irqs_off(void) | ||
2301 | { | ||
2302 | early_boot_irqs_enabled = 0; | ||
2303 | } | ||
2304 | |||
2305 | void early_boot_irqs_on(void) | ||
2306 | { | ||
2307 | early_boot_irqs_enabled = 1; | ||
2308 | } | ||
2309 | |||
2310 | /* | ||
2311 | * Hardirqs will be enabled: | 2295 | * Hardirqs will be enabled: |
2312 | */ | 2296 | */ |
2313 | void trace_hardirqs_on_caller(unsigned long ip) | 2297 | void trace_hardirqs_on_caller(unsigned long ip) |
@@ -2319,7 +2303,7 @@ void trace_hardirqs_on_caller(unsigned long ip) | |||
2319 | if (unlikely(!debug_locks || current->lockdep_recursion)) | 2303 | if (unlikely(!debug_locks || current->lockdep_recursion)) |
2320 | return; | 2304 | return; |
2321 | 2305 | ||
2322 | if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled))) | 2306 | if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) |
2323 | return; | 2307 | return; |
2324 | 2308 | ||
2325 | if (unlikely(curr->hardirqs_enabled)) { | 2309 | if (unlikely(curr->hardirqs_enabled)) { |
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 59b76c8ce9d7..1969d2fc4b36 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c | |||
@@ -494,7 +494,6 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) | |||
494 | namelen += 2; | 494 | namelen += 2; |
495 | 495 | ||
496 | for (i = 0; i < LOCKSTAT_POINTS; i++) { | 496 | for (i = 0; i < LOCKSTAT_POINTS; i++) { |
497 | char sym[KSYM_SYMBOL_LEN]; | ||
498 | char ip[32]; | 497 | char ip[32]; |
499 | 498 | ||
500 | if (class->contention_point[i] == 0) | 499 | if (class->contention_point[i] == 0) |
@@ -503,15 +502,13 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) | |||
503 | if (!i) | 502 | if (!i) |
504 | seq_line(m, '-', 40-namelen, namelen); | 503 | seq_line(m, '-', 40-namelen, namelen); |
505 | 504 | ||
506 | sprint_symbol(sym, class->contention_point[i]); | ||
507 | snprintf(ip, sizeof(ip), "[<%p>]", | 505 | snprintf(ip, sizeof(ip), "[<%p>]", |
508 | (void *)class->contention_point[i]); | 506 | (void *)class->contention_point[i]); |
509 | seq_printf(m, "%40s %14lu %29s %s\n", name, | 507 | seq_printf(m, "%40s %14lu %29s %pS\n", |
510 | stats->contention_point[i], | 508 | name, stats->contention_point[i], |
511 | ip, sym); | 509 | ip, (void *)class->contention_point[i]); |
512 | } | 510 | } |
513 | for (i = 0; i < LOCKSTAT_POINTS; i++) { | 511 | for (i = 0; i < LOCKSTAT_POINTS; i++) { |
514 | char sym[KSYM_SYMBOL_LEN]; | ||
515 | char ip[32]; | 512 | char ip[32]; |
516 | 513 | ||
517 | if (class->contending_point[i] == 0) | 514 | if (class->contending_point[i] == 0) |
@@ -520,12 +517,11 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) | |||
520 | if (!i) | 517 | if (!i) |
521 | seq_line(m, '-', 40-namelen, namelen); | 518 | seq_line(m, '-', 40-namelen, namelen); |
522 | 519 | ||
523 | sprint_symbol(sym, class->contending_point[i]); | ||
524 | snprintf(ip, sizeof(ip), "[<%p>]", | 520 | snprintf(ip, sizeof(ip), "[<%p>]", |
525 | (void *)class->contending_point[i]); | 521 | (void *)class->contending_point[i]); |
526 | seq_printf(m, "%40s %14lu %29s %s\n", name, | 522 | seq_printf(m, "%40s %14lu %29s %pS\n", |
527 | stats->contending_point[i], | 523 | name, stats->contending_point[i], |
528 | ip, sym); | 524 | ip, (void *)class->contending_point[i]); |
529 | } | 525 | } |
530 | if (i) { | 526 | if (i) { |
531 | seq_puts(m, "\n"); | 527 | seq_puts(m, "\n"); |
diff --git a/kernel/module.c b/kernel/module.c index d190664f25ff..efa290ea94bf 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -56,6 +56,7 @@ | |||
56 | #include <linux/percpu.h> | 56 | #include <linux/percpu.h> |
57 | #include <linux/kmemleak.h> | 57 | #include <linux/kmemleak.h> |
58 | #include <linux/jump_label.h> | 58 | #include <linux/jump_label.h> |
59 | #include <linux/pfn.h> | ||
59 | 60 | ||
60 | #define CREATE_TRACE_POINTS | 61 | #define CREATE_TRACE_POINTS |
61 | #include <trace/events/module.h> | 62 | #include <trace/events/module.h> |
@@ -70,6 +71,26 @@ | |||
70 | #define ARCH_SHF_SMALL 0 | 71 | #define ARCH_SHF_SMALL 0 |
71 | #endif | 72 | #endif |
72 | 73 | ||
74 | /* | ||
75 | * Modules' sections will be aligned on page boundaries | ||
76 | * to ensure complete separation of code and data, but | ||
77 | * only when CONFIG_DEBUG_SET_MODULE_RONX=y | ||
78 | */ | ||
79 | #ifdef CONFIG_DEBUG_SET_MODULE_RONX | ||
80 | # define debug_align(X) ALIGN(X, PAGE_SIZE) | ||
81 | #else | ||
82 | # define debug_align(X) (X) | ||
83 | #endif | ||
84 | |||
85 | /* | ||
86 | * Given BASE and SIZE this macro calculates the number of pages the | ||
87 | * memory regions occupies | ||
88 | */ | ||
89 | #define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \ | ||
90 | (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \ | ||
91 | PFN_DOWN((unsigned long)BASE) + 1) \ | ||
92 | : (0UL)) | ||
93 | |||
73 | /* If this is set, the section belongs in the init part of the module */ | 94 | /* If this is set, the section belongs in the init part of the module */ |
74 | #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) | 95 | #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) |
75 | 96 | ||
@@ -1542,6 +1563,115 @@ static int __unlink_module(void *_mod) | |||
1542 | return 0; | 1563 | return 0; |
1543 | } | 1564 | } |
1544 | 1565 | ||
1566 | #ifdef CONFIG_DEBUG_SET_MODULE_RONX | ||
1567 | /* | ||
1568 | * LKM RO/NX protection: protect module's text/ro-data | ||
1569 | * from modification and any data from execution. | ||
1570 | */ | ||
1571 | void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages)) | ||
1572 | { | ||
1573 | unsigned long begin_pfn = PFN_DOWN((unsigned long)start); | ||
1574 | unsigned long end_pfn = PFN_DOWN((unsigned long)end); | ||
1575 | |||
1576 | if (end_pfn > begin_pfn) | ||
1577 | set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); | ||
1578 | } | ||
1579 | |||
1580 | static void set_section_ro_nx(void *base, | ||
1581 | unsigned long text_size, | ||
1582 | unsigned long ro_size, | ||
1583 | unsigned long total_size) | ||
1584 | { | ||
1585 | /* begin and end PFNs of the current subsection */ | ||
1586 | unsigned long begin_pfn; | ||
1587 | unsigned long end_pfn; | ||
1588 | |||
1589 | /* | ||
1590 | * Set RO for module text and RO-data: | ||
1591 | * - Always protect first page. | ||
1592 | * - Do not protect last partial page. | ||
1593 | */ | ||
1594 | if (ro_size > 0) | ||
1595 | set_page_attributes(base, base + ro_size, set_memory_ro); | ||
1596 | |||
1597 | /* | ||
1598 | * Set NX permissions for module data: | ||
1599 | * - Do not protect first partial page. | ||
1600 | * - Always protect last page. | ||
1601 | */ | ||
1602 | if (total_size > text_size) { | ||
1603 | begin_pfn = PFN_UP((unsigned long)base + text_size); | ||
1604 | end_pfn = PFN_UP((unsigned long)base + total_size); | ||
1605 | if (end_pfn > begin_pfn) | ||
1606 | set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); | ||
1607 | } | ||
1608 | } | ||
1609 | |||
1610 | /* Setting memory back to RW+NX before releasing it */ | ||
1611 | void unset_section_ro_nx(struct module *mod, void *module_region) | ||
1612 | { | ||
1613 | unsigned long total_pages; | ||
1614 | |||
1615 | if (mod->module_core == module_region) { | ||
1616 | /* Set core as NX+RW */ | ||
1617 | total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size); | ||
1618 | set_memory_nx((unsigned long)mod->module_core, total_pages); | ||
1619 | set_memory_rw((unsigned long)mod->module_core, total_pages); | ||
1620 | |||
1621 | } else if (mod->module_init == module_region) { | ||
1622 | /* Set init as NX+RW */ | ||
1623 | total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size); | ||
1624 | set_memory_nx((unsigned long)mod->module_init, total_pages); | ||
1625 | set_memory_rw((unsigned long)mod->module_init, total_pages); | ||
1626 | } | ||
1627 | } | ||
1628 | |||
1629 | /* Iterate through all modules and set each module's text as RW */ | ||
1630 | void set_all_modules_text_rw() | ||
1631 | { | ||
1632 | struct module *mod; | ||
1633 | |||
1634 | mutex_lock(&module_mutex); | ||
1635 | list_for_each_entry_rcu(mod, &modules, list) { | ||
1636 | if ((mod->module_core) && (mod->core_text_size)) { | ||
1637 | set_page_attributes(mod->module_core, | ||
1638 | mod->module_core + mod->core_text_size, | ||
1639 | set_memory_rw); | ||
1640 | } | ||
1641 | if ((mod->module_init) && (mod->init_text_size)) { | ||
1642 | set_page_attributes(mod->module_init, | ||
1643 | mod->module_init + mod->init_text_size, | ||
1644 | set_memory_rw); | ||
1645 | } | ||
1646 | } | ||
1647 | mutex_unlock(&module_mutex); | ||
1648 | } | ||
1649 | |||
1650 | /* Iterate through all modules and set each module's text as RO */ | ||
1651 | void set_all_modules_text_ro() | ||
1652 | { | ||
1653 | struct module *mod; | ||
1654 | |||
1655 | mutex_lock(&module_mutex); | ||
1656 | list_for_each_entry_rcu(mod, &modules, list) { | ||
1657 | if ((mod->module_core) && (mod->core_text_size)) { | ||
1658 | set_page_attributes(mod->module_core, | ||
1659 | mod->module_core + mod->core_text_size, | ||
1660 | set_memory_ro); | ||
1661 | } | ||
1662 | if ((mod->module_init) && (mod->init_text_size)) { | ||
1663 | set_page_attributes(mod->module_init, | ||
1664 | mod->module_init + mod->init_text_size, | ||
1665 | set_memory_ro); | ||
1666 | } | ||
1667 | } | ||
1668 | mutex_unlock(&module_mutex); | ||
1669 | } | ||
1670 | #else | ||
1671 | static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } | ||
1672 | static inline void unset_section_ro_nx(struct module *mod, void *module_region) { } | ||
1673 | #endif | ||
1674 | |||
1545 | /* Free a module, remove from lists, etc. */ | 1675 | /* Free a module, remove from lists, etc. */ |
1546 | static void free_module(struct module *mod) | 1676 | static void free_module(struct module *mod) |
1547 | { | 1677 | { |
@@ -1566,6 +1696,7 @@ static void free_module(struct module *mod) | |||
1566 | destroy_params(mod->kp, mod->num_kp); | 1696 | destroy_params(mod->kp, mod->num_kp); |
1567 | 1697 | ||
1568 | /* This may be NULL, but that's OK */ | 1698 | /* This may be NULL, but that's OK */ |
1699 | unset_section_ro_nx(mod, mod->module_init); | ||
1569 | module_free(mod, mod->module_init); | 1700 | module_free(mod, mod->module_init); |
1570 | kfree(mod->args); | 1701 | kfree(mod->args); |
1571 | percpu_modfree(mod); | 1702 | percpu_modfree(mod); |
@@ -1574,6 +1705,7 @@ static void free_module(struct module *mod) | |||
1574 | lockdep_free_key_range(mod->module_core, mod->core_size); | 1705 | lockdep_free_key_range(mod->module_core, mod->core_size); |
1575 | 1706 | ||
1576 | /* Finally, free the core (containing the module structure) */ | 1707 | /* Finally, free the core (containing the module structure) */ |
1708 | unset_section_ro_nx(mod, mod->module_core); | ||
1577 | module_free(mod, mod->module_core); | 1709 | module_free(mod, mod->module_core); |
1578 | 1710 | ||
1579 | #ifdef CONFIG_MPU | 1711 | #ifdef CONFIG_MPU |
@@ -1777,8 +1909,19 @@ static void layout_sections(struct module *mod, struct load_info *info) | |||
1777 | s->sh_entsize = get_offset(mod, &mod->core_size, s, i); | 1909 | s->sh_entsize = get_offset(mod, &mod->core_size, s, i); |
1778 | DEBUGP("\t%s\n", name); | 1910 | DEBUGP("\t%s\n", name); |
1779 | } | 1911 | } |
1780 | if (m == 0) | 1912 | switch (m) { |
1913 | case 0: /* executable */ | ||
1914 | mod->core_size = debug_align(mod->core_size); | ||
1781 | mod->core_text_size = mod->core_size; | 1915 | mod->core_text_size = mod->core_size; |
1916 | break; | ||
1917 | case 1: /* RO: text and ro-data */ | ||
1918 | mod->core_size = debug_align(mod->core_size); | ||
1919 | mod->core_ro_size = mod->core_size; | ||
1920 | break; | ||
1921 | case 3: /* whole core */ | ||
1922 | mod->core_size = debug_align(mod->core_size); | ||
1923 | break; | ||
1924 | } | ||
1782 | } | 1925 | } |
1783 | 1926 | ||
1784 | DEBUGP("Init section allocation order:\n"); | 1927 | DEBUGP("Init section allocation order:\n"); |
@@ -1796,8 +1939,19 @@ static void layout_sections(struct module *mod, struct load_info *info) | |||
1796 | | INIT_OFFSET_MASK); | 1939 | | INIT_OFFSET_MASK); |
1797 | DEBUGP("\t%s\n", sname); | 1940 | DEBUGP("\t%s\n", sname); |
1798 | } | 1941 | } |
1799 | if (m == 0) | 1942 | switch (m) { |
1943 | case 0: /* executable */ | ||
1944 | mod->init_size = debug_align(mod->init_size); | ||
1800 | mod->init_text_size = mod->init_size; | 1945 | mod->init_text_size = mod->init_size; |
1946 | break; | ||
1947 | case 1: /* RO: text and ro-data */ | ||
1948 | mod->init_size = debug_align(mod->init_size); | ||
1949 | mod->init_ro_size = mod->init_size; | ||
1950 | break; | ||
1951 | case 3: /* whole init */ | ||
1952 | mod->init_size = debug_align(mod->init_size); | ||
1953 | break; | ||
1954 | } | ||
1801 | } | 1955 | } |
1802 | } | 1956 | } |
1803 | 1957 | ||
@@ -2306,9 +2460,9 @@ static void find_module_sections(struct module *mod, struct load_info *info) | |||
2306 | #endif | 2460 | #endif |
2307 | 2461 | ||
2308 | #ifdef CONFIG_TRACEPOINTS | 2462 | #ifdef CONFIG_TRACEPOINTS |
2309 | mod->tracepoints = section_objs(info, "__tracepoints", | 2463 | mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs", |
2310 | sizeof(*mod->tracepoints), | 2464 | sizeof(*mod->tracepoints_ptrs), |
2311 | &mod->num_tracepoints); | 2465 | &mod->num_tracepoints); |
2312 | #endif | 2466 | #endif |
2313 | #ifdef HAVE_JUMP_LABEL | 2467 | #ifdef HAVE_JUMP_LABEL |
2314 | mod->jump_entries = section_objs(info, "__jump_table", | 2468 | mod->jump_entries = section_objs(info, "__jump_table", |
@@ -2722,6 +2876,18 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, | |||
2722 | blocking_notifier_call_chain(&module_notify_list, | 2876 | blocking_notifier_call_chain(&module_notify_list, |
2723 | MODULE_STATE_COMING, mod); | 2877 | MODULE_STATE_COMING, mod); |
2724 | 2878 | ||
2879 | /* Set RO and NX regions for core */ | ||
2880 | set_section_ro_nx(mod->module_core, | ||
2881 | mod->core_text_size, | ||
2882 | mod->core_ro_size, | ||
2883 | mod->core_size); | ||
2884 | |||
2885 | /* Set RO and NX regions for init */ | ||
2886 | set_section_ro_nx(mod->module_init, | ||
2887 | mod->init_text_size, | ||
2888 | mod->init_ro_size, | ||
2889 | mod->init_size); | ||
2890 | |||
2725 | do_mod_ctors(mod); | 2891 | do_mod_ctors(mod); |
2726 | /* Start the module */ | 2892 | /* Start the module */ |
2727 | if (mod->init != NULL) | 2893 | if (mod->init != NULL) |
@@ -2765,6 +2931,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, | |||
2765 | mod->symtab = mod->core_symtab; | 2931 | mod->symtab = mod->core_symtab; |
2766 | mod->strtab = mod->core_strtab; | 2932 | mod->strtab = mod->core_strtab; |
2767 | #endif | 2933 | #endif |
2934 | unset_section_ro_nx(mod, mod->module_init); | ||
2768 | module_free(mod, mod->module_init); | 2935 | module_free(mod, mod->module_init); |
2769 | mod->module_init = NULL; | 2936 | mod->module_init = NULL; |
2770 | mod->init_size = 0; | 2937 | mod->init_size = 0; |
@@ -3226,7 +3393,7 @@ void module_layout(struct module *mod, | |||
3226 | struct modversion_info *ver, | 3393 | struct modversion_info *ver, |
3227 | struct kernel_param *kp, | 3394 | struct kernel_param *kp, |
3228 | struct kernel_symbol *ks, | 3395 | struct kernel_symbol *ks, |
3229 | struct tracepoint *tp) | 3396 | struct tracepoint * const *tp) |
3230 | { | 3397 | { |
3231 | } | 3398 | } |
3232 | EXPORT_SYMBOL(module_layout); | 3399 | EXPORT_SYMBOL(module_layout); |
@@ -3240,8 +3407,8 @@ void module_update_tracepoints(void) | |||
3240 | mutex_lock(&module_mutex); | 3407 | mutex_lock(&module_mutex); |
3241 | list_for_each_entry(mod, &modules, list) | 3408 | list_for_each_entry(mod, &modules, list) |
3242 | if (!mod->taints) | 3409 | if (!mod->taints) |
3243 | tracepoint_update_probe_range(mod->tracepoints, | 3410 | tracepoint_update_probe_range(mod->tracepoints_ptrs, |
3244 | mod->tracepoints + mod->num_tracepoints); | 3411 | mod->tracepoints_ptrs + mod->num_tracepoints); |
3245 | mutex_unlock(&module_mutex); | 3412 | mutex_unlock(&module_mutex); |
3246 | } | 3413 | } |
3247 | 3414 | ||
@@ -3265,8 +3432,8 @@ int module_get_iter_tracepoints(struct tracepoint_iter *iter) | |||
3265 | else if (iter_mod > iter->module) | 3432 | else if (iter_mod > iter->module) |
3266 | iter->tracepoint = NULL; | 3433 | iter->tracepoint = NULL; |
3267 | found = tracepoint_get_iter_range(&iter->tracepoint, | 3434 | found = tracepoint_get_iter_range(&iter->tracepoint, |
3268 | iter_mod->tracepoints, | 3435 | iter_mod->tracepoints_ptrs, |
3269 | iter_mod->tracepoints | 3436 | iter_mod->tracepoints_ptrs |
3270 | + iter_mod->num_tracepoints); | 3437 | + iter_mod->num_tracepoints); |
3271 | if (found) { | 3438 | if (found) { |
3272 | iter->module = iter_mod; | 3439 | iter->module = iter_mod; |
diff --git a/kernel/mutex.c b/kernel/mutex.c index 200407c1502f..a5889fb28ecf 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -199,7 +199,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
199 | * memory barriers as we'll eventually observe the right | 199 | * memory barriers as we'll eventually observe the right |
200 | * values at the cost of a few extra spins. | 200 | * values at the cost of a few extra spins. |
201 | */ | 201 | */ |
202 | cpu_relax(); | 202 | arch_mutex_cpu_relax(); |
203 | } | 203 | } |
204 | #endif | 204 | #endif |
205 | spin_lock_mutex(&lock->wait_lock, flags); | 205 | spin_lock_mutex(&lock->wait_lock, flags); |
diff --git a/kernel/panic.c b/kernel/panic.c index 4c13b1a88ebb..991bb87a1704 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -34,6 +34,7 @@ static int pause_on_oops_flag; | |||
34 | static DEFINE_SPINLOCK(pause_on_oops_lock); | 34 | static DEFINE_SPINLOCK(pause_on_oops_lock); |
35 | 35 | ||
36 | int panic_timeout; | 36 | int panic_timeout; |
37 | EXPORT_SYMBOL_GPL(panic_timeout); | ||
37 | 38 | ||
38 | ATOMIC_NOTIFIER_HEAD(panic_notifier_list); | 39 | ATOMIC_NOTIFIER_HEAD(panic_notifier_list); |
39 | 40 | ||
diff --git a/kernel/params.c b/kernel/params.c index 08107d181758..0da1411222b9 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -719,9 +719,7 @@ void destroy_params(const struct kernel_param *params, unsigned num) | |||
719 | params[i].ops->free(params[i].arg); | 719 | params[i].ops->free(params[i].arg); |
720 | } | 720 | } |
721 | 721 | ||
722 | static void __init kernel_add_sysfs_param(const char *name, | 722 | static struct module_kobject * __init locate_module_kobject(const char *name) |
723 | struct kernel_param *kparam, | ||
724 | unsigned int name_skip) | ||
725 | { | 723 | { |
726 | struct module_kobject *mk; | 724 | struct module_kobject *mk; |
727 | struct kobject *kobj; | 725 | struct kobject *kobj; |
@@ -729,10 +727,7 @@ static void __init kernel_add_sysfs_param(const char *name, | |||
729 | 727 | ||
730 | kobj = kset_find_obj(module_kset, name); | 728 | kobj = kset_find_obj(module_kset, name); |
731 | if (kobj) { | 729 | if (kobj) { |
732 | /* We already have one. Remove params so we can add more. */ | ||
733 | mk = to_module_kobject(kobj); | 730 | mk = to_module_kobject(kobj); |
734 | /* We need to remove it before adding parameters. */ | ||
735 | sysfs_remove_group(&mk->kobj, &mk->mp->grp); | ||
736 | } else { | 731 | } else { |
737 | mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); | 732 | mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); |
738 | BUG_ON(!mk); | 733 | BUG_ON(!mk); |
@@ -743,15 +738,36 @@ static void __init kernel_add_sysfs_param(const char *name, | |||
743 | "%s", name); | 738 | "%s", name); |
744 | if (err) { | 739 | if (err) { |
745 | kobject_put(&mk->kobj); | 740 | kobject_put(&mk->kobj); |
746 | printk(KERN_ERR "Module '%s' failed add to sysfs, " | 741 | printk(KERN_ERR |
747 | "error number %d\n", name, err); | 742 | "Module '%s' failed add to sysfs, error number %d\n", |
748 | printk(KERN_ERR "The system will be unstable now.\n"); | 743 | name, err); |
749 | return; | 744 | printk(KERN_ERR |
745 | "The system will be unstable now.\n"); | ||
746 | return NULL; | ||
750 | } | 747 | } |
751 | /* So that exit path is even. */ | 748 | |
749 | /* So that we hold reference in both cases. */ | ||
752 | kobject_get(&mk->kobj); | 750 | kobject_get(&mk->kobj); |
753 | } | 751 | } |
754 | 752 | ||
753 | return mk; | ||
754 | } | ||
755 | |||
756 | static void __init kernel_add_sysfs_param(const char *name, | ||
757 | struct kernel_param *kparam, | ||
758 | unsigned int name_skip) | ||
759 | { | ||
760 | struct module_kobject *mk; | ||
761 | int err; | ||
762 | |||
763 | mk = locate_module_kobject(name); | ||
764 | if (!mk) | ||
765 | return; | ||
766 | |||
767 | /* We need to remove old parameters before adding more. */ | ||
768 | if (mk->mp) | ||
769 | sysfs_remove_group(&mk->kobj, &mk->mp->grp); | ||
770 | |||
755 | /* These should not fail at boot. */ | 771 | /* These should not fail at boot. */ |
756 | err = add_sysfs_param(mk, kparam, kparam->name + name_skip); | 772 | err = add_sysfs_param(mk, kparam, kparam->name + name_skip); |
757 | BUG_ON(err); | 773 | BUG_ON(err); |
@@ -796,6 +812,32 @@ static void __init param_sysfs_builtin(void) | |||
796 | } | 812 | } |
797 | } | 813 | } |
798 | 814 | ||
815 | ssize_t __modver_version_show(struct module_attribute *mattr, | ||
816 | struct module *mod, char *buf) | ||
817 | { | ||
818 | struct module_version_attribute *vattr = | ||
819 | container_of(mattr, struct module_version_attribute, mattr); | ||
820 | |||
821 | return sprintf(buf, "%s\n", vattr->version); | ||
822 | } | ||
823 | |||
824 | extern struct module_version_attribute __start___modver[], __stop___modver[]; | ||
825 | |||
826 | static void __init version_sysfs_builtin(void) | ||
827 | { | ||
828 | const struct module_version_attribute *vattr; | ||
829 | struct module_kobject *mk; | ||
830 | int err; | ||
831 | |||
832 | for (vattr = __start___modver; vattr < __stop___modver; vattr++) { | ||
833 | mk = locate_module_kobject(vattr->module_name); | ||
834 | if (mk) { | ||
835 | err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); | ||
836 | kobject_uevent(&mk->kobj, KOBJ_ADD); | ||
837 | kobject_put(&mk->kobj); | ||
838 | } | ||
839 | } | ||
840 | } | ||
799 | 841 | ||
800 | /* module-related sysfs stuff */ | 842 | /* module-related sysfs stuff */ |
801 | 843 | ||
@@ -875,6 +917,7 @@ static int __init param_sysfs_init(void) | |||
875 | } | 917 | } |
876 | module_sysfs_initialized = 1; | 918 | module_sysfs_initialized = 1; |
877 | 919 | ||
920 | version_sysfs_builtin(); | ||
878 | param_sysfs_builtin(); | 921 | param_sysfs_builtin(); |
879 | 922 | ||
880 | return 0; | 923 | return 0; |
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 2870feee81dd..999835b6112b 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/mm.h> | 13 | #include <linux/mm.h> |
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | #include <linux/smp.h> | 15 | #include <linux/smp.h> |
16 | #include <linux/idr.h> | ||
16 | #include <linux/file.h> | 17 | #include <linux/file.h> |
17 | #include <linux/poll.h> | 18 | #include <linux/poll.h> |
18 | #include <linux/slab.h> | 19 | #include <linux/slab.h> |
@@ -21,7 +22,9 @@ | |||
21 | #include <linux/dcache.h> | 22 | #include <linux/dcache.h> |
22 | #include <linux/percpu.h> | 23 | #include <linux/percpu.h> |
23 | #include <linux/ptrace.h> | 24 | #include <linux/ptrace.h> |
25 | #include <linux/reboot.h> | ||
24 | #include <linux/vmstat.h> | 26 | #include <linux/vmstat.h> |
27 | #include <linux/device.h> | ||
25 | #include <linux/vmalloc.h> | 28 | #include <linux/vmalloc.h> |
26 | #include <linux/hardirq.h> | 29 | #include <linux/hardirq.h> |
27 | #include <linux/rculist.h> | 30 | #include <linux/rculist.h> |
@@ -35,6 +38,12 @@ | |||
35 | 38 | ||
36 | #include <asm/irq_regs.h> | 39 | #include <asm/irq_regs.h> |
37 | 40 | ||
41 | enum event_type_t { | ||
42 | EVENT_FLEXIBLE = 0x1, | ||
43 | EVENT_PINNED = 0x2, | ||
44 | EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, | ||
45 | }; | ||
46 | |||
38 | atomic_t perf_task_events __read_mostly; | 47 | atomic_t perf_task_events __read_mostly; |
39 | static atomic_t nr_mmap_events __read_mostly; | 48 | static atomic_t nr_mmap_events __read_mostly; |
40 | static atomic_t nr_comm_events __read_mostly; | 49 | static atomic_t nr_comm_events __read_mostly; |
@@ -62,6 +71,12 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000; | |||
62 | 71 | ||
63 | static atomic64_t perf_event_id; | 72 | static atomic64_t perf_event_id; |
64 | 73 | ||
74 | static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, | ||
75 | enum event_type_t event_type); | ||
76 | |||
77 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | ||
78 | enum event_type_t event_type); | ||
79 | |||
65 | void __weak perf_event_print_debug(void) { } | 80 | void __weak perf_event_print_debug(void) { } |
66 | 81 | ||
67 | extern __weak const char *perf_pmu_name(void) | 82 | extern __weak const char *perf_pmu_name(void) |
@@ -69,6 +84,11 @@ extern __weak const char *perf_pmu_name(void) | |||
69 | return "pmu"; | 84 | return "pmu"; |
70 | } | 85 | } |
71 | 86 | ||
87 | static inline u64 perf_clock(void) | ||
88 | { | ||
89 | return local_clock(); | ||
90 | } | ||
91 | |||
72 | void perf_pmu_disable(struct pmu *pmu) | 92 | void perf_pmu_disable(struct pmu *pmu) |
73 | { | 93 | { |
74 | int *count = this_cpu_ptr(pmu->pmu_disable_count); | 94 | int *count = this_cpu_ptr(pmu->pmu_disable_count); |
@@ -133,6 +153,28 @@ static void unclone_ctx(struct perf_event_context *ctx) | |||
133 | } | 153 | } |
134 | } | 154 | } |
135 | 155 | ||
156 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) | ||
157 | { | ||
158 | /* | ||
159 | * only top level events have the pid namespace they were created in | ||
160 | */ | ||
161 | if (event->parent) | ||
162 | event = event->parent; | ||
163 | |||
164 | return task_tgid_nr_ns(p, event->ns); | ||
165 | } | ||
166 | |||
167 | static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) | ||
168 | { | ||
169 | /* | ||
170 | * only top level events have the pid namespace they were created in | ||
171 | */ | ||
172 | if (event->parent) | ||
173 | event = event->parent; | ||
174 | |||
175 | return task_pid_nr_ns(p, event->ns); | ||
176 | } | ||
177 | |||
136 | /* | 178 | /* |
137 | * If we inherit events we want to return the parent event id | 179 | * If we inherit events we want to return the parent event id |
138 | * to userspace. | 180 | * to userspace. |
@@ -215,11 +257,6 @@ static void perf_unpin_context(struct perf_event_context *ctx) | |||
215 | put_ctx(ctx); | 257 | put_ctx(ctx); |
216 | } | 258 | } |
217 | 259 | ||
218 | static inline u64 perf_clock(void) | ||
219 | { | ||
220 | return local_clock(); | ||
221 | } | ||
222 | |||
223 | /* | 260 | /* |
224 | * Update the record of the current time in a context. | 261 | * Update the record of the current time in a context. |
225 | */ | 262 | */ |
@@ -231,6 +268,12 @@ static void update_context_time(struct perf_event_context *ctx) | |||
231 | ctx->timestamp = now; | 268 | ctx->timestamp = now; |
232 | } | 269 | } |
233 | 270 | ||
271 | static u64 perf_event_time(struct perf_event *event) | ||
272 | { | ||
273 | struct perf_event_context *ctx = event->ctx; | ||
274 | return ctx ? ctx->time : 0; | ||
275 | } | ||
276 | |||
234 | /* | 277 | /* |
235 | * Update the total_time_enabled and total_time_running fields for a event. | 278 | * Update the total_time_enabled and total_time_running fields for a event. |
236 | */ | 279 | */ |
@@ -244,7 +287,7 @@ static void update_event_times(struct perf_event *event) | |||
244 | return; | 287 | return; |
245 | 288 | ||
246 | if (ctx->is_active) | 289 | if (ctx->is_active) |
247 | run_end = ctx->time; | 290 | run_end = perf_event_time(event); |
248 | else | 291 | else |
249 | run_end = event->tstamp_stopped; | 292 | run_end = event->tstamp_stopped; |
250 | 293 | ||
@@ -253,7 +296,7 @@ static void update_event_times(struct perf_event *event) | |||
253 | if (event->state == PERF_EVENT_STATE_INACTIVE) | 296 | if (event->state == PERF_EVENT_STATE_INACTIVE) |
254 | run_end = event->tstamp_stopped; | 297 | run_end = event->tstamp_stopped; |
255 | else | 298 | else |
256 | run_end = ctx->time; | 299 | run_end = perf_event_time(event); |
257 | 300 | ||
258 | event->total_time_running = run_end - event->tstamp_running; | 301 | event->total_time_running = run_end - event->tstamp_running; |
259 | } | 302 | } |
@@ -312,9 +355,84 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
312 | ctx->nr_stat++; | 355 | ctx->nr_stat++; |
313 | } | 356 | } |
314 | 357 | ||
358 | /* | ||
359 | * Called at perf_event creation and when events are attached/detached from a | ||
360 | * group. | ||
361 | */ | ||
362 | static void perf_event__read_size(struct perf_event *event) | ||
363 | { | ||
364 | int entry = sizeof(u64); /* value */ | ||
365 | int size = 0; | ||
366 | int nr = 1; | ||
367 | |||
368 | if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) | ||
369 | size += sizeof(u64); | ||
370 | |||
371 | if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) | ||
372 | size += sizeof(u64); | ||
373 | |||
374 | if (event->attr.read_format & PERF_FORMAT_ID) | ||
375 | entry += sizeof(u64); | ||
376 | |||
377 | if (event->attr.read_format & PERF_FORMAT_GROUP) { | ||
378 | nr += event->group_leader->nr_siblings; | ||
379 | size += sizeof(u64); | ||
380 | } | ||
381 | |||
382 | size += entry * nr; | ||
383 | event->read_size = size; | ||
384 | } | ||
385 | |||
386 | static void perf_event__header_size(struct perf_event *event) | ||
387 | { | ||
388 | struct perf_sample_data *data; | ||
389 | u64 sample_type = event->attr.sample_type; | ||
390 | u16 size = 0; | ||
391 | |||
392 | perf_event__read_size(event); | ||
393 | |||
394 | if (sample_type & PERF_SAMPLE_IP) | ||
395 | size += sizeof(data->ip); | ||
396 | |||
397 | if (sample_type & PERF_SAMPLE_ADDR) | ||
398 | size += sizeof(data->addr); | ||
399 | |||
400 | if (sample_type & PERF_SAMPLE_PERIOD) | ||
401 | size += sizeof(data->period); | ||
402 | |||
403 | if (sample_type & PERF_SAMPLE_READ) | ||
404 | size += event->read_size; | ||
405 | |||
406 | event->header_size = size; | ||
407 | } | ||
408 | |||
409 | static void perf_event__id_header_size(struct perf_event *event) | ||
410 | { | ||
411 | struct perf_sample_data *data; | ||
412 | u64 sample_type = event->attr.sample_type; | ||
413 | u16 size = 0; | ||
414 | |||
415 | if (sample_type & PERF_SAMPLE_TID) | ||
416 | size += sizeof(data->tid_entry); | ||
417 | |||
418 | if (sample_type & PERF_SAMPLE_TIME) | ||
419 | size += sizeof(data->time); | ||
420 | |||
421 | if (sample_type & PERF_SAMPLE_ID) | ||
422 | size += sizeof(data->id); | ||
423 | |||
424 | if (sample_type & PERF_SAMPLE_STREAM_ID) | ||
425 | size += sizeof(data->stream_id); | ||
426 | |||
427 | if (sample_type & PERF_SAMPLE_CPU) | ||
428 | size += sizeof(data->cpu_entry); | ||
429 | |||
430 | event->id_header_size = size; | ||
431 | } | ||
432 | |||
315 | static void perf_group_attach(struct perf_event *event) | 433 | static void perf_group_attach(struct perf_event *event) |
316 | { | 434 | { |
317 | struct perf_event *group_leader = event->group_leader; | 435 | struct perf_event *group_leader = event->group_leader, *pos; |
318 | 436 | ||
319 | /* | 437 | /* |
320 | * We can have double attach due to group movement in perf_event_open. | 438 | * We can have double attach due to group movement in perf_event_open. |
@@ -333,6 +451,11 @@ static void perf_group_attach(struct perf_event *event) | |||
333 | 451 | ||
334 | list_add_tail(&event->group_entry, &group_leader->sibling_list); | 452 | list_add_tail(&event->group_entry, &group_leader->sibling_list); |
335 | group_leader->nr_siblings++; | 453 | group_leader->nr_siblings++; |
454 | |||
455 | perf_event__header_size(group_leader); | ||
456 | |||
457 | list_for_each_entry(pos, &group_leader->sibling_list, group_entry) | ||
458 | perf_event__header_size(pos); | ||
336 | } | 459 | } |
337 | 460 | ||
338 | /* | 461 | /* |
@@ -391,7 +514,7 @@ static void perf_group_detach(struct perf_event *event) | |||
391 | if (event->group_leader != event) { | 514 | if (event->group_leader != event) { |
392 | list_del_init(&event->group_entry); | 515 | list_del_init(&event->group_entry); |
393 | event->group_leader->nr_siblings--; | 516 | event->group_leader->nr_siblings--; |
394 | return; | 517 | goto out; |
395 | } | 518 | } |
396 | 519 | ||
397 | if (!list_empty(&event->group_entry)) | 520 | if (!list_empty(&event->group_entry)) |
@@ -410,6 +533,12 @@ static void perf_group_detach(struct perf_event *event) | |||
410 | /* Inherit group flags from the previous leader */ | 533 | /* Inherit group flags from the previous leader */ |
411 | sibling->group_flags = event->group_flags; | 534 | sibling->group_flags = event->group_flags; |
412 | } | 535 | } |
536 | |||
537 | out: | ||
538 | perf_event__header_size(event->group_leader); | ||
539 | |||
540 | list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) | ||
541 | perf_event__header_size(tmp); | ||
413 | } | 542 | } |
414 | 543 | ||
415 | static inline int | 544 | static inline int |
@@ -423,6 +552,7 @@ event_sched_out(struct perf_event *event, | |||
423 | struct perf_cpu_context *cpuctx, | 552 | struct perf_cpu_context *cpuctx, |
424 | struct perf_event_context *ctx) | 553 | struct perf_event_context *ctx) |
425 | { | 554 | { |
555 | u64 tstamp = perf_event_time(event); | ||
426 | u64 delta; | 556 | u64 delta; |
427 | /* | 557 | /* |
428 | * An event which could not be activated because of | 558 | * An event which could not be activated because of |
@@ -434,7 +564,7 @@ event_sched_out(struct perf_event *event, | |||
434 | && !event_filter_match(event)) { | 564 | && !event_filter_match(event)) { |
435 | delta = ctx->time - event->tstamp_stopped; | 565 | delta = ctx->time - event->tstamp_stopped; |
436 | event->tstamp_running += delta; | 566 | event->tstamp_running += delta; |
437 | event->tstamp_stopped = ctx->time; | 567 | event->tstamp_stopped = tstamp; |
438 | } | 568 | } |
439 | 569 | ||
440 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 570 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
@@ -445,7 +575,7 @@ event_sched_out(struct perf_event *event, | |||
445 | event->pending_disable = 0; | 575 | event->pending_disable = 0; |
446 | event->state = PERF_EVENT_STATE_OFF; | 576 | event->state = PERF_EVENT_STATE_OFF; |
447 | } | 577 | } |
448 | event->tstamp_stopped = ctx->time; | 578 | event->tstamp_stopped = tstamp; |
449 | event->pmu->del(event, 0); | 579 | event->pmu->del(event, 0); |
450 | event->oncpu = -1; | 580 | event->oncpu = -1; |
451 | 581 | ||
@@ -657,6 +787,8 @@ event_sched_in(struct perf_event *event, | |||
657 | struct perf_cpu_context *cpuctx, | 787 | struct perf_cpu_context *cpuctx, |
658 | struct perf_event_context *ctx) | 788 | struct perf_event_context *ctx) |
659 | { | 789 | { |
790 | u64 tstamp = perf_event_time(event); | ||
791 | |||
660 | if (event->state <= PERF_EVENT_STATE_OFF) | 792 | if (event->state <= PERF_EVENT_STATE_OFF) |
661 | return 0; | 793 | return 0; |
662 | 794 | ||
@@ -673,9 +805,9 @@ event_sched_in(struct perf_event *event, | |||
673 | return -EAGAIN; | 805 | return -EAGAIN; |
674 | } | 806 | } |
675 | 807 | ||
676 | event->tstamp_running += ctx->time - event->tstamp_stopped; | 808 | event->tstamp_running += tstamp - event->tstamp_stopped; |
677 | 809 | ||
678 | event->shadow_ctx_time = ctx->time - ctx->timestamp; | 810 | event->shadow_ctx_time = tstamp - ctx->timestamp; |
679 | 811 | ||
680 | if (!is_software_event(event)) | 812 | if (!is_software_event(event)) |
681 | cpuctx->active_oncpu++; | 813 | cpuctx->active_oncpu++; |
@@ -787,11 +919,13 @@ static int group_can_go_on(struct perf_event *event, | |||
787 | static void add_event_to_ctx(struct perf_event *event, | 919 | static void add_event_to_ctx(struct perf_event *event, |
788 | struct perf_event_context *ctx) | 920 | struct perf_event_context *ctx) |
789 | { | 921 | { |
922 | u64 tstamp = perf_event_time(event); | ||
923 | |||
790 | list_add_event(event, ctx); | 924 | list_add_event(event, ctx); |
791 | perf_group_attach(event); | 925 | perf_group_attach(event); |
792 | event->tstamp_enabled = ctx->time; | 926 | event->tstamp_enabled = tstamp; |
793 | event->tstamp_running = ctx->time; | 927 | event->tstamp_running = tstamp; |
794 | event->tstamp_stopped = ctx->time; | 928 | event->tstamp_stopped = tstamp; |
795 | } | 929 | } |
796 | 930 | ||
797 | /* | 931 | /* |
@@ -826,7 +960,7 @@ static void __perf_install_in_context(void *info) | |||
826 | 960 | ||
827 | add_event_to_ctx(event, ctx); | 961 | add_event_to_ctx(event, ctx); |
828 | 962 | ||
829 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 963 | if (!event_filter_match(event)) |
830 | goto unlock; | 964 | goto unlock; |
831 | 965 | ||
832 | /* | 966 | /* |
@@ -931,14 +1065,13 @@ static void __perf_event_mark_enabled(struct perf_event *event, | |||
931 | struct perf_event_context *ctx) | 1065 | struct perf_event_context *ctx) |
932 | { | 1066 | { |
933 | struct perf_event *sub; | 1067 | struct perf_event *sub; |
1068 | u64 tstamp = perf_event_time(event); | ||
934 | 1069 | ||
935 | event->state = PERF_EVENT_STATE_INACTIVE; | 1070 | event->state = PERF_EVENT_STATE_INACTIVE; |
936 | event->tstamp_enabled = ctx->time - event->total_time_enabled; | 1071 | event->tstamp_enabled = tstamp - event->total_time_enabled; |
937 | list_for_each_entry(sub, &event->sibling_list, group_entry) { | 1072 | list_for_each_entry(sub, &event->sibling_list, group_entry) { |
938 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) { | 1073 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) |
939 | sub->tstamp_enabled = | 1074 | sub->tstamp_enabled = tstamp - sub->total_time_enabled; |
940 | ctx->time - sub->total_time_enabled; | ||
941 | } | ||
942 | } | 1075 | } |
943 | } | 1076 | } |
944 | 1077 | ||
@@ -971,7 +1104,7 @@ static void __perf_event_enable(void *info) | |||
971 | goto unlock; | 1104 | goto unlock; |
972 | __perf_event_mark_enabled(event, ctx); | 1105 | __perf_event_mark_enabled(event, ctx); |
973 | 1106 | ||
974 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 1107 | if (!event_filter_match(event)) |
975 | goto unlock; | 1108 | goto unlock; |
976 | 1109 | ||
977 | /* | 1110 | /* |
@@ -1073,7 +1206,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh) | |||
1073 | /* | 1206 | /* |
1074 | * not supported on inherited events | 1207 | * not supported on inherited events |
1075 | */ | 1208 | */ |
1076 | if (event->attr.inherit) | 1209 | if (event->attr.inherit || !is_sampling_event(event)) |
1077 | return -EINVAL; | 1210 | return -EINVAL; |
1078 | 1211 | ||
1079 | atomic_add(refresh, &event->event_limit); | 1212 | atomic_add(refresh, &event->event_limit); |
@@ -1082,12 +1215,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh) | |||
1082 | return 0; | 1215 | return 0; |
1083 | } | 1216 | } |
1084 | 1217 | ||
1085 | enum event_type_t { | ||
1086 | EVENT_FLEXIBLE = 0x1, | ||
1087 | EVENT_PINNED = 0x2, | ||
1088 | EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, | ||
1089 | }; | ||
1090 | |||
1091 | static void ctx_sched_out(struct perf_event_context *ctx, | 1218 | static void ctx_sched_out(struct perf_event_context *ctx, |
1092 | struct perf_cpu_context *cpuctx, | 1219 | struct perf_cpu_context *cpuctx, |
1093 | enum event_type_t event_type) | 1220 | enum event_type_t event_type) |
@@ -1324,7 +1451,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, | |||
1324 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) { | 1451 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) { |
1325 | if (event->state <= PERF_EVENT_STATE_OFF) | 1452 | if (event->state <= PERF_EVENT_STATE_OFF) |
1326 | continue; | 1453 | continue; |
1327 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 1454 | if (!event_filter_match(event)) |
1328 | continue; | 1455 | continue; |
1329 | 1456 | ||
1330 | if (group_can_go_on(event, cpuctx, 1)) | 1457 | if (group_can_go_on(event, cpuctx, 1)) |
@@ -1356,7 +1483,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, | |||
1356 | * Listen to the 'cpu' scheduling filter constraint | 1483 | * Listen to the 'cpu' scheduling filter constraint |
1357 | * of events: | 1484 | * of events: |
1358 | */ | 1485 | */ |
1359 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 1486 | if (!event_filter_match(event)) |
1360 | continue; | 1487 | continue; |
1361 | 1488 | ||
1362 | if (group_can_go_on(event, cpuctx, can_add_hw)) { | 1489 | if (group_can_go_on(event, cpuctx, can_add_hw)) { |
@@ -1583,7 +1710,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) | |||
1583 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 1710 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
1584 | continue; | 1711 | continue; |
1585 | 1712 | ||
1586 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 1713 | if (!event_filter_match(event)) |
1587 | continue; | 1714 | continue; |
1588 | 1715 | ||
1589 | hwc = &event->hw; | 1716 | hwc = &event->hw; |
@@ -1774,11 +1901,12 @@ static void __perf_event_read(void *info) | |||
1774 | return; | 1901 | return; |
1775 | 1902 | ||
1776 | raw_spin_lock(&ctx->lock); | 1903 | raw_spin_lock(&ctx->lock); |
1777 | update_context_time(ctx); | 1904 | if (ctx->is_active) |
1905 | update_context_time(ctx); | ||
1778 | update_event_times(event); | 1906 | update_event_times(event); |
1907 | if (event->state == PERF_EVENT_STATE_ACTIVE) | ||
1908 | event->pmu->read(event); | ||
1779 | raw_spin_unlock(&ctx->lock); | 1909 | raw_spin_unlock(&ctx->lock); |
1780 | |||
1781 | event->pmu->read(event); | ||
1782 | } | 1910 | } |
1783 | 1911 | ||
1784 | static inline u64 perf_event_count(struct perf_event *event) | 1912 | static inline u64 perf_event_count(struct perf_event *event) |
@@ -1872,8 +2000,7 @@ static int alloc_callchain_buffers(void) | |||
1872 | * accessed from NMI. Use a temporary manual per cpu allocation | 2000 | * accessed from NMI. Use a temporary manual per cpu allocation |
1873 | * until that gets sorted out. | 2001 | * until that gets sorted out. |
1874 | */ | 2002 | */ |
1875 | size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) * | 2003 | size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); |
1876 | num_possible_cpus(); | ||
1877 | 2004 | ||
1878 | entries = kzalloc(size, GFP_KERNEL); | 2005 | entries = kzalloc(size, GFP_KERNEL); |
1879 | if (!entries) | 2006 | if (!entries) |
@@ -2074,13 +2201,6 @@ find_lively_task_by_vpid(pid_t vpid) | |||
2074 | if (!task) | 2201 | if (!task) |
2075 | return ERR_PTR(-ESRCH); | 2202 | return ERR_PTR(-ESRCH); |
2076 | 2203 | ||
2077 | /* | ||
2078 | * Can't attach events to a dying task. | ||
2079 | */ | ||
2080 | err = -ESRCH; | ||
2081 | if (task->flags & PF_EXITING) | ||
2082 | goto errout; | ||
2083 | |||
2084 | /* Reuse ptrace permission checks for now. */ | 2204 | /* Reuse ptrace permission checks for now. */ |
2085 | err = -EACCES; | 2205 | err = -EACCES; |
2086 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) | 2206 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) |
@@ -2101,14 +2221,11 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | |||
2101 | unsigned long flags; | 2221 | unsigned long flags; |
2102 | int ctxn, err; | 2222 | int ctxn, err; |
2103 | 2223 | ||
2104 | if (!task && cpu != -1) { | 2224 | if (!task) { |
2105 | /* Must be root to operate on a CPU event: */ | 2225 | /* Must be root to operate on a CPU event: */ |
2106 | if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) | 2226 | if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) |
2107 | return ERR_PTR(-EACCES); | 2227 | return ERR_PTR(-EACCES); |
2108 | 2228 | ||
2109 | if (cpu < 0 || cpu >= nr_cpumask_bits) | ||
2110 | return ERR_PTR(-EINVAL); | ||
2111 | |||
2112 | /* | 2229 | /* |
2113 | * We could be clever and allow to attach a event to an | 2230 | * We could be clever and allow to attach a event to an |
2114 | * offline CPU and activate it when the CPU comes up, but | 2231 | * offline CPU and activate it when the CPU comes up, but |
@@ -2144,14 +2261,27 @@ retry: | |||
2144 | 2261 | ||
2145 | get_ctx(ctx); | 2262 | get_ctx(ctx); |
2146 | 2263 | ||
2147 | if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) { | 2264 | err = 0; |
2148 | /* | 2265 | mutex_lock(&task->perf_event_mutex); |
2149 | * We raced with some other task; use | 2266 | /* |
2150 | * the context they set. | 2267 | * If it has already passed perf_event_exit_task(). |
2151 | */ | 2268 | * we must see PF_EXITING, it takes this mutex too. |
2269 | */ | ||
2270 | if (task->flags & PF_EXITING) | ||
2271 | err = -ESRCH; | ||
2272 | else if (task->perf_event_ctxp[ctxn]) | ||
2273 | err = -EAGAIN; | ||
2274 | else | ||
2275 | rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); | ||
2276 | mutex_unlock(&task->perf_event_mutex); | ||
2277 | |||
2278 | if (unlikely(err)) { | ||
2152 | put_task_struct(task); | 2279 | put_task_struct(task); |
2153 | kfree(ctx); | 2280 | kfree(ctx); |
2154 | goto retry; | 2281 | |
2282 | if (err == -EAGAIN) | ||
2283 | goto retry; | ||
2284 | goto errout; | ||
2155 | } | 2285 | } |
2156 | } | 2286 | } |
2157 | 2287 | ||
@@ -2289,31 +2419,6 @@ static int perf_release(struct inode *inode, struct file *file) | |||
2289 | return perf_event_release_kernel(event); | 2419 | return perf_event_release_kernel(event); |
2290 | } | 2420 | } |
2291 | 2421 | ||
2292 | static int perf_event_read_size(struct perf_event *event) | ||
2293 | { | ||
2294 | int entry = sizeof(u64); /* value */ | ||
2295 | int size = 0; | ||
2296 | int nr = 1; | ||
2297 | |||
2298 | if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) | ||
2299 | size += sizeof(u64); | ||
2300 | |||
2301 | if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) | ||
2302 | size += sizeof(u64); | ||
2303 | |||
2304 | if (event->attr.read_format & PERF_FORMAT_ID) | ||
2305 | entry += sizeof(u64); | ||
2306 | |||
2307 | if (event->attr.read_format & PERF_FORMAT_GROUP) { | ||
2308 | nr += event->group_leader->nr_siblings; | ||
2309 | size += sizeof(u64); | ||
2310 | } | ||
2311 | |||
2312 | size += entry * nr; | ||
2313 | |||
2314 | return size; | ||
2315 | } | ||
2316 | |||
2317 | u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) | 2422 | u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) |
2318 | { | 2423 | { |
2319 | struct perf_event *child; | 2424 | struct perf_event *child; |
@@ -2428,7 +2533,7 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count) | |||
2428 | if (event->state == PERF_EVENT_STATE_ERROR) | 2533 | if (event->state == PERF_EVENT_STATE_ERROR) |
2429 | return 0; | 2534 | return 0; |
2430 | 2535 | ||
2431 | if (count < perf_event_read_size(event)) | 2536 | if (count < event->read_size) |
2432 | return -ENOSPC; | 2537 | return -ENOSPC; |
2433 | 2538 | ||
2434 | WARN_ON_ONCE(event->ctx->parent_ctx); | 2539 | WARN_ON_ONCE(event->ctx->parent_ctx); |
@@ -2514,7 +2619,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) | |||
2514 | int ret = 0; | 2619 | int ret = 0; |
2515 | u64 value; | 2620 | u64 value; |
2516 | 2621 | ||
2517 | if (!event->attr.sample_period) | 2622 | if (!is_sampling_event(event)) |
2518 | return -EINVAL; | 2623 | return -EINVAL; |
2519 | 2624 | ||
2520 | if (copy_from_user(&value, arg, sizeof(value))) | 2625 | if (copy_from_user(&value, arg, sizeof(value))) |
@@ -3305,6 +3410,73 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle, | |||
3305 | } while (len); | 3410 | } while (len); |
3306 | } | 3411 | } |
3307 | 3412 | ||
3413 | static void __perf_event_header__init_id(struct perf_event_header *header, | ||
3414 | struct perf_sample_data *data, | ||
3415 | struct perf_event *event) | ||
3416 | { | ||
3417 | u64 sample_type = event->attr.sample_type; | ||
3418 | |||
3419 | data->type = sample_type; | ||
3420 | header->size += event->id_header_size; | ||
3421 | |||
3422 | if (sample_type & PERF_SAMPLE_TID) { | ||
3423 | /* namespace issues */ | ||
3424 | data->tid_entry.pid = perf_event_pid(event, current); | ||
3425 | data->tid_entry.tid = perf_event_tid(event, current); | ||
3426 | } | ||
3427 | |||
3428 | if (sample_type & PERF_SAMPLE_TIME) | ||
3429 | data->time = perf_clock(); | ||
3430 | |||
3431 | if (sample_type & PERF_SAMPLE_ID) | ||
3432 | data->id = primary_event_id(event); | ||
3433 | |||
3434 | if (sample_type & PERF_SAMPLE_STREAM_ID) | ||
3435 | data->stream_id = event->id; | ||
3436 | |||
3437 | if (sample_type & PERF_SAMPLE_CPU) { | ||
3438 | data->cpu_entry.cpu = raw_smp_processor_id(); | ||
3439 | data->cpu_entry.reserved = 0; | ||
3440 | } | ||
3441 | } | ||
3442 | |||
3443 | static void perf_event_header__init_id(struct perf_event_header *header, | ||
3444 | struct perf_sample_data *data, | ||
3445 | struct perf_event *event) | ||
3446 | { | ||
3447 | if (event->attr.sample_id_all) | ||
3448 | __perf_event_header__init_id(header, data, event); | ||
3449 | } | ||
3450 | |||
3451 | static void __perf_event__output_id_sample(struct perf_output_handle *handle, | ||
3452 | struct perf_sample_data *data) | ||
3453 | { | ||
3454 | u64 sample_type = data->type; | ||
3455 | |||
3456 | if (sample_type & PERF_SAMPLE_TID) | ||
3457 | perf_output_put(handle, data->tid_entry); | ||
3458 | |||
3459 | if (sample_type & PERF_SAMPLE_TIME) | ||
3460 | perf_output_put(handle, data->time); | ||
3461 | |||
3462 | if (sample_type & PERF_SAMPLE_ID) | ||
3463 | perf_output_put(handle, data->id); | ||
3464 | |||
3465 | if (sample_type & PERF_SAMPLE_STREAM_ID) | ||
3466 | perf_output_put(handle, data->stream_id); | ||
3467 | |||
3468 | if (sample_type & PERF_SAMPLE_CPU) | ||
3469 | perf_output_put(handle, data->cpu_entry); | ||
3470 | } | ||
3471 | |||
3472 | static void perf_event__output_id_sample(struct perf_event *event, | ||
3473 | struct perf_output_handle *handle, | ||
3474 | struct perf_sample_data *sample) | ||
3475 | { | ||
3476 | if (event->attr.sample_id_all) | ||
3477 | __perf_event__output_id_sample(handle, sample); | ||
3478 | } | ||
3479 | |||
3308 | int perf_output_begin(struct perf_output_handle *handle, | 3480 | int perf_output_begin(struct perf_output_handle *handle, |
3309 | struct perf_event *event, unsigned int size, | 3481 | struct perf_event *event, unsigned int size, |
3310 | int nmi, int sample) | 3482 | int nmi, int sample) |
@@ -3312,6 +3484,7 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3312 | struct perf_buffer *buffer; | 3484 | struct perf_buffer *buffer; |
3313 | unsigned long tail, offset, head; | 3485 | unsigned long tail, offset, head; |
3314 | int have_lost; | 3486 | int have_lost; |
3487 | struct perf_sample_data sample_data; | ||
3315 | struct { | 3488 | struct { |
3316 | struct perf_event_header header; | 3489 | struct perf_event_header header; |
3317 | u64 id; | 3490 | u64 id; |
@@ -3338,8 +3511,12 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3338 | goto out; | 3511 | goto out; |
3339 | 3512 | ||
3340 | have_lost = local_read(&buffer->lost); | 3513 | have_lost = local_read(&buffer->lost); |
3341 | if (have_lost) | 3514 | if (have_lost) { |
3342 | size += sizeof(lost_event); | 3515 | lost_event.header.size = sizeof(lost_event); |
3516 | perf_event_header__init_id(&lost_event.header, &sample_data, | ||
3517 | event); | ||
3518 | size += lost_event.header.size; | ||
3519 | } | ||
3343 | 3520 | ||
3344 | perf_output_get_handle(handle); | 3521 | perf_output_get_handle(handle); |
3345 | 3522 | ||
@@ -3370,11 +3547,11 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3370 | if (have_lost) { | 3547 | if (have_lost) { |
3371 | lost_event.header.type = PERF_RECORD_LOST; | 3548 | lost_event.header.type = PERF_RECORD_LOST; |
3372 | lost_event.header.misc = 0; | 3549 | lost_event.header.misc = 0; |
3373 | lost_event.header.size = sizeof(lost_event); | ||
3374 | lost_event.id = event->id; | 3550 | lost_event.id = event->id; |
3375 | lost_event.lost = local_xchg(&buffer->lost, 0); | 3551 | lost_event.lost = local_xchg(&buffer->lost, 0); |
3376 | 3552 | ||
3377 | perf_output_put(handle, lost_event); | 3553 | perf_output_put(handle, lost_event); |
3554 | perf_event__output_id_sample(event, handle, &sample_data); | ||
3378 | } | 3555 | } |
3379 | 3556 | ||
3380 | return 0; | 3557 | return 0; |
@@ -3407,28 +3584,6 @@ void perf_output_end(struct perf_output_handle *handle) | |||
3407 | rcu_read_unlock(); | 3584 | rcu_read_unlock(); |
3408 | } | 3585 | } |
3409 | 3586 | ||
3410 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) | ||
3411 | { | ||
3412 | /* | ||
3413 | * only top level events have the pid namespace they were created in | ||
3414 | */ | ||
3415 | if (event->parent) | ||
3416 | event = event->parent; | ||
3417 | |||
3418 | return task_tgid_nr_ns(p, event->ns); | ||
3419 | } | ||
3420 | |||
3421 | static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) | ||
3422 | { | ||
3423 | /* | ||
3424 | * only top level events have the pid namespace they were created in | ||
3425 | */ | ||
3426 | if (event->parent) | ||
3427 | event = event->parent; | ||
3428 | |||
3429 | return task_pid_nr_ns(p, event->ns); | ||
3430 | } | ||
3431 | |||
3432 | static void perf_output_read_one(struct perf_output_handle *handle, | 3587 | static void perf_output_read_one(struct perf_output_handle *handle, |
3433 | struct perf_event *event, | 3588 | struct perf_event *event, |
3434 | u64 enabled, u64 running) | 3589 | u64 enabled, u64 running) |
@@ -3603,61 +3758,16 @@ void perf_prepare_sample(struct perf_event_header *header, | |||
3603 | { | 3758 | { |
3604 | u64 sample_type = event->attr.sample_type; | 3759 | u64 sample_type = event->attr.sample_type; |
3605 | 3760 | ||
3606 | data->type = sample_type; | ||
3607 | |||
3608 | header->type = PERF_RECORD_SAMPLE; | 3761 | header->type = PERF_RECORD_SAMPLE; |
3609 | header->size = sizeof(*header); | 3762 | header->size = sizeof(*header) + event->header_size; |
3610 | 3763 | ||
3611 | header->misc = 0; | 3764 | header->misc = 0; |
3612 | header->misc |= perf_misc_flags(regs); | 3765 | header->misc |= perf_misc_flags(regs); |
3613 | 3766 | ||
3614 | if (sample_type & PERF_SAMPLE_IP) { | 3767 | __perf_event_header__init_id(header, data, event); |
3615 | data->ip = perf_instruction_pointer(regs); | ||
3616 | |||
3617 | header->size += sizeof(data->ip); | ||
3618 | } | ||
3619 | |||
3620 | if (sample_type & PERF_SAMPLE_TID) { | ||
3621 | /* namespace issues */ | ||
3622 | data->tid_entry.pid = perf_event_pid(event, current); | ||
3623 | data->tid_entry.tid = perf_event_tid(event, current); | ||
3624 | |||
3625 | header->size += sizeof(data->tid_entry); | ||
3626 | } | ||
3627 | |||
3628 | if (sample_type & PERF_SAMPLE_TIME) { | ||
3629 | data->time = perf_clock(); | ||
3630 | |||
3631 | header->size += sizeof(data->time); | ||
3632 | } | ||
3633 | |||
3634 | if (sample_type & PERF_SAMPLE_ADDR) | ||
3635 | header->size += sizeof(data->addr); | ||
3636 | |||
3637 | if (sample_type & PERF_SAMPLE_ID) { | ||
3638 | data->id = primary_event_id(event); | ||
3639 | |||
3640 | header->size += sizeof(data->id); | ||
3641 | } | ||
3642 | |||
3643 | if (sample_type & PERF_SAMPLE_STREAM_ID) { | ||
3644 | data->stream_id = event->id; | ||
3645 | |||
3646 | header->size += sizeof(data->stream_id); | ||
3647 | } | ||
3648 | |||
3649 | if (sample_type & PERF_SAMPLE_CPU) { | ||
3650 | data->cpu_entry.cpu = raw_smp_processor_id(); | ||
3651 | data->cpu_entry.reserved = 0; | ||
3652 | |||
3653 | header->size += sizeof(data->cpu_entry); | ||
3654 | } | ||
3655 | |||
3656 | if (sample_type & PERF_SAMPLE_PERIOD) | ||
3657 | header->size += sizeof(data->period); | ||
3658 | 3768 | ||
3659 | if (sample_type & PERF_SAMPLE_READ) | 3769 | if (sample_type & PERF_SAMPLE_IP) |
3660 | header->size += perf_event_read_size(event); | 3770 | data->ip = perf_instruction_pointer(regs); |
3661 | 3771 | ||
3662 | if (sample_type & PERF_SAMPLE_CALLCHAIN) { | 3772 | if (sample_type & PERF_SAMPLE_CALLCHAIN) { |
3663 | int size = 1; | 3773 | int size = 1; |
@@ -3722,23 +3832,26 @@ perf_event_read_event(struct perf_event *event, | |||
3722 | struct task_struct *task) | 3832 | struct task_struct *task) |
3723 | { | 3833 | { |
3724 | struct perf_output_handle handle; | 3834 | struct perf_output_handle handle; |
3835 | struct perf_sample_data sample; | ||
3725 | struct perf_read_event read_event = { | 3836 | struct perf_read_event read_event = { |
3726 | .header = { | 3837 | .header = { |
3727 | .type = PERF_RECORD_READ, | 3838 | .type = PERF_RECORD_READ, |
3728 | .misc = 0, | 3839 | .misc = 0, |
3729 | .size = sizeof(read_event) + perf_event_read_size(event), | 3840 | .size = sizeof(read_event) + event->read_size, |
3730 | }, | 3841 | }, |
3731 | .pid = perf_event_pid(event, task), | 3842 | .pid = perf_event_pid(event, task), |
3732 | .tid = perf_event_tid(event, task), | 3843 | .tid = perf_event_tid(event, task), |
3733 | }; | 3844 | }; |
3734 | int ret; | 3845 | int ret; |
3735 | 3846 | ||
3847 | perf_event_header__init_id(&read_event.header, &sample, event); | ||
3736 | ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); | 3848 | ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); |
3737 | if (ret) | 3849 | if (ret) |
3738 | return; | 3850 | return; |
3739 | 3851 | ||
3740 | perf_output_put(&handle, read_event); | 3852 | perf_output_put(&handle, read_event); |
3741 | perf_output_read(&handle, event); | 3853 | perf_output_read(&handle, event); |
3854 | perf_event__output_id_sample(event, &handle, &sample); | ||
3742 | 3855 | ||
3743 | perf_output_end(&handle); | 3856 | perf_output_end(&handle); |
3744 | } | 3857 | } |
@@ -3768,14 +3881,16 @@ static void perf_event_task_output(struct perf_event *event, | |||
3768 | struct perf_task_event *task_event) | 3881 | struct perf_task_event *task_event) |
3769 | { | 3882 | { |
3770 | struct perf_output_handle handle; | 3883 | struct perf_output_handle handle; |
3884 | struct perf_sample_data sample; | ||
3771 | struct task_struct *task = task_event->task; | 3885 | struct task_struct *task = task_event->task; |
3772 | int size, ret; | 3886 | int ret, size = task_event->event_id.header.size; |
3773 | 3887 | ||
3774 | size = task_event->event_id.header.size; | 3888 | perf_event_header__init_id(&task_event->event_id.header, &sample, event); |
3775 | ret = perf_output_begin(&handle, event, size, 0, 0); | ||
3776 | 3889 | ||
3890 | ret = perf_output_begin(&handle, event, | ||
3891 | task_event->event_id.header.size, 0, 0); | ||
3777 | if (ret) | 3892 | if (ret) |
3778 | return; | 3893 | goto out; |
3779 | 3894 | ||
3780 | task_event->event_id.pid = perf_event_pid(event, task); | 3895 | task_event->event_id.pid = perf_event_pid(event, task); |
3781 | task_event->event_id.ppid = perf_event_pid(event, current); | 3896 | task_event->event_id.ppid = perf_event_pid(event, current); |
@@ -3785,7 +3900,11 @@ static void perf_event_task_output(struct perf_event *event, | |||
3785 | 3900 | ||
3786 | perf_output_put(&handle, task_event->event_id); | 3901 | perf_output_put(&handle, task_event->event_id); |
3787 | 3902 | ||
3903 | perf_event__output_id_sample(event, &handle, &sample); | ||
3904 | |||
3788 | perf_output_end(&handle); | 3905 | perf_output_end(&handle); |
3906 | out: | ||
3907 | task_event->event_id.header.size = size; | ||
3789 | } | 3908 | } |
3790 | 3909 | ||
3791 | static int perf_event_task_match(struct perf_event *event) | 3910 | static int perf_event_task_match(struct perf_event *event) |
@@ -3793,7 +3912,7 @@ static int perf_event_task_match(struct perf_event *event) | |||
3793 | if (event->state < PERF_EVENT_STATE_INACTIVE) | 3912 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
3794 | return 0; | 3913 | return 0; |
3795 | 3914 | ||
3796 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 3915 | if (!event_filter_match(event)) |
3797 | return 0; | 3916 | return 0; |
3798 | 3917 | ||
3799 | if (event->attr.comm || event->attr.mmap || | 3918 | if (event->attr.comm || event->attr.mmap || |
@@ -3900,11 +4019,16 @@ static void perf_event_comm_output(struct perf_event *event, | |||
3900 | struct perf_comm_event *comm_event) | 4019 | struct perf_comm_event *comm_event) |
3901 | { | 4020 | { |
3902 | struct perf_output_handle handle; | 4021 | struct perf_output_handle handle; |
4022 | struct perf_sample_data sample; | ||
3903 | int size = comm_event->event_id.header.size; | 4023 | int size = comm_event->event_id.header.size; |
3904 | int ret = perf_output_begin(&handle, event, size, 0, 0); | 4024 | int ret; |
4025 | |||
4026 | perf_event_header__init_id(&comm_event->event_id.header, &sample, event); | ||
4027 | ret = perf_output_begin(&handle, event, | ||
4028 | comm_event->event_id.header.size, 0, 0); | ||
3905 | 4029 | ||
3906 | if (ret) | 4030 | if (ret) |
3907 | return; | 4031 | goto out; |
3908 | 4032 | ||
3909 | comm_event->event_id.pid = perf_event_pid(event, comm_event->task); | 4033 | comm_event->event_id.pid = perf_event_pid(event, comm_event->task); |
3910 | comm_event->event_id.tid = perf_event_tid(event, comm_event->task); | 4034 | comm_event->event_id.tid = perf_event_tid(event, comm_event->task); |
@@ -3912,7 +4036,12 @@ static void perf_event_comm_output(struct perf_event *event, | |||
3912 | perf_output_put(&handle, comm_event->event_id); | 4036 | perf_output_put(&handle, comm_event->event_id); |
3913 | perf_output_copy(&handle, comm_event->comm, | 4037 | perf_output_copy(&handle, comm_event->comm, |
3914 | comm_event->comm_size); | 4038 | comm_event->comm_size); |
4039 | |||
4040 | perf_event__output_id_sample(event, &handle, &sample); | ||
4041 | |||
3915 | perf_output_end(&handle); | 4042 | perf_output_end(&handle); |
4043 | out: | ||
4044 | comm_event->event_id.header.size = size; | ||
3916 | } | 4045 | } |
3917 | 4046 | ||
3918 | static int perf_event_comm_match(struct perf_event *event) | 4047 | static int perf_event_comm_match(struct perf_event *event) |
@@ -3920,7 +4049,7 @@ static int perf_event_comm_match(struct perf_event *event) | |||
3920 | if (event->state < PERF_EVENT_STATE_INACTIVE) | 4049 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
3921 | return 0; | 4050 | return 0; |
3922 | 4051 | ||
3923 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 4052 | if (!event_filter_match(event)) |
3924 | return 0; | 4053 | return 0; |
3925 | 4054 | ||
3926 | if (event->attr.comm) | 4055 | if (event->attr.comm) |
@@ -3957,7 +4086,6 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
3957 | comm_event->comm_size = size; | 4086 | comm_event->comm_size = size; |
3958 | 4087 | ||
3959 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; | 4088 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; |
3960 | |||
3961 | rcu_read_lock(); | 4089 | rcu_read_lock(); |
3962 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 4090 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
3963 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); | 4091 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
@@ -4038,11 +4166,15 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
4038 | struct perf_mmap_event *mmap_event) | 4166 | struct perf_mmap_event *mmap_event) |
4039 | { | 4167 | { |
4040 | struct perf_output_handle handle; | 4168 | struct perf_output_handle handle; |
4169 | struct perf_sample_data sample; | ||
4041 | int size = mmap_event->event_id.header.size; | 4170 | int size = mmap_event->event_id.header.size; |
4042 | int ret = perf_output_begin(&handle, event, size, 0, 0); | 4171 | int ret; |
4043 | 4172 | ||
4173 | perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); | ||
4174 | ret = perf_output_begin(&handle, event, | ||
4175 | mmap_event->event_id.header.size, 0, 0); | ||
4044 | if (ret) | 4176 | if (ret) |
4045 | return; | 4177 | goto out; |
4046 | 4178 | ||
4047 | mmap_event->event_id.pid = perf_event_pid(event, current); | 4179 | mmap_event->event_id.pid = perf_event_pid(event, current); |
4048 | mmap_event->event_id.tid = perf_event_tid(event, current); | 4180 | mmap_event->event_id.tid = perf_event_tid(event, current); |
@@ -4050,7 +4182,12 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
4050 | perf_output_put(&handle, mmap_event->event_id); | 4182 | perf_output_put(&handle, mmap_event->event_id); |
4051 | perf_output_copy(&handle, mmap_event->file_name, | 4183 | perf_output_copy(&handle, mmap_event->file_name, |
4052 | mmap_event->file_size); | 4184 | mmap_event->file_size); |
4185 | |||
4186 | perf_event__output_id_sample(event, &handle, &sample); | ||
4187 | |||
4053 | perf_output_end(&handle); | 4188 | perf_output_end(&handle); |
4189 | out: | ||
4190 | mmap_event->event_id.header.size = size; | ||
4054 | } | 4191 | } |
4055 | 4192 | ||
4056 | static int perf_event_mmap_match(struct perf_event *event, | 4193 | static int perf_event_mmap_match(struct perf_event *event, |
@@ -4060,7 +4197,7 @@ static int perf_event_mmap_match(struct perf_event *event, | |||
4060 | if (event->state < PERF_EVENT_STATE_INACTIVE) | 4197 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
4061 | return 0; | 4198 | return 0; |
4062 | 4199 | ||
4063 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 4200 | if (!event_filter_match(event)) |
4064 | return 0; | 4201 | return 0; |
4065 | 4202 | ||
4066 | if ((!executable && event->attr.mmap_data) || | 4203 | if ((!executable && event->attr.mmap_data) || |
@@ -4205,6 +4342,7 @@ void perf_event_mmap(struct vm_area_struct *vma) | |||
4205 | static void perf_log_throttle(struct perf_event *event, int enable) | 4342 | static void perf_log_throttle(struct perf_event *event, int enable) |
4206 | { | 4343 | { |
4207 | struct perf_output_handle handle; | 4344 | struct perf_output_handle handle; |
4345 | struct perf_sample_data sample; | ||
4208 | int ret; | 4346 | int ret; |
4209 | 4347 | ||
4210 | struct { | 4348 | struct { |
@@ -4226,11 +4364,15 @@ static void perf_log_throttle(struct perf_event *event, int enable) | |||
4226 | if (enable) | 4364 | if (enable) |
4227 | throttle_event.header.type = PERF_RECORD_UNTHROTTLE; | 4365 | throttle_event.header.type = PERF_RECORD_UNTHROTTLE; |
4228 | 4366 | ||
4229 | ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); | 4367 | perf_event_header__init_id(&throttle_event.header, &sample, event); |
4368 | |||
4369 | ret = perf_output_begin(&handle, event, | ||
4370 | throttle_event.header.size, 1, 0); | ||
4230 | if (ret) | 4371 | if (ret) |
4231 | return; | 4372 | return; |
4232 | 4373 | ||
4233 | perf_output_put(&handle, throttle_event); | 4374 | perf_output_put(&handle, throttle_event); |
4375 | perf_event__output_id_sample(event, &handle, &sample); | ||
4234 | perf_output_end(&handle); | 4376 | perf_output_end(&handle); |
4235 | } | 4377 | } |
4236 | 4378 | ||
@@ -4246,6 +4388,13 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
4246 | struct hw_perf_event *hwc = &event->hw; | 4388 | struct hw_perf_event *hwc = &event->hw; |
4247 | int ret = 0; | 4389 | int ret = 0; |
4248 | 4390 | ||
4391 | /* | ||
4392 | * Non-sampling counters might still use the PMI to fold short | ||
4393 | * hardware counters, ignore those. | ||
4394 | */ | ||
4395 | if (unlikely(!is_sampling_event(event))) | ||
4396 | return 0; | ||
4397 | |||
4249 | if (!throttle) { | 4398 | if (!throttle) { |
4250 | hwc->interrupts++; | 4399 | hwc->interrupts++; |
4251 | } else { | 4400 | } else { |
@@ -4391,7 +4540,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, | |||
4391 | if (!regs) | 4540 | if (!regs) |
4392 | return; | 4541 | return; |
4393 | 4542 | ||
4394 | if (!hwc->sample_period) | 4543 | if (!is_sampling_event(event)) |
4395 | return; | 4544 | return; |
4396 | 4545 | ||
4397 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) | 4546 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) |
@@ -4518,7 +4667,7 @@ int perf_swevent_get_recursion_context(void) | |||
4518 | } | 4667 | } |
4519 | EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); | 4668 | EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); |
4520 | 4669 | ||
4521 | void inline perf_swevent_put_recursion_context(int rctx) | 4670 | inline void perf_swevent_put_recursion_context(int rctx) |
4522 | { | 4671 | { |
4523 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); | 4672 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); |
4524 | 4673 | ||
@@ -4554,7 +4703,7 @@ static int perf_swevent_add(struct perf_event *event, int flags) | |||
4554 | struct hw_perf_event *hwc = &event->hw; | 4703 | struct hw_perf_event *hwc = &event->hw; |
4555 | struct hlist_head *head; | 4704 | struct hlist_head *head; |
4556 | 4705 | ||
4557 | if (hwc->sample_period) { | 4706 | if (is_sampling_event(event)) { |
4558 | hwc->last_period = hwc->sample_period; | 4707 | hwc->last_period = hwc->sample_period; |
4559 | perf_swevent_set_period(event); | 4708 | perf_swevent_set_period(event); |
4560 | } | 4709 | } |
@@ -4811,15 +4960,6 @@ static int perf_tp_event_init(struct perf_event *event) | |||
4811 | if (event->attr.type != PERF_TYPE_TRACEPOINT) | 4960 | if (event->attr.type != PERF_TYPE_TRACEPOINT) |
4812 | return -ENOENT; | 4961 | return -ENOENT; |
4813 | 4962 | ||
4814 | /* | ||
4815 | * Raw tracepoint data is a severe data leak, only allow root to | ||
4816 | * have these. | ||
4817 | */ | ||
4818 | if ((event->attr.sample_type & PERF_SAMPLE_RAW) && | ||
4819 | perf_paranoid_tracepoint_raw() && | ||
4820 | !capable(CAP_SYS_ADMIN)) | ||
4821 | return -EPERM; | ||
4822 | |||
4823 | err = perf_trace_init(event); | 4963 | err = perf_trace_init(event); |
4824 | if (err) | 4964 | if (err) |
4825 | return err; | 4965 | return err; |
@@ -4842,7 +4982,7 @@ static struct pmu perf_tracepoint = { | |||
4842 | 4982 | ||
4843 | static inline void perf_tp_register(void) | 4983 | static inline void perf_tp_register(void) |
4844 | { | 4984 | { |
4845 | perf_pmu_register(&perf_tracepoint); | 4985 | perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); |
4846 | } | 4986 | } |
4847 | 4987 | ||
4848 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) | 4988 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) |
@@ -4932,31 +5072,33 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | |||
4932 | static void perf_swevent_start_hrtimer(struct perf_event *event) | 5072 | static void perf_swevent_start_hrtimer(struct perf_event *event) |
4933 | { | 5073 | { |
4934 | struct hw_perf_event *hwc = &event->hw; | 5074 | struct hw_perf_event *hwc = &event->hw; |
5075 | s64 period; | ||
5076 | |||
5077 | if (!is_sampling_event(event)) | ||
5078 | return; | ||
4935 | 5079 | ||
4936 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 5080 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
4937 | hwc->hrtimer.function = perf_swevent_hrtimer; | 5081 | hwc->hrtimer.function = perf_swevent_hrtimer; |
4938 | if (hwc->sample_period) { | ||
4939 | s64 period = local64_read(&hwc->period_left); | ||
4940 | 5082 | ||
4941 | if (period) { | 5083 | period = local64_read(&hwc->period_left); |
4942 | if (period < 0) | 5084 | if (period) { |
4943 | period = 10000; | 5085 | if (period < 0) |
5086 | period = 10000; | ||
4944 | 5087 | ||
4945 | local64_set(&hwc->period_left, 0); | 5088 | local64_set(&hwc->period_left, 0); |
4946 | } else { | 5089 | } else { |
4947 | period = max_t(u64, 10000, hwc->sample_period); | 5090 | period = max_t(u64, 10000, hwc->sample_period); |
4948 | } | 5091 | } |
4949 | __hrtimer_start_range_ns(&hwc->hrtimer, | 5092 | __hrtimer_start_range_ns(&hwc->hrtimer, |
4950 | ns_to_ktime(period), 0, | 5093 | ns_to_ktime(period), 0, |
4951 | HRTIMER_MODE_REL_PINNED, 0); | 5094 | HRTIMER_MODE_REL_PINNED, 0); |
4952 | } | ||
4953 | } | 5095 | } |
4954 | 5096 | ||
4955 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) | 5097 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) |
4956 | { | 5098 | { |
4957 | struct hw_perf_event *hwc = &event->hw; | 5099 | struct hw_perf_event *hwc = &event->hw; |
4958 | 5100 | ||
4959 | if (hwc->sample_period) { | 5101 | if (is_sampling_event(event)) { |
4960 | ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); | 5102 | ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); |
4961 | local64_set(&hwc->period_left, ktime_to_ns(remaining)); | 5103 | local64_set(&hwc->period_left, ktime_to_ns(remaining)); |
4962 | 5104 | ||
@@ -5184,8 +5326,63 @@ static void free_pmu_context(struct pmu *pmu) | |||
5184 | out: | 5326 | out: |
5185 | mutex_unlock(&pmus_lock); | 5327 | mutex_unlock(&pmus_lock); |
5186 | } | 5328 | } |
5329 | static struct idr pmu_idr; | ||
5330 | |||
5331 | static ssize_t | ||
5332 | type_show(struct device *dev, struct device_attribute *attr, char *page) | ||
5333 | { | ||
5334 | struct pmu *pmu = dev_get_drvdata(dev); | ||
5335 | |||
5336 | return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); | ||
5337 | } | ||
5338 | |||
5339 | static struct device_attribute pmu_dev_attrs[] = { | ||
5340 | __ATTR_RO(type), | ||
5341 | __ATTR_NULL, | ||
5342 | }; | ||
5343 | |||
5344 | static int pmu_bus_running; | ||
5345 | static struct bus_type pmu_bus = { | ||
5346 | .name = "event_source", | ||
5347 | .dev_attrs = pmu_dev_attrs, | ||
5348 | }; | ||
5349 | |||
5350 | static void pmu_dev_release(struct device *dev) | ||
5351 | { | ||
5352 | kfree(dev); | ||
5353 | } | ||
5187 | 5354 | ||
5188 | int perf_pmu_register(struct pmu *pmu) | 5355 | static int pmu_dev_alloc(struct pmu *pmu) |
5356 | { | ||
5357 | int ret = -ENOMEM; | ||
5358 | |||
5359 | pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); | ||
5360 | if (!pmu->dev) | ||
5361 | goto out; | ||
5362 | |||
5363 | device_initialize(pmu->dev); | ||
5364 | ret = dev_set_name(pmu->dev, "%s", pmu->name); | ||
5365 | if (ret) | ||
5366 | goto free_dev; | ||
5367 | |||
5368 | dev_set_drvdata(pmu->dev, pmu); | ||
5369 | pmu->dev->bus = &pmu_bus; | ||
5370 | pmu->dev->release = pmu_dev_release; | ||
5371 | ret = device_add(pmu->dev); | ||
5372 | if (ret) | ||
5373 | goto free_dev; | ||
5374 | |||
5375 | out: | ||
5376 | return ret; | ||
5377 | |||
5378 | free_dev: | ||
5379 | put_device(pmu->dev); | ||
5380 | goto out; | ||
5381 | } | ||
5382 | |||
5383 | static struct lock_class_key cpuctx_mutex; | ||
5384 | |||
5385 | int perf_pmu_register(struct pmu *pmu, char *name, int type) | ||
5189 | { | 5386 | { |
5190 | int cpu, ret; | 5387 | int cpu, ret; |
5191 | 5388 | ||
@@ -5195,19 +5392,45 @@ int perf_pmu_register(struct pmu *pmu) | |||
5195 | if (!pmu->pmu_disable_count) | 5392 | if (!pmu->pmu_disable_count) |
5196 | goto unlock; | 5393 | goto unlock; |
5197 | 5394 | ||
5395 | pmu->type = -1; | ||
5396 | if (!name) | ||
5397 | goto skip_type; | ||
5398 | pmu->name = name; | ||
5399 | |||
5400 | if (type < 0) { | ||
5401 | int err = idr_pre_get(&pmu_idr, GFP_KERNEL); | ||
5402 | if (!err) | ||
5403 | goto free_pdc; | ||
5404 | |||
5405 | err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type); | ||
5406 | if (err) { | ||
5407 | ret = err; | ||
5408 | goto free_pdc; | ||
5409 | } | ||
5410 | } | ||
5411 | pmu->type = type; | ||
5412 | |||
5413 | if (pmu_bus_running) { | ||
5414 | ret = pmu_dev_alloc(pmu); | ||
5415 | if (ret) | ||
5416 | goto free_idr; | ||
5417 | } | ||
5418 | |||
5419 | skip_type: | ||
5198 | pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); | 5420 | pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); |
5199 | if (pmu->pmu_cpu_context) | 5421 | if (pmu->pmu_cpu_context) |
5200 | goto got_cpu_context; | 5422 | goto got_cpu_context; |
5201 | 5423 | ||
5202 | pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); | 5424 | pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); |
5203 | if (!pmu->pmu_cpu_context) | 5425 | if (!pmu->pmu_cpu_context) |
5204 | goto free_pdc; | 5426 | goto free_dev; |
5205 | 5427 | ||
5206 | for_each_possible_cpu(cpu) { | 5428 | for_each_possible_cpu(cpu) { |
5207 | struct perf_cpu_context *cpuctx; | 5429 | struct perf_cpu_context *cpuctx; |
5208 | 5430 | ||
5209 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | 5431 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); |
5210 | __perf_event_init_context(&cpuctx->ctx); | 5432 | __perf_event_init_context(&cpuctx->ctx); |
5433 | lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); | ||
5211 | cpuctx->ctx.type = cpu_context; | 5434 | cpuctx->ctx.type = cpu_context; |
5212 | cpuctx->ctx.pmu = pmu; | 5435 | cpuctx->ctx.pmu = pmu; |
5213 | cpuctx->jiffies_interval = 1; | 5436 | cpuctx->jiffies_interval = 1; |
@@ -5245,6 +5468,14 @@ unlock: | |||
5245 | 5468 | ||
5246 | return ret; | 5469 | return ret; |
5247 | 5470 | ||
5471 | free_dev: | ||
5472 | device_del(pmu->dev); | ||
5473 | put_device(pmu->dev); | ||
5474 | |||
5475 | free_idr: | ||
5476 | if (pmu->type >= PERF_TYPE_MAX) | ||
5477 | idr_remove(&pmu_idr, pmu->type); | ||
5478 | |||
5248 | free_pdc: | 5479 | free_pdc: |
5249 | free_percpu(pmu->pmu_disable_count); | 5480 | free_percpu(pmu->pmu_disable_count); |
5250 | goto unlock; | 5481 | goto unlock; |
@@ -5264,6 +5495,10 @@ void perf_pmu_unregister(struct pmu *pmu) | |||
5264 | synchronize_rcu(); | 5495 | synchronize_rcu(); |
5265 | 5496 | ||
5266 | free_percpu(pmu->pmu_disable_count); | 5497 | free_percpu(pmu->pmu_disable_count); |
5498 | if (pmu->type >= PERF_TYPE_MAX) | ||
5499 | idr_remove(&pmu_idr, pmu->type); | ||
5500 | device_del(pmu->dev); | ||
5501 | put_device(pmu->dev); | ||
5267 | free_pmu_context(pmu); | 5502 | free_pmu_context(pmu); |
5268 | } | 5503 | } |
5269 | 5504 | ||
@@ -5273,6 +5508,13 @@ struct pmu *perf_init_event(struct perf_event *event) | |||
5273 | int idx; | 5508 | int idx; |
5274 | 5509 | ||
5275 | idx = srcu_read_lock(&pmus_srcu); | 5510 | idx = srcu_read_lock(&pmus_srcu); |
5511 | |||
5512 | rcu_read_lock(); | ||
5513 | pmu = idr_find(&pmu_idr, event->attr.type); | ||
5514 | rcu_read_unlock(); | ||
5515 | if (pmu) | ||
5516 | goto unlock; | ||
5517 | |||
5276 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 5518 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
5277 | int ret = pmu->event_init(event); | 5519 | int ret = pmu->event_init(event); |
5278 | if (!ret) | 5520 | if (!ret) |
@@ -5305,6 +5547,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
5305 | struct hw_perf_event *hwc; | 5547 | struct hw_perf_event *hwc; |
5306 | long err; | 5548 | long err; |
5307 | 5549 | ||
5550 | if ((unsigned)cpu >= nr_cpu_ids) { | ||
5551 | if (!task || cpu != -1) | ||
5552 | return ERR_PTR(-EINVAL); | ||
5553 | } | ||
5554 | |||
5308 | event = kzalloc(sizeof(*event), GFP_KERNEL); | 5555 | event = kzalloc(sizeof(*event), GFP_KERNEL); |
5309 | if (!event) | 5556 | if (!event) |
5310 | return ERR_PTR(-ENOMEM); | 5557 | return ERR_PTR(-ENOMEM); |
@@ -5353,7 +5600,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
5353 | 5600 | ||
5354 | if (!overflow_handler && parent_event) | 5601 | if (!overflow_handler && parent_event) |
5355 | overflow_handler = parent_event->overflow_handler; | 5602 | overflow_handler = parent_event->overflow_handler; |
5356 | 5603 | ||
5357 | event->overflow_handler = overflow_handler; | 5604 | event->overflow_handler = overflow_handler; |
5358 | 5605 | ||
5359 | if (attr->disabled) | 5606 | if (attr->disabled) |
@@ -5738,6 +5985,12 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5738 | mutex_unlock(¤t->perf_event_mutex); | 5985 | mutex_unlock(¤t->perf_event_mutex); |
5739 | 5986 | ||
5740 | /* | 5987 | /* |
5988 | * Precalculate sample_data sizes | ||
5989 | */ | ||
5990 | perf_event__header_size(event); | ||
5991 | perf_event__id_header_size(event); | ||
5992 | |||
5993 | /* | ||
5741 | * Drop the reference on the group_event after placing the | 5994 | * Drop the reference on the group_event after placing the |
5742 | * new event on the sibling_list. This ensures destruction | 5995 | * new event on the sibling_list. This ensures destruction |
5743 | * of the group leader will find the pointer to itself in | 5996 | * of the group leader will find the pointer to itself in |
@@ -5883,7 +6136,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | |||
5883 | * scheduled, so we are now safe from rescheduling changing | 6136 | * scheduled, so we are now safe from rescheduling changing |
5884 | * our context. | 6137 | * our context. |
5885 | */ | 6138 | */ |
5886 | child_ctx = child->perf_event_ctxp[ctxn]; | 6139 | child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); |
5887 | task_ctx_sched_out(child_ctx, EVENT_ALL); | 6140 | task_ctx_sched_out(child_ctx, EVENT_ALL); |
5888 | 6141 | ||
5889 | /* | 6142 | /* |
@@ -6090,6 +6343,12 @@ inherit_event(struct perf_event *parent_event, | |||
6090 | child_event->overflow_handler = parent_event->overflow_handler; | 6343 | child_event->overflow_handler = parent_event->overflow_handler; |
6091 | 6344 | ||
6092 | /* | 6345 | /* |
6346 | * Precalculate sample_data sizes | ||
6347 | */ | ||
6348 | perf_event__header_size(child_event); | ||
6349 | perf_event__id_header_size(child_event); | ||
6350 | |||
6351 | /* | ||
6093 | * Link it up in the child's context: | 6352 | * Link it up in the child's context: |
6094 | */ | 6353 | */ |
6095 | raw_spin_lock_irqsave(&child_ctx->lock, flags); | 6354 | raw_spin_lock_irqsave(&child_ctx->lock, flags); |
@@ -6190,11 +6449,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn) | |||
6190 | unsigned long flags; | 6449 | unsigned long flags; |
6191 | int ret = 0; | 6450 | int ret = 0; |
6192 | 6451 | ||
6193 | child->perf_event_ctxp[ctxn] = NULL; | ||
6194 | |||
6195 | mutex_init(&child->perf_event_mutex); | ||
6196 | INIT_LIST_HEAD(&child->perf_event_list); | ||
6197 | |||
6198 | if (likely(!parent->perf_event_ctxp[ctxn])) | 6452 | if (likely(!parent->perf_event_ctxp[ctxn])) |
6199 | return 0; | 6453 | return 0; |
6200 | 6454 | ||
@@ -6246,7 +6500,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn) | |||
6246 | 6500 | ||
6247 | raw_spin_lock_irqsave(&parent_ctx->lock, flags); | 6501 | raw_spin_lock_irqsave(&parent_ctx->lock, flags); |
6248 | parent_ctx->rotate_disable = 0; | 6502 | parent_ctx->rotate_disable = 0; |
6249 | raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); | ||
6250 | 6503 | ||
6251 | child_ctx = child->perf_event_ctxp[ctxn]; | 6504 | child_ctx = child->perf_event_ctxp[ctxn]; |
6252 | 6505 | ||
@@ -6254,12 +6507,11 @@ int perf_event_init_context(struct task_struct *child, int ctxn) | |||
6254 | /* | 6507 | /* |
6255 | * Mark the child context as a clone of the parent | 6508 | * Mark the child context as a clone of the parent |
6256 | * context, or of whatever the parent is a clone of. | 6509 | * context, or of whatever the parent is a clone of. |
6257 | * Note that if the parent is a clone, it could get | 6510 | * |
6258 | * uncloned at any point, but that doesn't matter | 6511 | * Note that if the parent is a clone, the holding of |
6259 | * because the list of events and the generation | 6512 | * parent_ctx->lock avoids it from being uncloned. |
6260 | * count can't have changed since we took the mutex. | ||
6261 | */ | 6513 | */ |
6262 | cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); | 6514 | cloned_ctx = parent_ctx->parent_ctx; |
6263 | if (cloned_ctx) { | 6515 | if (cloned_ctx) { |
6264 | child_ctx->parent_ctx = cloned_ctx; | 6516 | child_ctx->parent_ctx = cloned_ctx; |
6265 | child_ctx->parent_gen = parent_ctx->parent_gen; | 6517 | child_ctx->parent_gen = parent_ctx->parent_gen; |
@@ -6270,6 +6522,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn) | |||
6270 | get_ctx(child_ctx->parent_ctx); | 6522 | get_ctx(child_ctx->parent_ctx); |
6271 | } | 6523 | } |
6272 | 6524 | ||
6525 | raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); | ||
6273 | mutex_unlock(&parent_ctx->mutex); | 6526 | mutex_unlock(&parent_ctx->mutex); |
6274 | 6527 | ||
6275 | perf_unpin_context(parent_ctx); | 6528 | perf_unpin_context(parent_ctx); |
@@ -6284,6 +6537,10 @@ int perf_event_init_task(struct task_struct *child) | |||
6284 | { | 6537 | { |
6285 | int ctxn, ret; | 6538 | int ctxn, ret; |
6286 | 6539 | ||
6540 | memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp)); | ||
6541 | mutex_init(&child->perf_event_mutex); | ||
6542 | INIT_LIST_HEAD(&child->perf_event_list); | ||
6543 | |||
6287 | for_each_task_context_nr(ctxn) { | 6544 | for_each_task_context_nr(ctxn) { |
6288 | ret = perf_event_init_context(child, ctxn); | 6545 | ret = perf_event_init_context(child, ctxn); |
6289 | if (ret) | 6546 | if (ret) |
@@ -6320,7 +6577,7 @@ static void __cpuinit perf_event_init_cpu(int cpu) | |||
6320 | mutex_unlock(&swhash->hlist_mutex); | 6577 | mutex_unlock(&swhash->hlist_mutex); |
6321 | } | 6578 | } |
6322 | 6579 | ||
6323 | #ifdef CONFIG_HOTPLUG_CPU | 6580 | #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC |
6324 | static void perf_pmu_rotate_stop(struct pmu *pmu) | 6581 | static void perf_pmu_rotate_stop(struct pmu *pmu) |
6325 | { | 6582 | { |
6326 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | 6583 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); |
@@ -6374,6 +6631,26 @@ static void perf_event_exit_cpu(int cpu) | |||
6374 | static inline void perf_event_exit_cpu(int cpu) { } | 6631 | static inline void perf_event_exit_cpu(int cpu) { } |
6375 | #endif | 6632 | #endif |
6376 | 6633 | ||
6634 | static int | ||
6635 | perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) | ||
6636 | { | ||
6637 | int cpu; | ||
6638 | |||
6639 | for_each_online_cpu(cpu) | ||
6640 | perf_event_exit_cpu(cpu); | ||
6641 | |||
6642 | return NOTIFY_OK; | ||
6643 | } | ||
6644 | |||
6645 | /* | ||
6646 | * Run the perf reboot notifier at the very last possible moment so that | ||
6647 | * the generic watchdog code runs as long as possible. | ||
6648 | */ | ||
6649 | static struct notifier_block perf_reboot_notifier = { | ||
6650 | .notifier_call = perf_reboot, | ||
6651 | .priority = INT_MIN, | ||
6652 | }; | ||
6653 | |||
6377 | static int __cpuinit | 6654 | static int __cpuinit |
6378 | perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | 6655 | perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) |
6379 | { | 6656 | { |
@@ -6402,14 +6679,45 @@ void __init perf_event_init(void) | |||
6402 | { | 6679 | { |
6403 | int ret; | 6680 | int ret; |
6404 | 6681 | ||
6682 | idr_init(&pmu_idr); | ||
6683 | |||
6405 | perf_event_init_all_cpus(); | 6684 | perf_event_init_all_cpus(); |
6406 | init_srcu_struct(&pmus_srcu); | 6685 | init_srcu_struct(&pmus_srcu); |
6407 | perf_pmu_register(&perf_swevent); | 6686 | perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); |
6408 | perf_pmu_register(&perf_cpu_clock); | 6687 | perf_pmu_register(&perf_cpu_clock, NULL, -1); |
6409 | perf_pmu_register(&perf_task_clock); | 6688 | perf_pmu_register(&perf_task_clock, NULL, -1); |
6410 | perf_tp_register(); | 6689 | perf_tp_register(); |
6411 | perf_cpu_notifier(perf_cpu_notify); | 6690 | perf_cpu_notifier(perf_cpu_notify); |
6691 | register_reboot_notifier(&perf_reboot_notifier); | ||
6412 | 6692 | ||
6413 | ret = init_hw_breakpoint(); | 6693 | ret = init_hw_breakpoint(); |
6414 | WARN(ret, "hw_breakpoint initialization failed with: %d", ret); | 6694 | WARN(ret, "hw_breakpoint initialization failed with: %d", ret); |
6415 | } | 6695 | } |
6696 | |||
6697 | static int __init perf_event_sysfs_init(void) | ||
6698 | { | ||
6699 | struct pmu *pmu; | ||
6700 | int ret; | ||
6701 | |||
6702 | mutex_lock(&pmus_lock); | ||
6703 | |||
6704 | ret = bus_register(&pmu_bus); | ||
6705 | if (ret) | ||
6706 | goto unlock; | ||
6707 | |||
6708 | list_for_each_entry(pmu, &pmus, entry) { | ||
6709 | if (!pmu->name || pmu->type < 0) | ||
6710 | continue; | ||
6711 | |||
6712 | ret = pmu_dev_alloc(pmu); | ||
6713 | WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret); | ||
6714 | } | ||
6715 | pmu_bus_running = 1; | ||
6716 | ret = 0; | ||
6717 | |||
6718 | unlock: | ||
6719 | mutex_unlock(&pmus_lock); | ||
6720 | |||
6721 | return ret; | ||
6722 | } | ||
6723 | device_initcall(perf_event_sysfs_init); | ||
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 9ca4973f736d..93bd2eb2bc53 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -145,7 +145,13 @@ static int common_timer_del(struct k_itimer *timer); | |||
145 | 145 | ||
146 | static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); | 146 | static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); |
147 | 147 | ||
148 | static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); | 148 | static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); |
149 | |||
150 | #define lock_timer(tid, flags) \ | ||
151 | ({ struct k_itimer *__timr; \ | ||
152 | __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ | ||
153 | __timr; \ | ||
154 | }) | ||
149 | 155 | ||
150 | static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) | 156 | static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) |
151 | { | 157 | { |
@@ -619,7 +625,7 @@ out: | |||
619 | * the find to the timer lock. To avoid a dead lock, the timer id MUST | 625 | * the find to the timer lock. To avoid a dead lock, the timer id MUST |
620 | * be release with out holding the timer lock. | 626 | * be release with out holding the timer lock. |
621 | */ | 627 | */ |
622 | static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags) | 628 | static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) |
623 | { | 629 | { |
624 | struct k_itimer *timr; | 630 | struct k_itimer *timr; |
625 | /* | 631 | /* |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index a5aff3ebad38..265729966ece 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -100,13 +100,9 @@ config PM_SLEEP_ADVANCED_DEBUG | |||
100 | depends on PM_ADVANCED_DEBUG | 100 | depends on PM_ADVANCED_DEBUG |
101 | default n | 101 | default n |
102 | 102 | ||
103 | config SUSPEND_NVS | ||
104 | bool | ||
105 | |||
106 | config SUSPEND | 103 | config SUSPEND |
107 | bool "Suspend to RAM and standby" | 104 | bool "Suspend to RAM and standby" |
108 | depends on PM && ARCH_SUSPEND_POSSIBLE | 105 | depends on PM && ARCH_SUSPEND_POSSIBLE |
109 | select SUSPEND_NVS if HAS_IOMEM | ||
110 | default y | 106 | default y |
111 | ---help--- | 107 | ---help--- |
112 | Allow the system to enter sleep states in which main memory is | 108 | Allow the system to enter sleep states in which main memory is |
@@ -140,7 +136,6 @@ config HIBERNATION | |||
140 | depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE | 136 | depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE |
141 | select LZO_COMPRESS | 137 | select LZO_COMPRESS |
142 | select LZO_DECOMPRESS | 138 | select LZO_DECOMPRESS |
143 | select SUSPEND_NVS if HAS_IOMEM | ||
144 | ---help--- | 139 | ---help--- |
145 | Enable the suspend to disk (STD) functionality, which is usually | 140 | Enable the suspend to disk (STD) functionality, which is usually |
146 | called "hibernation" in user interfaces. STD checkpoints the | 141 | called "hibernation" in user interfaces. STD checkpoints the |
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index f9063c6b185d..c350e18b53e3 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
@@ -1,7 +1,4 @@ | |||
1 | 1 | ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG | |
2 | ifeq ($(CONFIG_PM_DEBUG),y) | ||
3 | EXTRA_CFLAGS += -DDEBUG | ||
4 | endif | ||
5 | 2 | ||
6 | obj-$(CONFIG_PM) += main.o | 3 | obj-$(CONFIG_PM) += main.o |
7 | obj-$(CONFIG_PM_SLEEP) += console.o | 4 | obj-$(CONFIG_PM_SLEEP) += console.o |
@@ -10,6 +7,5 @@ obj-$(CONFIG_SUSPEND) += suspend.o | |||
10 | obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o | 7 | obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o |
11 | obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ | 8 | obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ |
12 | block_io.o | 9 | block_io.o |
13 | obj-$(CONFIG_SUSPEND_NVS) += nvs.o | ||
14 | 10 | ||
15 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o | 11 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 048d0b514831..1832bd264219 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -51,18 +51,18 @@ enum { | |||
51 | 51 | ||
52 | static int hibernation_mode = HIBERNATION_SHUTDOWN; | 52 | static int hibernation_mode = HIBERNATION_SHUTDOWN; |
53 | 53 | ||
54 | static struct platform_hibernation_ops *hibernation_ops; | 54 | static const struct platform_hibernation_ops *hibernation_ops; |
55 | 55 | ||
56 | /** | 56 | /** |
57 | * hibernation_set_ops - set the global hibernate operations | 57 | * hibernation_set_ops - set the global hibernate operations |
58 | * @ops: the hibernation operations to use in subsequent hibernation transitions | 58 | * @ops: the hibernation operations to use in subsequent hibernation transitions |
59 | */ | 59 | */ |
60 | 60 | ||
61 | void hibernation_set_ops(struct platform_hibernation_ops *ops) | 61 | void hibernation_set_ops(const struct platform_hibernation_ops *ops) |
62 | { | 62 | { |
63 | if (ops && !(ops->begin && ops->end && ops->pre_snapshot | 63 | if (ops && !(ops->begin && ops->end && ops->pre_snapshot |
64 | && ops->prepare && ops->finish && ops->enter && ops->pre_restore | 64 | && ops->prepare && ops->finish && ops->enter && ops->pre_restore |
65 | && ops->restore_cleanup)) { | 65 | && ops->restore_cleanup && ops->leave)) { |
66 | WARN_ON(1); | 66 | WARN_ON(1); |
67 | return; | 67 | return; |
68 | } | 68 | } |
@@ -278,7 +278,7 @@ static int create_image(int platform_mode) | |||
278 | goto Enable_irqs; | 278 | goto Enable_irqs; |
279 | } | 279 | } |
280 | 280 | ||
281 | if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events()) | 281 | if (hibernation_test(TEST_CORE) || pm_wakeup_pending()) |
282 | goto Power_up; | 282 | goto Power_up; |
283 | 283 | ||
284 | in_suspend = 1; | 284 | in_suspend = 1; |
@@ -516,7 +516,7 @@ int hibernation_platform_enter(void) | |||
516 | 516 | ||
517 | local_irq_disable(); | 517 | local_irq_disable(); |
518 | sysdev_suspend(PMSG_HIBERNATE); | 518 | sysdev_suspend(PMSG_HIBERNATE); |
519 | if (!pm_check_wakeup_events()) { | 519 | if (pm_wakeup_pending()) { |
520 | error = -EAGAIN; | 520 | error = -EAGAIN; |
521 | goto Power_up; | 521 | goto Power_up; |
522 | } | 522 | } |
@@ -647,6 +647,7 @@ int hibernate(void) | |||
647 | swsusp_free(); | 647 | swsusp_free(); |
648 | if (!error) | 648 | if (!error) |
649 | power_down(); | 649 | power_down(); |
650 | in_suspend = 0; | ||
650 | pm_restore_gfp_mask(); | 651 | pm_restore_gfp_mask(); |
651 | } else { | 652 | } else { |
652 | pr_debug("PM: Image restored successfully.\n"); | 653 | pr_debug("PM: Image restored successfully.\n"); |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 7b5db6a8561e..701853042c28 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -326,7 +326,7 @@ EXPORT_SYMBOL_GPL(pm_wq); | |||
326 | 326 | ||
327 | static int __init pm_start_workqueue(void) | 327 | static int __init pm_start_workqueue(void) |
328 | { | 328 | { |
329 | pm_wq = alloc_workqueue("pm", WQ_FREEZEABLE, 0); | 329 | pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0); |
330 | 330 | ||
331 | return pm_wq ? 0 : -ENOMEM; | 331 | return pm_wq ? 0 : -ENOMEM; |
332 | } | 332 | } |
diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c deleted file mode 100644 index 1836db60bbb6..000000000000 --- a/kernel/power/nvs.c +++ /dev/null | |||
@@ -1,136 +0,0 @@ | |||
1 | /* | ||
2 | * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory | ||
3 | * | ||
4 | * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. | ||
5 | * | ||
6 | * This file is released under the GPLv2. | ||
7 | */ | ||
8 | |||
9 | #include <linux/io.h> | ||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/list.h> | ||
12 | #include <linux/mm.h> | ||
13 | #include <linux/slab.h> | ||
14 | #include <linux/suspend.h> | ||
15 | |||
16 | /* | ||
17 | * Platforms, like ACPI, may want us to save some memory used by them during | ||
18 | * suspend and to restore the contents of this memory during the subsequent | ||
19 | * resume. The code below implements a mechanism allowing us to do that. | ||
20 | */ | ||
21 | |||
22 | struct nvs_page { | ||
23 | unsigned long phys_start; | ||
24 | unsigned int size; | ||
25 | void *kaddr; | ||
26 | void *data; | ||
27 | struct list_head node; | ||
28 | }; | ||
29 | |||
30 | static LIST_HEAD(nvs_list); | ||
31 | |||
32 | /** | ||
33 | * suspend_nvs_register - register platform NVS memory region to save | ||
34 | * @start - physical address of the region | ||
35 | * @size - size of the region | ||
36 | * | ||
37 | * The NVS region need not be page-aligned (both ends) and we arrange | ||
38 | * things so that the data from page-aligned addresses in this region will | ||
39 | * be copied into separate RAM pages. | ||
40 | */ | ||
41 | int suspend_nvs_register(unsigned long start, unsigned long size) | ||
42 | { | ||
43 | struct nvs_page *entry, *next; | ||
44 | |||
45 | while (size > 0) { | ||
46 | unsigned int nr_bytes; | ||
47 | |||
48 | entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); | ||
49 | if (!entry) | ||
50 | goto Error; | ||
51 | |||
52 | list_add_tail(&entry->node, &nvs_list); | ||
53 | entry->phys_start = start; | ||
54 | nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); | ||
55 | entry->size = (size < nr_bytes) ? size : nr_bytes; | ||
56 | |||
57 | start += entry->size; | ||
58 | size -= entry->size; | ||
59 | } | ||
60 | return 0; | ||
61 | |||
62 | Error: | ||
63 | list_for_each_entry_safe(entry, next, &nvs_list, node) { | ||
64 | list_del(&entry->node); | ||
65 | kfree(entry); | ||
66 | } | ||
67 | return -ENOMEM; | ||
68 | } | ||
69 | |||
70 | /** | ||
71 | * suspend_nvs_free - free data pages allocated for saving NVS regions | ||
72 | */ | ||
73 | void suspend_nvs_free(void) | ||
74 | { | ||
75 | struct nvs_page *entry; | ||
76 | |||
77 | list_for_each_entry(entry, &nvs_list, node) | ||
78 | if (entry->data) { | ||
79 | free_page((unsigned long)entry->data); | ||
80 | entry->data = NULL; | ||
81 | if (entry->kaddr) { | ||
82 | iounmap(entry->kaddr); | ||
83 | entry->kaddr = NULL; | ||
84 | } | ||
85 | } | ||
86 | } | ||
87 | |||
88 | /** | ||
89 | * suspend_nvs_alloc - allocate memory necessary for saving NVS regions | ||
90 | */ | ||
91 | int suspend_nvs_alloc(void) | ||
92 | { | ||
93 | struct nvs_page *entry; | ||
94 | |||
95 | list_for_each_entry(entry, &nvs_list, node) { | ||
96 | entry->data = (void *)__get_free_page(GFP_KERNEL); | ||
97 | if (!entry->data) { | ||
98 | suspend_nvs_free(); | ||
99 | return -ENOMEM; | ||
100 | } | ||
101 | } | ||
102 | return 0; | ||
103 | } | ||
104 | |||
105 | /** | ||
106 | * suspend_nvs_save - save NVS memory regions | ||
107 | */ | ||
108 | void suspend_nvs_save(void) | ||
109 | { | ||
110 | struct nvs_page *entry; | ||
111 | |||
112 | printk(KERN_INFO "PM: Saving platform NVS memory\n"); | ||
113 | |||
114 | list_for_each_entry(entry, &nvs_list, node) | ||
115 | if (entry->data) { | ||
116 | entry->kaddr = ioremap(entry->phys_start, entry->size); | ||
117 | memcpy(entry->data, entry->kaddr, entry->size); | ||
118 | } | ||
119 | } | ||
120 | |||
121 | /** | ||
122 | * suspend_nvs_restore - restore NVS memory regions | ||
123 | * | ||
124 | * This function is going to be called with interrupts disabled, so it | ||
125 | * cannot iounmap the virtual addresses used to access the NVS region. | ||
126 | */ | ||
127 | void suspend_nvs_restore(void) | ||
128 | { | ||
129 | struct nvs_page *entry; | ||
130 | |||
131 | printk(KERN_INFO "PM: Restoring platform NVS memory\n"); | ||
132 | |||
133 | list_for_each_entry(entry, &nvs_list, node) | ||
134 | if (entry->data) | ||
135 | memcpy(entry->kaddr, entry->data, entry->size); | ||
136 | } | ||
diff --git a/kernel/power/process.c b/kernel/power/process.c index e50b4c1b2a0f..0cf3a27a6c9d 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -22,7 +22,7 @@ | |||
22 | */ | 22 | */ |
23 | #define TIMEOUT (20 * HZ) | 23 | #define TIMEOUT (20 * HZ) |
24 | 24 | ||
25 | static inline int freezeable(struct task_struct * p) | 25 | static inline int freezable(struct task_struct * p) |
26 | { | 26 | { |
27 | if ((p == current) || | 27 | if ((p == current) || |
28 | (p->flags & PF_NOFREEZE) || | 28 | (p->flags & PF_NOFREEZE) || |
@@ -53,7 +53,7 @@ static int try_to_freeze_tasks(bool sig_only) | |||
53 | todo = 0; | 53 | todo = 0; |
54 | read_lock(&tasklist_lock); | 54 | read_lock(&tasklist_lock); |
55 | do_each_thread(g, p) { | 55 | do_each_thread(g, p) { |
56 | if (frozen(p) || !freezeable(p)) | 56 | if (frozen(p) || !freezable(p)) |
57 | continue; | 57 | continue; |
58 | 58 | ||
59 | if (!freeze_task(p, sig_only)) | 59 | if (!freeze_task(p, sig_only)) |
@@ -64,6 +64,12 @@ static int try_to_freeze_tasks(bool sig_only) | |||
64 | * perturb a task in TASK_STOPPED or TASK_TRACED. | 64 | * perturb a task in TASK_STOPPED or TASK_TRACED. |
65 | * It is "frozen enough". If the task does wake | 65 | * It is "frozen enough". If the task does wake |
66 | * up, it will immediately call try_to_freeze. | 66 | * up, it will immediately call try_to_freeze. |
67 | * | ||
68 | * Because freeze_task() goes through p's | ||
69 | * scheduler lock after setting TIF_FREEZE, it's | ||
70 | * guaranteed that either we see TASK_RUNNING or | ||
71 | * try_to_stop() after schedule() in ptrace/signal | ||
72 | * stop sees TIF_FREEZE. | ||
67 | */ | 73 | */ |
68 | if (!task_is_stopped_or_traced(p) && | 74 | if (!task_is_stopped_or_traced(p) && |
69 | !freezer_should_skip(p)) | 75 | !freezer_should_skip(p)) |
@@ -79,7 +85,7 @@ static int try_to_freeze_tasks(bool sig_only) | |||
79 | if (!todo || time_after(jiffies, end_time)) | 85 | if (!todo || time_after(jiffies, end_time)) |
80 | break; | 86 | break; |
81 | 87 | ||
82 | if (!pm_check_wakeup_events()) { | 88 | if (pm_wakeup_pending()) { |
83 | wakeup = true; | 89 | wakeup = true; |
84 | break; | 90 | break; |
85 | } | 91 | } |
@@ -161,7 +167,7 @@ static void thaw_tasks(bool nosig_only) | |||
161 | 167 | ||
162 | read_lock(&tasklist_lock); | 168 | read_lock(&tasklist_lock); |
163 | do_each_thread(g, p) { | 169 | do_each_thread(g, p) { |
164 | if (!freezeable(p)) | 170 | if (!freezable(p)) |
165 | continue; | 171 | continue; |
166 | 172 | ||
167 | if (nosig_only && should_send_signal(p)) | 173 | if (nosig_only && should_send_signal(p)) |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 0dac75ea4456..64db648ff911 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -1519,11 +1519,8 @@ static int | |||
1519 | swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, | 1519 | swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, |
1520 | unsigned int nr_pages, unsigned int nr_highmem) | 1520 | unsigned int nr_pages, unsigned int nr_highmem) |
1521 | { | 1521 | { |
1522 | int error = 0; | ||
1523 | |||
1524 | if (nr_highmem > 0) { | 1522 | if (nr_highmem > 0) { |
1525 | error = get_highmem_buffer(PG_ANY); | 1523 | if (get_highmem_buffer(PG_ANY)) |
1526 | if (error) | ||
1527 | goto err_out; | 1524 | goto err_out; |
1528 | if (nr_highmem > alloc_highmem) { | 1525 | if (nr_highmem > alloc_highmem) { |
1529 | nr_highmem -= alloc_highmem; | 1526 | nr_highmem -= alloc_highmem; |
@@ -1546,7 +1543,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, | |||
1546 | 1543 | ||
1547 | err_out: | 1544 | err_out: |
1548 | swsusp_free(); | 1545 | swsusp_free(); |
1549 | return error; | 1546 | return -ENOMEM; |
1550 | } | 1547 | } |
1551 | 1548 | ||
1552 | asmlinkage int swsusp_save(void) | 1549 | asmlinkage int swsusp_save(void) |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ecf770509d0d..de6f86bfa303 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/mm.h> | 22 | #include <linux/mm.h> |
23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
24 | #include <linux/suspend.h> | 24 | #include <linux/suspend.h> |
25 | #include <trace/events/power.h> | ||
25 | 26 | ||
26 | #include "power.h" | 27 | #include "power.h" |
27 | 28 | ||
@@ -30,13 +31,13 @@ const char *const pm_states[PM_SUSPEND_MAX] = { | |||
30 | [PM_SUSPEND_MEM] = "mem", | 31 | [PM_SUSPEND_MEM] = "mem", |
31 | }; | 32 | }; |
32 | 33 | ||
33 | static struct platform_suspend_ops *suspend_ops; | 34 | static const struct platform_suspend_ops *suspend_ops; |
34 | 35 | ||
35 | /** | 36 | /** |
36 | * suspend_set_ops - Set the global suspend method table. | 37 | * suspend_set_ops - Set the global suspend method table. |
37 | * @ops: Pointer to ops structure. | 38 | * @ops: Pointer to ops structure. |
38 | */ | 39 | */ |
39 | void suspend_set_ops(struct platform_suspend_ops *ops) | 40 | void suspend_set_ops(const struct platform_suspend_ops *ops) |
40 | { | 41 | { |
41 | mutex_lock(&pm_mutex); | 42 | mutex_lock(&pm_mutex); |
42 | suspend_ops = ops; | 43 | suspend_ops = ops; |
@@ -163,7 +164,7 @@ static int suspend_enter(suspend_state_t state) | |||
163 | 164 | ||
164 | error = sysdev_suspend(PMSG_SUSPEND); | 165 | error = sysdev_suspend(PMSG_SUSPEND); |
165 | if (!error) { | 166 | if (!error) { |
166 | if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) { | 167 | if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { |
167 | error = suspend_ops->enter(state); | 168 | error = suspend_ops->enter(state); |
168 | events_check_enabled = false; | 169 | events_check_enabled = false; |
169 | } | 170 | } |
@@ -201,6 +202,7 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
201 | if (!suspend_ops) | 202 | if (!suspend_ops) |
202 | return -ENOSYS; | 203 | return -ENOSYS; |
203 | 204 | ||
205 | trace_machine_suspend(state); | ||
204 | if (suspend_ops->begin) { | 206 | if (suspend_ops->begin) { |
205 | error = suspend_ops->begin(state); | 207 | error = suspend_ops->begin(state); |
206 | if (error) | 208 | if (error) |
@@ -229,6 +231,7 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
229 | Close: | 231 | Close: |
230 | if (suspend_ops->end) | 232 | if (suspend_ops->end) |
231 | suspend_ops->end(); | 233 | suspend_ops->end(); |
234 | trace_machine_suspend(PWR_EVENT_EXIT); | ||
232 | return error; | 235 | return error; |
233 | 236 | ||
234 | Recover_platform: | 237 | Recover_platform: |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8c7e4832b9be..7c97c3a0eee3 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -224,7 +224,7 @@ static int swsusp_swap_check(void) | |||
224 | return res; | 224 | return res; |
225 | 225 | ||
226 | root_swap = res; | 226 | root_swap = res; |
227 | res = blkdev_get(hib_resume_bdev, FMODE_WRITE); | 227 | res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL); |
228 | if (res) | 228 | if (res) |
229 | return res; | 229 | return res; |
230 | 230 | ||
@@ -888,7 +888,7 @@ out_finish: | |||
888 | /** | 888 | /** |
889 | * swsusp_read - read the hibernation image. | 889 | * swsusp_read - read the hibernation image. |
890 | * @flags_p: flags passed by the "frozen" kernel in the image header should | 890 | * @flags_p: flags passed by the "frozen" kernel in the image header should |
891 | * be written into this memeory location | 891 | * be written into this memory location |
892 | */ | 892 | */ |
893 | 893 | ||
894 | int swsusp_read(unsigned int *flags_p) | 894 | int swsusp_read(unsigned int *flags_p) |
@@ -930,7 +930,8 @@ int swsusp_check(void) | |||
930 | { | 930 | { |
931 | int error; | 931 | int error; |
932 | 932 | ||
933 | hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); | 933 | hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, |
934 | FMODE_READ, NULL); | ||
934 | if (!IS_ERR(hib_resume_bdev)) { | 935 | if (!IS_ERR(hib_resume_bdev)) { |
935 | set_blocksize(hib_resume_bdev, PAGE_SIZE); | 936 | set_blocksize(hib_resume_bdev, PAGE_SIZE); |
936 | clear_page(swsusp_header); | 937 | clear_page(swsusp_header); |
diff --git a/kernel/printk.c b/kernel/printk.c index a23315dc4498..36231525e22f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -39,16 +39,11 @@ | |||
39 | #include <linux/syslog.h> | 39 | #include <linux/syslog.h> |
40 | #include <linux/cpu.h> | 40 | #include <linux/cpu.h> |
41 | #include <linux/notifier.h> | 41 | #include <linux/notifier.h> |
42 | #include <linux/rculist.h> | ||
42 | 43 | ||
43 | #include <asm/uaccess.h> | 44 | #include <asm/uaccess.h> |
44 | 45 | ||
45 | /* | 46 | /* |
46 | * for_each_console() allows you to iterate on each console | ||
47 | */ | ||
48 | #define for_each_console(con) \ | ||
49 | for (con = console_drivers; con != NULL; con = con->next) | ||
50 | |||
51 | /* | ||
52 | * Architectures can override it: | 47 | * Architectures can override it: |
53 | */ | 48 | */ |
54 | void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) | 49 | void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) |
@@ -102,7 +97,7 @@ static int console_locked, console_suspended; | |||
102 | /* | 97 | /* |
103 | * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars | 98 | * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars |
104 | * It is also used in interesting ways to provide interlocking in | 99 | * It is also used in interesting ways to provide interlocking in |
105 | * release_console_sem(). | 100 | * console_unlock();. |
106 | */ | 101 | */ |
107 | static DEFINE_SPINLOCK(logbuf_lock); | 102 | static DEFINE_SPINLOCK(logbuf_lock); |
108 | 103 | ||
@@ -267,25 +262,47 @@ int dmesg_restrict = 1; | |||
267 | int dmesg_restrict; | 262 | int dmesg_restrict; |
268 | #endif | 263 | #endif |
269 | 264 | ||
265 | static int syslog_action_restricted(int type) | ||
266 | { | ||
267 | if (dmesg_restrict) | ||
268 | return 1; | ||
269 | /* Unless restricted, we allow "read all" and "get buffer size" for everybody */ | ||
270 | return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; | ||
271 | } | ||
272 | |||
273 | static int check_syslog_permissions(int type, bool from_file) | ||
274 | { | ||
275 | /* | ||
276 | * If this is from /proc/kmsg and we've already opened it, then we've | ||
277 | * already done the capabilities checks at open time. | ||
278 | */ | ||
279 | if (from_file && type != SYSLOG_ACTION_OPEN) | ||
280 | return 0; | ||
281 | |||
282 | if (syslog_action_restricted(type)) { | ||
283 | if (capable(CAP_SYSLOG)) | ||
284 | return 0; | ||
285 | /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ | ||
286 | if (capable(CAP_SYS_ADMIN)) { | ||
287 | WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN " | ||
288 | "but no CAP_SYSLOG (deprecated).\n"); | ||
289 | return 0; | ||
290 | } | ||
291 | return -EPERM; | ||
292 | } | ||
293 | return 0; | ||
294 | } | ||
295 | |||
270 | int do_syslog(int type, char __user *buf, int len, bool from_file) | 296 | int do_syslog(int type, char __user *buf, int len, bool from_file) |
271 | { | 297 | { |
272 | unsigned i, j, limit, count; | 298 | unsigned i, j, limit, count; |
273 | int do_clear = 0; | 299 | int do_clear = 0; |
274 | char c; | 300 | char c; |
275 | int error = 0; | 301 | int error; |
276 | 302 | ||
277 | /* | 303 | error = check_syslog_permissions(type, from_file); |
278 | * If this is from /proc/kmsg we only do the capabilities checks | 304 | if (error) |
279 | * at open time. | 305 | goto out; |
280 | */ | ||
281 | if (type == SYSLOG_ACTION_OPEN || !from_file) { | ||
282 | if (dmesg_restrict && !capable(CAP_SYS_ADMIN)) | ||
283 | return -EPERM; | ||
284 | if ((type != SYSLOG_ACTION_READ_ALL && | ||
285 | type != SYSLOG_ACTION_SIZE_BUFFER) && | ||
286 | !capable(CAP_SYS_ADMIN)) | ||
287 | return -EPERM; | ||
288 | } | ||
289 | 306 | ||
290 | error = security_syslog(type); | 307 | error = security_syslog(type); |
291 | if (error) | 308 | if (error) |
@@ -500,7 +517,7 @@ static void _call_console_drivers(unsigned start, | |||
500 | /* | 517 | /* |
501 | * Call the console drivers, asking them to write out | 518 | * Call the console drivers, asking them to write out |
502 | * log_buf[start] to log_buf[end - 1]. | 519 | * log_buf[start] to log_buf[end - 1]. |
503 | * The console_sem must be held. | 520 | * The console_lock must be held. |
504 | */ | 521 | */ |
505 | static void call_console_drivers(unsigned start, unsigned end) | 522 | static void call_console_drivers(unsigned start, unsigned end) |
506 | { | 523 | { |
@@ -603,11 +620,11 @@ static int have_callable_console(void) | |||
603 | * | 620 | * |
604 | * This is printk(). It can be called from any context. We want it to work. | 621 | * This is printk(). It can be called from any context. We want it to work. |
605 | * | 622 | * |
606 | * We try to grab the console_sem. If we succeed, it's easy - we log the output and | 623 | * We try to grab the console_lock. If we succeed, it's easy - we log the output and |
607 | * call the console drivers. If we fail to get the semaphore we place the output | 624 | * call the console drivers. If we fail to get the semaphore we place the output |
608 | * into the log buffer and return. The current holder of the console_sem will | 625 | * into the log buffer and return. The current holder of the console_sem will |
609 | * notice the new output in release_console_sem() and will send it to the | 626 | * notice the new output in console_unlock(); and will send it to the |
610 | * consoles before releasing the semaphore. | 627 | * consoles before releasing the lock. |
611 | * | 628 | * |
612 | * One effect of this deferred printing is that code which calls printk() and | 629 | * One effect of this deferred printing is that code which calls printk() and |
613 | * then changes console_loglevel may break. This is because console_loglevel | 630 | * then changes console_loglevel may break. This is because console_loglevel |
@@ -658,19 +675,19 @@ static inline int can_use_console(unsigned int cpu) | |||
658 | /* | 675 | /* |
659 | * Try to get console ownership to actually show the kernel | 676 | * Try to get console ownership to actually show the kernel |
660 | * messages from a 'printk'. Return true (and with the | 677 | * messages from a 'printk'. Return true (and with the |
661 | * console_semaphore held, and 'console_locked' set) if it | 678 | * console_lock held, and 'console_locked' set) if it |
662 | * is successful, false otherwise. | 679 | * is successful, false otherwise. |
663 | * | 680 | * |
664 | * This gets called with the 'logbuf_lock' spinlock held and | 681 | * This gets called with the 'logbuf_lock' spinlock held and |
665 | * interrupts disabled. It should return with 'lockbuf_lock' | 682 | * interrupts disabled. It should return with 'lockbuf_lock' |
666 | * released but interrupts still disabled. | 683 | * released but interrupts still disabled. |
667 | */ | 684 | */ |
668 | static int acquire_console_semaphore_for_printk(unsigned int cpu) | 685 | static int console_trylock_for_printk(unsigned int cpu) |
669 | __releases(&logbuf_lock) | 686 | __releases(&logbuf_lock) |
670 | { | 687 | { |
671 | int retval = 0; | 688 | int retval = 0; |
672 | 689 | ||
673 | if (!try_acquire_console_sem()) { | 690 | if (console_trylock()) { |
674 | retval = 1; | 691 | retval = 1; |
675 | 692 | ||
676 | /* | 693 | /* |
@@ -826,12 +843,12 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
826 | * actual magic (print out buffers, wake up klogd, | 843 | * actual magic (print out buffers, wake up klogd, |
827 | * etc). | 844 | * etc). |
828 | * | 845 | * |
829 | * The acquire_console_semaphore_for_printk() function | 846 | * The console_trylock_for_printk() function |
830 | * will release 'logbuf_lock' regardless of whether it | 847 | * will release 'logbuf_lock' regardless of whether it |
831 | * actually gets the semaphore or not. | 848 | * actually gets the semaphore or not. |
832 | */ | 849 | */ |
833 | if (acquire_console_semaphore_for_printk(this_cpu)) | 850 | if (console_trylock_for_printk(this_cpu)) |
834 | release_console_sem(); | 851 | console_unlock(); |
835 | 852 | ||
836 | lockdep_on(); | 853 | lockdep_on(); |
837 | out_restore_irqs: | 854 | out_restore_irqs: |
@@ -992,7 +1009,7 @@ void suspend_console(void) | |||
992 | if (!console_suspend_enabled) | 1009 | if (!console_suspend_enabled) |
993 | return; | 1010 | return; |
994 | printk("Suspending console(s) (use no_console_suspend to debug)\n"); | 1011 | printk("Suspending console(s) (use no_console_suspend to debug)\n"); |
995 | acquire_console_sem(); | 1012 | console_lock(); |
996 | console_suspended = 1; | 1013 | console_suspended = 1; |
997 | up(&console_sem); | 1014 | up(&console_sem); |
998 | } | 1015 | } |
@@ -1003,7 +1020,7 @@ void resume_console(void) | |||
1003 | return; | 1020 | return; |
1004 | down(&console_sem); | 1021 | down(&console_sem); |
1005 | console_suspended = 0; | 1022 | console_suspended = 0; |
1006 | release_console_sem(); | 1023 | console_unlock(); |
1007 | } | 1024 | } |
1008 | 1025 | ||
1009 | /** | 1026 | /** |
@@ -1026,21 +1043,21 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self, | |||
1026 | case CPU_DYING: | 1043 | case CPU_DYING: |
1027 | case CPU_DOWN_FAILED: | 1044 | case CPU_DOWN_FAILED: |
1028 | case CPU_UP_CANCELED: | 1045 | case CPU_UP_CANCELED: |
1029 | acquire_console_sem(); | 1046 | console_lock(); |
1030 | release_console_sem(); | 1047 | console_unlock(); |
1031 | } | 1048 | } |
1032 | return NOTIFY_OK; | 1049 | return NOTIFY_OK; |
1033 | } | 1050 | } |
1034 | 1051 | ||
1035 | /** | 1052 | /** |
1036 | * acquire_console_sem - lock the console system for exclusive use. | 1053 | * console_lock - lock the console system for exclusive use. |
1037 | * | 1054 | * |
1038 | * Acquires a semaphore which guarantees that the caller has | 1055 | * Acquires a lock which guarantees that the caller has |
1039 | * exclusive access to the console system and the console_drivers list. | 1056 | * exclusive access to the console system and the console_drivers list. |
1040 | * | 1057 | * |
1041 | * Can sleep, returns nothing. | 1058 | * Can sleep, returns nothing. |
1042 | */ | 1059 | */ |
1043 | void acquire_console_sem(void) | 1060 | void console_lock(void) |
1044 | { | 1061 | { |
1045 | BUG_ON(in_interrupt()); | 1062 | BUG_ON(in_interrupt()); |
1046 | down(&console_sem); | 1063 | down(&console_sem); |
@@ -1049,21 +1066,29 @@ void acquire_console_sem(void) | |||
1049 | console_locked = 1; | 1066 | console_locked = 1; |
1050 | console_may_schedule = 1; | 1067 | console_may_schedule = 1; |
1051 | } | 1068 | } |
1052 | EXPORT_SYMBOL(acquire_console_sem); | 1069 | EXPORT_SYMBOL(console_lock); |
1053 | 1070 | ||
1054 | int try_acquire_console_sem(void) | 1071 | /** |
1072 | * console_trylock - try to lock the console system for exclusive use. | ||
1073 | * | ||
1074 | * Tried to acquire a lock which guarantees that the caller has | ||
1075 | * exclusive access to the console system and the console_drivers list. | ||
1076 | * | ||
1077 | * returns 1 on success, and 0 on failure to acquire the lock. | ||
1078 | */ | ||
1079 | int console_trylock(void) | ||
1055 | { | 1080 | { |
1056 | if (down_trylock(&console_sem)) | 1081 | if (down_trylock(&console_sem)) |
1057 | return -1; | 1082 | return 0; |
1058 | if (console_suspended) { | 1083 | if (console_suspended) { |
1059 | up(&console_sem); | 1084 | up(&console_sem); |
1060 | return -1; | 1085 | return 0; |
1061 | } | 1086 | } |
1062 | console_locked = 1; | 1087 | console_locked = 1; |
1063 | console_may_schedule = 0; | 1088 | console_may_schedule = 0; |
1064 | return 0; | 1089 | return 1; |
1065 | } | 1090 | } |
1066 | EXPORT_SYMBOL(try_acquire_console_sem); | 1091 | EXPORT_SYMBOL(console_trylock); |
1067 | 1092 | ||
1068 | int is_console_locked(void) | 1093 | int is_console_locked(void) |
1069 | { | 1094 | { |
@@ -1074,17 +1099,17 @@ static DEFINE_PER_CPU(int, printk_pending); | |||
1074 | 1099 | ||
1075 | void printk_tick(void) | 1100 | void printk_tick(void) |
1076 | { | 1101 | { |
1077 | if (__get_cpu_var(printk_pending)) { | 1102 | if (__this_cpu_read(printk_pending)) { |
1078 | __get_cpu_var(printk_pending) = 0; | 1103 | __this_cpu_write(printk_pending, 0); |
1079 | wake_up_interruptible(&log_wait); | 1104 | wake_up_interruptible(&log_wait); |
1080 | } | 1105 | } |
1081 | } | 1106 | } |
1082 | 1107 | ||
1083 | int printk_needs_cpu(int cpu) | 1108 | int printk_needs_cpu(int cpu) |
1084 | { | 1109 | { |
1085 | if (unlikely(cpu_is_offline(cpu))) | 1110 | if (cpu_is_offline(cpu)) |
1086 | printk_tick(); | 1111 | printk_tick(); |
1087 | return per_cpu(printk_pending, cpu); | 1112 | return __this_cpu_read(printk_pending); |
1088 | } | 1113 | } |
1089 | 1114 | ||
1090 | void wake_up_klogd(void) | 1115 | void wake_up_klogd(void) |
@@ -1094,20 +1119,20 @@ void wake_up_klogd(void) | |||
1094 | } | 1119 | } |
1095 | 1120 | ||
1096 | /** | 1121 | /** |
1097 | * release_console_sem - unlock the console system | 1122 | * console_unlock - unlock the console system |
1098 | * | 1123 | * |
1099 | * Releases the semaphore which the caller holds on the console system | 1124 | * Releases the console_lock which the caller holds on the console system |
1100 | * and the console driver list. | 1125 | * and the console driver list. |
1101 | * | 1126 | * |
1102 | * While the semaphore was held, console output may have been buffered | 1127 | * While the console_lock was held, console output may have been buffered |
1103 | * by printk(). If this is the case, release_console_sem() emits | 1128 | * by printk(). If this is the case, console_unlock(); emits |
1104 | * the output prior to releasing the semaphore. | 1129 | * the output prior to releasing the lock. |
1105 | * | 1130 | * |
1106 | * If there is output waiting for klogd, we wake it up. | 1131 | * If there is output waiting for klogd, we wake it up. |
1107 | * | 1132 | * |
1108 | * release_console_sem() may be called from any context. | 1133 | * console_unlock(); may be called from any context. |
1109 | */ | 1134 | */ |
1110 | void release_console_sem(void) | 1135 | void console_unlock(void) |
1111 | { | 1136 | { |
1112 | unsigned long flags; | 1137 | unsigned long flags; |
1113 | unsigned _con_start, _log_end; | 1138 | unsigned _con_start, _log_end; |
@@ -1140,7 +1165,7 @@ void release_console_sem(void) | |||
1140 | if (wake_klogd) | 1165 | if (wake_klogd) |
1141 | wake_up_klogd(); | 1166 | wake_up_klogd(); |
1142 | } | 1167 | } |
1143 | EXPORT_SYMBOL(release_console_sem); | 1168 | EXPORT_SYMBOL(console_unlock); |
1144 | 1169 | ||
1145 | /** | 1170 | /** |
1146 | * console_conditional_schedule - yield the CPU if required | 1171 | * console_conditional_schedule - yield the CPU if required |
@@ -1149,7 +1174,7 @@ EXPORT_SYMBOL(release_console_sem); | |||
1149 | * if this CPU should yield the CPU to another task, do | 1174 | * if this CPU should yield the CPU to another task, do |
1150 | * so here. | 1175 | * so here. |
1151 | * | 1176 | * |
1152 | * Must be called within acquire_console_sem(). | 1177 | * Must be called within console_lock();. |
1153 | */ | 1178 | */ |
1154 | void __sched console_conditional_schedule(void) | 1179 | void __sched console_conditional_schedule(void) |
1155 | { | 1180 | { |
@@ -1170,14 +1195,14 @@ void console_unblank(void) | |||
1170 | if (down_trylock(&console_sem) != 0) | 1195 | if (down_trylock(&console_sem) != 0) |
1171 | return; | 1196 | return; |
1172 | } else | 1197 | } else |
1173 | acquire_console_sem(); | 1198 | console_lock(); |
1174 | 1199 | ||
1175 | console_locked = 1; | 1200 | console_locked = 1; |
1176 | console_may_schedule = 0; | 1201 | console_may_schedule = 0; |
1177 | for_each_console(c) | 1202 | for_each_console(c) |
1178 | if ((c->flags & CON_ENABLED) && c->unblank) | 1203 | if ((c->flags & CON_ENABLED) && c->unblank) |
1179 | c->unblank(); | 1204 | c->unblank(); |
1180 | release_console_sem(); | 1205 | console_unlock(); |
1181 | } | 1206 | } |
1182 | 1207 | ||
1183 | /* | 1208 | /* |
@@ -1188,7 +1213,7 @@ struct tty_driver *console_device(int *index) | |||
1188 | struct console *c; | 1213 | struct console *c; |
1189 | struct tty_driver *driver = NULL; | 1214 | struct tty_driver *driver = NULL; |
1190 | 1215 | ||
1191 | acquire_console_sem(); | 1216 | console_lock(); |
1192 | for_each_console(c) { | 1217 | for_each_console(c) { |
1193 | if (!c->device) | 1218 | if (!c->device) |
1194 | continue; | 1219 | continue; |
@@ -1196,7 +1221,7 @@ struct tty_driver *console_device(int *index) | |||
1196 | if (driver) | 1221 | if (driver) |
1197 | break; | 1222 | break; |
1198 | } | 1223 | } |
1199 | release_console_sem(); | 1224 | console_unlock(); |
1200 | return driver; | 1225 | return driver; |
1201 | } | 1226 | } |
1202 | 1227 | ||
@@ -1207,17 +1232,17 @@ struct tty_driver *console_device(int *index) | |||
1207 | */ | 1232 | */ |
1208 | void console_stop(struct console *console) | 1233 | void console_stop(struct console *console) |
1209 | { | 1234 | { |
1210 | acquire_console_sem(); | 1235 | console_lock(); |
1211 | console->flags &= ~CON_ENABLED; | 1236 | console->flags &= ~CON_ENABLED; |
1212 | release_console_sem(); | 1237 | console_unlock(); |
1213 | } | 1238 | } |
1214 | EXPORT_SYMBOL(console_stop); | 1239 | EXPORT_SYMBOL(console_stop); |
1215 | 1240 | ||
1216 | void console_start(struct console *console) | 1241 | void console_start(struct console *console) |
1217 | { | 1242 | { |
1218 | acquire_console_sem(); | 1243 | console_lock(); |
1219 | console->flags |= CON_ENABLED; | 1244 | console->flags |= CON_ENABLED; |
1220 | release_console_sem(); | 1245 | console_unlock(); |
1221 | } | 1246 | } |
1222 | EXPORT_SYMBOL(console_start); | 1247 | EXPORT_SYMBOL(console_start); |
1223 | 1248 | ||
@@ -1339,7 +1364,7 @@ void register_console(struct console *newcon) | |||
1339 | * Put this console in the list - keep the | 1364 | * Put this console in the list - keep the |
1340 | * preferred driver at the head of the list. | 1365 | * preferred driver at the head of the list. |
1341 | */ | 1366 | */ |
1342 | acquire_console_sem(); | 1367 | console_lock(); |
1343 | if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) { | 1368 | if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) { |
1344 | newcon->next = console_drivers; | 1369 | newcon->next = console_drivers; |
1345 | console_drivers = newcon; | 1370 | console_drivers = newcon; |
@@ -1351,14 +1376,15 @@ void register_console(struct console *newcon) | |||
1351 | } | 1376 | } |
1352 | if (newcon->flags & CON_PRINTBUFFER) { | 1377 | if (newcon->flags & CON_PRINTBUFFER) { |
1353 | /* | 1378 | /* |
1354 | * release_console_sem() will print out the buffered messages | 1379 | * console_unlock(); will print out the buffered messages |
1355 | * for us. | 1380 | * for us. |
1356 | */ | 1381 | */ |
1357 | spin_lock_irqsave(&logbuf_lock, flags); | 1382 | spin_lock_irqsave(&logbuf_lock, flags); |
1358 | con_start = log_start; | 1383 | con_start = log_start; |
1359 | spin_unlock_irqrestore(&logbuf_lock, flags); | 1384 | spin_unlock_irqrestore(&logbuf_lock, flags); |
1360 | } | 1385 | } |
1361 | release_console_sem(); | 1386 | console_unlock(); |
1387 | console_sysfs_notify(); | ||
1362 | 1388 | ||
1363 | /* | 1389 | /* |
1364 | * By unregistering the bootconsoles after we enable the real console | 1390 | * By unregistering the bootconsoles after we enable the real console |
@@ -1394,7 +1420,7 @@ int unregister_console(struct console *console) | |||
1394 | return braille_unregister_console(console); | 1420 | return braille_unregister_console(console); |
1395 | #endif | 1421 | #endif |
1396 | 1422 | ||
1397 | acquire_console_sem(); | 1423 | console_lock(); |
1398 | if (console_drivers == console) { | 1424 | if (console_drivers == console) { |
1399 | console_drivers=console->next; | 1425 | console_drivers=console->next; |
1400 | res = 0; | 1426 | res = 0; |
@@ -1416,7 +1442,8 @@ int unregister_console(struct console *console) | |||
1416 | if (console_drivers != NULL && console->flags & CON_CONSDEV) | 1442 | if (console_drivers != NULL && console->flags & CON_CONSDEV) |
1417 | console_drivers->flags |= CON_CONSDEV; | 1443 | console_drivers->flags |= CON_CONSDEV; |
1418 | 1444 | ||
1419 | release_console_sem(); | 1445 | console_unlock(); |
1446 | console_sysfs_notify(); | ||
1420 | return res; | 1447 | return res; |
1421 | } | 1448 | } |
1422 | EXPORT_SYMBOL(unregister_console); | 1449 | EXPORT_SYMBOL(unregister_console); |
@@ -1500,7 +1527,7 @@ int kmsg_dump_register(struct kmsg_dumper *dumper) | |||
1500 | /* Don't allow registering multiple times */ | 1527 | /* Don't allow registering multiple times */ |
1501 | if (!dumper->registered) { | 1528 | if (!dumper->registered) { |
1502 | dumper->registered = 1; | 1529 | dumper->registered = 1; |
1503 | list_add_tail(&dumper->list, &dump_list); | 1530 | list_add_tail_rcu(&dumper->list, &dump_list); |
1504 | err = 0; | 1531 | err = 0; |
1505 | } | 1532 | } |
1506 | spin_unlock_irqrestore(&dump_list_lock, flags); | 1533 | spin_unlock_irqrestore(&dump_list_lock, flags); |
@@ -1524,29 +1551,16 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper) | |||
1524 | spin_lock_irqsave(&dump_list_lock, flags); | 1551 | spin_lock_irqsave(&dump_list_lock, flags); |
1525 | if (dumper->registered) { | 1552 | if (dumper->registered) { |
1526 | dumper->registered = 0; | 1553 | dumper->registered = 0; |
1527 | list_del(&dumper->list); | 1554 | list_del_rcu(&dumper->list); |
1528 | err = 0; | 1555 | err = 0; |
1529 | } | 1556 | } |
1530 | spin_unlock_irqrestore(&dump_list_lock, flags); | 1557 | spin_unlock_irqrestore(&dump_list_lock, flags); |
1558 | synchronize_rcu(); | ||
1531 | 1559 | ||
1532 | return err; | 1560 | return err; |
1533 | } | 1561 | } |
1534 | EXPORT_SYMBOL_GPL(kmsg_dump_unregister); | 1562 | EXPORT_SYMBOL_GPL(kmsg_dump_unregister); |
1535 | 1563 | ||
1536 | static const char * const kmsg_reasons[] = { | ||
1537 | [KMSG_DUMP_OOPS] = "oops", | ||
1538 | [KMSG_DUMP_PANIC] = "panic", | ||
1539 | [KMSG_DUMP_KEXEC] = "kexec", | ||
1540 | }; | ||
1541 | |||
1542 | static const char *kmsg_to_str(enum kmsg_dump_reason reason) | ||
1543 | { | ||
1544 | if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0) | ||
1545 | return "unknown"; | ||
1546 | |||
1547 | return kmsg_reasons[reason]; | ||
1548 | } | ||
1549 | |||
1550 | /** | 1564 | /** |
1551 | * kmsg_dump - dump kernel log to kernel message dumpers. | 1565 | * kmsg_dump - dump kernel log to kernel message dumpers. |
1552 | * @reason: the reason (oops, panic etc) for dumping | 1566 | * @reason: the reason (oops, panic etc) for dumping |
@@ -1585,13 +1599,9 @@ void kmsg_dump(enum kmsg_dump_reason reason) | |||
1585 | l2 = chars; | 1599 | l2 = chars; |
1586 | } | 1600 | } |
1587 | 1601 | ||
1588 | if (!spin_trylock_irqsave(&dump_list_lock, flags)) { | 1602 | rcu_read_lock(); |
1589 | printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n", | 1603 | list_for_each_entry_rcu(dumper, &dump_list, list) |
1590 | kmsg_to_str(reason)); | ||
1591 | return; | ||
1592 | } | ||
1593 | list_for_each_entry(dumper, &dump_list, list) | ||
1594 | dumper->dump(dumper, reason, s1, l1, s2, l2); | 1604 | dumper->dump(dumper, reason, s1, l1, s2, l2); |
1595 | spin_unlock_irqrestore(&dump_list_lock, flags); | 1605 | rcu_read_unlock(); |
1596 | } | 1606 | } |
1597 | #endif | 1607 | #endif |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 99bbaa3e5b0d..1708b1e2972d 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -313,7 +313,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data) | |||
313 | child->exit_code = data; | 313 | child->exit_code = data; |
314 | dead = __ptrace_detach(current, child); | 314 | dead = __ptrace_detach(current, child); |
315 | if (!child->exit_state) | 315 | if (!child->exit_state) |
316 | wake_up_process(child); | 316 | wake_up_state(child, TASK_TRACED | TASK_STOPPED); |
317 | } | 317 | } |
318 | write_unlock_irq(&tasklist_lock); | 318 | write_unlock_irq(&tasklist_lock); |
319 | 319 | ||
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index d806735342ac..0c343b9a46d5 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
@@ -36,31 +36,16 @@ | |||
36 | #include <linux/time.h> | 36 | #include <linux/time.h> |
37 | #include <linux/cpu.h> | 37 | #include <linux/cpu.h> |
38 | 38 | ||
39 | /* Global control variables for rcupdate callback mechanism. */ | 39 | /* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ |
40 | struct rcu_ctrlblk { | 40 | static struct task_struct *rcu_kthread_task; |
41 | struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ | 41 | static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); |
42 | struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ | 42 | static unsigned long have_rcu_kthread_work; |
43 | struct rcu_head **curtail; /* ->next pointer of last CB. */ | 43 | static void invoke_rcu_kthread(void); |
44 | }; | ||
45 | |||
46 | /* Definition for rcupdate control block. */ | ||
47 | static struct rcu_ctrlblk rcu_sched_ctrlblk = { | ||
48 | .donetail = &rcu_sched_ctrlblk.rcucblist, | ||
49 | .curtail = &rcu_sched_ctrlblk.rcucblist, | ||
50 | }; | ||
51 | |||
52 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | ||
53 | .donetail = &rcu_bh_ctrlblk.rcucblist, | ||
54 | .curtail = &rcu_bh_ctrlblk.rcucblist, | ||
55 | }; | ||
56 | |||
57 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
58 | int rcu_scheduler_active __read_mostly; | ||
59 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); | ||
60 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
61 | 44 | ||
62 | /* Forward declarations for rcutiny_plugin.h. */ | 45 | /* Forward declarations for rcutiny_plugin.h. */ |
63 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); | 46 | struct rcu_ctrlblk; |
47 | static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); | ||
48 | static int rcu_kthread(void *arg); | ||
64 | static void __call_rcu(struct rcu_head *head, | 49 | static void __call_rcu(struct rcu_head *head, |
65 | void (*func)(struct rcu_head *rcu), | 50 | void (*func)(struct rcu_head *rcu), |
66 | struct rcu_ctrlblk *rcp); | 51 | struct rcu_ctrlblk *rcp); |
@@ -123,7 +108,7 @@ void rcu_sched_qs(int cpu) | |||
123 | { | 108 | { |
124 | if (rcu_qsctr_help(&rcu_sched_ctrlblk) + | 109 | if (rcu_qsctr_help(&rcu_sched_ctrlblk) + |
125 | rcu_qsctr_help(&rcu_bh_ctrlblk)) | 110 | rcu_qsctr_help(&rcu_bh_ctrlblk)) |
126 | raise_softirq(RCU_SOFTIRQ); | 111 | invoke_rcu_kthread(); |
127 | } | 112 | } |
128 | 113 | ||
129 | /* | 114 | /* |
@@ -132,7 +117,7 @@ void rcu_sched_qs(int cpu) | |||
132 | void rcu_bh_qs(int cpu) | 117 | void rcu_bh_qs(int cpu) |
133 | { | 118 | { |
134 | if (rcu_qsctr_help(&rcu_bh_ctrlblk)) | 119 | if (rcu_qsctr_help(&rcu_bh_ctrlblk)) |
135 | raise_softirq(RCU_SOFTIRQ); | 120 | invoke_rcu_kthread(); |
136 | } | 121 | } |
137 | 122 | ||
138 | /* | 123 | /* |
@@ -152,13 +137,14 @@ void rcu_check_callbacks(int cpu, int user) | |||
152 | } | 137 | } |
153 | 138 | ||
154 | /* | 139 | /* |
155 | * Helper function for rcu_process_callbacks() that operates on the | 140 | * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure |
156 | * specified rcu_ctrlkblk structure. | 141 | * whose grace period has elapsed. |
157 | */ | 142 | */ |
158 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | 143 | static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) |
159 | { | 144 | { |
160 | struct rcu_head *next, *list; | 145 | struct rcu_head *next, *list; |
161 | unsigned long flags; | 146 | unsigned long flags; |
147 | RCU_TRACE(int cb_count = 0); | ||
162 | 148 | ||
163 | /* If no RCU callbacks ready to invoke, just return. */ | 149 | /* If no RCU callbacks ready to invoke, just return. */ |
164 | if (&rcp->rcucblist == rcp->donetail) | 150 | if (&rcp->rcucblist == rcp->donetail) |
@@ -180,19 +166,59 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
180 | next = list->next; | 166 | next = list->next; |
181 | prefetch(next); | 167 | prefetch(next); |
182 | debug_rcu_head_unqueue(list); | 168 | debug_rcu_head_unqueue(list); |
169 | local_bh_disable(); | ||
183 | list->func(list); | 170 | list->func(list); |
171 | local_bh_enable(); | ||
184 | list = next; | 172 | list = next; |
173 | RCU_TRACE(cb_count++); | ||
185 | } | 174 | } |
175 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); | ||
186 | } | 176 | } |
187 | 177 | ||
188 | /* | 178 | /* |
189 | * Invoke any callbacks whose grace period has completed. | 179 | * This kthread invokes RCU callbacks whose grace periods have |
180 | * elapsed. It is awakened as needed, and takes the place of the | ||
181 | * RCU_SOFTIRQ that was used previously for this purpose. | ||
182 | * This is a kthread, but it is never stopped, at least not until | ||
183 | * the system goes down. | ||
190 | */ | 184 | */ |
191 | static void rcu_process_callbacks(struct softirq_action *unused) | 185 | static int rcu_kthread(void *arg) |
192 | { | 186 | { |
193 | __rcu_process_callbacks(&rcu_sched_ctrlblk); | 187 | unsigned long work; |
194 | __rcu_process_callbacks(&rcu_bh_ctrlblk); | 188 | unsigned long morework; |
195 | rcu_preempt_process_callbacks(); | 189 | unsigned long flags; |
190 | |||
191 | for (;;) { | ||
192 | wait_event_interruptible(rcu_kthread_wq, | ||
193 | have_rcu_kthread_work != 0); | ||
194 | morework = rcu_boost(); | ||
195 | local_irq_save(flags); | ||
196 | work = have_rcu_kthread_work; | ||
197 | have_rcu_kthread_work = morework; | ||
198 | local_irq_restore(flags); | ||
199 | if (work) { | ||
200 | rcu_process_callbacks(&rcu_sched_ctrlblk); | ||
201 | rcu_process_callbacks(&rcu_bh_ctrlblk); | ||
202 | rcu_preempt_process_callbacks(); | ||
203 | } | ||
204 | schedule_timeout_interruptible(1); /* Leave CPU for others. */ | ||
205 | } | ||
206 | |||
207 | return 0; /* Not reached, but needed to shut gcc up. */ | ||
208 | } | ||
209 | |||
210 | /* | ||
211 | * Wake up rcu_kthread() to process callbacks now eligible for invocation | ||
212 | * or to boost readers. | ||
213 | */ | ||
214 | static void invoke_rcu_kthread(void) | ||
215 | { | ||
216 | unsigned long flags; | ||
217 | |||
218 | local_irq_save(flags); | ||
219 | have_rcu_kthread_work = 1; | ||
220 | wake_up(&rcu_kthread_wq); | ||
221 | local_irq_restore(flags); | ||
196 | } | 222 | } |
197 | 223 | ||
198 | /* | 224 | /* |
@@ -230,6 +256,7 @@ static void __call_rcu(struct rcu_head *head, | |||
230 | local_irq_save(flags); | 256 | local_irq_save(flags); |
231 | *rcp->curtail = head; | 257 | *rcp->curtail = head; |
232 | rcp->curtail = &head->next; | 258 | rcp->curtail = &head->next; |
259 | RCU_TRACE(rcp->qlen++); | ||
233 | local_irq_restore(flags); | 260 | local_irq_restore(flags); |
234 | } | 261 | } |
235 | 262 | ||
@@ -282,7 +309,16 @@ void rcu_barrier_sched(void) | |||
282 | } | 309 | } |
283 | EXPORT_SYMBOL_GPL(rcu_barrier_sched); | 310 | EXPORT_SYMBOL_GPL(rcu_barrier_sched); |
284 | 311 | ||
285 | void __init rcu_init(void) | 312 | /* |
313 | * Spawn the kthread that invokes RCU callbacks. | ||
314 | */ | ||
315 | static int __init rcu_spawn_kthreads(void) | ||
286 | { | 316 | { |
287 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 317 | struct sched_param sp; |
318 | |||
319 | rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread"); | ||
320 | sp.sched_priority = RCU_BOOST_PRIO; | ||
321 | sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp); | ||
322 | return 0; | ||
288 | } | 323 | } |
324 | early_initcall(rcu_spawn_kthreads); | ||
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 6ceca4f745ff..015abaea962a 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -22,6 +22,40 @@ | |||
22 | * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 22 | * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> |
23 | */ | 23 | */ |
24 | 24 | ||
25 | #include <linux/kthread.h> | ||
26 | #include <linux/debugfs.h> | ||
27 | #include <linux/seq_file.h> | ||
28 | |||
29 | #ifdef CONFIG_RCU_TRACE | ||
30 | #define RCU_TRACE(stmt) stmt | ||
31 | #else /* #ifdef CONFIG_RCU_TRACE */ | ||
32 | #define RCU_TRACE(stmt) | ||
33 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | ||
34 | |||
35 | /* Global control variables for rcupdate callback mechanism. */ | ||
36 | struct rcu_ctrlblk { | ||
37 | struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ | ||
38 | struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ | ||
39 | struct rcu_head **curtail; /* ->next pointer of last CB. */ | ||
40 | RCU_TRACE(long qlen); /* Number of pending CBs. */ | ||
41 | }; | ||
42 | |||
43 | /* Definition for rcupdate control block. */ | ||
44 | static struct rcu_ctrlblk rcu_sched_ctrlblk = { | ||
45 | .donetail = &rcu_sched_ctrlblk.rcucblist, | ||
46 | .curtail = &rcu_sched_ctrlblk.rcucblist, | ||
47 | }; | ||
48 | |||
49 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | ||
50 | .donetail = &rcu_bh_ctrlblk.rcucblist, | ||
51 | .curtail = &rcu_bh_ctrlblk.rcucblist, | ||
52 | }; | ||
53 | |||
54 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
55 | int rcu_scheduler_active __read_mostly; | ||
56 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); | ||
57 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
58 | |||
25 | #ifdef CONFIG_TINY_PREEMPT_RCU | 59 | #ifdef CONFIG_TINY_PREEMPT_RCU |
26 | 60 | ||
27 | #include <linux/delay.h> | 61 | #include <linux/delay.h> |
@@ -46,17 +80,45 @@ struct rcu_preempt_ctrlblk { | |||
46 | struct list_head *gp_tasks; | 80 | struct list_head *gp_tasks; |
47 | /* Pointer to the first task blocking the */ | 81 | /* Pointer to the first task blocking the */ |
48 | /* current grace period, or NULL if there */ | 82 | /* current grace period, or NULL if there */ |
49 | /* is not such task. */ | 83 | /* is no such task. */ |
50 | struct list_head *exp_tasks; | 84 | struct list_head *exp_tasks; |
51 | /* Pointer to first task blocking the */ | 85 | /* Pointer to first task blocking the */ |
52 | /* current expedited grace period, or NULL */ | 86 | /* current expedited grace period, or NULL */ |
53 | /* if there is no such task. If there */ | 87 | /* if there is no such task. If there */ |
54 | /* is no current expedited grace period, */ | 88 | /* is no current expedited grace period, */ |
55 | /* then there cannot be any such task. */ | 89 | /* then there cannot be any such task. */ |
90 | #ifdef CONFIG_RCU_BOOST | ||
91 | struct list_head *boost_tasks; | ||
92 | /* Pointer to first task that needs to be */ | ||
93 | /* priority-boosted, or NULL if no priority */ | ||
94 | /* boosting is needed. If there is no */ | ||
95 | /* current or expedited grace period, there */ | ||
96 | /* can be no such task. */ | ||
97 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
56 | u8 gpnum; /* Current grace period. */ | 98 | u8 gpnum; /* Current grace period. */ |
57 | u8 gpcpu; /* Last grace period blocked by the CPU. */ | 99 | u8 gpcpu; /* Last grace period blocked by the CPU. */ |
58 | u8 completed; /* Last grace period completed. */ | 100 | u8 completed; /* Last grace period completed. */ |
59 | /* If all three are equal, RCU is idle. */ | 101 | /* If all three are equal, RCU is idle. */ |
102 | #ifdef CONFIG_RCU_BOOST | ||
103 | s8 boosted_this_gp; /* Has boosting already happened? */ | ||
104 | unsigned long boost_time; /* When to start boosting (jiffies) */ | ||
105 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
106 | #ifdef CONFIG_RCU_TRACE | ||
107 | unsigned long n_grace_periods; | ||
108 | #ifdef CONFIG_RCU_BOOST | ||
109 | unsigned long n_tasks_boosted; | ||
110 | unsigned long n_exp_boosts; | ||
111 | unsigned long n_normal_boosts; | ||
112 | unsigned long n_normal_balk_blkd_tasks; | ||
113 | unsigned long n_normal_balk_gp_tasks; | ||
114 | unsigned long n_normal_balk_boost_tasks; | ||
115 | unsigned long n_normal_balk_boosted; | ||
116 | unsigned long n_normal_balk_notyet; | ||
117 | unsigned long n_normal_balk_nos; | ||
118 | unsigned long n_exp_balk_blkd_tasks; | ||
119 | unsigned long n_exp_balk_nos; | ||
120 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
121 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
60 | }; | 122 | }; |
61 | 123 | ||
62 | static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { | 124 | static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { |
@@ -122,6 +184,210 @@ static int rcu_preempt_gp_in_progress(void) | |||
122 | } | 184 | } |
123 | 185 | ||
124 | /* | 186 | /* |
187 | * Advance a ->blkd_tasks-list pointer to the next entry, instead | ||
188 | * returning NULL if at the end of the list. | ||
189 | */ | ||
190 | static struct list_head *rcu_next_node_entry(struct task_struct *t) | ||
191 | { | ||
192 | struct list_head *np; | ||
193 | |||
194 | np = t->rcu_node_entry.next; | ||
195 | if (np == &rcu_preempt_ctrlblk.blkd_tasks) | ||
196 | np = NULL; | ||
197 | return np; | ||
198 | } | ||
199 | |||
200 | #ifdef CONFIG_RCU_TRACE | ||
201 | |||
202 | #ifdef CONFIG_RCU_BOOST | ||
203 | static void rcu_initiate_boost_trace(void); | ||
204 | static void rcu_initiate_exp_boost_trace(void); | ||
205 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
206 | |||
207 | /* | ||
208 | * Dump additional statistice for TINY_PREEMPT_RCU. | ||
209 | */ | ||
210 | static void show_tiny_preempt_stats(struct seq_file *m) | ||
211 | { | ||
212 | seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n", | ||
213 | rcu_preempt_ctrlblk.rcb.qlen, | ||
214 | rcu_preempt_ctrlblk.n_grace_periods, | ||
215 | rcu_preempt_ctrlblk.gpnum, | ||
216 | rcu_preempt_ctrlblk.gpcpu, | ||
217 | rcu_preempt_ctrlblk.completed, | ||
218 | "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)], | ||
219 | "N."[!rcu_preempt_ctrlblk.gp_tasks], | ||
220 | "E."[!rcu_preempt_ctrlblk.exp_tasks]); | ||
221 | #ifdef CONFIG_RCU_BOOST | ||
222 | seq_printf(m, " ttb=%c btg=", | ||
223 | "B."[!rcu_preempt_ctrlblk.boost_tasks]); | ||
224 | switch (rcu_preempt_ctrlblk.boosted_this_gp) { | ||
225 | case -1: | ||
226 | seq_puts(m, "exp"); | ||
227 | break; | ||
228 | case 0: | ||
229 | seq_puts(m, "no"); | ||
230 | break; | ||
231 | case 1: | ||
232 | seq_puts(m, "begun"); | ||
233 | break; | ||
234 | case 2: | ||
235 | seq_puts(m, "done"); | ||
236 | break; | ||
237 | default: | ||
238 | seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp); | ||
239 | } | ||
240 | seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", | ||
241 | rcu_preempt_ctrlblk.n_tasks_boosted, | ||
242 | rcu_preempt_ctrlblk.n_exp_boosts, | ||
243 | rcu_preempt_ctrlblk.n_normal_boosts, | ||
244 | (int)(jiffies & 0xffff), | ||
245 | (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); | ||
246 | seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n", | ||
247 | "normal balk", | ||
248 | rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks, | ||
249 | rcu_preempt_ctrlblk.n_normal_balk_gp_tasks, | ||
250 | rcu_preempt_ctrlblk.n_normal_balk_boost_tasks, | ||
251 | rcu_preempt_ctrlblk.n_normal_balk_boosted, | ||
252 | rcu_preempt_ctrlblk.n_normal_balk_notyet, | ||
253 | rcu_preempt_ctrlblk.n_normal_balk_nos); | ||
254 | seq_printf(m, " exp balk: bt=%lu nos=%lu\n", | ||
255 | rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks, | ||
256 | rcu_preempt_ctrlblk.n_exp_balk_nos); | ||
257 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
258 | } | ||
259 | |||
260 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
261 | |||
262 | #ifdef CONFIG_RCU_BOOST | ||
263 | |||
264 | #include "rtmutex_common.h" | ||
265 | |||
266 | /* | ||
267 | * Carry out RCU priority boosting on the task indicated by ->boost_tasks, | ||
268 | * and advance ->boost_tasks to the next task in the ->blkd_tasks list. | ||
269 | */ | ||
270 | static int rcu_boost(void) | ||
271 | { | ||
272 | unsigned long flags; | ||
273 | struct rt_mutex mtx; | ||
274 | struct list_head *np; | ||
275 | struct task_struct *t; | ||
276 | |||
277 | if (rcu_preempt_ctrlblk.boost_tasks == NULL) | ||
278 | return 0; /* Nothing to boost. */ | ||
279 | raw_local_irq_save(flags); | ||
280 | rcu_preempt_ctrlblk.boosted_this_gp++; | ||
281 | t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct, | ||
282 | rcu_node_entry); | ||
283 | np = rcu_next_node_entry(t); | ||
284 | rt_mutex_init_proxy_locked(&mtx, t); | ||
285 | t->rcu_boost_mutex = &mtx; | ||
286 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; | ||
287 | raw_local_irq_restore(flags); | ||
288 | rt_mutex_lock(&mtx); | ||
289 | RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); | ||
290 | rcu_preempt_ctrlblk.boosted_this_gp++; | ||
291 | rt_mutex_unlock(&mtx); | ||
292 | return rcu_preempt_ctrlblk.boost_tasks != NULL; | ||
293 | } | ||
294 | |||
295 | /* | ||
296 | * Check to see if it is now time to start boosting RCU readers blocking | ||
297 | * the current grace period, and, if so, tell the rcu_kthread_task to | ||
298 | * start boosting them. If there is an expedited boost in progress, | ||
299 | * we wait for it to complete. | ||
300 | * | ||
301 | * If there are no blocked readers blocking the current grace period, | ||
302 | * return 0 to let the caller know, otherwise return 1. Note that this | ||
303 | * return value is independent of whether or not boosting was done. | ||
304 | */ | ||
305 | static int rcu_initiate_boost(void) | ||
306 | { | ||
307 | if (!rcu_preempt_blocked_readers_cgp()) { | ||
308 | RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++); | ||
309 | return 0; | ||
310 | } | ||
311 | if (rcu_preempt_ctrlblk.gp_tasks != NULL && | ||
312 | rcu_preempt_ctrlblk.boost_tasks == NULL && | ||
313 | rcu_preempt_ctrlblk.boosted_this_gp == 0 && | ||
314 | ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) { | ||
315 | rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; | ||
316 | invoke_rcu_kthread(); | ||
317 | RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); | ||
318 | } else | ||
319 | RCU_TRACE(rcu_initiate_boost_trace()); | ||
320 | return 1; | ||
321 | } | ||
322 | |||
323 | /* | ||
324 | * Initiate boosting for an expedited grace period. | ||
325 | */ | ||
326 | static void rcu_initiate_expedited_boost(void) | ||
327 | { | ||
328 | unsigned long flags; | ||
329 | |||
330 | raw_local_irq_save(flags); | ||
331 | if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) { | ||
332 | rcu_preempt_ctrlblk.boost_tasks = | ||
333 | rcu_preempt_ctrlblk.blkd_tasks.next; | ||
334 | rcu_preempt_ctrlblk.boosted_this_gp = -1; | ||
335 | invoke_rcu_kthread(); | ||
336 | RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); | ||
337 | } else | ||
338 | RCU_TRACE(rcu_initiate_exp_boost_trace()); | ||
339 | raw_local_irq_restore(flags); | ||
340 | } | ||
341 | |||
342 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000); | ||
343 | |||
344 | /* | ||
345 | * Do priority-boost accounting for the start of a new grace period. | ||
346 | */ | ||
347 | static void rcu_preempt_boost_start_gp(void) | ||
348 | { | ||
349 | rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; | ||
350 | if (rcu_preempt_ctrlblk.boosted_this_gp > 0) | ||
351 | rcu_preempt_ctrlblk.boosted_this_gp = 0; | ||
352 | } | ||
353 | |||
354 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
355 | |||
356 | /* | ||
357 | * If there is no RCU priority boosting, we don't boost. | ||
358 | */ | ||
359 | static int rcu_boost(void) | ||
360 | { | ||
361 | return 0; | ||
362 | } | ||
363 | |||
364 | /* | ||
365 | * If there is no RCU priority boosting, we don't initiate boosting, | ||
366 | * but we do indicate whether there are blocked readers blocking the | ||
367 | * current grace period. | ||
368 | */ | ||
369 | static int rcu_initiate_boost(void) | ||
370 | { | ||
371 | return rcu_preempt_blocked_readers_cgp(); | ||
372 | } | ||
373 | |||
374 | /* | ||
375 | * If there is no RCU priority boosting, we don't initiate expedited boosting. | ||
376 | */ | ||
377 | static void rcu_initiate_expedited_boost(void) | ||
378 | { | ||
379 | } | ||
380 | |||
381 | /* | ||
382 | * If there is no RCU priority boosting, nothing to do at grace-period start. | ||
383 | */ | ||
384 | static void rcu_preempt_boost_start_gp(void) | ||
385 | { | ||
386 | } | ||
387 | |||
388 | #endif /* else #ifdef CONFIG_RCU_BOOST */ | ||
389 | |||
390 | /* | ||
125 | * Record a preemptible-RCU quiescent state for the specified CPU. Note | 391 | * Record a preemptible-RCU quiescent state for the specified CPU. Note |
126 | * that this just means that the task currently running on the CPU is | 392 | * that this just means that the task currently running on the CPU is |
127 | * in a quiescent state. There might be any number of tasks blocked | 393 | * in a quiescent state. There might be any number of tasks blocked |
@@ -148,11 +414,14 @@ static void rcu_preempt_cpu_qs(void) | |||
148 | rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; | 414 | rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; |
149 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; | 415 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; |
150 | 416 | ||
417 | /* If there is no GP then there is nothing more to do. */ | ||
418 | if (!rcu_preempt_gp_in_progress()) | ||
419 | return; | ||
151 | /* | 420 | /* |
152 | * If there is no GP, or if blocked readers are still blocking GP, | 421 | * Check up on boosting. If there are no readers blocking the |
153 | * then there is nothing more to do. | 422 | * current grace period, leave. |
154 | */ | 423 | */ |
155 | if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp()) | 424 | if (rcu_initiate_boost()) |
156 | return; | 425 | return; |
157 | 426 | ||
158 | /* Advance callbacks. */ | 427 | /* Advance callbacks. */ |
@@ -164,9 +433,9 @@ static void rcu_preempt_cpu_qs(void) | |||
164 | if (!rcu_preempt_blocked_readers_any()) | 433 | if (!rcu_preempt_blocked_readers_any()) |
165 | rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; | 434 | rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; |
166 | 435 | ||
167 | /* If there are done callbacks, make RCU_SOFTIRQ process them. */ | 436 | /* If there are done callbacks, cause them to be invoked. */ |
168 | if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) | 437 | if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) |
169 | raise_softirq(RCU_SOFTIRQ); | 438 | invoke_rcu_kthread(); |
170 | } | 439 | } |
171 | 440 | ||
172 | /* | 441 | /* |
@@ -178,12 +447,16 @@ static void rcu_preempt_start_gp(void) | |||
178 | 447 | ||
179 | /* Official start of GP. */ | 448 | /* Official start of GP. */ |
180 | rcu_preempt_ctrlblk.gpnum++; | 449 | rcu_preempt_ctrlblk.gpnum++; |
450 | RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); | ||
181 | 451 | ||
182 | /* Any blocked RCU readers block new GP. */ | 452 | /* Any blocked RCU readers block new GP. */ |
183 | if (rcu_preempt_blocked_readers_any()) | 453 | if (rcu_preempt_blocked_readers_any()) |
184 | rcu_preempt_ctrlblk.gp_tasks = | 454 | rcu_preempt_ctrlblk.gp_tasks = |
185 | rcu_preempt_ctrlblk.blkd_tasks.next; | 455 | rcu_preempt_ctrlblk.blkd_tasks.next; |
186 | 456 | ||
457 | /* Set up for RCU priority boosting. */ | ||
458 | rcu_preempt_boost_start_gp(); | ||
459 | |||
187 | /* If there is no running reader, CPU is done with GP. */ | 460 | /* If there is no running reader, CPU is done with GP. */ |
188 | if (!rcu_preempt_running_reader()) | 461 | if (!rcu_preempt_running_reader()) |
189 | rcu_preempt_cpu_qs(); | 462 | rcu_preempt_cpu_qs(); |
@@ -304,14 +577,16 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
304 | */ | 577 | */ |
305 | empty = !rcu_preempt_blocked_readers_cgp(); | 578 | empty = !rcu_preempt_blocked_readers_cgp(); |
306 | empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; | 579 | empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; |
307 | np = t->rcu_node_entry.next; | 580 | np = rcu_next_node_entry(t); |
308 | if (np == &rcu_preempt_ctrlblk.blkd_tasks) | ||
309 | np = NULL; | ||
310 | list_del(&t->rcu_node_entry); | 581 | list_del(&t->rcu_node_entry); |
311 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) | 582 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) |
312 | rcu_preempt_ctrlblk.gp_tasks = np; | 583 | rcu_preempt_ctrlblk.gp_tasks = np; |
313 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) | 584 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) |
314 | rcu_preempt_ctrlblk.exp_tasks = np; | 585 | rcu_preempt_ctrlblk.exp_tasks = np; |
586 | #ifdef CONFIG_RCU_BOOST | ||
587 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) | ||
588 | rcu_preempt_ctrlblk.boost_tasks = np; | ||
589 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
315 | INIT_LIST_HEAD(&t->rcu_node_entry); | 590 | INIT_LIST_HEAD(&t->rcu_node_entry); |
316 | 591 | ||
317 | /* | 592 | /* |
@@ -331,6 +606,14 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
331 | if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) | 606 | if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) |
332 | rcu_report_exp_done(); | 607 | rcu_report_exp_done(); |
333 | } | 608 | } |
609 | #ifdef CONFIG_RCU_BOOST | ||
610 | /* Unboost self if was boosted. */ | ||
611 | if (special & RCU_READ_UNLOCK_BOOSTED) { | ||
612 | t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; | ||
613 | rt_mutex_unlock(t->rcu_boost_mutex); | ||
614 | t->rcu_boost_mutex = NULL; | ||
615 | } | ||
616 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
334 | local_irq_restore(flags); | 617 | local_irq_restore(flags); |
335 | } | 618 | } |
336 | 619 | ||
@@ -374,7 +657,7 @@ static void rcu_preempt_check_callbacks(void) | |||
374 | rcu_preempt_cpu_qs(); | 657 | rcu_preempt_cpu_qs(); |
375 | if (&rcu_preempt_ctrlblk.rcb.rcucblist != | 658 | if (&rcu_preempt_ctrlblk.rcb.rcucblist != |
376 | rcu_preempt_ctrlblk.rcb.donetail) | 659 | rcu_preempt_ctrlblk.rcb.donetail) |
377 | raise_softirq(RCU_SOFTIRQ); | 660 | invoke_rcu_kthread(); |
378 | if (rcu_preempt_gp_in_progress() && | 661 | if (rcu_preempt_gp_in_progress() && |
379 | rcu_cpu_blocking_cur_gp() && | 662 | rcu_cpu_blocking_cur_gp() && |
380 | rcu_preempt_running_reader()) | 663 | rcu_preempt_running_reader()) |
@@ -383,7 +666,7 @@ static void rcu_preempt_check_callbacks(void) | |||
383 | 666 | ||
384 | /* | 667 | /* |
385 | * TINY_PREEMPT_RCU has an extra callback-list tail pointer to | 668 | * TINY_PREEMPT_RCU has an extra callback-list tail pointer to |
386 | * update, so this is invoked from __rcu_process_callbacks() to | 669 | * update, so this is invoked from rcu_process_callbacks() to |
387 | * handle that case. Of course, it is invoked for all flavors of | 670 | * handle that case. Of course, it is invoked for all flavors of |
388 | * RCU, but RCU callbacks can appear only on one of the lists, and | 671 | * RCU, but RCU callbacks can appear only on one of the lists, and |
389 | * neither ->nexttail nor ->donetail can possibly be NULL, so there | 672 | * neither ->nexttail nor ->donetail can possibly be NULL, so there |
@@ -400,7 +683,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) | |||
400 | */ | 683 | */ |
401 | static void rcu_preempt_process_callbacks(void) | 684 | static void rcu_preempt_process_callbacks(void) |
402 | { | 685 | { |
403 | __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); | 686 | rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); |
404 | } | 687 | } |
405 | 688 | ||
406 | /* | 689 | /* |
@@ -417,6 +700,7 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
417 | local_irq_save(flags); | 700 | local_irq_save(flags); |
418 | *rcu_preempt_ctrlblk.nexttail = head; | 701 | *rcu_preempt_ctrlblk.nexttail = head; |
419 | rcu_preempt_ctrlblk.nexttail = &head->next; | 702 | rcu_preempt_ctrlblk.nexttail = &head->next; |
703 | RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++); | ||
420 | rcu_preempt_start_gp(); /* checks to see if GP needed. */ | 704 | rcu_preempt_start_gp(); /* checks to see if GP needed. */ |
421 | local_irq_restore(flags); | 705 | local_irq_restore(flags); |
422 | } | 706 | } |
@@ -532,6 +816,7 @@ void synchronize_rcu_expedited(void) | |||
532 | 816 | ||
533 | /* Wait for tail of ->blkd_tasks list to drain. */ | 817 | /* Wait for tail of ->blkd_tasks list to drain. */ |
534 | if (rcu_preempted_readers_exp()) | 818 | if (rcu_preempted_readers_exp()) |
819 | rcu_initiate_expedited_boost(); | ||
535 | wait_event(sync_rcu_preempt_exp_wq, | 820 | wait_event(sync_rcu_preempt_exp_wq, |
536 | !rcu_preempted_readers_exp()); | 821 | !rcu_preempted_readers_exp()); |
537 | 822 | ||
@@ -572,6 +857,27 @@ void exit_rcu(void) | |||
572 | 857 | ||
573 | #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ | 858 | #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ |
574 | 859 | ||
860 | #ifdef CONFIG_RCU_TRACE | ||
861 | |||
862 | /* | ||
863 | * Because preemptible RCU does not exist, it is not necessary to | ||
864 | * dump out its statistics. | ||
865 | */ | ||
866 | static void show_tiny_preempt_stats(struct seq_file *m) | ||
867 | { | ||
868 | } | ||
869 | |||
870 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
871 | |||
872 | /* | ||
873 | * Because preemptible RCU does not exist, it is never necessary to | ||
874 | * boost preempted RCU readers. | ||
875 | */ | ||
876 | static int rcu_boost(void) | ||
877 | { | ||
878 | return 0; | ||
879 | } | ||
880 | |||
575 | /* | 881 | /* |
576 | * Because preemptible RCU does not exist, it never has any callbacks | 882 | * Because preemptible RCU does not exist, it never has any callbacks |
577 | * to check. | 883 | * to check. |
@@ -599,17 +905,116 @@ static void rcu_preempt_process_callbacks(void) | |||
599 | #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ | 905 | #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ |
600 | 906 | ||
601 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 907 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
602 | |||
603 | #include <linux/kernel_stat.h> | 908 | #include <linux/kernel_stat.h> |
604 | 909 | ||
605 | /* | 910 | /* |
606 | * During boot, we forgive RCU lockdep issues. After this function is | 911 | * During boot, we forgive RCU lockdep issues. After this function is |
607 | * invoked, we start taking RCU lockdep issues seriously. | 912 | * invoked, we start taking RCU lockdep issues seriously. |
608 | */ | 913 | */ |
609 | void rcu_scheduler_starting(void) | 914 | void __init rcu_scheduler_starting(void) |
610 | { | 915 | { |
611 | WARN_ON(nr_context_switches() > 0); | 916 | WARN_ON(nr_context_switches() > 0); |
612 | rcu_scheduler_active = 1; | 917 | rcu_scheduler_active = 1; |
613 | } | 918 | } |
614 | 919 | ||
615 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 920 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
921 | |||
922 | #ifdef CONFIG_RCU_BOOST | ||
923 | #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO | ||
924 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
925 | #define RCU_BOOST_PRIO 1 | ||
926 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
927 | |||
928 | #ifdef CONFIG_RCU_TRACE | ||
929 | |||
930 | #ifdef CONFIG_RCU_BOOST | ||
931 | |||
932 | static void rcu_initiate_boost_trace(void) | ||
933 | { | ||
934 | if (rcu_preempt_ctrlblk.gp_tasks == NULL) | ||
935 | rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++; | ||
936 | else if (rcu_preempt_ctrlblk.boost_tasks != NULL) | ||
937 | rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++; | ||
938 | else if (rcu_preempt_ctrlblk.boosted_this_gp != 0) | ||
939 | rcu_preempt_ctrlblk.n_normal_balk_boosted++; | ||
940 | else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) | ||
941 | rcu_preempt_ctrlblk.n_normal_balk_notyet++; | ||
942 | else | ||
943 | rcu_preempt_ctrlblk.n_normal_balk_nos++; | ||
944 | } | ||
945 | |||
946 | static void rcu_initiate_exp_boost_trace(void) | ||
947 | { | ||
948 | if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) | ||
949 | rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++; | ||
950 | else | ||
951 | rcu_preempt_ctrlblk.n_exp_balk_nos++; | ||
952 | } | ||
953 | |||
954 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
955 | |||
956 | static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) | ||
957 | { | ||
958 | unsigned long flags; | ||
959 | |||
960 | raw_local_irq_save(flags); | ||
961 | rcp->qlen -= n; | ||
962 | raw_local_irq_restore(flags); | ||
963 | } | ||
964 | |||
965 | /* | ||
966 | * Dump statistics for TINY_RCU, such as they are. | ||
967 | */ | ||
968 | static int show_tiny_stats(struct seq_file *m, void *unused) | ||
969 | { | ||
970 | show_tiny_preempt_stats(m); | ||
971 | seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); | ||
972 | seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); | ||
973 | return 0; | ||
974 | } | ||
975 | |||
976 | static int show_tiny_stats_open(struct inode *inode, struct file *file) | ||
977 | { | ||
978 | return single_open(file, show_tiny_stats, NULL); | ||
979 | } | ||
980 | |||
981 | static const struct file_operations show_tiny_stats_fops = { | ||
982 | .owner = THIS_MODULE, | ||
983 | .open = show_tiny_stats_open, | ||
984 | .read = seq_read, | ||
985 | .llseek = seq_lseek, | ||
986 | .release = single_release, | ||
987 | }; | ||
988 | |||
989 | static struct dentry *rcudir; | ||
990 | |||
991 | static int __init rcutiny_trace_init(void) | ||
992 | { | ||
993 | struct dentry *retval; | ||
994 | |||
995 | rcudir = debugfs_create_dir("rcu", NULL); | ||
996 | if (!rcudir) | ||
997 | goto free_out; | ||
998 | retval = debugfs_create_file("rcudata", 0444, rcudir, | ||
999 | NULL, &show_tiny_stats_fops); | ||
1000 | if (!retval) | ||
1001 | goto free_out; | ||
1002 | return 0; | ||
1003 | free_out: | ||
1004 | debugfs_remove_recursive(rcudir); | ||
1005 | return 1; | ||
1006 | } | ||
1007 | |||
1008 | static void __exit rcutiny_trace_cleanup(void) | ||
1009 | { | ||
1010 | debugfs_remove_recursive(rcudir); | ||
1011 | } | ||
1012 | |||
1013 | module_init(rcutiny_trace_init); | ||
1014 | module_exit(rcutiny_trace_cleanup); | ||
1015 | |||
1016 | MODULE_AUTHOR("Paul E. McKenney"); | ||
1017 | MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); | ||
1018 | MODULE_LICENSE("GPL"); | ||
1019 | |||
1020 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 9d8e8fb2515f..89613f97ff26 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -47,6 +47,7 @@ | |||
47 | #include <linux/srcu.h> | 47 | #include <linux/srcu.h> |
48 | #include <linux/slab.h> | 48 | #include <linux/slab.h> |
49 | #include <asm/byteorder.h> | 49 | #include <asm/byteorder.h> |
50 | #include <linux/sched.h> | ||
50 | 51 | ||
51 | MODULE_LICENSE("GPL"); | 52 | MODULE_LICENSE("GPL"); |
52 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " | 53 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " |
@@ -64,6 +65,9 @@ static int irqreader = 1; /* RCU readers from irq (timers). */ | |||
64 | static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ | 65 | static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ |
65 | static int fqs_holdoff = 0; /* Hold time within burst (us). */ | 66 | static int fqs_holdoff = 0; /* Hold time within burst (us). */ |
66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ | 67 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ |
68 | static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ | ||
69 | static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ | ||
70 | static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ | ||
67 | static char *torture_type = "rcu"; /* What RCU implementation to torture. */ | 71 | static char *torture_type = "rcu"; /* What RCU implementation to torture. */ |
68 | 72 | ||
69 | module_param(nreaders, int, 0444); | 73 | module_param(nreaders, int, 0444); |
@@ -88,6 +92,12 @@ module_param(fqs_holdoff, int, 0444); | |||
88 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); | 92 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); |
89 | module_param(fqs_stutter, int, 0444); | 93 | module_param(fqs_stutter, int, 0444); |
90 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); | 94 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); |
95 | module_param(test_boost, int, 0444); | ||
96 | MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); | ||
97 | module_param(test_boost_interval, int, 0444); | ||
98 | MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); | ||
99 | module_param(test_boost_duration, int, 0444); | ||
100 | MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); | ||
91 | module_param(torture_type, charp, 0444); | 101 | module_param(torture_type, charp, 0444); |
92 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); | 102 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); |
93 | 103 | ||
@@ -109,6 +119,7 @@ static struct task_struct *stats_task; | |||
109 | static struct task_struct *shuffler_task; | 119 | static struct task_struct *shuffler_task; |
110 | static struct task_struct *stutter_task; | 120 | static struct task_struct *stutter_task; |
111 | static struct task_struct *fqs_task; | 121 | static struct task_struct *fqs_task; |
122 | static struct task_struct *boost_tasks[NR_CPUS]; | ||
112 | 123 | ||
113 | #define RCU_TORTURE_PIPE_LEN 10 | 124 | #define RCU_TORTURE_PIPE_LEN 10 |
114 | 125 | ||
@@ -134,6 +145,12 @@ static atomic_t n_rcu_torture_alloc_fail; | |||
134 | static atomic_t n_rcu_torture_free; | 145 | static atomic_t n_rcu_torture_free; |
135 | static atomic_t n_rcu_torture_mberror; | 146 | static atomic_t n_rcu_torture_mberror; |
136 | static atomic_t n_rcu_torture_error; | 147 | static atomic_t n_rcu_torture_error; |
148 | static long n_rcu_torture_boost_ktrerror; | ||
149 | static long n_rcu_torture_boost_rterror; | ||
150 | static long n_rcu_torture_boost_allocerror; | ||
151 | static long n_rcu_torture_boost_afferror; | ||
152 | static long n_rcu_torture_boost_failure; | ||
153 | static long n_rcu_torture_boosts; | ||
137 | static long n_rcu_torture_timers; | 154 | static long n_rcu_torture_timers; |
138 | static struct list_head rcu_torture_removed; | 155 | static struct list_head rcu_torture_removed; |
139 | static cpumask_var_t shuffle_tmp_mask; | 156 | static cpumask_var_t shuffle_tmp_mask; |
@@ -147,6 +164,16 @@ static int stutter_pause_test; | |||
147 | #endif | 164 | #endif |
148 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; | 165 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; |
149 | 166 | ||
167 | #ifdef CONFIG_RCU_BOOST | ||
168 | #define rcu_can_boost() 1 | ||
169 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
170 | #define rcu_can_boost() 0 | ||
171 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
172 | |||
173 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ | ||
174 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ | ||
175 | /* and boost task create/destroy. */ | ||
176 | |||
150 | /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ | 177 | /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ |
151 | 178 | ||
152 | #define FULLSTOP_DONTSTOP 0 /* Normal operation. */ | 179 | #define FULLSTOP_DONTSTOP 0 /* Normal operation. */ |
@@ -277,6 +304,7 @@ struct rcu_torture_ops { | |||
277 | void (*fqs)(void); | 304 | void (*fqs)(void); |
278 | int (*stats)(char *page); | 305 | int (*stats)(char *page); |
279 | int irq_capable; | 306 | int irq_capable; |
307 | int can_boost; | ||
280 | char *name; | 308 | char *name; |
281 | }; | 309 | }; |
282 | 310 | ||
@@ -366,6 +394,7 @@ static struct rcu_torture_ops rcu_ops = { | |||
366 | .fqs = rcu_force_quiescent_state, | 394 | .fqs = rcu_force_quiescent_state, |
367 | .stats = NULL, | 395 | .stats = NULL, |
368 | .irq_capable = 1, | 396 | .irq_capable = 1, |
397 | .can_boost = rcu_can_boost(), | ||
369 | .name = "rcu" | 398 | .name = "rcu" |
370 | }; | 399 | }; |
371 | 400 | ||
@@ -408,6 +437,7 @@ static struct rcu_torture_ops rcu_sync_ops = { | |||
408 | .fqs = rcu_force_quiescent_state, | 437 | .fqs = rcu_force_quiescent_state, |
409 | .stats = NULL, | 438 | .stats = NULL, |
410 | .irq_capable = 1, | 439 | .irq_capable = 1, |
440 | .can_boost = rcu_can_boost(), | ||
411 | .name = "rcu_sync" | 441 | .name = "rcu_sync" |
412 | }; | 442 | }; |
413 | 443 | ||
@@ -424,6 +454,7 @@ static struct rcu_torture_ops rcu_expedited_ops = { | |||
424 | .fqs = rcu_force_quiescent_state, | 454 | .fqs = rcu_force_quiescent_state, |
425 | .stats = NULL, | 455 | .stats = NULL, |
426 | .irq_capable = 1, | 456 | .irq_capable = 1, |
457 | .can_boost = rcu_can_boost(), | ||
427 | .name = "rcu_expedited" | 458 | .name = "rcu_expedited" |
428 | }; | 459 | }; |
429 | 460 | ||
@@ -684,6 +715,110 @@ static struct rcu_torture_ops sched_expedited_ops = { | |||
684 | }; | 715 | }; |
685 | 716 | ||
686 | /* | 717 | /* |
718 | * RCU torture priority-boost testing. Runs one real-time thread per | ||
719 | * CPU for moderate bursts, repeatedly registering RCU callbacks and | ||
720 | * spinning waiting for them to be invoked. If a given callback takes | ||
721 | * too long to be invoked, we assume that priority inversion has occurred. | ||
722 | */ | ||
723 | |||
724 | struct rcu_boost_inflight { | ||
725 | struct rcu_head rcu; | ||
726 | int inflight; | ||
727 | }; | ||
728 | |||
729 | static void rcu_torture_boost_cb(struct rcu_head *head) | ||
730 | { | ||
731 | struct rcu_boost_inflight *rbip = | ||
732 | container_of(head, struct rcu_boost_inflight, rcu); | ||
733 | |||
734 | smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */ | ||
735 | rbip->inflight = 0; | ||
736 | } | ||
737 | |||
738 | static int rcu_torture_boost(void *arg) | ||
739 | { | ||
740 | unsigned long call_rcu_time; | ||
741 | unsigned long endtime; | ||
742 | unsigned long oldstarttime; | ||
743 | struct rcu_boost_inflight rbi = { .inflight = 0 }; | ||
744 | struct sched_param sp; | ||
745 | |||
746 | VERBOSE_PRINTK_STRING("rcu_torture_boost started"); | ||
747 | |||
748 | /* Set real-time priority. */ | ||
749 | sp.sched_priority = 1; | ||
750 | if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { | ||
751 | VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!"); | ||
752 | n_rcu_torture_boost_rterror++; | ||
753 | } | ||
754 | |||
755 | /* Each pass through the following loop does one boost-test cycle. */ | ||
756 | do { | ||
757 | /* Wait for the next test interval. */ | ||
758 | oldstarttime = boost_starttime; | ||
759 | while (jiffies - oldstarttime > ULONG_MAX / 2) { | ||
760 | schedule_timeout_uninterruptible(1); | ||
761 | rcu_stutter_wait("rcu_torture_boost"); | ||
762 | if (kthread_should_stop() || | ||
763 | fullstop != FULLSTOP_DONTSTOP) | ||
764 | goto checkwait; | ||
765 | } | ||
766 | |||
767 | /* Do one boost-test interval. */ | ||
768 | endtime = oldstarttime + test_boost_duration * HZ; | ||
769 | call_rcu_time = jiffies; | ||
770 | while (jiffies - endtime > ULONG_MAX / 2) { | ||
771 | /* If we don't have a callback in flight, post one. */ | ||
772 | if (!rbi.inflight) { | ||
773 | smp_mb(); /* RCU core before ->inflight = 1. */ | ||
774 | rbi.inflight = 1; | ||
775 | call_rcu(&rbi.rcu, rcu_torture_boost_cb); | ||
776 | if (jiffies - call_rcu_time > | ||
777 | test_boost_duration * HZ - HZ / 2) { | ||
778 | VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed"); | ||
779 | n_rcu_torture_boost_failure++; | ||
780 | } | ||
781 | call_rcu_time = jiffies; | ||
782 | } | ||
783 | cond_resched(); | ||
784 | rcu_stutter_wait("rcu_torture_boost"); | ||
785 | if (kthread_should_stop() || | ||
786 | fullstop != FULLSTOP_DONTSTOP) | ||
787 | goto checkwait; | ||
788 | } | ||
789 | |||
790 | /* | ||
791 | * Set the start time of the next test interval. | ||
792 | * Yes, this is vulnerable to long delays, but such | ||
793 | * delays simply cause a false negative for the next | ||
794 | * interval. Besides, we are running at RT priority, | ||
795 | * so delays should be relatively rare. | ||
796 | */ | ||
797 | while (oldstarttime == boost_starttime) { | ||
798 | if (mutex_trylock(&boost_mutex)) { | ||
799 | boost_starttime = jiffies + | ||
800 | test_boost_interval * HZ; | ||
801 | n_rcu_torture_boosts++; | ||
802 | mutex_unlock(&boost_mutex); | ||
803 | break; | ||
804 | } | ||
805 | schedule_timeout_uninterruptible(1); | ||
806 | } | ||
807 | |||
808 | /* Go do the stutter. */ | ||
809 | checkwait: rcu_stutter_wait("rcu_torture_boost"); | ||
810 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | ||
811 | |||
812 | /* Clean up and exit. */ | ||
813 | VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); | ||
814 | rcutorture_shutdown_absorb("rcu_torture_boost"); | ||
815 | while (!kthread_should_stop() || rbi.inflight) | ||
816 | schedule_timeout_uninterruptible(1); | ||
817 | smp_mb(); /* order accesses to ->inflight before stack-frame death. */ | ||
818 | return 0; | ||
819 | } | ||
820 | |||
821 | /* | ||
687 | * RCU torture force-quiescent-state kthread. Repeatedly induces | 822 | * RCU torture force-quiescent-state kthread. Repeatedly induces |
688 | * bursts of calls to force_quiescent_state(), increasing the probability | 823 | * bursts of calls to force_quiescent_state(), increasing the probability |
689 | * of occurrence of some important types of race conditions. | 824 | * of occurrence of some important types of race conditions. |
@@ -933,7 +1068,8 @@ rcu_torture_printk(char *page) | |||
933 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); | 1068 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); |
934 | cnt += sprintf(&page[cnt], | 1069 | cnt += sprintf(&page[cnt], |
935 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " | 1070 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " |
936 | "rtmbe: %d nt: %ld", | 1071 | "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld " |
1072 | "rtbf: %ld rtb: %ld nt: %ld", | ||
937 | rcu_torture_current, | 1073 | rcu_torture_current, |
938 | rcu_torture_current_version, | 1074 | rcu_torture_current_version, |
939 | list_empty(&rcu_torture_freelist), | 1075 | list_empty(&rcu_torture_freelist), |
@@ -941,8 +1077,19 @@ rcu_torture_printk(char *page) | |||
941 | atomic_read(&n_rcu_torture_alloc_fail), | 1077 | atomic_read(&n_rcu_torture_alloc_fail), |
942 | atomic_read(&n_rcu_torture_free), | 1078 | atomic_read(&n_rcu_torture_free), |
943 | atomic_read(&n_rcu_torture_mberror), | 1079 | atomic_read(&n_rcu_torture_mberror), |
1080 | n_rcu_torture_boost_ktrerror, | ||
1081 | n_rcu_torture_boost_rterror, | ||
1082 | n_rcu_torture_boost_allocerror, | ||
1083 | n_rcu_torture_boost_afferror, | ||
1084 | n_rcu_torture_boost_failure, | ||
1085 | n_rcu_torture_boosts, | ||
944 | n_rcu_torture_timers); | 1086 | n_rcu_torture_timers); |
945 | if (atomic_read(&n_rcu_torture_mberror) != 0) | 1087 | if (atomic_read(&n_rcu_torture_mberror) != 0 || |
1088 | n_rcu_torture_boost_ktrerror != 0 || | ||
1089 | n_rcu_torture_boost_rterror != 0 || | ||
1090 | n_rcu_torture_boost_allocerror != 0 || | ||
1091 | n_rcu_torture_boost_afferror != 0 || | ||
1092 | n_rcu_torture_boost_failure != 0) | ||
946 | cnt += sprintf(&page[cnt], " !!!"); | 1093 | cnt += sprintf(&page[cnt], " !!!"); |
947 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | 1094 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
948 | if (i > 1) { | 1095 | if (i > 1) { |
@@ -1094,22 +1241,91 @@ rcu_torture_stutter(void *arg) | |||
1094 | } | 1241 | } |
1095 | 1242 | ||
1096 | static inline void | 1243 | static inline void |
1097 | rcu_torture_print_module_parms(char *tag) | 1244 | rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) |
1098 | { | 1245 | { |
1099 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1246 | printk(KERN_ALERT "%s" TORTURE_FLAG |
1100 | "--- %s: nreaders=%d nfakewriters=%d " | 1247 | "--- %s: nreaders=%d nfakewriters=%d " |
1101 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " | 1248 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " |
1102 | "shuffle_interval=%d stutter=%d irqreader=%d " | 1249 | "shuffle_interval=%d stutter=%d irqreader=%d " |
1103 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n", | 1250 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " |
1251 | "test_boost=%d/%d test_boost_interval=%d " | ||
1252 | "test_boost_duration=%d\n", | ||
1104 | torture_type, tag, nrealreaders, nfakewriters, | 1253 | torture_type, tag, nrealreaders, nfakewriters, |
1105 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, | 1254 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, |
1106 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter); | 1255 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, |
1256 | test_boost, cur_ops->can_boost, | ||
1257 | test_boost_interval, test_boost_duration); | ||
1107 | } | 1258 | } |
1108 | 1259 | ||
1109 | static struct notifier_block rcutorture_nb = { | 1260 | static struct notifier_block rcutorture_shutdown_nb = { |
1110 | .notifier_call = rcutorture_shutdown_notify, | 1261 | .notifier_call = rcutorture_shutdown_notify, |
1111 | }; | 1262 | }; |
1112 | 1263 | ||
1264 | static void rcutorture_booster_cleanup(int cpu) | ||
1265 | { | ||
1266 | struct task_struct *t; | ||
1267 | |||
1268 | if (boost_tasks[cpu] == NULL) | ||
1269 | return; | ||
1270 | mutex_lock(&boost_mutex); | ||
1271 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task"); | ||
1272 | t = boost_tasks[cpu]; | ||
1273 | boost_tasks[cpu] = NULL; | ||
1274 | mutex_unlock(&boost_mutex); | ||
1275 | |||
1276 | /* This must be outside of the mutex, otherwise deadlock! */ | ||
1277 | kthread_stop(t); | ||
1278 | } | ||
1279 | |||
1280 | static int rcutorture_booster_init(int cpu) | ||
1281 | { | ||
1282 | int retval; | ||
1283 | |||
1284 | if (boost_tasks[cpu] != NULL) | ||
1285 | return 0; /* Already created, nothing more to do. */ | ||
1286 | |||
1287 | /* Don't allow time recalculation while creating a new task. */ | ||
1288 | mutex_lock(&boost_mutex); | ||
1289 | VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); | ||
1290 | boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL, | ||
1291 | "rcu_torture_boost"); | ||
1292 | if (IS_ERR(boost_tasks[cpu])) { | ||
1293 | retval = PTR_ERR(boost_tasks[cpu]); | ||
1294 | VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); | ||
1295 | n_rcu_torture_boost_ktrerror++; | ||
1296 | boost_tasks[cpu] = NULL; | ||
1297 | mutex_unlock(&boost_mutex); | ||
1298 | return retval; | ||
1299 | } | ||
1300 | kthread_bind(boost_tasks[cpu], cpu); | ||
1301 | wake_up_process(boost_tasks[cpu]); | ||
1302 | mutex_unlock(&boost_mutex); | ||
1303 | return 0; | ||
1304 | } | ||
1305 | |||
1306 | static int rcutorture_cpu_notify(struct notifier_block *self, | ||
1307 | unsigned long action, void *hcpu) | ||
1308 | { | ||
1309 | long cpu = (long)hcpu; | ||
1310 | |||
1311 | switch (action) { | ||
1312 | case CPU_ONLINE: | ||
1313 | case CPU_DOWN_FAILED: | ||
1314 | (void)rcutorture_booster_init(cpu); | ||
1315 | break; | ||
1316 | case CPU_DOWN_PREPARE: | ||
1317 | rcutorture_booster_cleanup(cpu); | ||
1318 | break; | ||
1319 | default: | ||
1320 | break; | ||
1321 | } | ||
1322 | return NOTIFY_OK; | ||
1323 | } | ||
1324 | |||
1325 | static struct notifier_block rcutorture_cpu_nb = { | ||
1326 | .notifier_call = rcutorture_cpu_notify, | ||
1327 | }; | ||
1328 | |||
1113 | static void | 1329 | static void |
1114 | rcu_torture_cleanup(void) | 1330 | rcu_torture_cleanup(void) |
1115 | { | 1331 | { |
@@ -1127,7 +1343,7 @@ rcu_torture_cleanup(void) | |||
1127 | } | 1343 | } |
1128 | fullstop = FULLSTOP_RMMOD; | 1344 | fullstop = FULLSTOP_RMMOD; |
1129 | mutex_unlock(&fullstop_mutex); | 1345 | mutex_unlock(&fullstop_mutex); |
1130 | unregister_reboot_notifier(&rcutorture_nb); | 1346 | unregister_reboot_notifier(&rcutorture_shutdown_nb); |
1131 | if (stutter_task) { | 1347 | if (stutter_task) { |
1132 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); | 1348 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); |
1133 | kthread_stop(stutter_task); | 1349 | kthread_stop(stutter_task); |
@@ -1184,6 +1400,12 @@ rcu_torture_cleanup(void) | |||
1184 | kthread_stop(fqs_task); | 1400 | kthread_stop(fqs_task); |
1185 | } | 1401 | } |
1186 | fqs_task = NULL; | 1402 | fqs_task = NULL; |
1403 | if ((test_boost == 1 && cur_ops->can_boost) || | ||
1404 | test_boost == 2) { | ||
1405 | unregister_cpu_notifier(&rcutorture_cpu_nb); | ||
1406 | for_each_possible_cpu(i) | ||
1407 | rcutorture_booster_cleanup(i); | ||
1408 | } | ||
1187 | 1409 | ||
1188 | /* Wait for all RCU callbacks to fire. */ | 1410 | /* Wait for all RCU callbacks to fire. */ |
1189 | 1411 | ||
@@ -1195,9 +1417,9 @@ rcu_torture_cleanup(void) | |||
1195 | if (cur_ops->cleanup) | 1417 | if (cur_ops->cleanup) |
1196 | cur_ops->cleanup(); | 1418 | cur_ops->cleanup(); |
1197 | if (atomic_read(&n_rcu_torture_error)) | 1419 | if (atomic_read(&n_rcu_torture_error)) |
1198 | rcu_torture_print_module_parms("End of test: FAILURE"); | 1420 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); |
1199 | else | 1421 | else |
1200 | rcu_torture_print_module_parms("End of test: SUCCESS"); | 1422 | rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); |
1201 | } | 1423 | } |
1202 | 1424 | ||
1203 | static int __init | 1425 | static int __init |
@@ -1242,7 +1464,7 @@ rcu_torture_init(void) | |||
1242 | nrealreaders = nreaders; | 1464 | nrealreaders = nreaders; |
1243 | else | 1465 | else |
1244 | nrealreaders = 2 * num_online_cpus(); | 1466 | nrealreaders = 2 * num_online_cpus(); |
1245 | rcu_torture_print_module_parms("Start of test"); | 1467 | rcu_torture_print_module_parms(cur_ops, "Start of test"); |
1246 | fullstop = FULLSTOP_DONTSTOP; | 1468 | fullstop = FULLSTOP_DONTSTOP; |
1247 | 1469 | ||
1248 | /* Set up the freelist. */ | 1470 | /* Set up the freelist. */ |
@@ -1263,6 +1485,12 @@ rcu_torture_init(void) | |||
1263 | atomic_set(&n_rcu_torture_free, 0); | 1485 | atomic_set(&n_rcu_torture_free, 0); |
1264 | atomic_set(&n_rcu_torture_mberror, 0); | 1486 | atomic_set(&n_rcu_torture_mberror, 0); |
1265 | atomic_set(&n_rcu_torture_error, 0); | 1487 | atomic_set(&n_rcu_torture_error, 0); |
1488 | n_rcu_torture_boost_ktrerror = 0; | ||
1489 | n_rcu_torture_boost_rterror = 0; | ||
1490 | n_rcu_torture_boost_allocerror = 0; | ||
1491 | n_rcu_torture_boost_afferror = 0; | ||
1492 | n_rcu_torture_boost_failure = 0; | ||
1493 | n_rcu_torture_boosts = 0; | ||
1266 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | 1494 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
1267 | atomic_set(&rcu_torture_wcount[i], 0); | 1495 | atomic_set(&rcu_torture_wcount[i], 0); |
1268 | for_each_possible_cpu(cpu) { | 1496 | for_each_possible_cpu(cpu) { |
@@ -1376,7 +1604,27 @@ rcu_torture_init(void) | |||
1376 | goto unwind; | 1604 | goto unwind; |
1377 | } | 1605 | } |
1378 | } | 1606 | } |
1379 | register_reboot_notifier(&rcutorture_nb); | 1607 | if (test_boost_interval < 1) |
1608 | test_boost_interval = 1; | ||
1609 | if (test_boost_duration < 2) | ||
1610 | test_boost_duration = 2; | ||
1611 | if ((test_boost == 1 && cur_ops->can_boost) || | ||
1612 | test_boost == 2) { | ||
1613 | int retval; | ||
1614 | |||
1615 | boost_starttime = jiffies + test_boost_interval * HZ; | ||
1616 | register_cpu_notifier(&rcutorture_cpu_nb); | ||
1617 | for_each_possible_cpu(i) { | ||
1618 | if (cpu_is_offline(i)) | ||
1619 | continue; /* Heuristic: CPU can go offline. */ | ||
1620 | retval = rcutorture_booster_init(i); | ||
1621 | if (retval < 0) { | ||
1622 | firsterr = retval; | ||
1623 | goto unwind; | ||
1624 | } | ||
1625 | } | ||
1626 | } | ||
1627 | register_reboot_notifier(&rcutorture_shutdown_nb); | ||
1380 | mutex_unlock(&fullstop_mutex); | 1628 | mutex_unlock(&fullstop_mutex); |
1381 | return 0; | 1629 | return 0; |
1382 | 1630 | ||
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ccdc04c47981..dd4aea806f8e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; | |||
67 | .gpnum = -300, \ | 67 | .gpnum = -300, \ |
68 | .completed = -300, \ | 68 | .completed = -300, \ |
69 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ | 69 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ |
70 | .orphan_cbs_list = NULL, \ | ||
71 | .orphan_cbs_tail = &structname.orphan_cbs_list, \ | ||
72 | .orphan_qlen = 0, \ | ||
73 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ | 70 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ |
74 | .n_force_qs = 0, \ | 71 | .n_force_qs = 0, \ |
75 | .n_force_qs_ngp = 0, \ | 72 | .n_force_qs_ngp = 0, \ |
@@ -367,8 +364,8 @@ void rcu_irq_exit(void) | |||
367 | WARN_ON_ONCE(rdtp->dynticks & 0x1); | 364 | WARN_ON_ONCE(rdtp->dynticks & 0x1); |
368 | 365 | ||
369 | /* If the interrupt queued a callback, get out of dyntick mode. */ | 366 | /* If the interrupt queued a callback, get out of dyntick mode. */ |
370 | if (__get_cpu_var(rcu_sched_data).nxtlist || | 367 | if (__this_cpu_read(rcu_sched_data.nxtlist) || |
371 | __get_cpu_var(rcu_bh_data).nxtlist) | 368 | __this_cpu_read(rcu_bh_data.nxtlist)) |
372 | set_need_resched(); | 369 | set_need_resched(); |
373 | } | 370 | } |
374 | 371 | ||
@@ -620,9 +617,17 @@ static void __init check_cpu_stall_init(void) | |||
620 | static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) | 617 | static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) |
621 | { | 618 | { |
622 | if (rdp->gpnum != rnp->gpnum) { | 619 | if (rdp->gpnum != rnp->gpnum) { |
623 | rdp->qs_pending = 1; | 620 | /* |
624 | rdp->passed_quiesc = 0; | 621 | * If the current grace period is waiting for this CPU, |
622 | * set up to detect a quiescent state, otherwise don't | ||
623 | * go looking for one. | ||
624 | */ | ||
625 | rdp->gpnum = rnp->gpnum; | 625 | rdp->gpnum = rnp->gpnum; |
626 | if (rnp->qsmask & rdp->grpmask) { | ||
627 | rdp->qs_pending = 1; | ||
628 | rdp->passed_quiesc = 0; | ||
629 | } else | ||
630 | rdp->qs_pending = 0; | ||
626 | } | 631 | } |
627 | } | 632 | } |
628 | 633 | ||
@@ -681,6 +686,24 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat | |||
681 | 686 | ||
682 | /* Remember that we saw this grace-period completion. */ | 687 | /* Remember that we saw this grace-period completion. */ |
683 | rdp->completed = rnp->completed; | 688 | rdp->completed = rnp->completed; |
689 | |||
690 | /* | ||
691 | * If we were in an extended quiescent state, we may have | ||
692 | * missed some grace periods that others CPUs handled on | ||
693 | * our behalf. Catch up with this state to avoid noting | ||
694 | * spurious new grace periods. If another grace period | ||
695 | * has started, then rnp->gpnum will have advanced, so | ||
696 | * we will detect this later on. | ||
697 | */ | ||
698 | if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) | ||
699 | rdp->gpnum = rdp->completed; | ||
700 | |||
701 | /* | ||
702 | * If RCU does not need a quiescent state from this CPU, | ||
703 | * then make sure that this CPU doesn't go looking for one. | ||
704 | */ | ||
705 | if ((rnp->qsmask & rdp->grpmask) == 0) | ||
706 | rdp->qs_pending = 0; | ||
684 | } | 707 | } |
685 | } | 708 | } |
686 | 709 | ||
@@ -984,53 +1007,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) | |||
984 | #ifdef CONFIG_HOTPLUG_CPU | 1007 | #ifdef CONFIG_HOTPLUG_CPU |
985 | 1008 | ||
986 | /* | 1009 | /* |
987 | * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the | 1010 | * Move a dying CPU's RCU callbacks to online CPU's callback list. |
988 | * specified flavor of RCU. The callbacks will be adopted by the next | 1011 | * Synchronization is not required because this function executes |
989 | * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever | 1012 | * in stop_machine() context. |
990 | * comes first. Because this is invoked from the CPU_DYING notifier, | ||
991 | * irqs are already disabled. | ||
992 | */ | 1013 | */ |
993 | static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) | 1014 | static void rcu_send_cbs_to_online(struct rcu_state *rsp) |
994 | { | 1015 | { |
995 | int i; | 1016 | int i; |
1017 | /* current DYING CPU is cleared in the cpu_online_mask */ | ||
1018 | int receive_cpu = cpumask_any(cpu_online_mask); | ||
996 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | 1019 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); |
1020 | struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); | ||
997 | 1021 | ||
998 | if (rdp->nxtlist == NULL) | 1022 | if (rdp->nxtlist == NULL) |
999 | return; /* irqs disabled, so comparison is stable. */ | 1023 | return; /* irqs disabled, so comparison is stable. */ |
1000 | raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ | 1024 | |
1001 | *rsp->orphan_cbs_tail = rdp->nxtlist; | 1025 | *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; |
1002 | rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; | 1026 | receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; |
1027 | receive_rdp->qlen += rdp->qlen; | ||
1028 | receive_rdp->n_cbs_adopted += rdp->qlen; | ||
1029 | rdp->n_cbs_orphaned += rdp->qlen; | ||
1030 | |||
1003 | rdp->nxtlist = NULL; | 1031 | rdp->nxtlist = NULL; |
1004 | for (i = 0; i < RCU_NEXT_SIZE; i++) | 1032 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
1005 | rdp->nxttail[i] = &rdp->nxtlist; | 1033 | rdp->nxttail[i] = &rdp->nxtlist; |
1006 | rsp->orphan_qlen += rdp->qlen; | ||
1007 | rdp->n_cbs_orphaned += rdp->qlen; | ||
1008 | rdp->qlen = 0; | 1034 | rdp->qlen = 0; |
1009 | raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ | ||
1010 | } | ||
1011 | |||
1012 | /* | ||
1013 | * Adopt previously orphaned RCU callbacks. | ||
1014 | */ | ||
1015 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | ||
1016 | { | ||
1017 | unsigned long flags; | ||
1018 | struct rcu_data *rdp; | ||
1019 | |||
1020 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | ||
1021 | rdp = this_cpu_ptr(rsp->rda); | ||
1022 | if (rsp->orphan_cbs_list == NULL) { | ||
1023 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
1024 | return; | ||
1025 | } | ||
1026 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; | ||
1027 | rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; | ||
1028 | rdp->qlen += rsp->orphan_qlen; | ||
1029 | rdp->n_cbs_adopted += rsp->orphan_qlen; | ||
1030 | rsp->orphan_cbs_list = NULL; | ||
1031 | rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; | ||
1032 | rsp->orphan_qlen = 0; | ||
1033 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
1034 | } | 1035 | } |
1035 | 1036 | ||
1036 | /* | 1037 | /* |
@@ -1081,8 +1082,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
1081 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1082 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1082 | if (need_report & RCU_OFL_TASKS_EXP_GP) | 1083 | if (need_report & RCU_OFL_TASKS_EXP_GP) |
1083 | rcu_report_exp_rnp(rsp, rnp); | 1084 | rcu_report_exp_rnp(rsp, rnp); |
1084 | |||
1085 | rcu_adopt_orphan_cbs(rsp); | ||
1086 | } | 1085 | } |
1087 | 1086 | ||
1088 | /* | 1087 | /* |
@@ -1100,11 +1099,7 @@ static void rcu_offline_cpu(int cpu) | |||
1100 | 1099 | ||
1101 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 1100 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ |
1102 | 1101 | ||
1103 | static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) | 1102 | static void rcu_send_cbs_to_online(struct rcu_state *rsp) |
1104 | { | ||
1105 | } | ||
1106 | |||
1107 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | ||
1108 | { | 1103 | { |
1109 | } | 1104 | } |
1110 | 1105 | ||
@@ -1440,22 +1435,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1440 | */ | 1435 | */ |
1441 | local_irq_save(flags); | 1436 | local_irq_save(flags); |
1442 | rdp = this_cpu_ptr(rsp->rda); | 1437 | rdp = this_cpu_ptr(rsp->rda); |
1443 | rcu_process_gp_end(rsp, rdp); | ||
1444 | check_for_new_grace_period(rsp, rdp); | ||
1445 | 1438 | ||
1446 | /* Add the callback to our list. */ | 1439 | /* Add the callback to our list. */ |
1447 | *rdp->nxttail[RCU_NEXT_TAIL] = head; | 1440 | *rdp->nxttail[RCU_NEXT_TAIL] = head; |
1448 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | 1441 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; |
1449 | 1442 | ||
1450 | /* Start a new grace period if one not already started. */ | ||
1451 | if (!rcu_gp_in_progress(rsp)) { | ||
1452 | unsigned long nestflag; | ||
1453 | struct rcu_node *rnp_root = rcu_get_root(rsp); | ||
1454 | |||
1455 | raw_spin_lock_irqsave(&rnp_root->lock, nestflag); | ||
1456 | rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ | ||
1457 | } | ||
1458 | |||
1459 | /* | 1443 | /* |
1460 | * Force the grace period if too many callbacks or too long waiting. | 1444 | * Force the grace period if too many callbacks or too long waiting. |
1461 | * Enforce hysteresis, and don't invoke force_quiescent_state() | 1445 | * Enforce hysteresis, and don't invoke force_quiescent_state() |
@@ -1464,12 +1448,27 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1464 | * is the only one waiting for a grace period to complete. | 1448 | * is the only one waiting for a grace period to complete. |
1465 | */ | 1449 | */ |
1466 | if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { | 1450 | if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { |
1467 | rdp->blimit = LONG_MAX; | 1451 | |
1468 | if (rsp->n_force_qs == rdp->n_force_qs_snap && | 1452 | /* Are we ignoring a completed grace period? */ |
1469 | *rdp->nxttail[RCU_DONE_TAIL] != head) | 1453 | rcu_process_gp_end(rsp, rdp); |
1470 | force_quiescent_state(rsp, 0); | 1454 | check_for_new_grace_period(rsp, rdp); |
1471 | rdp->n_force_qs_snap = rsp->n_force_qs; | 1455 | |
1472 | rdp->qlen_last_fqs_check = rdp->qlen; | 1456 | /* Start a new grace period if one not already started. */ |
1457 | if (!rcu_gp_in_progress(rsp)) { | ||
1458 | unsigned long nestflag; | ||
1459 | struct rcu_node *rnp_root = rcu_get_root(rsp); | ||
1460 | |||
1461 | raw_spin_lock_irqsave(&rnp_root->lock, nestflag); | ||
1462 | rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ | ||
1463 | } else { | ||
1464 | /* Give the grace period a kick. */ | ||
1465 | rdp->blimit = LONG_MAX; | ||
1466 | if (rsp->n_force_qs == rdp->n_force_qs_snap && | ||
1467 | *rdp->nxttail[RCU_DONE_TAIL] != head) | ||
1468 | force_quiescent_state(rsp, 0); | ||
1469 | rdp->n_force_qs_snap = rsp->n_force_qs; | ||
1470 | rdp->qlen_last_fqs_check = rdp->qlen; | ||
1471 | } | ||
1473 | } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) | 1472 | } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) |
1474 | force_quiescent_state(rsp, 1); | 1473 | force_quiescent_state(rsp, 1); |
1475 | local_irq_restore(flags); | 1474 | local_irq_restore(flags); |
@@ -1699,13 +1698,12 @@ static void _rcu_barrier(struct rcu_state *rsp, | |||
1699 | * decrement rcu_barrier_cpu_count -- otherwise the first CPU | 1698 | * decrement rcu_barrier_cpu_count -- otherwise the first CPU |
1700 | * might complete its grace period before all of the other CPUs | 1699 | * might complete its grace period before all of the other CPUs |
1701 | * did their increment, causing this function to return too | 1700 | * did their increment, causing this function to return too |
1702 | * early. | 1701 | * early. Note that on_each_cpu() disables irqs, which prevents |
1702 | * any CPUs from coming online or going offline until each online | ||
1703 | * CPU has queued its RCU-barrier callback. | ||
1703 | */ | 1704 | */ |
1704 | atomic_set(&rcu_barrier_cpu_count, 1); | 1705 | atomic_set(&rcu_barrier_cpu_count, 1); |
1705 | preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */ | ||
1706 | rcu_adopt_orphan_cbs(rsp); | ||
1707 | on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); | 1706 | on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); |
1708 | preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */ | ||
1709 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 1707 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) |
1710 | complete(&rcu_barrier_completion); | 1708 | complete(&rcu_barrier_completion); |
1711 | wait_for_completion(&rcu_barrier_completion); | 1709 | wait_for_completion(&rcu_barrier_completion); |
@@ -1831,18 +1829,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
1831 | case CPU_DYING: | 1829 | case CPU_DYING: |
1832 | case CPU_DYING_FROZEN: | 1830 | case CPU_DYING_FROZEN: |
1833 | /* | 1831 | /* |
1834 | * preempt_disable() in _rcu_barrier() prevents stop_machine(), | 1832 | * The whole machine is "stopped" except this CPU, so we can |
1835 | * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" | 1833 | * touch any data without introducing corruption. We send the |
1836 | * returns, all online cpus have queued rcu_barrier_func(). | 1834 | * dying CPU's callbacks to an arbitrarily chosen online CPU. |
1837 | * The dying CPU clears its cpu_online_mask bit and | ||
1838 | * moves all of its RCU callbacks to ->orphan_cbs_list | ||
1839 | * in the context of stop_machine(), so subsequent calls | ||
1840 | * to _rcu_barrier() will adopt these callbacks and only | ||
1841 | * then queue rcu_barrier_func() on all remaining CPUs. | ||
1842 | */ | 1835 | */ |
1843 | rcu_send_cbs_to_orphanage(&rcu_bh_state); | 1836 | rcu_send_cbs_to_online(&rcu_bh_state); |
1844 | rcu_send_cbs_to_orphanage(&rcu_sched_state); | 1837 | rcu_send_cbs_to_online(&rcu_sched_state); |
1845 | rcu_preempt_send_cbs_to_orphanage(); | 1838 | rcu_preempt_send_cbs_to_online(); |
1846 | break; | 1839 | break; |
1847 | case CPU_DEAD: | 1840 | case CPU_DEAD: |
1848 | case CPU_DEAD_FROZEN: | 1841 | case CPU_DEAD_FROZEN: |
@@ -1880,8 +1873,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
1880 | { | 1873 | { |
1881 | int i; | 1874 | int i; |
1882 | 1875 | ||
1883 | for (i = NUM_RCU_LVLS - 1; i >= 0; i--) | 1876 | for (i = NUM_RCU_LVLS - 1; i > 0; i--) |
1884 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; | 1877 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; |
1878 | rsp->levelspread[0] = RCU_FANOUT_LEAF; | ||
1885 | } | 1879 | } |
1886 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ | 1880 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ |
1887 | static void __init rcu_init_levelspread(struct rcu_state *rsp) | 1881 | static void __init rcu_init_levelspread(struct rcu_state *rsp) |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 91d4170c5c13..e8f057e44e3e 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -31,46 +31,51 @@ | |||
31 | /* | 31 | /* |
32 | * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. | 32 | * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. |
33 | * In theory, it should be possible to add more levels straightforwardly. | 33 | * In theory, it should be possible to add more levels straightforwardly. |
34 | * In practice, this has not been tested, so there is probably some | 34 | * In practice, this did work well going from three levels to four. |
35 | * bug somewhere. | 35 | * Of course, your mileage may vary. |
36 | */ | 36 | */ |
37 | #define MAX_RCU_LVLS 4 | 37 | #define MAX_RCU_LVLS 4 |
38 | #define RCU_FANOUT (CONFIG_RCU_FANOUT) | 38 | #if CONFIG_RCU_FANOUT > 16 |
39 | #define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) | 39 | #define RCU_FANOUT_LEAF 16 |
40 | #define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) | 40 | #else /* #if CONFIG_RCU_FANOUT > 16 */ |
41 | #define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT) | 41 | #define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT) |
42 | 42 | #endif /* #else #if CONFIG_RCU_FANOUT > 16 */ | |
43 | #if NR_CPUS <= RCU_FANOUT | 43 | #define RCU_FANOUT_1 (RCU_FANOUT_LEAF) |
44 | #define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) | ||
45 | #define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) | ||
46 | #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) | ||
47 | |||
48 | #if NR_CPUS <= RCU_FANOUT_1 | ||
44 | # define NUM_RCU_LVLS 1 | 49 | # define NUM_RCU_LVLS 1 |
45 | # define NUM_RCU_LVL_0 1 | 50 | # define NUM_RCU_LVL_0 1 |
46 | # define NUM_RCU_LVL_1 (NR_CPUS) | 51 | # define NUM_RCU_LVL_1 (NR_CPUS) |
47 | # define NUM_RCU_LVL_2 0 | 52 | # define NUM_RCU_LVL_2 0 |
48 | # define NUM_RCU_LVL_3 0 | 53 | # define NUM_RCU_LVL_3 0 |
49 | # define NUM_RCU_LVL_4 0 | 54 | # define NUM_RCU_LVL_4 0 |
50 | #elif NR_CPUS <= RCU_FANOUT_SQ | 55 | #elif NR_CPUS <= RCU_FANOUT_2 |
51 | # define NUM_RCU_LVLS 2 | 56 | # define NUM_RCU_LVLS 2 |
52 | # define NUM_RCU_LVL_0 1 | 57 | # define NUM_RCU_LVL_0 1 |
53 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) | 58 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) |
54 | # define NUM_RCU_LVL_2 (NR_CPUS) | 59 | # define NUM_RCU_LVL_2 (NR_CPUS) |
55 | # define NUM_RCU_LVL_3 0 | 60 | # define NUM_RCU_LVL_3 0 |
56 | # define NUM_RCU_LVL_4 0 | 61 | # define NUM_RCU_LVL_4 0 |
57 | #elif NR_CPUS <= RCU_FANOUT_CUBE | 62 | #elif NR_CPUS <= RCU_FANOUT_3 |
58 | # define NUM_RCU_LVLS 3 | 63 | # define NUM_RCU_LVLS 3 |
59 | # define NUM_RCU_LVL_0 1 | 64 | # define NUM_RCU_LVL_0 1 |
60 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) | 65 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) |
61 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) | 66 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) |
62 | # define NUM_RCU_LVL_3 NR_CPUS | 67 | # define NUM_RCU_LVL_3 (NR_CPUS) |
63 | # define NUM_RCU_LVL_4 0 | 68 | # define NUM_RCU_LVL_4 0 |
64 | #elif NR_CPUS <= RCU_FANOUT_FOURTH | 69 | #elif NR_CPUS <= RCU_FANOUT_4 |
65 | # define NUM_RCU_LVLS 4 | 70 | # define NUM_RCU_LVLS 4 |
66 | # define NUM_RCU_LVL_0 1 | 71 | # define NUM_RCU_LVL_0 1 |
67 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE) | 72 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) |
68 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) | 73 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) |
69 | # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) | 74 | # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) |
70 | # define NUM_RCU_LVL_4 NR_CPUS | 75 | # define NUM_RCU_LVL_4 (NR_CPUS) |
71 | #else | 76 | #else |
72 | # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" | 77 | # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" |
73 | #endif /* #if (NR_CPUS) <= RCU_FANOUT */ | 78 | #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ |
74 | 79 | ||
75 | #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) | 80 | #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) |
76 | #define NUM_RCU_NODES (RCU_SUM - NR_CPUS) | 81 | #define NUM_RCU_NODES (RCU_SUM - NR_CPUS) |
@@ -203,8 +208,8 @@ struct rcu_data { | |||
203 | long qlen_last_fqs_check; | 208 | long qlen_last_fqs_check; |
204 | /* qlen at last check for QS forcing */ | 209 | /* qlen at last check for QS forcing */ |
205 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ | 210 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ |
206 | unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */ | 211 | unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ |
207 | unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */ | 212 | unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ |
208 | unsigned long n_force_qs_snap; | 213 | unsigned long n_force_qs_snap; |
209 | /* did other CPU force QS recently? */ | 214 | /* did other CPU force QS recently? */ |
210 | long blimit; /* Upper limit on a processed batch */ | 215 | long blimit; /* Upper limit on a processed batch */ |
@@ -309,15 +314,7 @@ struct rcu_state { | |||
309 | /* End of fields guarded by root rcu_node's lock. */ | 314 | /* End of fields guarded by root rcu_node's lock. */ |
310 | 315 | ||
311 | raw_spinlock_t onofflock; /* exclude on/offline and */ | 316 | raw_spinlock_t onofflock; /* exclude on/offline and */ |
312 | /* starting new GP. Also */ | 317 | /* starting new GP. */ |
313 | /* protects the following */ | ||
314 | /* orphan_cbs fields. */ | ||
315 | struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */ | ||
316 | /* orphaned by all CPUs in */ | ||
317 | /* a given leaf rcu_node */ | ||
318 | /* going offline. */ | ||
319 | struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ | ||
320 | long orphan_qlen; /* Number of orphaned cbs. */ | ||
321 | raw_spinlock_t fqslock; /* Only one task forcing */ | 318 | raw_spinlock_t fqslock; /* Only one task forcing */ |
322 | /* quiescent states. */ | 319 | /* quiescent states. */ |
323 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | 320 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
@@ -390,7 +387,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); | |||
390 | static int rcu_preempt_pending(int cpu); | 387 | static int rcu_preempt_pending(int cpu); |
391 | static int rcu_preempt_needs_cpu(int cpu); | 388 | static int rcu_preempt_needs_cpu(int cpu); |
392 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu); | 389 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu); |
393 | static void rcu_preempt_send_cbs_to_orphanage(void); | 390 | static void rcu_preempt_send_cbs_to_online(void); |
394 | static void __init __rcu_init_preempt(void); | 391 | static void __init __rcu_init_preempt(void); |
395 | static void rcu_needs_cpu_flush(void); | 392 | static void rcu_needs_cpu_flush(void); |
396 | 393 | ||
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 71a4147473f9..a3638710dc67 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -25,6 +25,7 @@ | |||
25 | */ | 25 | */ |
26 | 26 | ||
27 | #include <linux/delay.h> | 27 | #include <linux/delay.h> |
28 | #include <linux/stop_machine.h> | ||
28 | 29 | ||
29 | /* | 30 | /* |
30 | * Check the RCU kernel configuration parameters and print informative | 31 | * Check the RCU kernel configuration parameters and print informative |
@@ -773,11 +774,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | |||
773 | } | 774 | } |
774 | 775 | ||
775 | /* | 776 | /* |
776 | * Move preemptable RCU's callbacks to ->orphan_cbs_list. | 777 | * Move preemptable RCU's callbacks from dying CPU to other online CPU. |
777 | */ | 778 | */ |
778 | static void rcu_preempt_send_cbs_to_orphanage(void) | 779 | static void rcu_preempt_send_cbs_to_online(void) |
779 | { | 780 | { |
780 | rcu_send_cbs_to_orphanage(&rcu_preempt_state); | 781 | rcu_send_cbs_to_online(&rcu_preempt_state); |
781 | } | 782 | } |
782 | 783 | ||
783 | /* | 784 | /* |
@@ -1001,7 +1002,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | |||
1001 | /* | 1002 | /* |
1002 | * Because there is no preemptable RCU, there are no callbacks to move. | 1003 | * Because there is no preemptable RCU, there are no callbacks to move. |
1003 | */ | 1004 | */ |
1004 | static void rcu_preempt_send_cbs_to_orphanage(void) | 1005 | static void rcu_preempt_send_cbs_to_online(void) |
1005 | { | 1006 | { |
1006 | } | 1007 | } |
1007 | 1008 | ||
@@ -1014,6 +1015,132 @@ static void __init __rcu_init_preempt(void) | |||
1014 | 1015 | ||
1015 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ | 1016 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ |
1016 | 1017 | ||
1018 | #ifndef CONFIG_SMP | ||
1019 | |||
1020 | void synchronize_sched_expedited(void) | ||
1021 | { | ||
1022 | cond_resched(); | ||
1023 | } | ||
1024 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
1025 | |||
1026 | #else /* #ifndef CONFIG_SMP */ | ||
1027 | |||
1028 | static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); | ||
1029 | static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); | ||
1030 | |||
1031 | static int synchronize_sched_expedited_cpu_stop(void *data) | ||
1032 | { | ||
1033 | /* | ||
1034 | * There must be a full memory barrier on each affected CPU | ||
1035 | * between the time that try_stop_cpus() is called and the | ||
1036 | * time that it returns. | ||
1037 | * | ||
1038 | * In the current initial implementation of cpu_stop, the | ||
1039 | * above condition is already met when the control reaches | ||
1040 | * this point and the following smp_mb() is not strictly | ||
1041 | * necessary. Do smp_mb() anyway for documentation and | ||
1042 | * robustness against future implementation changes. | ||
1043 | */ | ||
1044 | smp_mb(); /* See above comment block. */ | ||
1045 | return 0; | ||
1046 | } | ||
1047 | |||
1048 | /* | ||
1049 | * Wait for an rcu-sched grace period to elapse, but use "big hammer" | ||
1050 | * approach to force grace period to end quickly. This consumes | ||
1051 | * significant time on all CPUs, and is thus not recommended for | ||
1052 | * any sort of common-case code. | ||
1053 | * | ||
1054 | * Note that it is illegal to call this function while holding any | ||
1055 | * lock that is acquired by a CPU-hotplug notifier. Failing to | ||
1056 | * observe this restriction will result in deadlock. | ||
1057 | * | ||
1058 | * This implementation can be thought of as an application of ticket | ||
1059 | * locking to RCU, with sync_sched_expedited_started and | ||
1060 | * sync_sched_expedited_done taking on the roles of the halves | ||
1061 | * of the ticket-lock word. Each task atomically increments | ||
1062 | * sync_sched_expedited_started upon entry, snapshotting the old value, | ||
1063 | * then attempts to stop all the CPUs. If this succeeds, then each | ||
1064 | * CPU will have executed a context switch, resulting in an RCU-sched | ||
1065 | * grace period. We are then done, so we use atomic_cmpxchg() to | ||
1066 | * update sync_sched_expedited_done to match our snapshot -- but | ||
1067 | * only if someone else has not already advanced past our snapshot. | ||
1068 | * | ||
1069 | * On the other hand, if try_stop_cpus() fails, we check the value | ||
1070 | * of sync_sched_expedited_done. If it has advanced past our | ||
1071 | * initial snapshot, then someone else must have forced a grace period | ||
1072 | * some time after we took our snapshot. In this case, our work is | ||
1073 | * done for us, and we can simply return. Otherwise, we try again, | ||
1074 | * but keep our initial snapshot for purposes of checking for someone | ||
1075 | * doing our work for us. | ||
1076 | * | ||
1077 | * If we fail too many times in a row, we fall back to synchronize_sched(). | ||
1078 | */ | ||
1079 | void synchronize_sched_expedited(void) | ||
1080 | { | ||
1081 | int firstsnap, s, snap, trycount = 0; | ||
1082 | |||
1083 | /* Note that atomic_inc_return() implies full memory barrier. */ | ||
1084 | firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); | ||
1085 | get_online_cpus(); | ||
1086 | |||
1087 | /* | ||
1088 | * Each pass through the following loop attempts to force a | ||
1089 | * context switch on each CPU. | ||
1090 | */ | ||
1091 | while (try_stop_cpus(cpu_online_mask, | ||
1092 | synchronize_sched_expedited_cpu_stop, | ||
1093 | NULL) == -EAGAIN) { | ||
1094 | put_online_cpus(); | ||
1095 | |||
1096 | /* No joy, try again later. Or just synchronize_sched(). */ | ||
1097 | if (trycount++ < 10) | ||
1098 | udelay(trycount * num_online_cpus()); | ||
1099 | else { | ||
1100 | synchronize_sched(); | ||
1101 | return; | ||
1102 | } | ||
1103 | |||
1104 | /* Check to see if someone else did our work for us. */ | ||
1105 | s = atomic_read(&sync_sched_expedited_done); | ||
1106 | if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { | ||
1107 | smp_mb(); /* ensure test happens before caller kfree */ | ||
1108 | return; | ||
1109 | } | ||
1110 | |||
1111 | /* | ||
1112 | * Refetching sync_sched_expedited_started allows later | ||
1113 | * callers to piggyback on our grace period. We subtract | ||
1114 | * 1 to get the same token that the last incrementer got. | ||
1115 | * We retry after they started, so our grace period works | ||
1116 | * for them, and they started after our first try, so their | ||
1117 | * grace period works for us. | ||
1118 | */ | ||
1119 | get_online_cpus(); | ||
1120 | snap = atomic_read(&sync_sched_expedited_started) - 1; | ||
1121 | smp_mb(); /* ensure read is before try_stop_cpus(). */ | ||
1122 | } | ||
1123 | |||
1124 | /* | ||
1125 | * Everyone up to our most recent fetch is covered by our grace | ||
1126 | * period. Update the counter, but only if our work is still | ||
1127 | * relevant -- which it won't be if someone who started later | ||
1128 | * than we did beat us to the punch. | ||
1129 | */ | ||
1130 | do { | ||
1131 | s = atomic_read(&sync_sched_expedited_done); | ||
1132 | if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { | ||
1133 | smp_mb(); /* ensure test happens before caller kfree */ | ||
1134 | break; | ||
1135 | } | ||
1136 | } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); | ||
1137 | |||
1138 | put_online_cpus(); | ||
1139 | } | ||
1140 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
1141 | |||
1142 | #endif /* #else #ifndef CONFIG_SMP */ | ||
1143 | |||
1017 | #if !defined(CONFIG_RCU_FAST_NO_HZ) | 1144 | #if !defined(CONFIG_RCU_FAST_NO_HZ) |
1018 | 1145 | ||
1019 | /* | 1146 | /* |
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index d15430b9d122..c8e97853b970 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -166,13 +166,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
166 | 166 | ||
167 | gpnum = rsp->gpnum; | 167 | gpnum = rsp->gpnum; |
168 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " | 168 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " |
169 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", | 169 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", |
170 | rsp->completed, gpnum, rsp->signaled, | 170 | rsp->completed, gpnum, rsp->signaled, |
171 | (long)(rsp->jiffies_force_qs - jiffies), | 171 | (long)(rsp->jiffies_force_qs - jiffies), |
172 | (int)(jiffies & 0xffff), | 172 | (int)(jiffies & 0xffff), |
173 | rsp->n_force_qs, rsp->n_force_qs_ngp, | 173 | rsp->n_force_qs, rsp->n_force_qs_ngp, |
174 | rsp->n_force_qs - rsp->n_force_qs_ngp, | 174 | rsp->n_force_qs - rsp->n_force_qs_ngp, |
175 | rsp->n_force_qs_lh, rsp->orphan_qlen); | 175 | rsp->n_force_qs_lh); |
176 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { | 176 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { |
177 | if (rnp->level != level) { | 177 | if (rnp->level != level) { |
178 | seq_puts(m, "\n"); | 178 | seq_puts(m, "\n"); |
@@ -300,7 +300,7 @@ static const struct file_operations rcu_pending_fops = { | |||
300 | 300 | ||
301 | static struct dentry *rcudir; | 301 | static struct dentry *rcudir; |
302 | 302 | ||
303 | static int __init rcuclassic_trace_init(void) | 303 | static int __init rcutree_trace_init(void) |
304 | { | 304 | { |
305 | struct dentry *retval; | 305 | struct dentry *retval; |
306 | 306 | ||
@@ -337,14 +337,14 @@ free_out: | |||
337 | return 1; | 337 | return 1; |
338 | } | 338 | } |
339 | 339 | ||
340 | static void __exit rcuclassic_trace_cleanup(void) | 340 | static void __exit rcutree_trace_cleanup(void) |
341 | { | 341 | { |
342 | debugfs_remove_recursive(rcudir); | 342 | debugfs_remove_recursive(rcudir); |
343 | } | 343 | } |
344 | 344 | ||
345 | 345 | ||
346 | module_init(rcuclassic_trace_init); | 346 | module_init(rcutree_trace_init); |
347 | module_exit(rcuclassic_trace_cleanup); | 347 | module_exit(rcutree_trace_cleanup); |
348 | 348 | ||
349 | MODULE_AUTHOR("Paul E. McKenney"); | 349 | MODULE_AUTHOR("Paul E. McKenney"); |
350 | MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); | 350 | MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); |
diff --git a/kernel/sched.c b/kernel/sched.c index 297d1a0eedb0..18d38e4ec7ba 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -75,9 +75,11 @@ | |||
75 | 75 | ||
76 | #include <asm/tlb.h> | 76 | #include <asm/tlb.h> |
77 | #include <asm/irq_regs.h> | 77 | #include <asm/irq_regs.h> |
78 | #include <asm/mutex.h> | ||
78 | 79 | ||
79 | #include "sched_cpupri.h" | 80 | #include "sched_cpupri.h" |
80 | #include "workqueue_sched.h" | 81 | #include "workqueue_sched.h" |
82 | #include "sched_autogroup.h" | ||
81 | 83 | ||
82 | #define CREATE_TRACE_POINTS | 84 | #define CREATE_TRACE_POINTS |
83 | #include <trace/events/sched.h> | 85 | #include <trace/events/sched.h> |
@@ -253,6 +255,8 @@ struct task_group { | |||
253 | /* runqueue "owned" by this group on each cpu */ | 255 | /* runqueue "owned" by this group on each cpu */ |
254 | struct cfs_rq **cfs_rq; | 256 | struct cfs_rq **cfs_rq; |
255 | unsigned long shares; | 257 | unsigned long shares; |
258 | |||
259 | atomic_t load_weight; | ||
256 | #endif | 260 | #endif |
257 | 261 | ||
258 | #ifdef CONFIG_RT_GROUP_SCHED | 262 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -268,25 +272,18 @@ struct task_group { | |||
268 | struct task_group *parent; | 272 | struct task_group *parent; |
269 | struct list_head siblings; | 273 | struct list_head siblings; |
270 | struct list_head children; | 274 | struct list_head children; |
271 | }; | ||
272 | 275 | ||
273 | #define root_task_group init_task_group | 276 | #ifdef CONFIG_SCHED_AUTOGROUP |
277 | struct autogroup *autogroup; | ||
278 | #endif | ||
279 | }; | ||
274 | 280 | ||
275 | /* task_group_lock serializes add/remove of task groups and also changes to | 281 | /* task_group_lock serializes the addition/removal of task groups */ |
276 | * a task group's cpu shares. | ||
277 | */ | ||
278 | static DEFINE_SPINLOCK(task_group_lock); | 282 | static DEFINE_SPINLOCK(task_group_lock); |
279 | 283 | ||
280 | #ifdef CONFIG_FAIR_GROUP_SCHED | 284 | #ifdef CONFIG_FAIR_GROUP_SCHED |
281 | 285 | ||
282 | #ifdef CONFIG_SMP | 286 | # define ROOT_TASK_GROUP_LOAD NICE_0_LOAD |
283 | static int root_task_group_empty(void) | ||
284 | { | ||
285 | return list_empty(&root_task_group.children); | ||
286 | } | ||
287 | #endif | ||
288 | |||
289 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | ||
290 | 287 | ||
291 | /* | 288 | /* |
292 | * A weight of 0 or 1 can cause arithmetics problems. | 289 | * A weight of 0 or 1 can cause arithmetics problems. |
@@ -299,13 +296,13 @@ static int root_task_group_empty(void) | |||
299 | #define MIN_SHARES 2 | 296 | #define MIN_SHARES 2 |
300 | #define MAX_SHARES (1UL << 18) | 297 | #define MAX_SHARES (1UL << 18) |
301 | 298 | ||
302 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | 299 | static int root_task_group_load = ROOT_TASK_GROUP_LOAD; |
303 | #endif | 300 | #endif |
304 | 301 | ||
305 | /* Default task group. | 302 | /* Default task group. |
306 | * Every task in system belong to this group at bootup. | 303 | * Every task in system belong to this group at bootup. |
307 | */ | 304 | */ |
308 | struct task_group init_task_group; | 305 | struct task_group root_task_group; |
309 | 306 | ||
310 | #endif /* CONFIG_CGROUP_SCHED */ | 307 | #endif /* CONFIG_CGROUP_SCHED */ |
311 | 308 | ||
@@ -342,6 +339,7 @@ struct cfs_rq { | |||
342 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | 339 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This |
343 | * list is used during load balance. | 340 | * list is used during load balance. |
344 | */ | 341 | */ |
342 | int on_list; | ||
345 | struct list_head leaf_cfs_rq_list; | 343 | struct list_head leaf_cfs_rq_list; |
346 | struct task_group *tg; /* group that "owns" this runqueue */ | 344 | struct task_group *tg; /* group that "owns" this runqueue */ |
347 | 345 | ||
@@ -360,14 +358,17 @@ struct cfs_rq { | |||
360 | unsigned long h_load; | 358 | unsigned long h_load; |
361 | 359 | ||
362 | /* | 360 | /* |
363 | * this cpu's part of tg->shares | 361 | * Maintaining per-cpu shares distribution for group scheduling |
362 | * | ||
363 | * load_stamp is the last time we updated the load average | ||
364 | * load_last is the last time we updated the load average and saw load | ||
365 | * load_unacc_exec_time is currently unaccounted execution time | ||
364 | */ | 366 | */ |
365 | unsigned long shares; | 367 | u64 load_avg; |
368 | u64 load_period; | ||
369 | u64 load_stamp, load_last, load_unacc_exec_time; | ||
366 | 370 | ||
367 | /* | 371 | unsigned long load_contribution; |
368 | * load.weight at the time we set shares | ||
369 | */ | ||
370 | unsigned long rq_weight; | ||
371 | #endif | 372 | #endif |
372 | #endif | 373 | #endif |
373 | }; | 374 | }; |
@@ -552,9 +553,6 @@ struct rq { | |||
552 | /* try_to_wake_up() stats */ | 553 | /* try_to_wake_up() stats */ |
553 | unsigned int ttwu_count; | 554 | unsigned int ttwu_count; |
554 | unsigned int ttwu_local; | 555 | unsigned int ttwu_local; |
555 | |||
556 | /* BKL stats */ | ||
557 | unsigned int bkl_count; | ||
558 | #endif | 556 | #endif |
559 | }; | 557 | }; |
560 | 558 | ||
@@ -605,11 +603,17 @@ static inline int cpu_of(struct rq *rq) | |||
605 | */ | 603 | */ |
606 | static inline struct task_group *task_group(struct task_struct *p) | 604 | static inline struct task_group *task_group(struct task_struct *p) |
607 | { | 605 | { |
606 | struct task_group *tg; | ||
608 | struct cgroup_subsys_state *css; | 607 | struct cgroup_subsys_state *css; |
609 | 608 | ||
609 | if (p->flags & PF_EXITING) | ||
610 | return &root_task_group; | ||
611 | |||
610 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | 612 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, |
611 | lockdep_is_held(&task_rq(p)->lock)); | 613 | lockdep_is_held(&task_rq(p)->lock)); |
612 | return container_of(css, struct task_group, css); | 614 | tg = container_of(css, struct task_group, css); |
615 | |||
616 | return autogroup_task_group(p, tg); | ||
613 | } | 617 | } |
614 | 618 | ||
615 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | 619 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ |
@@ -737,7 +741,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
737 | buf[cnt] = 0; | 741 | buf[cnt] = 0; |
738 | cmp = strstrip(buf); | 742 | cmp = strstrip(buf); |
739 | 743 | ||
740 | if (strncmp(buf, "NO_", 3) == 0) { | 744 | if (strncmp(cmp, "NO_", 3) == 0) { |
741 | neg = 1; | 745 | neg = 1; |
742 | cmp += 3; | 746 | cmp += 3; |
743 | } | 747 | } |
@@ -793,20 +797,6 @@ late_initcall(sched_init_debug); | |||
793 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | 797 | const_debug unsigned int sysctl_sched_nr_migrate = 32; |
794 | 798 | ||
795 | /* | 799 | /* |
796 | * ratelimit for updating the group shares. | ||
797 | * default: 0.25ms | ||
798 | */ | ||
799 | unsigned int sysctl_sched_shares_ratelimit = 250000; | ||
800 | unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; | ||
801 | |||
802 | /* | ||
803 | * Inject some fuzzyness into changing the per-cpu group shares | ||
804 | * this avoids remote rq-locks at the expense of fairness. | ||
805 | * default: 4 | ||
806 | */ | ||
807 | unsigned int sysctl_sched_shares_thresh = 4; | ||
808 | |||
809 | /* | ||
810 | * period over which we average the RT time consumption, measured | 800 | * period over which we average the RT time consumption, measured |
811 | * in ms. | 801 | * in ms. |
812 | * | 802 | * |
@@ -1355,6 +1345,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | |||
1355 | lw->inv_weight = 0; | 1345 | lw->inv_weight = 0; |
1356 | } | 1346 | } |
1357 | 1347 | ||
1348 | static inline void update_load_set(struct load_weight *lw, unsigned long w) | ||
1349 | { | ||
1350 | lw->weight = w; | ||
1351 | lw->inv_weight = 0; | ||
1352 | } | ||
1353 | |||
1358 | /* | 1354 | /* |
1359 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | 1355 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
1360 | * of tasks with abnormal "nice" values across CPUs the contribution that | 1356 | * of tasks with abnormal "nice" values across CPUs the contribution that |
@@ -1543,101 +1539,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1543 | 1539 | ||
1544 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1540 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1545 | 1541 | ||
1546 | static __read_mostly unsigned long __percpu *update_shares_data; | ||
1547 | |||
1548 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | ||
1549 | |||
1550 | /* | ||
1551 | * Calculate and set the cpu's group shares. | ||
1552 | */ | ||
1553 | static void update_group_shares_cpu(struct task_group *tg, int cpu, | ||
1554 | unsigned long sd_shares, | ||
1555 | unsigned long sd_rq_weight, | ||
1556 | unsigned long *usd_rq_weight) | ||
1557 | { | ||
1558 | unsigned long shares, rq_weight; | ||
1559 | int boost = 0; | ||
1560 | |||
1561 | rq_weight = usd_rq_weight[cpu]; | ||
1562 | if (!rq_weight) { | ||
1563 | boost = 1; | ||
1564 | rq_weight = NICE_0_LOAD; | ||
1565 | } | ||
1566 | |||
1567 | /* | ||
1568 | * \Sum_j shares_j * rq_weight_i | ||
1569 | * shares_i = ----------------------------- | ||
1570 | * \Sum_j rq_weight_j | ||
1571 | */ | ||
1572 | shares = (sd_shares * rq_weight) / sd_rq_weight; | ||
1573 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); | ||
1574 | |||
1575 | if (abs(shares - tg->se[cpu]->load.weight) > | ||
1576 | sysctl_sched_shares_thresh) { | ||
1577 | struct rq *rq = cpu_rq(cpu); | ||
1578 | unsigned long flags; | ||
1579 | |||
1580 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
1581 | tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; | ||
1582 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | ||
1583 | __set_se_shares(tg->se[cpu], shares); | ||
1584 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
1585 | } | ||
1586 | } | ||
1587 | |||
1588 | /* | ||
1589 | * Re-compute the task group their per cpu shares over the given domain. | ||
1590 | * This needs to be done in a bottom-up fashion because the rq weight of a | ||
1591 | * parent group depends on the shares of its child groups. | ||
1592 | */ | ||
1593 | static int tg_shares_up(struct task_group *tg, void *data) | ||
1594 | { | ||
1595 | unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; | ||
1596 | unsigned long *usd_rq_weight; | ||
1597 | struct sched_domain *sd = data; | ||
1598 | unsigned long flags; | ||
1599 | int i; | ||
1600 | |||
1601 | if (!tg->se[0]) | ||
1602 | return 0; | ||
1603 | |||
1604 | local_irq_save(flags); | ||
1605 | usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); | ||
1606 | |||
1607 | for_each_cpu(i, sched_domain_span(sd)) { | ||
1608 | weight = tg->cfs_rq[i]->load.weight; | ||
1609 | usd_rq_weight[i] = weight; | ||
1610 | |||
1611 | rq_weight += weight; | ||
1612 | /* | ||
1613 | * If there are currently no tasks on the cpu pretend there | ||
1614 | * is one of average load so that when a new task gets to | ||
1615 | * run here it will not get delayed by group starvation. | ||
1616 | */ | ||
1617 | if (!weight) | ||
1618 | weight = NICE_0_LOAD; | ||
1619 | |||
1620 | sum_weight += weight; | ||
1621 | shares += tg->cfs_rq[i]->shares; | ||
1622 | } | ||
1623 | |||
1624 | if (!rq_weight) | ||
1625 | rq_weight = sum_weight; | ||
1626 | |||
1627 | if ((!shares && rq_weight) || shares > tg->shares) | ||
1628 | shares = tg->shares; | ||
1629 | |||
1630 | if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) | ||
1631 | shares = tg->shares; | ||
1632 | |||
1633 | for_each_cpu(i, sched_domain_span(sd)) | ||
1634 | update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); | ||
1635 | |||
1636 | local_irq_restore(flags); | ||
1637 | |||
1638 | return 0; | ||
1639 | } | ||
1640 | |||
1641 | /* | 1542 | /* |
1642 | * Compute the cpu's hierarchical load factor for each task group. | 1543 | * Compute the cpu's hierarchical load factor for each task group. |
1643 | * This needs to be done in a top-down fashion because the load of a child | 1544 | * This needs to be done in a top-down fashion because the load of a child |
@@ -1652,7 +1553,7 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
1652 | load = cpu_rq(cpu)->load.weight; | 1553 | load = cpu_rq(cpu)->load.weight; |
1653 | } else { | 1554 | } else { |
1654 | load = tg->parent->cfs_rq[cpu]->h_load; | 1555 | load = tg->parent->cfs_rq[cpu]->h_load; |
1655 | load *= tg->cfs_rq[cpu]->shares; | 1556 | load *= tg->se[cpu]->load.weight; |
1656 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; | 1557 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; |
1657 | } | 1558 | } |
1658 | 1559 | ||
@@ -1661,34 +1562,11 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
1661 | return 0; | 1562 | return 0; |
1662 | } | 1563 | } |
1663 | 1564 | ||
1664 | static void update_shares(struct sched_domain *sd) | ||
1665 | { | ||
1666 | s64 elapsed; | ||
1667 | u64 now; | ||
1668 | |||
1669 | if (root_task_group_empty()) | ||
1670 | return; | ||
1671 | |||
1672 | now = local_clock(); | ||
1673 | elapsed = now - sd->last_update; | ||
1674 | |||
1675 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | ||
1676 | sd->last_update = now; | ||
1677 | walk_tg_tree(tg_nop, tg_shares_up, sd); | ||
1678 | } | ||
1679 | } | ||
1680 | |||
1681 | static void update_h_load(long cpu) | 1565 | static void update_h_load(long cpu) |
1682 | { | 1566 | { |
1683 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 1567 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
1684 | } | 1568 | } |
1685 | 1569 | ||
1686 | #else | ||
1687 | |||
1688 | static inline void update_shares(struct sched_domain *sd) | ||
1689 | { | ||
1690 | } | ||
1691 | |||
1692 | #endif | 1570 | #endif |
1693 | 1571 | ||
1694 | #ifdef CONFIG_PREEMPT | 1572 | #ifdef CONFIG_PREEMPT |
@@ -1810,15 +1688,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
1810 | 1688 | ||
1811 | #endif | 1689 | #endif |
1812 | 1690 | ||
1813 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1814 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | ||
1815 | { | ||
1816 | #ifdef CONFIG_SMP | ||
1817 | cfs_rq->shares = shares; | ||
1818 | #endif | ||
1819 | } | ||
1820 | #endif | ||
1821 | |||
1822 | static void calc_load_account_idle(struct rq *this_rq); | 1691 | static void calc_load_account_idle(struct rq *this_rq); |
1823 | static void update_sysctl(void); | 1692 | static void update_sysctl(void); |
1824 | static int get_update_sysctl_factor(void); | 1693 | static int get_update_sysctl_factor(void); |
@@ -2063,6 +1932,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | |||
2063 | #include "sched_idletask.c" | 1932 | #include "sched_idletask.c" |
2064 | #include "sched_fair.c" | 1933 | #include "sched_fair.c" |
2065 | #include "sched_rt.c" | 1934 | #include "sched_rt.c" |
1935 | #include "sched_autogroup.c" | ||
2066 | #include "sched_stoptask.c" | 1936 | #include "sched_stoptask.c" |
2067 | #ifdef CONFIG_SCHED_DEBUG | 1937 | #ifdef CONFIG_SCHED_DEBUG |
2068 | # include "sched_debug.c" | 1938 | # include "sched_debug.c" |
@@ -2255,10 +2125,8 @@ static int migration_cpu_stop(void *data); | |||
2255 | * The task's runqueue lock must be held. | 2125 | * The task's runqueue lock must be held. |
2256 | * Returns true if you have to wait for migration thread. | 2126 | * Returns true if you have to wait for migration thread. |
2257 | */ | 2127 | */ |
2258 | static bool migrate_task(struct task_struct *p, int dest_cpu) | 2128 | static bool migrate_task(struct task_struct *p, struct rq *rq) |
2259 | { | 2129 | { |
2260 | struct rq *rq = task_rq(p); | ||
2261 | |||
2262 | /* | 2130 | /* |
2263 | * If the task is not on a runqueue (and not running), then | 2131 | * If the task is not on a runqueue (and not running), then |
2264 | * the next wake-up will properly place the task. | 2132 | * the next wake-up will properly place the task. |
@@ -2438,18 +2306,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
2438 | return dest_cpu; | 2306 | return dest_cpu; |
2439 | 2307 | ||
2440 | /* No more Mr. Nice Guy. */ | 2308 | /* No more Mr. Nice Guy. */ |
2441 | if (unlikely(dest_cpu >= nr_cpu_ids)) { | 2309 | dest_cpu = cpuset_cpus_allowed_fallback(p); |
2442 | dest_cpu = cpuset_cpus_allowed_fallback(p); | 2310 | /* |
2443 | /* | 2311 | * Don't tell them about moving exiting tasks or |
2444 | * Don't tell them about moving exiting tasks or | 2312 | * kernel threads (both mm NULL), since they never |
2445 | * kernel threads (both mm NULL), since they never | 2313 | * leave kernel. |
2446 | * leave kernel. | 2314 | */ |
2447 | */ | 2315 | if (p->mm && printk_ratelimit()) { |
2448 | if (p->mm && printk_ratelimit()) { | 2316 | printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", |
2449 | printk(KERN_INFO "process %d (%s) no " | 2317 | task_pid_nr(p), p->comm, cpu); |
2450 | "longer affine to cpu%d\n", | ||
2451 | task_pid_nr(p), p->comm, cpu); | ||
2452 | } | ||
2453 | } | 2318 | } |
2454 | 2319 | ||
2455 | return dest_cpu; | 2320 | return dest_cpu; |
@@ -2640,7 +2505,7 @@ out: | |||
2640 | * try_to_wake_up_local - try to wake up a local task with rq lock held | 2505 | * try_to_wake_up_local - try to wake up a local task with rq lock held |
2641 | * @p: the thread to be awakened | 2506 | * @p: the thread to be awakened |
2642 | * | 2507 | * |
2643 | * Put @p on the run-queue if it's not alredy there. The caller must | 2508 | * Put @p on the run-queue if it's not already there. The caller must |
2644 | * ensure that this_rq() is locked, @p is bound to this_rq() and not | 2509 | * ensure that this_rq() is locked, @p is bound to this_rq() and not |
2645 | * the current task. this_rq() stays locked over invocation. | 2510 | * the current task. this_rq() stays locked over invocation. |
2646 | */ | 2511 | */ |
@@ -2785,7 +2650,9 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2785 | /* Want to start with kernel preemption disabled. */ | 2650 | /* Want to start with kernel preemption disabled. */ |
2786 | task_thread_info(p)->preempt_count = 1; | 2651 | task_thread_info(p)->preempt_count = 1; |
2787 | #endif | 2652 | #endif |
2653 | #ifdef CONFIG_SMP | ||
2788 | plist_node_init(&p->pushable_tasks, MAX_PRIO); | 2654 | plist_node_init(&p->pushable_tasks, MAX_PRIO); |
2655 | #endif | ||
2789 | 2656 | ||
2790 | put_cpu(); | 2657 | put_cpu(); |
2791 | } | 2658 | } |
@@ -3549,7 +3416,7 @@ void sched_exec(void) | |||
3549 | * select_task_rq() can race against ->cpus_allowed | 3416 | * select_task_rq() can race against ->cpus_allowed |
3550 | */ | 3417 | */ |
3551 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && | 3418 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && |
3552 | likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { | 3419 | likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) { |
3553 | struct migration_arg arg = { p, dest_cpu }; | 3420 | struct migration_arg arg = { p, dest_cpu }; |
3554 | 3421 | ||
3555 | task_rq_unlock(rq, &flags); | 3422 | task_rq_unlock(rq, &flags); |
@@ -4020,7 +3887,7 @@ static inline void schedule_debug(struct task_struct *prev) | |||
4020 | schedstat_inc(this_rq(), sched_count); | 3887 | schedstat_inc(this_rq(), sched_count); |
4021 | #ifdef CONFIG_SCHEDSTATS | 3888 | #ifdef CONFIG_SCHEDSTATS |
4022 | if (unlikely(prev->lock_depth >= 0)) { | 3889 | if (unlikely(prev->lock_depth >= 0)) { |
4023 | schedstat_inc(this_rq(), bkl_count); | 3890 | schedstat_inc(this_rq(), rq_sched_info.bkl_count); |
4024 | schedstat_inc(prev, sched_info.bkl_count); | 3891 | schedstat_inc(prev, sched_info.bkl_count); |
4025 | } | 3892 | } |
4026 | #endif | 3893 | #endif |
@@ -4214,7 +4081,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | |||
4214 | if (task_thread_info(rq->curr) != owner || need_resched()) | 4081 | if (task_thread_info(rq->curr) != owner || need_resched()) |
4215 | return 0; | 4082 | return 0; |
4216 | 4083 | ||
4217 | cpu_relax(); | 4084 | arch_mutex_cpu_relax(); |
4218 | } | 4085 | } |
4219 | 4086 | ||
4220 | return 1; | 4087 | return 1; |
@@ -4526,7 +4393,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible); | |||
4526 | * This waits for either a completion of a specific task to be signaled or for a | 4393 | * This waits for either a completion of a specific task to be signaled or for a |
4527 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | 4394 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. |
4528 | */ | 4395 | */ |
4529 | unsigned long __sched | 4396 | long __sched |
4530 | wait_for_completion_interruptible_timeout(struct completion *x, | 4397 | wait_for_completion_interruptible_timeout(struct completion *x, |
4531 | unsigned long timeout) | 4398 | unsigned long timeout) |
4532 | { | 4399 | { |
@@ -4559,7 +4426,7 @@ EXPORT_SYMBOL(wait_for_completion_killable); | |||
4559 | * signaled or for a specified timeout to expire. It can be | 4426 | * signaled or for a specified timeout to expire. It can be |
4560 | * interrupted by a kill signal. The timeout is in jiffies. | 4427 | * interrupted by a kill signal. The timeout is in jiffies. |
4561 | */ | 4428 | */ |
4562 | unsigned long __sched | 4429 | long __sched |
4563 | wait_for_completion_killable_timeout(struct completion *x, | 4430 | wait_for_completion_killable_timeout(struct completion *x, |
4564 | unsigned long timeout) | 4431 | unsigned long timeout) |
4565 | { | 4432 | { |
@@ -4901,7 +4768,7 @@ static bool check_same_owner(struct task_struct *p) | |||
4901 | } | 4768 | } |
4902 | 4769 | ||
4903 | static int __sched_setscheduler(struct task_struct *p, int policy, | 4770 | static int __sched_setscheduler(struct task_struct *p, int policy, |
4904 | struct sched_param *param, bool user) | 4771 | const struct sched_param *param, bool user) |
4905 | { | 4772 | { |
4906 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 4773 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
4907 | unsigned long flags; | 4774 | unsigned long flags; |
@@ -5004,7 +4871,8 @@ recheck: | |||
5004 | * assigned. | 4871 | * assigned. |
5005 | */ | 4872 | */ |
5006 | if (rt_bandwidth_enabled() && rt_policy(policy) && | 4873 | if (rt_bandwidth_enabled() && rt_policy(policy) && |
5007 | task_group(p)->rt_bandwidth.rt_runtime == 0) { | 4874 | task_group(p)->rt_bandwidth.rt_runtime == 0 && |
4875 | !task_group_is_autogroup(task_group(p))) { | ||
5008 | __task_rq_unlock(rq); | 4876 | __task_rq_unlock(rq); |
5009 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 4877 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
5010 | return -EPERM; | 4878 | return -EPERM; |
@@ -5056,7 +4924,7 @@ recheck: | |||
5056 | * NOTE that the task may be already dead. | 4924 | * NOTE that the task may be already dead. |
5057 | */ | 4925 | */ |
5058 | int sched_setscheduler(struct task_struct *p, int policy, | 4926 | int sched_setscheduler(struct task_struct *p, int policy, |
5059 | struct sched_param *param) | 4927 | const struct sched_param *param) |
5060 | { | 4928 | { |
5061 | return __sched_setscheduler(p, policy, param, true); | 4929 | return __sched_setscheduler(p, policy, param, true); |
5062 | } | 4930 | } |
@@ -5074,7 +4942,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); | |||
5074 | * but our caller might not have that capability. | 4942 | * but our caller might not have that capability. |
5075 | */ | 4943 | */ |
5076 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, | 4944 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, |
5077 | struct sched_param *param) | 4945 | const struct sched_param *param) |
5078 | { | 4946 | { |
5079 | return __sched_setscheduler(p, policy, param, false); | 4947 | return __sched_setscheduler(p, policy, param, false); |
5080 | } | 4948 | } |
@@ -5590,7 +5458,7 @@ void sched_show_task(struct task_struct *p) | |||
5590 | unsigned state; | 5458 | unsigned state; |
5591 | 5459 | ||
5592 | state = p->state ? __ffs(p->state) + 1 : 0; | 5460 | state = p->state ? __ffs(p->state) + 1 : 0; |
5593 | printk(KERN_INFO "%-13.13s %c", p->comm, | 5461 | printk(KERN_INFO "%-15.15s %c", p->comm, |
5594 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); | 5462 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); |
5595 | #if BITS_PER_LONG == 32 | 5463 | #if BITS_PER_LONG == 32 |
5596 | if (state == TASK_RUNNING) | 5464 | if (state == TASK_RUNNING) |
@@ -5754,7 +5622,6 @@ static void update_sysctl(void) | |||
5754 | SET_SYSCTL(sched_min_granularity); | 5622 | SET_SYSCTL(sched_min_granularity); |
5755 | SET_SYSCTL(sched_latency); | 5623 | SET_SYSCTL(sched_latency); |
5756 | SET_SYSCTL(sched_wakeup_granularity); | 5624 | SET_SYSCTL(sched_wakeup_granularity); |
5757 | SET_SYSCTL(sched_shares_ratelimit); | ||
5758 | #undef SET_SYSCTL | 5625 | #undef SET_SYSCTL |
5759 | } | 5626 | } |
5760 | 5627 | ||
@@ -5830,7 +5697,7 @@ again: | |||
5830 | goto out; | 5697 | goto out; |
5831 | 5698 | ||
5832 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); | 5699 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); |
5833 | if (migrate_task(p, dest_cpu)) { | 5700 | if (migrate_task(p, rq)) { |
5834 | struct migration_arg arg = { p, dest_cpu }; | 5701 | struct migration_arg arg = { p, dest_cpu }; |
5835 | /* Need help from migration thread: drop lock and wait. */ | 5702 | /* Need help from migration thread: drop lock and wait. */ |
5836 | task_rq_unlock(rq, &flags); | 5703 | task_rq_unlock(rq, &flags); |
@@ -5912,29 +5779,20 @@ static int migration_cpu_stop(void *data) | |||
5912 | } | 5779 | } |
5913 | 5780 | ||
5914 | #ifdef CONFIG_HOTPLUG_CPU | 5781 | #ifdef CONFIG_HOTPLUG_CPU |
5782 | |||
5915 | /* | 5783 | /* |
5916 | * Figure out where task on dead CPU should go, use force if necessary. | 5784 | * Ensures that the idle task is using init_mm right before its cpu goes |
5785 | * offline. | ||
5917 | */ | 5786 | */ |
5918 | void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | 5787 | void idle_task_exit(void) |
5919 | { | 5788 | { |
5920 | struct rq *rq = cpu_rq(dead_cpu); | 5789 | struct mm_struct *mm = current->active_mm; |
5921 | int needs_cpu, uninitialized_var(dest_cpu); | ||
5922 | unsigned long flags; | ||
5923 | 5790 | ||
5924 | local_irq_save(flags); | 5791 | BUG_ON(cpu_online(smp_processor_id())); |
5925 | 5792 | ||
5926 | raw_spin_lock(&rq->lock); | 5793 | if (mm != &init_mm) |
5927 | needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); | 5794 | switch_mm(mm, &init_mm, current); |
5928 | if (needs_cpu) | 5795 | mmdrop(mm); |
5929 | dest_cpu = select_fallback_rq(dead_cpu, p); | ||
5930 | raw_spin_unlock(&rq->lock); | ||
5931 | /* | ||
5932 | * It can only fail if we race with set_cpus_allowed(), | ||
5933 | * in the racer should migrate the task anyway. | ||
5934 | */ | ||
5935 | if (needs_cpu) | ||
5936 | __migrate_task(p, dead_cpu, dest_cpu); | ||
5937 | local_irq_restore(flags); | ||
5938 | } | 5796 | } |
5939 | 5797 | ||
5940 | /* | 5798 | /* |
@@ -5947,128 +5805,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
5947 | static void migrate_nr_uninterruptible(struct rq *rq_src) | 5805 | static void migrate_nr_uninterruptible(struct rq *rq_src) |
5948 | { | 5806 | { |
5949 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); | 5807 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); |
5950 | unsigned long flags; | ||
5951 | 5808 | ||
5952 | local_irq_save(flags); | ||
5953 | double_rq_lock(rq_src, rq_dest); | ||
5954 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; | 5809 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; |
5955 | rq_src->nr_uninterruptible = 0; | 5810 | rq_src->nr_uninterruptible = 0; |
5956 | double_rq_unlock(rq_src, rq_dest); | ||
5957 | local_irq_restore(flags); | ||
5958 | } | ||
5959 | |||
5960 | /* Run through task list and migrate tasks from the dead cpu. */ | ||
5961 | static void migrate_live_tasks(int src_cpu) | ||
5962 | { | ||
5963 | struct task_struct *p, *t; | ||
5964 | |||
5965 | read_lock(&tasklist_lock); | ||
5966 | |||
5967 | do_each_thread(t, p) { | ||
5968 | if (p == current) | ||
5969 | continue; | ||
5970 | |||
5971 | if (task_cpu(p) == src_cpu) | ||
5972 | move_task_off_dead_cpu(src_cpu, p); | ||
5973 | } while_each_thread(t, p); | ||
5974 | |||
5975 | read_unlock(&tasklist_lock); | ||
5976 | } | 5811 | } |
5977 | 5812 | ||
5978 | /* | 5813 | /* |
5979 | * Schedules idle task to be the next runnable task on current CPU. | 5814 | * remove the tasks which were accounted by rq from calc_load_tasks. |
5980 | * It does so by boosting its priority to highest possible. | ||
5981 | * Used by CPU offline code. | ||
5982 | */ | 5815 | */ |
5983 | void sched_idle_next(void) | 5816 | static void calc_global_load_remove(struct rq *rq) |
5984 | { | 5817 | { |
5985 | int this_cpu = smp_processor_id(); | 5818 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); |
5986 | struct rq *rq = cpu_rq(this_cpu); | 5819 | rq->calc_load_active = 0; |
5987 | struct task_struct *p = rq->idle; | ||
5988 | unsigned long flags; | ||
5989 | |||
5990 | /* cpu has to be offline */ | ||
5991 | BUG_ON(cpu_online(this_cpu)); | ||
5992 | |||
5993 | /* | ||
5994 | * Strictly not necessary since rest of the CPUs are stopped by now | ||
5995 | * and interrupts disabled on the current cpu. | ||
5996 | */ | ||
5997 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
5998 | |||
5999 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); | ||
6000 | |||
6001 | activate_task(rq, p, 0); | ||
6002 | |||
6003 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
6004 | } | 5820 | } |
6005 | 5821 | ||
6006 | /* | 5822 | /* |
6007 | * Ensures that the idle task is using init_mm right before its cpu goes | 5823 | * Migrate all tasks from the rq, sleeping tasks will be migrated by |
6008 | * offline. | 5824 | * try_to_wake_up()->select_task_rq(). |
5825 | * | ||
5826 | * Called with rq->lock held even though we'er in stop_machine() and | ||
5827 | * there's no concurrency possible, we hold the required locks anyway | ||
5828 | * because of lock validation efforts. | ||
6009 | */ | 5829 | */ |
6010 | void idle_task_exit(void) | 5830 | static void migrate_tasks(unsigned int dead_cpu) |
6011 | { | ||
6012 | struct mm_struct *mm = current->active_mm; | ||
6013 | |||
6014 | BUG_ON(cpu_online(smp_processor_id())); | ||
6015 | |||
6016 | if (mm != &init_mm) | ||
6017 | switch_mm(mm, &init_mm, current); | ||
6018 | mmdrop(mm); | ||
6019 | } | ||
6020 | |||
6021 | /* called under rq->lock with disabled interrupts */ | ||
6022 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) | ||
6023 | { | 5831 | { |
6024 | struct rq *rq = cpu_rq(dead_cpu); | 5832 | struct rq *rq = cpu_rq(dead_cpu); |
6025 | 5833 | struct task_struct *next, *stop = rq->stop; | |
6026 | /* Must be exiting, otherwise would be on tasklist. */ | 5834 | int dest_cpu; |
6027 | BUG_ON(!p->exit_state); | ||
6028 | |||
6029 | /* Cannot have done final schedule yet: would have vanished. */ | ||
6030 | BUG_ON(p->state == TASK_DEAD); | ||
6031 | |||
6032 | get_task_struct(p); | ||
6033 | 5835 | ||
6034 | /* | 5836 | /* |
6035 | * Drop lock around migration; if someone else moves it, | 5837 | * Fudge the rq selection such that the below task selection loop |
6036 | * that's OK. No task can be added to this CPU, so iteration is | 5838 | * doesn't get stuck on the currently eligible stop task. |
6037 | * fine. | 5839 | * |
5840 | * We're currently inside stop_machine() and the rq is either stuck | ||
5841 | * in the stop_machine_cpu_stop() loop, or we're executing this code, | ||
5842 | * either way we should never end up calling schedule() until we're | ||
5843 | * done here. | ||
6038 | */ | 5844 | */ |
6039 | raw_spin_unlock_irq(&rq->lock); | 5845 | rq->stop = NULL; |
6040 | move_task_off_dead_cpu(dead_cpu, p); | ||
6041 | raw_spin_lock_irq(&rq->lock); | ||
6042 | |||
6043 | put_task_struct(p); | ||
6044 | } | ||
6045 | |||
6046 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ | ||
6047 | static void migrate_dead_tasks(unsigned int dead_cpu) | ||
6048 | { | ||
6049 | struct rq *rq = cpu_rq(dead_cpu); | ||
6050 | struct task_struct *next; | ||
6051 | 5846 | ||
6052 | for ( ; ; ) { | 5847 | for ( ; ; ) { |
6053 | if (!rq->nr_running) | 5848 | /* |
5849 | * There's this thread running, bail when that's the only | ||
5850 | * remaining thread. | ||
5851 | */ | ||
5852 | if (rq->nr_running == 1) | ||
6054 | break; | 5853 | break; |
5854 | |||
6055 | next = pick_next_task(rq); | 5855 | next = pick_next_task(rq); |
6056 | if (!next) | 5856 | BUG_ON(!next); |
6057 | break; | ||
6058 | next->sched_class->put_prev_task(rq, next); | 5857 | next->sched_class->put_prev_task(rq, next); |
6059 | migrate_dead(dead_cpu, next); | ||
6060 | 5858 | ||
5859 | /* Find suitable destination for @next, with force if needed. */ | ||
5860 | dest_cpu = select_fallback_rq(dead_cpu, next); | ||
5861 | raw_spin_unlock(&rq->lock); | ||
5862 | |||
5863 | __migrate_task(next, dead_cpu, dest_cpu); | ||
5864 | |||
5865 | raw_spin_lock(&rq->lock); | ||
6061 | } | 5866 | } |
6062 | } | ||
6063 | 5867 | ||
6064 | /* | 5868 | rq->stop = stop; |
6065 | * remove the tasks which were accounted by rq from calc_load_tasks. | ||
6066 | */ | ||
6067 | static void calc_global_load_remove(struct rq *rq) | ||
6068 | { | ||
6069 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); | ||
6070 | rq->calc_load_active = 0; | ||
6071 | } | 5869 | } |
5870 | |||
6072 | #endif /* CONFIG_HOTPLUG_CPU */ | 5871 | #endif /* CONFIG_HOTPLUG_CPU */ |
6073 | 5872 | ||
6074 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | 5873 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) |
@@ -6278,15 +6077,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6278 | unsigned long flags; | 6077 | unsigned long flags; |
6279 | struct rq *rq = cpu_rq(cpu); | 6078 | struct rq *rq = cpu_rq(cpu); |
6280 | 6079 | ||
6281 | switch (action) { | 6080 | switch (action & ~CPU_TASKS_FROZEN) { |
6282 | 6081 | ||
6283 | case CPU_UP_PREPARE: | 6082 | case CPU_UP_PREPARE: |
6284 | case CPU_UP_PREPARE_FROZEN: | ||
6285 | rq->calc_load_update = calc_load_update; | 6083 | rq->calc_load_update = calc_load_update; |
6286 | break; | 6084 | break; |
6287 | 6085 | ||
6288 | case CPU_ONLINE: | 6086 | case CPU_ONLINE: |
6289 | case CPU_ONLINE_FROZEN: | ||
6290 | /* Update our root-domain */ | 6087 | /* Update our root-domain */ |
6291 | raw_spin_lock_irqsave(&rq->lock, flags); | 6088 | raw_spin_lock_irqsave(&rq->lock, flags); |
6292 | if (rq->rd) { | 6089 | if (rq->rd) { |
@@ -6298,30 +6095,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6298 | break; | 6095 | break; |
6299 | 6096 | ||
6300 | #ifdef CONFIG_HOTPLUG_CPU | 6097 | #ifdef CONFIG_HOTPLUG_CPU |
6301 | case CPU_DEAD: | ||
6302 | case CPU_DEAD_FROZEN: | ||
6303 | migrate_live_tasks(cpu); | ||
6304 | /* Idle task back to normal (off runqueue, low prio) */ | ||
6305 | raw_spin_lock_irq(&rq->lock); | ||
6306 | deactivate_task(rq, rq->idle, 0); | ||
6307 | __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); | ||
6308 | rq->idle->sched_class = &idle_sched_class; | ||
6309 | migrate_dead_tasks(cpu); | ||
6310 | raw_spin_unlock_irq(&rq->lock); | ||
6311 | migrate_nr_uninterruptible(rq); | ||
6312 | BUG_ON(rq->nr_running != 0); | ||
6313 | calc_global_load_remove(rq); | ||
6314 | break; | ||
6315 | |||
6316 | case CPU_DYING: | 6098 | case CPU_DYING: |
6317 | case CPU_DYING_FROZEN: | ||
6318 | /* Update our root-domain */ | 6099 | /* Update our root-domain */ |
6319 | raw_spin_lock_irqsave(&rq->lock, flags); | 6100 | raw_spin_lock_irqsave(&rq->lock, flags); |
6320 | if (rq->rd) { | 6101 | if (rq->rd) { |
6321 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); | 6102 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
6322 | set_rq_offline(rq); | 6103 | set_rq_offline(rq); |
6323 | } | 6104 | } |
6105 | migrate_tasks(cpu); | ||
6106 | BUG_ON(rq->nr_running != 1); /* the migration thread */ | ||
6324 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6107 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
6108 | |||
6109 | migrate_nr_uninterruptible(rq); | ||
6110 | calc_global_load_remove(rq); | ||
6325 | break; | 6111 | break; |
6326 | #endif | 6112 | #endif |
6327 | } | 6113 | } |
@@ -8052,18 +7838,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
8052 | 7838 | ||
8053 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7839 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8054 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | 7840 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, |
8055 | struct sched_entity *se, int cpu, int add, | 7841 | struct sched_entity *se, int cpu, |
8056 | struct sched_entity *parent) | 7842 | struct sched_entity *parent) |
8057 | { | 7843 | { |
8058 | struct rq *rq = cpu_rq(cpu); | 7844 | struct rq *rq = cpu_rq(cpu); |
8059 | tg->cfs_rq[cpu] = cfs_rq; | 7845 | tg->cfs_rq[cpu] = cfs_rq; |
8060 | init_cfs_rq(cfs_rq, rq); | 7846 | init_cfs_rq(cfs_rq, rq); |
8061 | cfs_rq->tg = tg; | 7847 | cfs_rq->tg = tg; |
8062 | if (add) | ||
8063 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
8064 | 7848 | ||
8065 | tg->se[cpu] = se; | 7849 | tg->se[cpu] = se; |
8066 | /* se could be NULL for init_task_group */ | 7850 | /* se could be NULL for root_task_group */ |
8067 | if (!se) | 7851 | if (!se) |
8068 | return; | 7852 | return; |
8069 | 7853 | ||
@@ -8073,15 +7857,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
8073 | se->cfs_rq = parent->my_q; | 7857 | se->cfs_rq = parent->my_q; |
8074 | 7858 | ||
8075 | se->my_q = cfs_rq; | 7859 | se->my_q = cfs_rq; |
8076 | se->load.weight = tg->shares; | 7860 | update_load_set(&se->load, 0); |
8077 | se->load.inv_weight = 0; | ||
8078 | se->parent = parent; | 7861 | se->parent = parent; |
8079 | } | 7862 | } |
8080 | #endif | 7863 | #endif |
8081 | 7864 | ||
8082 | #ifdef CONFIG_RT_GROUP_SCHED | 7865 | #ifdef CONFIG_RT_GROUP_SCHED |
8083 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | 7866 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, |
8084 | struct sched_rt_entity *rt_se, int cpu, int add, | 7867 | struct sched_rt_entity *rt_se, int cpu, |
8085 | struct sched_rt_entity *parent) | 7868 | struct sched_rt_entity *parent) |
8086 | { | 7869 | { |
8087 | struct rq *rq = cpu_rq(cpu); | 7870 | struct rq *rq = cpu_rq(cpu); |
@@ -8090,8 +7873,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | |||
8090 | init_rt_rq(rt_rq, rq); | 7873 | init_rt_rq(rt_rq, rq); |
8091 | rt_rq->tg = tg; | 7874 | rt_rq->tg = tg; |
8092 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | 7875 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; |
8093 | if (add) | ||
8094 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | ||
8095 | 7876 | ||
8096 | tg->rt_se[cpu] = rt_se; | 7877 | tg->rt_se[cpu] = rt_se; |
8097 | if (!rt_se) | 7878 | if (!rt_se) |
@@ -8126,18 +7907,18 @@ void __init sched_init(void) | |||
8126 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); | 7907 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); |
8127 | 7908 | ||
8128 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7909 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8129 | init_task_group.se = (struct sched_entity **)ptr; | 7910 | root_task_group.se = (struct sched_entity **)ptr; |
8130 | ptr += nr_cpu_ids * sizeof(void **); | 7911 | ptr += nr_cpu_ids * sizeof(void **); |
8131 | 7912 | ||
8132 | init_task_group.cfs_rq = (struct cfs_rq **)ptr; | 7913 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; |
8133 | ptr += nr_cpu_ids * sizeof(void **); | 7914 | ptr += nr_cpu_ids * sizeof(void **); |
8134 | 7915 | ||
8135 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7916 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8136 | #ifdef CONFIG_RT_GROUP_SCHED | 7917 | #ifdef CONFIG_RT_GROUP_SCHED |
8137 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; | 7918 | root_task_group.rt_se = (struct sched_rt_entity **)ptr; |
8138 | ptr += nr_cpu_ids * sizeof(void **); | 7919 | ptr += nr_cpu_ids * sizeof(void **); |
8139 | 7920 | ||
8140 | init_task_group.rt_rq = (struct rt_rq **)ptr; | 7921 | root_task_group.rt_rq = (struct rt_rq **)ptr; |
8141 | ptr += nr_cpu_ids * sizeof(void **); | 7922 | ptr += nr_cpu_ids * sizeof(void **); |
8142 | 7923 | ||
8143 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7924 | #endif /* CONFIG_RT_GROUP_SCHED */ |
@@ -8157,20 +7938,16 @@ void __init sched_init(void) | |||
8157 | global_rt_period(), global_rt_runtime()); | 7938 | global_rt_period(), global_rt_runtime()); |
8158 | 7939 | ||
8159 | #ifdef CONFIG_RT_GROUP_SCHED | 7940 | #ifdef CONFIG_RT_GROUP_SCHED |
8160 | init_rt_bandwidth(&init_task_group.rt_bandwidth, | 7941 | init_rt_bandwidth(&root_task_group.rt_bandwidth, |
8161 | global_rt_period(), global_rt_runtime()); | 7942 | global_rt_period(), global_rt_runtime()); |
8162 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7943 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8163 | 7944 | ||
8164 | #ifdef CONFIG_CGROUP_SCHED | 7945 | #ifdef CONFIG_CGROUP_SCHED |
8165 | list_add(&init_task_group.list, &task_groups); | 7946 | list_add(&root_task_group.list, &task_groups); |
8166 | INIT_LIST_HEAD(&init_task_group.children); | 7947 | INIT_LIST_HEAD(&root_task_group.children); |
8167 | 7948 | autogroup_init(&init_task); | |
8168 | #endif /* CONFIG_CGROUP_SCHED */ | 7949 | #endif /* CONFIG_CGROUP_SCHED */ |
8169 | 7950 | ||
8170 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP | ||
8171 | update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), | ||
8172 | __alignof__(unsigned long)); | ||
8173 | #endif | ||
8174 | for_each_possible_cpu(i) { | 7951 | for_each_possible_cpu(i) { |
8175 | struct rq *rq; | 7952 | struct rq *rq; |
8176 | 7953 | ||
@@ -8182,38 +7959,34 @@ void __init sched_init(void) | |||
8182 | init_cfs_rq(&rq->cfs, rq); | 7959 | init_cfs_rq(&rq->cfs, rq); |
8183 | init_rt_rq(&rq->rt, rq); | 7960 | init_rt_rq(&rq->rt, rq); |
8184 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7961 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8185 | init_task_group.shares = init_task_group_load; | 7962 | root_task_group.shares = root_task_group_load; |
8186 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 7963 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
8187 | #ifdef CONFIG_CGROUP_SCHED | ||
8188 | /* | 7964 | /* |
8189 | * How much cpu bandwidth does init_task_group get? | 7965 | * How much cpu bandwidth does root_task_group get? |
8190 | * | 7966 | * |
8191 | * In case of task-groups formed thr' the cgroup filesystem, it | 7967 | * In case of task-groups formed thr' the cgroup filesystem, it |
8192 | * gets 100% of the cpu resources in the system. This overall | 7968 | * gets 100% of the cpu resources in the system. This overall |
8193 | * system cpu resource is divided among the tasks of | 7969 | * system cpu resource is divided among the tasks of |
8194 | * init_task_group and its child task-groups in a fair manner, | 7970 | * root_task_group and its child task-groups in a fair manner, |
8195 | * based on each entity's (task or task-group's) weight | 7971 | * based on each entity's (task or task-group's) weight |
8196 | * (se->load.weight). | 7972 | * (se->load.weight). |
8197 | * | 7973 | * |
8198 | * In other words, if init_task_group has 10 tasks of weight | 7974 | * In other words, if root_task_group has 10 tasks of weight |
8199 | * 1024) and two child groups A0 and A1 (of weight 1024 each), | 7975 | * 1024) and two child groups A0 and A1 (of weight 1024 each), |
8200 | * then A0's share of the cpu resource is: | 7976 | * then A0's share of the cpu resource is: |
8201 | * | 7977 | * |
8202 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% | 7978 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% |
8203 | * | 7979 | * |
8204 | * We achieve this by letting init_task_group's tasks sit | 7980 | * We achieve this by letting root_task_group's tasks sit |
8205 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). | 7981 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). |
8206 | */ | 7982 | */ |
8207 | init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); | 7983 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); |
8208 | #endif | ||
8209 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7984 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8210 | 7985 | ||
8211 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; | 7986 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; |
8212 | #ifdef CONFIG_RT_GROUP_SCHED | 7987 | #ifdef CONFIG_RT_GROUP_SCHED |
8213 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | 7988 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); |
8214 | #ifdef CONFIG_CGROUP_SCHED | 7989 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); |
8215 | init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); | ||
8216 | #endif | ||
8217 | #endif | 7990 | #endif |
8218 | 7991 | ||
8219 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 7992 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
@@ -8293,8 +8066,6 @@ void __init sched_init(void) | |||
8293 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 8066 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
8294 | #endif /* SMP */ | 8067 | #endif /* SMP */ |
8295 | 8068 | ||
8296 | perf_event_init(); | ||
8297 | |||
8298 | scheduler_running = 1; | 8069 | scheduler_running = 1; |
8299 | } | 8070 | } |
8300 | 8071 | ||
@@ -8488,7 +8259,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8488 | if (!se) | 8259 | if (!se) |
8489 | goto err_free_rq; | 8260 | goto err_free_rq; |
8490 | 8261 | ||
8491 | init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); | 8262 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); |
8492 | } | 8263 | } |
8493 | 8264 | ||
8494 | return 1; | 8265 | return 1; |
@@ -8499,15 +8270,21 @@ err: | |||
8499 | return 0; | 8270 | return 0; |
8500 | } | 8271 | } |
8501 | 8272 | ||
8502 | static inline void register_fair_sched_group(struct task_group *tg, int cpu) | ||
8503 | { | ||
8504 | list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, | ||
8505 | &cpu_rq(cpu)->leaf_cfs_rq_list); | ||
8506 | } | ||
8507 | |||
8508 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8273 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
8509 | { | 8274 | { |
8510 | list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); | 8275 | struct rq *rq = cpu_rq(cpu); |
8276 | unsigned long flags; | ||
8277 | |||
8278 | /* | ||
8279 | * Only empty task groups can be destroyed; so we can speculatively | ||
8280 | * check on_list without danger of it being re-added. | ||
8281 | */ | ||
8282 | if (!tg->cfs_rq[cpu]->on_list) | ||
8283 | return; | ||
8284 | |||
8285 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8286 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); | ||
8287 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8511 | } | 8288 | } |
8512 | #else /* !CONFG_FAIR_GROUP_SCHED */ | 8289 | #else /* !CONFG_FAIR_GROUP_SCHED */ |
8513 | static inline void free_fair_sched_group(struct task_group *tg) | 8290 | static inline void free_fair_sched_group(struct task_group *tg) |
@@ -8520,10 +8297,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8520 | return 1; | 8297 | return 1; |
8521 | } | 8298 | } |
8522 | 8299 | ||
8523 | static inline void register_fair_sched_group(struct task_group *tg, int cpu) | ||
8524 | { | ||
8525 | } | ||
8526 | |||
8527 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8300 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
8528 | { | 8301 | { |
8529 | } | 8302 | } |
@@ -8578,7 +8351,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8578 | if (!rt_se) | 8351 | if (!rt_se) |
8579 | goto err_free_rq; | 8352 | goto err_free_rq; |
8580 | 8353 | ||
8581 | init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); | 8354 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); |
8582 | } | 8355 | } |
8583 | 8356 | ||
8584 | return 1; | 8357 | return 1; |
@@ -8588,17 +8361,6 @@ err_free_rq: | |||
8588 | err: | 8361 | err: |
8589 | return 0; | 8362 | return 0; |
8590 | } | 8363 | } |
8591 | |||
8592 | static inline void register_rt_sched_group(struct task_group *tg, int cpu) | ||
8593 | { | ||
8594 | list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, | ||
8595 | &cpu_rq(cpu)->leaf_rt_rq_list); | ||
8596 | } | ||
8597 | |||
8598 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | ||
8599 | { | ||
8600 | list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); | ||
8601 | } | ||
8602 | #else /* !CONFIG_RT_GROUP_SCHED */ | 8364 | #else /* !CONFIG_RT_GROUP_SCHED */ |
8603 | static inline void free_rt_sched_group(struct task_group *tg) | 8365 | static inline void free_rt_sched_group(struct task_group *tg) |
8604 | { | 8366 | { |
@@ -8609,14 +8371,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8609 | { | 8371 | { |
8610 | return 1; | 8372 | return 1; |
8611 | } | 8373 | } |
8612 | |||
8613 | static inline void register_rt_sched_group(struct task_group *tg, int cpu) | ||
8614 | { | ||
8615 | } | ||
8616 | |||
8617 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | ||
8618 | { | ||
8619 | } | ||
8620 | #endif /* CONFIG_RT_GROUP_SCHED */ | 8374 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8621 | 8375 | ||
8622 | #ifdef CONFIG_CGROUP_SCHED | 8376 | #ifdef CONFIG_CGROUP_SCHED |
@@ -8624,6 +8378,7 @@ static void free_sched_group(struct task_group *tg) | |||
8624 | { | 8378 | { |
8625 | free_fair_sched_group(tg); | 8379 | free_fair_sched_group(tg); |
8626 | free_rt_sched_group(tg); | 8380 | free_rt_sched_group(tg); |
8381 | autogroup_free(tg); | ||
8627 | kfree(tg); | 8382 | kfree(tg); |
8628 | } | 8383 | } |
8629 | 8384 | ||
@@ -8632,7 +8387,6 @@ struct task_group *sched_create_group(struct task_group *parent) | |||
8632 | { | 8387 | { |
8633 | struct task_group *tg; | 8388 | struct task_group *tg; |
8634 | unsigned long flags; | 8389 | unsigned long flags; |
8635 | int i; | ||
8636 | 8390 | ||
8637 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); | 8391 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); |
8638 | if (!tg) | 8392 | if (!tg) |
@@ -8645,10 +8399,6 @@ struct task_group *sched_create_group(struct task_group *parent) | |||
8645 | goto err; | 8399 | goto err; |
8646 | 8400 | ||
8647 | spin_lock_irqsave(&task_group_lock, flags); | 8401 | spin_lock_irqsave(&task_group_lock, flags); |
8648 | for_each_possible_cpu(i) { | ||
8649 | register_fair_sched_group(tg, i); | ||
8650 | register_rt_sched_group(tg, i); | ||
8651 | } | ||
8652 | list_add_rcu(&tg->list, &task_groups); | 8402 | list_add_rcu(&tg->list, &task_groups); |
8653 | 8403 | ||
8654 | WARN_ON(!parent); /* root should already exist */ | 8404 | WARN_ON(!parent); /* root should already exist */ |
@@ -8678,11 +8428,11 @@ void sched_destroy_group(struct task_group *tg) | |||
8678 | unsigned long flags; | 8428 | unsigned long flags; |
8679 | int i; | 8429 | int i; |
8680 | 8430 | ||
8681 | spin_lock_irqsave(&task_group_lock, flags); | 8431 | /* end participation in shares distribution */ |
8682 | for_each_possible_cpu(i) { | 8432 | for_each_possible_cpu(i) |
8683 | unregister_fair_sched_group(tg, i); | 8433 | unregister_fair_sched_group(tg, i); |
8684 | unregister_rt_sched_group(tg, i); | 8434 | |
8685 | } | 8435 | spin_lock_irqsave(&task_group_lock, flags); |
8686 | list_del_rcu(&tg->list); | 8436 | list_del_rcu(&tg->list); |
8687 | list_del_rcu(&tg->siblings); | 8437 | list_del_rcu(&tg->siblings); |
8688 | spin_unlock_irqrestore(&task_group_lock, flags); | 8438 | spin_unlock_irqrestore(&task_group_lock, flags); |
@@ -8729,33 +8479,6 @@ void sched_move_task(struct task_struct *tsk) | |||
8729 | #endif /* CONFIG_CGROUP_SCHED */ | 8479 | #endif /* CONFIG_CGROUP_SCHED */ |
8730 | 8480 | ||
8731 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8481 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8732 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) | ||
8733 | { | ||
8734 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
8735 | int on_rq; | ||
8736 | |||
8737 | on_rq = se->on_rq; | ||
8738 | if (on_rq) | ||
8739 | dequeue_entity(cfs_rq, se, 0); | ||
8740 | |||
8741 | se->load.weight = shares; | ||
8742 | se->load.inv_weight = 0; | ||
8743 | |||
8744 | if (on_rq) | ||
8745 | enqueue_entity(cfs_rq, se, 0); | ||
8746 | } | ||
8747 | |||
8748 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | ||
8749 | { | ||
8750 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
8751 | struct rq *rq = cfs_rq->rq; | ||
8752 | unsigned long flags; | ||
8753 | |||
8754 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8755 | __set_se_shares(se, shares); | ||
8756 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8757 | } | ||
8758 | |||
8759 | static DEFINE_MUTEX(shares_mutex); | 8482 | static DEFINE_MUTEX(shares_mutex); |
8760 | 8483 | ||
8761 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | 8484 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) |
@@ -8778,37 +8501,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
8778 | if (tg->shares == shares) | 8501 | if (tg->shares == shares) |
8779 | goto done; | 8502 | goto done; |
8780 | 8503 | ||
8781 | spin_lock_irqsave(&task_group_lock, flags); | ||
8782 | for_each_possible_cpu(i) | ||
8783 | unregister_fair_sched_group(tg, i); | ||
8784 | list_del_rcu(&tg->siblings); | ||
8785 | spin_unlock_irqrestore(&task_group_lock, flags); | ||
8786 | |||
8787 | /* wait for any ongoing reference to this group to finish */ | ||
8788 | synchronize_sched(); | ||
8789 | |||
8790 | /* | ||
8791 | * Now we are free to modify the group's share on each cpu | ||
8792 | * w/o tripping rebalance_share or load_balance_fair. | ||
8793 | */ | ||
8794 | tg->shares = shares; | 8504 | tg->shares = shares; |
8795 | for_each_possible_cpu(i) { | 8505 | for_each_possible_cpu(i) { |
8796 | /* | 8506 | struct rq *rq = cpu_rq(i); |
8797 | * force a rebalance | 8507 | struct sched_entity *se; |
8798 | */ | 8508 | |
8799 | cfs_rq_set_shares(tg->cfs_rq[i], 0); | 8509 | se = tg->se[i]; |
8800 | set_se_shares(tg->se[i], shares); | 8510 | /* Propagate contribution to hierarchy */ |
8511 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8512 | for_each_sched_entity(se) | ||
8513 | update_cfs_shares(group_cfs_rq(se), 0); | ||
8514 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8801 | } | 8515 | } |
8802 | 8516 | ||
8803 | /* | ||
8804 | * Enable load balance activity on this group, by inserting it back on | ||
8805 | * each cpu's rq->leaf_cfs_rq_list. | ||
8806 | */ | ||
8807 | spin_lock_irqsave(&task_group_lock, flags); | ||
8808 | for_each_possible_cpu(i) | ||
8809 | register_fair_sched_group(tg, i); | ||
8810 | list_add_rcu(&tg->siblings, &tg->parent->children); | ||
8811 | spin_unlock_irqrestore(&task_group_lock, flags); | ||
8812 | done: | 8517 | done: |
8813 | mutex_unlock(&shares_mutex); | 8518 | mutex_unlock(&shares_mutex); |
8814 | return 0; | 8519 | return 0; |
@@ -9107,7 +8812,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
9107 | 8812 | ||
9108 | if (!cgrp->parent) { | 8813 | if (!cgrp->parent) { |
9109 | /* This is early initialization for the top cgroup */ | 8814 | /* This is early initialization for the top cgroup */ |
9110 | return &init_task_group.css; | 8815 | return &root_task_group.css; |
9111 | } | 8816 | } |
9112 | 8817 | ||
9113 | parent = cgroup_tg(cgrp->parent); | 8818 | parent = cgroup_tg(cgrp->parent); |
@@ -9178,6 +8883,20 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
9178 | } | 8883 | } |
9179 | } | 8884 | } |
9180 | 8885 | ||
8886 | static void | ||
8887 | cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) | ||
8888 | { | ||
8889 | /* | ||
8890 | * cgroup_exit() is called in the copy_process() failure path. | ||
8891 | * Ignore this case since the task hasn't ran yet, this avoids | ||
8892 | * trying to poke a half freed task state from generic code. | ||
8893 | */ | ||
8894 | if (!(task->flags & PF_EXITING)) | ||
8895 | return; | ||
8896 | |||
8897 | sched_move_task(task); | ||
8898 | } | ||
8899 | |||
9181 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8900 | #ifdef CONFIG_FAIR_GROUP_SCHED |
9182 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, | 8901 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, |
9183 | u64 shareval) | 8902 | u64 shareval) |
@@ -9250,6 +8969,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
9250 | .destroy = cpu_cgroup_destroy, | 8969 | .destroy = cpu_cgroup_destroy, |
9251 | .can_attach = cpu_cgroup_can_attach, | 8970 | .can_attach = cpu_cgroup_can_attach, |
9252 | .attach = cpu_cgroup_attach, | 8971 | .attach = cpu_cgroup_attach, |
8972 | .exit = cpu_cgroup_exit, | ||
9253 | .populate = cpu_cgroup_populate, | 8973 | .populate = cpu_cgroup_populate, |
9254 | .subsys_id = cpu_cgroup_subsys_id, | 8974 | .subsys_id = cpu_cgroup_subsys_id, |
9255 | .early_init = 1, | 8975 | .early_init = 1, |
@@ -9534,72 +9254,3 @@ struct cgroup_subsys cpuacct_subsys = { | |||
9534 | }; | 9254 | }; |
9535 | #endif /* CONFIG_CGROUP_CPUACCT */ | 9255 | #endif /* CONFIG_CGROUP_CPUACCT */ |
9536 | 9256 | ||
9537 | #ifndef CONFIG_SMP | ||
9538 | |||
9539 | void synchronize_sched_expedited(void) | ||
9540 | { | ||
9541 | barrier(); | ||
9542 | } | ||
9543 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
9544 | |||
9545 | #else /* #ifndef CONFIG_SMP */ | ||
9546 | |||
9547 | static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0); | ||
9548 | |||
9549 | static int synchronize_sched_expedited_cpu_stop(void *data) | ||
9550 | { | ||
9551 | /* | ||
9552 | * There must be a full memory barrier on each affected CPU | ||
9553 | * between the time that try_stop_cpus() is called and the | ||
9554 | * time that it returns. | ||
9555 | * | ||
9556 | * In the current initial implementation of cpu_stop, the | ||
9557 | * above condition is already met when the control reaches | ||
9558 | * this point and the following smp_mb() is not strictly | ||
9559 | * necessary. Do smp_mb() anyway for documentation and | ||
9560 | * robustness against future implementation changes. | ||
9561 | */ | ||
9562 | smp_mb(); /* See above comment block. */ | ||
9563 | return 0; | ||
9564 | } | ||
9565 | |||
9566 | /* | ||
9567 | * Wait for an rcu-sched grace period to elapse, but use "big hammer" | ||
9568 | * approach to force grace period to end quickly. This consumes | ||
9569 | * significant time on all CPUs, and is thus not recommended for | ||
9570 | * any sort of common-case code. | ||
9571 | * | ||
9572 | * Note that it is illegal to call this function while holding any | ||
9573 | * lock that is acquired by a CPU-hotplug notifier. Failing to | ||
9574 | * observe this restriction will result in deadlock. | ||
9575 | */ | ||
9576 | void synchronize_sched_expedited(void) | ||
9577 | { | ||
9578 | int snap, trycount = 0; | ||
9579 | |||
9580 | smp_mb(); /* ensure prior mod happens before capturing snap. */ | ||
9581 | snap = atomic_read(&synchronize_sched_expedited_count) + 1; | ||
9582 | get_online_cpus(); | ||
9583 | while (try_stop_cpus(cpu_online_mask, | ||
9584 | synchronize_sched_expedited_cpu_stop, | ||
9585 | NULL) == -EAGAIN) { | ||
9586 | put_online_cpus(); | ||
9587 | if (trycount++ < 10) | ||
9588 | udelay(trycount * num_online_cpus()); | ||
9589 | else { | ||
9590 | synchronize_sched(); | ||
9591 | return; | ||
9592 | } | ||
9593 | if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) { | ||
9594 | smp_mb(); /* ensure test happens before caller kfree */ | ||
9595 | return; | ||
9596 | } | ||
9597 | get_online_cpus(); | ||
9598 | } | ||
9599 | atomic_inc(&synchronize_sched_expedited_count); | ||
9600 | smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */ | ||
9601 | put_online_cpus(); | ||
9602 | } | ||
9603 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
9604 | |||
9605 | #endif /* #else #ifndef CONFIG_SMP */ | ||
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c new file mode 100644 index 000000000000..9fb656283157 --- /dev/null +++ b/kernel/sched_autogroup.c | |||
@@ -0,0 +1,270 @@ | |||
1 | #ifdef CONFIG_SCHED_AUTOGROUP | ||
2 | |||
3 | #include <linux/proc_fs.h> | ||
4 | #include <linux/seq_file.h> | ||
5 | #include <linux/kallsyms.h> | ||
6 | #include <linux/utsname.h> | ||
7 | |||
8 | unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; | ||
9 | static struct autogroup autogroup_default; | ||
10 | static atomic_t autogroup_seq_nr; | ||
11 | |||
12 | static void __init autogroup_init(struct task_struct *init_task) | ||
13 | { | ||
14 | autogroup_default.tg = &root_task_group; | ||
15 | root_task_group.autogroup = &autogroup_default; | ||
16 | kref_init(&autogroup_default.kref); | ||
17 | init_rwsem(&autogroup_default.lock); | ||
18 | init_task->signal->autogroup = &autogroup_default; | ||
19 | } | ||
20 | |||
21 | static inline void autogroup_free(struct task_group *tg) | ||
22 | { | ||
23 | kfree(tg->autogroup); | ||
24 | } | ||
25 | |||
26 | static inline void autogroup_destroy(struct kref *kref) | ||
27 | { | ||
28 | struct autogroup *ag = container_of(kref, struct autogroup, kref); | ||
29 | |||
30 | #ifdef CONFIG_RT_GROUP_SCHED | ||
31 | /* We've redirected RT tasks to the root task group... */ | ||
32 | ag->tg->rt_se = NULL; | ||
33 | ag->tg->rt_rq = NULL; | ||
34 | #endif | ||
35 | sched_destroy_group(ag->tg); | ||
36 | } | ||
37 | |||
38 | static inline void autogroup_kref_put(struct autogroup *ag) | ||
39 | { | ||
40 | kref_put(&ag->kref, autogroup_destroy); | ||
41 | } | ||
42 | |||
43 | static inline struct autogroup *autogroup_kref_get(struct autogroup *ag) | ||
44 | { | ||
45 | kref_get(&ag->kref); | ||
46 | return ag; | ||
47 | } | ||
48 | |||
49 | static inline struct autogroup *autogroup_task_get(struct task_struct *p) | ||
50 | { | ||
51 | struct autogroup *ag; | ||
52 | unsigned long flags; | ||
53 | |||
54 | if (!lock_task_sighand(p, &flags)) | ||
55 | return autogroup_kref_get(&autogroup_default); | ||
56 | |||
57 | ag = autogroup_kref_get(p->signal->autogroup); | ||
58 | unlock_task_sighand(p, &flags); | ||
59 | |||
60 | return ag; | ||
61 | } | ||
62 | |||
63 | #ifdef CONFIG_RT_GROUP_SCHED | ||
64 | static void free_rt_sched_group(struct task_group *tg); | ||
65 | #endif | ||
66 | |||
67 | static inline struct autogroup *autogroup_create(void) | ||
68 | { | ||
69 | struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); | ||
70 | struct task_group *tg; | ||
71 | |||
72 | if (!ag) | ||
73 | goto out_fail; | ||
74 | |||
75 | tg = sched_create_group(&root_task_group); | ||
76 | |||
77 | if (IS_ERR(tg)) | ||
78 | goto out_free; | ||
79 | |||
80 | kref_init(&ag->kref); | ||
81 | init_rwsem(&ag->lock); | ||
82 | ag->id = atomic_inc_return(&autogroup_seq_nr); | ||
83 | ag->tg = tg; | ||
84 | #ifdef CONFIG_RT_GROUP_SCHED | ||
85 | /* | ||
86 | * Autogroup RT tasks are redirected to the root task group | ||
87 | * so we don't have to move tasks around upon policy change, | ||
88 | * or flail around trying to allocate bandwidth on the fly. | ||
89 | * A bandwidth exception in __sched_setscheduler() allows | ||
90 | * the policy change to proceed. Thereafter, task_group() | ||
91 | * returns &root_task_group, so zero bandwidth is required. | ||
92 | */ | ||
93 | free_rt_sched_group(tg); | ||
94 | tg->rt_se = root_task_group.rt_se; | ||
95 | tg->rt_rq = root_task_group.rt_rq; | ||
96 | #endif | ||
97 | tg->autogroup = ag; | ||
98 | |||
99 | return ag; | ||
100 | |||
101 | out_free: | ||
102 | kfree(ag); | ||
103 | out_fail: | ||
104 | if (printk_ratelimit()) { | ||
105 | printk(KERN_WARNING "autogroup_create: %s failure.\n", | ||
106 | ag ? "sched_create_group()" : "kmalloc()"); | ||
107 | } | ||
108 | |||
109 | return autogroup_kref_get(&autogroup_default); | ||
110 | } | ||
111 | |||
112 | static inline bool | ||
113 | task_wants_autogroup(struct task_struct *p, struct task_group *tg) | ||
114 | { | ||
115 | if (tg != &root_task_group) | ||
116 | return false; | ||
117 | |||
118 | if (p->sched_class != &fair_sched_class) | ||
119 | return false; | ||
120 | |||
121 | /* | ||
122 | * We can only assume the task group can't go away on us if | ||
123 | * autogroup_move_group() can see us on ->thread_group list. | ||
124 | */ | ||
125 | if (p->flags & PF_EXITING) | ||
126 | return false; | ||
127 | |||
128 | return true; | ||
129 | } | ||
130 | |||
131 | static inline bool task_group_is_autogroup(struct task_group *tg) | ||
132 | { | ||
133 | return tg != &root_task_group && tg->autogroup; | ||
134 | } | ||
135 | |||
136 | static inline struct task_group * | ||
137 | autogroup_task_group(struct task_struct *p, struct task_group *tg) | ||
138 | { | ||
139 | int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); | ||
140 | |||
141 | if (enabled && task_wants_autogroup(p, tg)) | ||
142 | return p->signal->autogroup->tg; | ||
143 | |||
144 | return tg; | ||
145 | } | ||
146 | |||
147 | static void | ||
148 | autogroup_move_group(struct task_struct *p, struct autogroup *ag) | ||
149 | { | ||
150 | struct autogroup *prev; | ||
151 | struct task_struct *t; | ||
152 | unsigned long flags; | ||
153 | |||
154 | BUG_ON(!lock_task_sighand(p, &flags)); | ||
155 | |||
156 | prev = p->signal->autogroup; | ||
157 | if (prev == ag) { | ||
158 | unlock_task_sighand(p, &flags); | ||
159 | return; | ||
160 | } | ||
161 | |||
162 | p->signal->autogroup = autogroup_kref_get(ag); | ||
163 | |||
164 | t = p; | ||
165 | do { | ||
166 | sched_move_task(t); | ||
167 | } while_each_thread(p, t); | ||
168 | |||
169 | unlock_task_sighand(p, &flags); | ||
170 | autogroup_kref_put(prev); | ||
171 | } | ||
172 | |||
173 | /* Allocates GFP_KERNEL, cannot be called under any spinlock */ | ||
174 | void sched_autogroup_create_attach(struct task_struct *p) | ||
175 | { | ||
176 | struct autogroup *ag = autogroup_create(); | ||
177 | |||
178 | autogroup_move_group(p, ag); | ||
179 | /* drop extra refrence added by autogroup_create() */ | ||
180 | autogroup_kref_put(ag); | ||
181 | } | ||
182 | EXPORT_SYMBOL(sched_autogroup_create_attach); | ||
183 | |||
184 | /* Cannot be called under siglock. Currently has no users */ | ||
185 | void sched_autogroup_detach(struct task_struct *p) | ||
186 | { | ||
187 | autogroup_move_group(p, &autogroup_default); | ||
188 | } | ||
189 | EXPORT_SYMBOL(sched_autogroup_detach); | ||
190 | |||
191 | void sched_autogroup_fork(struct signal_struct *sig) | ||
192 | { | ||
193 | sig->autogroup = autogroup_task_get(current); | ||
194 | } | ||
195 | |||
196 | void sched_autogroup_exit(struct signal_struct *sig) | ||
197 | { | ||
198 | autogroup_kref_put(sig->autogroup); | ||
199 | } | ||
200 | |||
201 | static int __init setup_autogroup(char *str) | ||
202 | { | ||
203 | sysctl_sched_autogroup_enabled = 0; | ||
204 | |||
205 | return 1; | ||
206 | } | ||
207 | |||
208 | __setup("noautogroup", setup_autogroup); | ||
209 | |||
210 | #ifdef CONFIG_PROC_FS | ||
211 | |||
212 | int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) | ||
213 | { | ||
214 | static unsigned long next = INITIAL_JIFFIES; | ||
215 | struct autogroup *ag; | ||
216 | int err; | ||
217 | |||
218 | if (*nice < -20 || *nice > 19) | ||
219 | return -EINVAL; | ||
220 | |||
221 | err = security_task_setnice(current, *nice); | ||
222 | if (err) | ||
223 | return err; | ||
224 | |||
225 | if (*nice < 0 && !can_nice(current, *nice)) | ||
226 | return -EPERM; | ||
227 | |||
228 | /* this is a heavy operation taking global locks.. */ | ||
229 | if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) | ||
230 | return -EAGAIN; | ||
231 | |||
232 | next = HZ / 10 + jiffies; | ||
233 | ag = autogroup_task_get(p); | ||
234 | |||
235 | down_write(&ag->lock); | ||
236 | err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]); | ||
237 | if (!err) | ||
238 | ag->nice = *nice; | ||
239 | up_write(&ag->lock); | ||
240 | |||
241 | autogroup_kref_put(ag); | ||
242 | |||
243 | return err; | ||
244 | } | ||
245 | |||
246 | void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) | ||
247 | { | ||
248 | struct autogroup *ag = autogroup_task_get(p); | ||
249 | |||
250 | down_read(&ag->lock); | ||
251 | seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); | ||
252 | up_read(&ag->lock); | ||
253 | |||
254 | autogroup_kref_put(ag); | ||
255 | } | ||
256 | #endif /* CONFIG_PROC_FS */ | ||
257 | |||
258 | #ifdef CONFIG_SCHED_DEBUG | ||
259 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) | ||
260 | { | ||
261 | int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); | ||
262 | |||
263 | if (!enabled || !tg->autogroup) | ||
264 | return 0; | ||
265 | |||
266 | return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); | ||
267 | } | ||
268 | #endif /* CONFIG_SCHED_DEBUG */ | ||
269 | |||
270 | #endif /* CONFIG_SCHED_AUTOGROUP */ | ||
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h new file mode 100644 index 000000000000..7b859ffe5dad --- /dev/null +++ b/kernel/sched_autogroup.h | |||
@@ -0,0 +1,36 @@ | |||
1 | #ifdef CONFIG_SCHED_AUTOGROUP | ||
2 | |||
3 | struct autogroup { | ||
4 | struct kref kref; | ||
5 | struct task_group *tg; | ||
6 | struct rw_semaphore lock; | ||
7 | unsigned long id; | ||
8 | int nice; | ||
9 | }; | ||
10 | |||
11 | static inline struct task_group * | ||
12 | autogroup_task_group(struct task_struct *p, struct task_group *tg); | ||
13 | |||
14 | #else /* !CONFIG_SCHED_AUTOGROUP */ | ||
15 | |||
16 | static inline void autogroup_init(struct task_struct *init_task) { } | ||
17 | static inline void autogroup_free(struct task_group *tg) { } | ||
18 | static inline bool task_group_is_autogroup(struct task_group *tg) | ||
19 | { | ||
20 | return 0; | ||
21 | } | ||
22 | |||
23 | static inline struct task_group * | ||
24 | autogroup_task_group(struct task_struct *p, struct task_group *tg) | ||
25 | { | ||
26 | return tg; | ||
27 | } | ||
28 | |||
29 | #ifdef CONFIG_SCHED_DEBUG | ||
30 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) | ||
31 | { | ||
32 | return 0; | ||
33 | } | ||
34 | #endif | ||
35 | |||
36 | #endif /* CONFIG_SCHED_AUTOGROUP */ | ||
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 52f1a149bfb1..9d8af0b3fb64 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c | |||
@@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
79 | } | 79 | } |
80 | EXPORT_SYMBOL_GPL(sched_clock); | 80 | EXPORT_SYMBOL_GPL(sched_clock); |
81 | 81 | ||
82 | static __read_mostly int sched_clock_running; | 82 | __read_mostly int sched_clock_running; |
83 | 83 | ||
84 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | 84 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
85 | __read_mostly int sched_clock_stable; | 85 | __read_mostly int sched_clock_stable; |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 2e1b0d17dd9b..eb6cb8edd075 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -16,6 +16,8 @@ | |||
16 | #include <linux/kallsyms.h> | 16 | #include <linux/kallsyms.h> |
17 | #include <linux/utsname.h> | 17 | #include <linux/utsname.h> |
18 | 18 | ||
19 | static DEFINE_SPINLOCK(sched_debug_lock); | ||
20 | |||
19 | /* | 21 | /* |
20 | * This allows printing both to /proc/sched_debug and | 22 | * This allows printing both to /proc/sched_debug and |
21 | * to the console | 23 | * to the console |
@@ -54,8 +56,7 @@ static unsigned long nsec_low(unsigned long long nsec) | |||
54 | #define SPLIT_NS(x) nsec_high(x), nsec_low(x) | 56 | #define SPLIT_NS(x) nsec_high(x), nsec_low(x) |
55 | 57 | ||
56 | #ifdef CONFIG_FAIR_GROUP_SCHED | 58 | #ifdef CONFIG_FAIR_GROUP_SCHED |
57 | static void print_cfs_group_stats(struct seq_file *m, int cpu, | 59 | static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) |
58 | struct task_group *tg) | ||
59 | { | 60 | { |
60 | struct sched_entity *se = tg->se[cpu]; | 61 | struct sched_entity *se = tg->se[cpu]; |
61 | if (!se) | 62 | if (!se) |
@@ -87,6 +88,26 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, | |||
87 | } | 88 | } |
88 | #endif | 89 | #endif |
89 | 90 | ||
91 | #ifdef CONFIG_CGROUP_SCHED | ||
92 | static char group_path[PATH_MAX]; | ||
93 | |||
94 | static char *task_group_path(struct task_group *tg) | ||
95 | { | ||
96 | if (autogroup_path(tg, group_path, PATH_MAX)) | ||
97 | return group_path; | ||
98 | |||
99 | /* | ||
100 | * May be NULL if the underlying cgroup isn't fully-created yet | ||
101 | */ | ||
102 | if (!tg->css.cgroup) { | ||
103 | group_path[0] = '\0'; | ||
104 | return group_path; | ||
105 | } | ||
106 | cgroup_path(tg->css.cgroup, group_path, PATH_MAX); | ||
107 | return group_path; | ||
108 | } | ||
109 | #endif | ||
110 | |||
90 | static void | 111 | static void |
91 | print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | 112 | print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) |
92 | { | 113 | { |
@@ -109,17 +130,10 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
109 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", | 130 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", |
110 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); | 131 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); |
111 | #endif | 132 | #endif |
112 | |||
113 | #ifdef CONFIG_CGROUP_SCHED | 133 | #ifdef CONFIG_CGROUP_SCHED |
114 | { | 134 | SEQ_printf(m, " %s", task_group_path(task_group(p))); |
115 | char path[64]; | ||
116 | |||
117 | rcu_read_lock(); | ||
118 | cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); | ||
119 | rcu_read_unlock(); | ||
120 | SEQ_printf(m, " %s", path); | ||
121 | } | ||
122 | #endif | 135 | #endif |
136 | |||
123 | SEQ_printf(m, "\n"); | 137 | SEQ_printf(m, "\n"); |
124 | } | 138 | } |
125 | 139 | ||
@@ -147,19 +161,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
147 | read_unlock_irqrestore(&tasklist_lock, flags); | 161 | read_unlock_irqrestore(&tasklist_lock, flags); |
148 | } | 162 | } |
149 | 163 | ||
150 | #if defined(CONFIG_CGROUP_SCHED) && \ | ||
151 | (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)) | ||
152 | static void task_group_path(struct task_group *tg, char *buf, int buflen) | ||
153 | { | ||
154 | /* may be NULL if the underlying cgroup isn't fully-created yet */ | ||
155 | if (!tg->css.cgroup) { | ||
156 | buf[0] = '\0'; | ||
157 | return; | ||
158 | } | ||
159 | cgroup_path(tg->css.cgroup, buf, buflen); | ||
160 | } | ||
161 | #endif | ||
162 | |||
163 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | 164 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) |
164 | { | 165 | { |
165 | s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, | 166 | s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, |
@@ -168,13 +169,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
168 | struct sched_entity *last; | 169 | struct sched_entity *last; |
169 | unsigned long flags; | 170 | unsigned long flags; |
170 | 171 | ||
171 | #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) | 172 | #ifdef CONFIG_FAIR_GROUP_SCHED |
172 | char path[128]; | 173 | SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); |
173 | struct task_group *tg = cfs_rq->tg; | ||
174 | |||
175 | task_group_path(tg, path, sizeof(path)); | ||
176 | |||
177 | SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); | ||
178 | #else | 174 | #else |
179 | SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); | 175 | SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); |
180 | #endif | 176 | #endif |
@@ -202,33 +198,34 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
202 | spread0 = min_vruntime - rq0_min_vruntime; | 198 | spread0 = min_vruntime - rq0_min_vruntime; |
203 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", | 199 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", |
204 | SPLIT_NS(spread0)); | 200 | SPLIT_NS(spread0)); |
205 | SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); | ||
206 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | ||
207 | |||
208 | SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", | 201 | SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", |
209 | cfs_rq->nr_spread_over); | 202 | cfs_rq->nr_spread_over); |
203 | SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); | ||
204 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | ||
210 | #ifdef CONFIG_FAIR_GROUP_SCHED | 205 | #ifdef CONFIG_FAIR_GROUP_SCHED |
211 | #ifdef CONFIG_SMP | 206 | #ifdef CONFIG_SMP |
212 | SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); | 207 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", |
208 | SPLIT_NS(cfs_rq->load_avg)); | ||
209 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", | ||
210 | SPLIT_NS(cfs_rq->load_period)); | ||
211 | SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", | ||
212 | cfs_rq->load_contribution); | ||
213 | SEQ_printf(m, " .%-30s: %d\n", "load_tg", | ||
214 | atomic_read(&cfs_rq->tg->load_weight)); | ||
213 | #endif | 215 | #endif |
216 | |||
214 | print_cfs_group_stats(m, cpu, cfs_rq->tg); | 217 | print_cfs_group_stats(m, cpu, cfs_rq->tg); |
215 | #endif | 218 | #endif |
216 | } | 219 | } |
217 | 220 | ||
218 | void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) | 221 | void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) |
219 | { | 222 | { |
220 | #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) | 223 | #ifdef CONFIG_RT_GROUP_SCHED |
221 | char path[128]; | 224 | SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); |
222 | struct task_group *tg = rt_rq->tg; | ||
223 | |||
224 | task_group_path(tg, path, sizeof(path)); | ||
225 | |||
226 | SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); | ||
227 | #else | 225 | #else |
228 | SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); | 226 | SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); |
229 | #endif | 227 | #endif |
230 | 228 | ||
231 | |||
232 | #define P(x) \ | 229 | #define P(x) \ |
233 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) | 230 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) |
234 | #define PN(x) \ | 231 | #define PN(x) \ |
@@ -243,9 +240,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) | |||
243 | #undef P | 240 | #undef P |
244 | } | 241 | } |
245 | 242 | ||
243 | extern __read_mostly int sched_clock_running; | ||
244 | |||
246 | static void print_cpu(struct seq_file *m, int cpu) | 245 | static void print_cpu(struct seq_file *m, int cpu) |
247 | { | 246 | { |
248 | struct rq *rq = cpu_rq(cpu); | 247 | struct rq *rq = cpu_rq(cpu); |
248 | unsigned long flags; | ||
249 | 249 | ||
250 | #ifdef CONFIG_X86 | 250 | #ifdef CONFIG_X86 |
251 | { | 251 | { |
@@ -296,14 +296,20 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
296 | P(ttwu_count); | 296 | P(ttwu_count); |
297 | P(ttwu_local); | 297 | P(ttwu_local); |
298 | 298 | ||
299 | P(bkl_count); | 299 | SEQ_printf(m, " .%-30s: %d\n", "bkl_count", |
300 | rq->rq_sched_info.bkl_count); | ||
300 | 301 | ||
301 | #undef P | 302 | #undef P |
303 | #undef P64 | ||
302 | #endif | 304 | #endif |
305 | spin_lock_irqsave(&sched_debug_lock, flags); | ||
303 | print_cfs_stats(m, cpu); | 306 | print_cfs_stats(m, cpu); |
304 | print_rt_stats(m, cpu); | 307 | print_rt_stats(m, cpu); |
305 | 308 | ||
309 | rcu_read_lock(); | ||
306 | print_rq(m, rq, cpu); | 310 | print_rq(m, rq, cpu); |
311 | rcu_read_unlock(); | ||
312 | spin_unlock_irqrestore(&sched_debug_lock, flags); | ||
307 | } | 313 | } |
308 | 314 | ||
309 | static const char *sched_tunable_scaling_names[] = { | 315 | static const char *sched_tunable_scaling_names[] = { |
@@ -314,21 +320,42 @@ static const char *sched_tunable_scaling_names[] = { | |||
314 | 320 | ||
315 | static int sched_debug_show(struct seq_file *m, void *v) | 321 | static int sched_debug_show(struct seq_file *m, void *v) |
316 | { | 322 | { |
317 | u64 now = ktime_to_ns(ktime_get()); | 323 | u64 ktime, sched_clk, cpu_clk; |
324 | unsigned long flags; | ||
318 | int cpu; | 325 | int cpu; |
319 | 326 | ||
320 | SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n", | 327 | local_irq_save(flags); |
328 | ktime = ktime_to_ns(ktime_get()); | ||
329 | sched_clk = sched_clock(); | ||
330 | cpu_clk = local_clock(); | ||
331 | local_irq_restore(flags); | ||
332 | |||
333 | SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", | ||
321 | init_utsname()->release, | 334 | init_utsname()->release, |
322 | (int)strcspn(init_utsname()->version, " "), | 335 | (int)strcspn(init_utsname()->version, " "), |
323 | init_utsname()->version); | 336 | init_utsname()->version); |
324 | 337 | ||
325 | SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); | 338 | #define P(x) \ |
339 | SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x)) | ||
340 | #define PN(x) \ | ||
341 | SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) | ||
342 | PN(ktime); | ||
343 | PN(sched_clk); | ||
344 | PN(cpu_clk); | ||
345 | P(jiffies); | ||
346 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | ||
347 | P(sched_clock_stable); | ||
348 | #endif | ||
349 | #undef PN | ||
350 | #undef P | ||
351 | |||
352 | SEQ_printf(m, "\n"); | ||
353 | SEQ_printf(m, "sysctl_sched\n"); | ||
326 | 354 | ||
327 | #define P(x) \ | 355 | #define P(x) \ |
328 | SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) | 356 | SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) |
329 | #define PN(x) \ | 357 | #define PN(x) \ |
330 | SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) | 358 | SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) |
331 | P(jiffies); | ||
332 | PN(sysctl_sched_latency); | 359 | PN(sysctl_sched_latency); |
333 | PN(sysctl_sched_min_granularity); | 360 | PN(sysctl_sched_min_granularity); |
334 | PN(sysctl_sched_wakeup_granularity); | 361 | PN(sysctl_sched_wakeup_granularity); |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 00ebd7686676..0c26e2df450e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; | |||
89 | 89 | ||
90 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | 90 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
91 | 91 | ||
92 | /* | ||
93 | * The exponential sliding window over which load is averaged for shares | ||
94 | * distribution. | ||
95 | * (default: 10msec) | ||
96 | */ | ||
97 | unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; | ||
98 | |||
92 | static const struct sched_class fair_sched_class; | 99 | static const struct sched_class fair_sched_class; |
93 | 100 | ||
94 | /************************************************************** | 101 | /************************************************************** |
@@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | |||
143 | return cfs_rq->tg->cfs_rq[this_cpu]; | 150 | return cfs_rq->tg->cfs_rq[this_cpu]; |
144 | } | 151 | } |
145 | 152 | ||
153 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | ||
154 | { | ||
155 | if (!cfs_rq->on_list) { | ||
156 | /* | ||
157 | * Ensure we either appear before our parent (if already | ||
158 | * enqueued) or force our parent to appear after us when it is | ||
159 | * enqueued. The fact that we always enqueue bottom-up | ||
160 | * reduces this to two cases. | ||
161 | */ | ||
162 | if (cfs_rq->tg->parent && | ||
163 | cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { | ||
164 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, | ||
165 | &rq_of(cfs_rq)->leaf_cfs_rq_list); | ||
166 | } else { | ||
167 | list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, | ||
168 | &rq_of(cfs_rq)->leaf_cfs_rq_list); | ||
169 | } | ||
170 | |||
171 | cfs_rq->on_list = 1; | ||
172 | } | ||
173 | } | ||
174 | |||
175 | static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) | ||
176 | { | ||
177 | if (cfs_rq->on_list) { | ||
178 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | ||
179 | cfs_rq->on_list = 0; | ||
180 | } | ||
181 | } | ||
182 | |||
146 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | 183 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ |
147 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 184 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
148 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | 185 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) |
@@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | |||
246 | return &cpu_rq(this_cpu)->cfs; | 283 | return &cpu_rq(this_cpu)->cfs; |
247 | } | 284 | } |
248 | 285 | ||
286 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | ||
287 | { | ||
288 | } | ||
289 | |||
290 | static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) | ||
291 | { | ||
292 | } | ||
293 | |||
249 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 294 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
250 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | 295 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) |
251 | 296 | ||
@@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write, | |||
417 | WRT_SYSCTL(sched_min_granularity); | 462 | WRT_SYSCTL(sched_min_granularity); |
418 | WRT_SYSCTL(sched_latency); | 463 | WRT_SYSCTL(sched_latency); |
419 | WRT_SYSCTL(sched_wakeup_granularity); | 464 | WRT_SYSCTL(sched_wakeup_granularity); |
420 | WRT_SYSCTL(sched_shares_ratelimit); | ||
421 | #undef WRT_SYSCTL | 465 | #undef WRT_SYSCTL |
422 | 466 | ||
423 | return 0; | 467 | return 0; |
@@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
495 | return calc_delta_fair(sched_slice(cfs_rq, se), se); | 539 | return calc_delta_fair(sched_slice(cfs_rq, se), se); |
496 | } | 540 | } |
497 | 541 | ||
542 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); | ||
543 | static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); | ||
544 | |||
498 | /* | 545 | /* |
499 | * Update the current task's runtime statistics. Skip current tasks that | 546 | * Update the current task's runtime statistics. Skip current tasks that |
500 | * are not in our scheduling class. | 547 | * are not in our scheduling class. |
@@ -514,6 +561,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
514 | 561 | ||
515 | curr->vruntime += delta_exec_weighted; | 562 | curr->vruntime += delta_exec_weighted; |
516 | update_min_vruntime(cfs_rq); | 563 | update_min_vruntime(cfs_rq); |
564 | |||
565 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
566 | cfs_rq->load_unacc_exec_time += delta_exec; | ||
567 | #endif | ||
517 | } | 568 | } |
518 | 569 | ||
519 | static void update_curr(struct cfs_rq *cfs_rq) | 570 | static void update_curr(struct cfs_rq *cfs_rq) |
@@ -633,7 +684,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
633 | list_add(&se->group_node, &cfs_rq->tasks); | 684 | list_add(&se->group_node, &cfs_rq->tasks); |
634 | } | 685 | } |
635 | cfs_rq->nr_running++; | 686 | cfs_rq->nr_running++; |
636 | se->on_rq = 1; | ||
637 | } | 687 | } |
638 | 688 | ||
639 | static void | 689 | static void |
@@ -647,9 +697,165 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
647 | list_del_init(&se->group_node); | 697 | list_del_init(&se->group_node); |
648 | } | 698 | } |
649 | cfs_rq->nr_running--; | 699 | cfs_rq->nr_running--; |
650 | se->on_rq = 0; | ||
651 | } | 700 | } |
652 | 701 | ||
702 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
703 | # ifdef CONFIG_SMP | ||
704 | static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, | ||
705 | int global_update) | ||
706 | { | ||
707 | struct task_group *tg = cfs_rq->tg; | ||
708 | long load_avg; | ||
709 | |||
710 | load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); | ||
711 | load_avg -= cfs_rq->load_contribution; | ||
712 | |||
713 | if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { | ||
714 | atomic_add(load_avg, &tg->load_weight); | ||
715 | cfs_rq->load_contribution += load_avg; | ||
716 | } | ||
717 | } | ||
718 | |||
719 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | ||
720 | { | ||
721 | u64 period = sysctl_sched_shares_window; | ||
722 | u64 now, delta; | ||
723 | unsigned long load = cfs_rq->load.weight; | ||
724 | |||
725 | if (cfs_rq->tg == &root_task_group) | ||
726 | return; | ||
727 | |||
728 | now = rq_of(cfs_rq)->clock_task; | ||
729 | delta = now - cfs_rq->load_stamp; | ||
730 | |||
731 | /* truncate load history at 4 idle periods */ | ||
732 | if (cfs_rq->load_stamp > cfs_rq->load_last && | ||
733 | now - cfs_rq->load_last > 4 * period) { | ||
734 | cfs_rq->load_period = 0; | ||
735 | cfs_rq->load_avg = 0; | ||
736 | } | ||
737 | |||
738 | cfs_rq->load_stamp = now; | ||
739 | cfs_rq->load_unacc_exec_time = 0; | ||
740 | cfs_rq->load_period += delta; | ||
741 | if (load) { | ||
742 | cfs_rq->load_last = now; | ||
743 | cfs_rq->load_avg += delta * load; | ||
744 | } | ||
745 | |||
746 | /* consider updating load contribution on each fold or truncate */ | ||
747 | if (global_update || cfs_rq->load_period > period | ||
748 | || !cfs_rq->load_period) | ||
749 | update_cfs_rq_load_contribution(cfs_rq, global_update); | ||
750 | |||
751 | while (cfs_rq->load_period > period) { | ||
752 | /* | ||
753 | * Inline assembly required to prevent the compiler | ||
754 | * optimising this loop into a divmod call. | ||
755 | * See __iter_div_u64_rem() for another example of this. | ||
756 | */ | ||
757 | asm("" : "+rm" (cfs_rq->load_period)); | ||
758 | cfs_rq->load_period /= 2; | ||
759 | cfs_rq->load_avg /= 2; | ||
760 | } | ||
761 | |||
762 | if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) | ||
763 | list_del_leaf_cfs_rq(cfs_rq); | ||
764 | } | ||
765 | |||
766 | static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, | ||
767 | long weight_delta) | ||
768 | { | ||
769 | long load_weight, load, shares; | ||
770 | |||
771 | load = cfs_rq->load.weight + weight_delta; | ||
772 | |||
773 | load_weight = atomic_read(&tg->load_weight); | ||
774 | load_weight -= cfs_rq->load_contribution; | ||
775 | load_weight += load; | ||
776 | |||
777 | shares = (tg->shares * load); | ||
778 | if (load_weight) | ||
779 | shares /= load_weight; | ||
780 | |||
781 | if (shares < MIN_SHARES) | ||
782 | shares = MIN_SHARES; | ||
783 | if (shares > tg->shares) | ||
784 | shares = tg->shares; | ||
785 | |||
786 | return shares; | ||
787 | } | ||
788 | |||
789 | static void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
790 | { | ||
791 | if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { | ||
792 | update_cfs_load(cfs_rq, 0); | ||
793 | update_cfs_shares(cfs_rq, 0); | ||
794 | } | ||
795 | } | ||
796 | # else /* CONFIG_SMP */ | ||
797 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | ||
798 | { | ||
799 | } | ||
800 | |||
801 | static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, | ||
802 | long weight_delta) | ||
803 | { | ||
804 | return tg->shares; | ||
805 | } | ||
806 | |||
807 | static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
808 | { | ||
809 | } | ||
810 | # endif /* CONFIG_SMP */ | ||
811 | static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | ||
812 | unsigned long weight) | ||
813 | { | ||
814 | if (se->on_rq) { | ||
815 | /* commit outstanding execution time */ | ||
816 | if (cfs_rq->curr == se) | ||
817 | update_curr(cfs_rq); | ||
818 | account_entity_dequeue(cfs_rq, se); | ||
819 | } | ||
820 | |||
821 | update_load_set(&se->load, weight); | ||
822 | |||
823 | if (se->on_rq) | ||
824 | account_entity_enqueue(cfs_rq, se); | ||
825 | } | ||
826 | |||
827 | static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) | ||
828 | { | ||
829 | struct task_group *tg; | ||
830 | struct sched_entity *se; | ||
831 | long shares; | ||
832 | |||
833 | tg = cfs_rq->tg; | ||
834 | se = tg->se[cpu_of(rq_of(cfs_rq))]; | ||
835 | if (!se) | ||
836 | return; | ||
837 | #ifndef CONFIG_SMP | ||
838 | if (likely(se->load.weight == tg->shares)) | ||
839 | return; | ||
840 | #endif | ||
841 | shares = calc_cfs_shares(cfs_rq, tg, weight_delta); | ||
842 | |||
843 | reweight_entity(cfs_rq_of(se), se, shares); | ||
844 | } | ||
845 | #else /* CONFIG_FAIR_GROUP_SCHED */ | ||
846 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | ||
847 | { | ||
848 | } | ||
849 | |||
850 | static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) | ||
851 | { | ||
852 | } | ||
853 | |||
854 | static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
855 | { | ||
856 | } | ||
857 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
858 | |||
653 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 859 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
654 | { | 860 | { |
655 | #ifdef CONFIG_SCHEDSTATS | 861 | #ifdef CONFIG_SCHEDSTATS |
@@ -771,6 +977,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
771 | * Update run-time statistics of the 'current'. | 977 | * Update run-time statistics of the 'current'. |
772 | */ | 978 | */ |
773 | update_curr(cfs_rq); | 979 | update_curr(cfs_rq); |
980 | update_cfs_load(cfs_rq, 0); | ||
981 | update_cfs_shares(cfs_rq, se->load.weight); | ||
774 | account_entity_enqueue(cfs_rq, se); | 982 | account_entity_enqueue(cfs_rq, se); |
775 | 983 | ||
776 | if (flags & ENQUEUE_WAKEUP) { | 984 | if (flags & ENQUEUE_WAKEUP) { |
@@ -782,6 +990,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
782 | check_spread(cfs_rq, se); | 990 | check_spread(cfs_rq, se); |
783 | if (se != cfs_rq->curr) | 991 | if (se != cfs_rq->curr) |
784 | __enqueue_entity(cfs_rq, se); | 992 | __enqueue_entity(cfs_rq, se); |
993 | se->on_rq = 1; | ||
994 | |||
995 | if (cfs_rq->nr_running == 1) | ||
996 | list_add_leaf_cfs_rq(cfs_rq); | ||
785 | } | 997 | } |
786 | 998 | ||
787 | static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | 999 | static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) |
@@ -825,8 +1037,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
825 | 1037 | ||
826 | if (se != cfs_rq->curr) | 1038 | if (se != cfs_rq->curr) |
827 | __dequeue_entity(cfs_rq, se); | 1039 | __dequeue_entity(cfs_rq, se); |
1040 | se->on_rq = 0; | ||
1041 | update_cfs_load(cfs_rq, 0); | ||
828 | account_entity_dequeue(cfs_rq, se); | 1042 | account_entity_dequeue(cfs_rq, se); |
829 | update_min_vruntime(cfs_rq); | 1043 | update_min_vruntime(cfs_rq); |
1044 | update_cfs_shares(cfs_rq, 0); | ||
830 | 1045 | ||
831 | /* | 1046 | /* |
832 | * Normalize the entity after updating the min_vruntime because the | 1047 | * Normalize the entity after updating the min_vruntime because the |
@@ -872,6 +1087,9 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
872 | struct sched_entity *se = __pick_next_entity(cfs_rq); | 1087 | struct sched_entity *se = __pick_next_entity(cfs_rq); |
873 | s64 delta = curr->vruntime - se->vruntime; | 1088 | s64 delta = curr->vruntime - se->vruntime; |
874 | 1089 | ||
1090 | if (delta < 0) | ||
1091 | return; | ||
1092 | |||
875 | if (delta > ideal_runtime) | 1093 | if (delta > ideal_runtime) |
876 | resched_task(rq_of(cfs_rq)->curr); | 1094 | resched_task(rq_of(cfs_rq)->curr); |
877 | } | 1095 | } |
@@ -955,6 +1173,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
955 | */ | 1173 | */ |
956 | update_curr(cfs_rq); | 1174 | update_curr(cfs_rq); |
957 | 1175 | ||
1176 | /* | ||
1177 | * Update share accounting for long-running entities. | ||
1178 | */ | ||
1179 | update_entity_shares_tick(cfs_rq); | ||
1180 | |||
958 | #ifdef CONFIG_SCHED_HRTICK | 1181 | #ifdef CONFIG_SCHED_HRTICK |
959 | /* | 1182 | /* |
960 | * queued ticks are scheduled to match the slice, so don't bother | 1183 | * queued ticks are scheduled to match the slice, so don't bother |
@@ -1055,6 +1278,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1055 | flags = ENQUEUE_WAKEUP; | 1278 | flags = ENQUEUE_WAKEUP; |
1056 | } | 1279 | } |
1057 | 1280 | ||
1281 | for_each_sched_entity(se) { | ||
1282 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1283 | |||
1284 | update_cfs_load(cfs_rq, 0); | ||
1285 | update_cfs_shares(cfs_rq, 0); | ||
1286 | } | ||
1287 | |||
1058 | hrtick_update(rq); | 1288 | hrtick_update(rq); |
1059 | } | 1289 | } |
1060 | 1290 | ||
@@ -1071,12 +1301,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1071 | for_each_sched_entity(se) { | 1301 | for_each_sched_entity(se) { |
1072 | cfs_rq = cfs_rq_of(se); | 1302 | cfs_rq = cfs_rq_of(se); |
1073 | dequeue_entity(cfs_rq, se, flags); | 1303 | dequeue_entity(cfs_rq, se, flags); |
1304 | |||
1074 | /* Don't dequeue parent if it has other entities besides us */ | 1305 | /* Don't dequeue parent if it has other entities besides us */ |
1075 | if (cfs_rq->load.weight) | 1306 | if (cfs_rq->load.weight) |
1076 | break; | 1307 | break; |
1077 | flags |= DEQUEUE_SLEEP; | 1308 | flags |= DEQUEUE_SLEEP; |
1078 | } | 1309 | } |
1079 | 1310 | ||
1311 | for_each_sched_entity(se) { | ||
1312 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1313 | |||
1314 | update_cfs_load(cfs_rq, 0); | ||
1315 | update_cfs_shares(cfs_rq, 0); | ||
1316 | } | ||
1317 | |||
1080 | hrtick_update(rq); | 1318 | hrtick_update(rq); |
1081 | } | 1319 | } |
1082 | 1320 | ||
@@ -1143,67 +1381,36 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p) | |||
1143 | * Adding load to a group doesn't make a group heavier, but can cause movement | 1381 | * Adding load to a group doesn't make a group heavier, but can cause movement |
1144 | * of group shares between cpus. Assuming the shares were perfectly aligned one | 1382 | * of group shares between cpus. Assuming the shares were perfectly aligned one |
1145 | * can calculate the shift in shares. | 1383 | * can calculate the shift in shares. |
1146 | * | ||
1147 | * The problem is that perfectly aligning the shares is rather expensive, hence | ||
1148 | * we try to avoid doing that too often - see update_shares(), which ratelimits | ||
1149 | * this change. | ||
1150 | * | ||
1151 | * We compensate this by not only taking the current delta into account, but | ||
1152 | * also considering the delta between when the shares were last adjusted and | ||
1153 | * now. | ||
1154 | * | ||
1155 | * We still saw a performance dip, some tracing learned us that between | ||
1156 | * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased | ||
1157 | * significantly. Therefore try to bias the error in direction of failing | ||
1158 | * the affine wakeup. | ||
1159 | * | ||
1160 | */ | 1384 | */ |
1161 | static long effective_load(struct task_group *tg, int cpu, | 1385 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg) |
1162 | long wl, long wg) | ||
1163 | { | 1386 | { |
1164 | struct sched_entity *se = tg->se[cpu]; | 1387 | struct sched_entity *se = tg->se[cpu]; |
1165 | 1388 | ||
1166 | if (!tg->parent) | 1389 | if (!tg->parent) |
1167 | return wl; | 1390 | return wl; |
1168 | 1391 | ||
1169 | /* | ||
1170 | * By not taking the decrease of shares on the other cpu into | ||
1171 | * account our error leans towards reducing the affine wakeups. | ||
1172 | */ | ||
1173 | if (!wl && sched_feat(ASYM_EFF_LOAD)) | ||
1174 | return wl; | ||
1175 | |||
1176 | for_each_sched_entity(se) { | 1392 | for_each_sched_entity(se) { |
1177 | long S, rw, s, a, b; | 1393 | long lw, w; |
1178 | long more_w; | ||
1179 | |||
1180 | /* | ||
1181 | * Instead of using this increment, also add the difference | ||
1182 | * between when the shares were last updated and now. | ||
1183 | */ | ||
1184 | more_w = se->my_q->load.weight - se->my_q->rq_weight; | ||
1185 | wl += more_w; | ||
1186 | wg += more_w; | ||
1187 | 1394 | ||
1188 | S = se->my_q->tg->shares; | 1395 | tg = se->my_q->tg; |
1189 | s = se->my_q->shares; | 1396 | w = se->my_q->load.weight; |
1190 | rw = se->my_q->rq_weight; | ||
1191 | 1397 | ||
1192 | a = S*(rw + wl); | 1398 | /* use this cpu's instantaneous contribution */ |
1193 | b = S*rw + s*wg; | 1399 | lw = atomic_read(&tg->load_weight); |
1400 | lw -= se->my_q->load_contribution; | ||
1401 | lw += w + wg; | ||
1194 | 1402 | ||
1195 | wl = s*(a-b); | 1403 | wl += w; |
1196 | 1404 | ||
1197 | if (likely(b)) | 1405 | if (lw > 0 && wl < lw) |
1198 | wl /= b; | 1406 | wl = (wl * tg->shares) / lw; |
1407 | else | ||
1408 | wl = tg->shares; | ||
1199 | 1409 | ||
1200 | /* | 1410 | /* zero point is MIN_SHARES */ |
1201 | * Assume the group is already running and will | 1411 | if (wl < MIN_SHARES) |
1202 | * thus already be accounted for in the weight. | 1412 | wl = MIN_SHARES; |
1203 | * | 1413 | wl -= se->load.weight; |
1204 | * That is, moving shares between CPUs, does not | ||
1205 | * alter the group weight. | ||
1206 | */ | ||
1207 | wg = 0; | 1414 | wg = 0; |
1208 | } | 1415 | } |
1209 | 1416 | ||
@@ -1222,7 +1429,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, | |||
1222 | 1429 | ||
1223 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | 1430 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) |
1224 | { | 1431 | { |
1225 | unsigned long this_load, load; | 1432 | s64 this_load, load; |
1226 | int idx, this_cpu, prev_cpu; | 1433 | int idx, this_cpu, prev_cpu; |
1227 | unsigned long tl_per_task; | 1434 | unsigned long tl_per_task; |
1228 | struct task_group *tg; | 1435 | struct task_group *tg; |
@@ -1261,8 +1468,8 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
1261 | * Otherwise check if either cpus are near enough in load to allow this | 1468 | * Otherwise check if either cpus are near enough in load to allow this |
1262 | * task to be woken on this_cpu. | 1469 | * task to be woken on this_cpu. |
1263 | */ | 1470 | */ |
1264 | if (this_load) { | 1471 | if (this_load > 0) { |
1265 | unsigned long this_eff_load, prev_eff_load; | 1472 | s64 this_eff_load, prev_eff_load; |
1266 | 1473 | ||
1267 | this_eff_load = 100; | 1474 | this_eff_load = 100; |
1268 | this_eff_load *= power_of(prev_cpu); | 1475 | this_eff_load *= power_of(prev_cpu); |
@@ -1508,23 +1715,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1508 | sd = tmp; | 1715 | sd = tmp; |
1509 | } | 1716 | } |
1510 | 1717 | ||
1511 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1512 | if (sched_feat(LB_SHARES_UPDATE)) { | ||
1513 | /* | ||
1514 | * Pick the largest domain to update shares over | ||
1515 | */ | ||
1516 | tmp = sd; | ||
1517 | if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight)) | ||
1518 | tmp = affine_sd; | ||
1519 | |||
1520 | if (tmp) { | ||
1521 | raw_spin_unlock(&rq->lock); | ||
1522 | update_shares(tmp); | ||
1523 | raw_spin_lock(&rq->lock); | ||
1524 | } | ||
1525 | } | ||
1526 | #endif | ||
1527 | |||
1528 | if (affine_sd) { | 1718 | if (affine_sd) { |
1529 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) | 1719 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) |
1530 | return select_idle_sibling(p, cpu); | 1720 | return select_idle_sibling(p, cpu); |
@@ -1909,6 +2099,48 @@ out: | |||
1909 | } | 2099 | } |
1910 | 2100 | ||
1911 | #ifdef CONFIG_FAIR_GROUP_SCHED | 2101 | #ifdef CONFIG_FAIR_GROUP_SCHED |
2102 | /* | ||
2103 | * update tg->load_weight by folding this cpu's load_avg | ||
2104 | */ | ||
2105 | static int update_shares_cpu(struct task_group *tg, int cpu) | ||
2106 | { | ||
2107 | struct cfs_rq *cfs_rq; | ||
2108 | unsigned long flags; | ||
2109 | struct rq *rq; | ||
2110 | |||
2111 | if (!tg->se[cpu]) | ||
2112 | return 0; | ||
2113 | |||
2114 | rq = cpu_rq(cpu); | ||
2115 | cfs_rq = tg->cfs_rq[cpu]; | ||
2116 | |||
2117 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
2118 | |||
2119 | update_rq_clock(rq); | ||
2120 | update_cfs_load(cfs_rq, 1); | ||
2121 | |||
2122 | /* | ||
2123 | * We need to update shares after updating tg->load_weight in | ||
2124 | * order to adjust the weight of groups with long running tasks. | ||
2125 | */ | ||
2126 | update_cfs_shares(cfs_rq, 0); | ||
2127 | |||
2128 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
2129 | |||
2130 | return 0; | ||
2131 | } | ||
2132 | |||
2133 | static void update_shares(int cpu) | ||
2134 | { | ||
2135 | struct cfs_rq *cfs_rq; | ||
2136 | struct rq *rq = cpu_rq(cpu); | ||
2137 | |||
2138 | rcu_read_lock(); | ||
2139 | for_each_leaf_cfs_rq(rq, cfs_rq) | ||
2140 | update_shares_cpu(cfs_rq->tg, cpu); | ||
2141 | rcu_read_unlock(); | ||
2142 | } | ||
2143 | |||
1912 | static unsigned long | 2144 | static unsigned long |
1913 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2145 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1914 | unsigned long max_load_move, | 2146 | unsigned long max_load_move, |
@@ -1956,6 +2188,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1956 | return max_load_move - rem_load_move; | 2188 | return max_load_move - rem_load_move; |
1957 | } | 2189 | } |
1958 | #else | 2190 | #else |
2191 | static inline void update_shares(int cpu) | ||
2192 | { | ||
2193 | } | ||
2194 | |||
1959 | static unsigned long | 2195 | static unsigned long |
1960 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2196 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1961 | unsigned long max_load_move, | 2197 | unsigned long max_load_move, |
@@ -3032,7 +3268,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3032 | schedstat_inc(sd, lb_count[idle]); | 3268 | schedstat_inc(sd, lb_count[idle]); |
3033 | 3269 | ||
3034 | redo: | 3270 | redo: |
3035 | update_shares(sd); | ||
3036 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 3271 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
3037 | cpus, balance); | 3272 | cpus, balance); |
3038 | 3273 | ||
@@ -3174,8 +3409,6 @@ out_one_pinned: | |||
3174 | else | 3409 | else |
3175 | ld_moved = 0; | 3410 | ld_moved = 0; |
3176 | out: | 3411 | out: |
3177 | if (ld_moved) | ||
3178 | update_shares(sd); | ||
3179 | return ld_moved; | 3412 | return ld_moved; |
3180 | } | 3413 | } |
3181 | 3414 | ||
@@ -3199,6 +3432,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
3199 | */ | 3432 | */ |
3200 | raw_spin_unlock(&this_rq->lock); | 3433 | raw_spin_unlock(&this_rq->lock); |
3201 | 3434 | ||
3435 | update_shares(this_cpu); | ||
3202 | for_each_domain(this_cpu, sd) { | 3436 | for_each_domain(this_cpu, sd) { |
3203 | unsigned long interval; | 3437 | unsigned long interval; |
3204 | int balance = 1; | 3438 | int balance = 1; |
@@ -3569,6 +3803,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3569 | int update_next_balance = 0; | 3803 | int update_next_balance = 0; |
3570 | int need_serialize; | 3804 | int need_serialize; |
3571 | 3805 | ||
3806 | update_shares(cpu); | ||
3807 | |||
3572 | for_each_domain(cpu, sd) { | 3808 | for_each_domain(cpu, sd) { |
3573 | if (!(sd->flags & SD_LOAD_BALANCE)) | 3809 | if (!(sd->flags & SD_LOAD_BALANCE)) |
3574 | continue; | 3810 | continue; |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 185f920ec1a2..68e69acc29b9 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0) | |||
52 | SCHED_FEAT(HRTICK, 0) | 52 | SCHED_FEAT(HRTICK, 0) |
53 | SCHED_FEAT(DOUBLE_TICK, 0) | 53 | SCHED_FEAT(DOUBLE_TICK, 0) |
54 | SCHED_FEAT(LB_BIAS, 1) | 54 | SCHED_FEAT(LB_BIAS, 1) |
55 | SCHED_FEAT(LB_SHARES_UPDATE, 1) | ||
56 | SCHED_FEAT(ASYM_EFF_LOAD, 1) | ||
57 | 55 | ||
58 | /* | 56 | /* |
59 | * Spin-wait on mutex acquisition when the mutex owner is running on | 57 | * Spin-wait on mutex acquisition when the mutex owner is running on |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index bea7d79f7e9c..ad6267714c84 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -183,6 +183,17 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) | |||
183 | return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); | 183 | return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); |
184 | } | 184 | } |
185 | 185 | ||
186 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) | ||
187 | { | ||
188 | list_add_rcu(&rt_rq->leaf_rt_rq_list, | ||
189 | &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list); | ||
190 | } | ||
191 | |||
192 | static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) | ||
193 | { | ||
194 | list_del_rcu(&rt_rq->leaf_rt_rq_list); | ||
195 | } | ||
196 | |||
186 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | 197 | #define for_each_leaf_rt_rq(rt_rq, rq) \ |
187 | list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) | 198 | list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) |
188 | 199 | ||
@@ -276,6 +287,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) | |||
276 | return ktime_to_ns(def_rt_bandwidth.rt_period); | 287 | return ktime_to_ns(def_rt_bandwidth.rt_period); |
277 | } | 288 | } |
278 | 289 | ||
290 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) | ||
291 | { | ||
292 | } | ||
293 | |||
294 | static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) | ||
295 | { | ||
296 | } | ||
297 | |||
279 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | 298 | #define for_each_leaf_rt_rq(rt_rq, rq) \ |
280 | for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) | 299 | for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) |
281 | 300 | ||
@@ -606,7 +625,7 @@ static void update_curr_rt(struct rq *rq) | |||
606 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | 625 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); |
607 | u64 delta_exec; | 626 | u64 delta_exec; |
608 | 627 | ||
609 | if (!task_has_rt_policy(curr)) | 628 | if (curr->sched_class != &rt_sched_class) |
610 | return; | 629 | return; |
611 | 630 | ||
612 | delta_exec = rq->clock_task - curr->se.exec_start; | 631 | delta_exec = rq->clock_task - curr->se.exec_start; |
@@ -825,6 +844,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) | |||
825 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) | 844 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) |
826 | return; | 845 | return; |
827 | 846 | ||
847 | if (!rt_rq->rt_nr_running) | ||
848 | list_add_leaf_rt_rq(rt_rq); | ||
849 | |||
828 | if (head) | 850 | if (head) |
829 | list_add(&rt_se->run_list, queue); | 851 | list_add(&rt_se->run_list, queue); |
830 | else | 852 | else |
@@ -844,6 +866,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) | |||
844 | __clear_bit(rt_se_prio(rt_se), array->bitmap); | 866 | __clear_bit(rt_se_prio(rt_se), array->bitmap); |
845 | 867 | ||
846 | dec_rt_tasks(rt_se, rt_rq); | 868 | dec_rt_tasks(rt_se, rt_rq); |
869 | if (!rt_rq->rt_nr_running) | ||
870 | list_del_leaf_rt_rq(rt_rq); | ||
847 | } | 871 | } |
848 | 872 | ||
849 | /* | 873 | /* |
diff --git a/kernel/smp.c b/kernel/smp.c index 12ed8b013e2d..9910744f0856 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/smp.h> | 13 | #include <linux/smp.h> |
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | 15 | ||
16 | #ifdef CONFIG_USE_GENERIC_SMP_HELPERS | ||
16 | static struct { | 17 | static struct { |
17 | struct list_head queue; | 18 | struct list_head queue; |
18 | raw_spinlock_t lock; | 19 | raw_spinlock_t lock; |
@@ -193,23 +194,52 @@ void generic_smp_call_function_interrupt(void) | |||
193 | */ | 194 | */ |
194 | list_for_each_entry_rcu(data, &call_function.queue, csd.list) { | 195 | list_for_each_entry_rcu(data, &call_function.queue, csd.list) { |
195 | int refs; | 196 | int refs; |
197 | void (*func) (void *info); | ||
196 | 198 | ||
197 | if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) | 199 | /* |
200 | * Since we walk the list without any locks, we might | ||
201 | * see an entry that was completed, removed from the | ||
202 | * list and is in the process of being reused. | ||
203 | * | ||
204 | * We must check that the cpu is in the cpumask before | ||
205 | * checking the refs, and both must be set before | ||
206 | * executing the callback on this cpu. | ||
207 | */ | ||
208 | |||
209 | if (!cpumask_test_cpu(cpu, data->cpumask)) | ||
210 | continue; | ||
211 | |||
212 | smp_rmb(); | ||
213 | |||
214 | if (atomic_read(&data->refs) == 0) | ||
198 | continue; | 215 | continue; |
199 | 216 | ||
217 | func = data->csd.func; /* for later warn */ | ||
200 | data->csd.func(data->csd.info); | 218 | data->csd.func(data->csd.info); |
201 | 219 | ||
220 | /* | ||
221 | * If the cpu mask is not still set then it enabled interrupts, | ||
222 | * we took another smp interrupt, and executed the function | ||
223 | * twice on this cpu. In theory that copy decremented refs. | ||
224 | */ | ||
225 | if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { | ||
226 | WARN(1, "%pS enabled interrupts and double executed\n", | ||
227 | func); | ||
228 | continue; | ||
229 | } | ||
230 | |||
202 | refs = atomic_dec_return(&data->refs); | 231 | refs = atomic_dec_return(&data->refs); |
203 | WARN_ON(refs < 0); | 232 | WARN_ON(refs < 0); |
204 | if (!refs) { | ||
205 | raw_spin_lock(&call_function.lock); | ||
206 | list_del_rcu(&data->csd.list); | ||
207 | raw_spin_unlock(&call_function.lock); | ||
208 | } | ||
209 | 233 | ||
210 | if (refs) | 234 | if (refs) |
211 | continue; | 235 | continue; |
212 | 236 | ||
237 | WARN_ON(!cpumask_empty(data->cpumask)); | ||
238 | |||
239 | raw_spin_lock(&call_function.lock); | ||
240 | list_del_rcu(&data->csd.list); | ||
241 | raw_spin_unlock(&call_function.lock); | ||
242 | |||
213 | csd_unlock(&data->csd); | 243 | csd_unlock(&data->csd); |
214 | } | 244 | } |
215 | 245 | ||
@@ -429,7 +459,7 @@ void smp_call_function_many(const struct cpumask *mask, | |||
429 | * can't happen. | 459 | * can't happen. |
430 | */ | 460 | */ |
431 | WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() | 461 | WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() |
432 | && !oops_in_progress); | 462 | && !oops_in_progress && !early_boot_irqs_disabled); |
433 | 463 | ||
434 | /* So, what's a CPU they want? Ignoring this one. */ | 464 | /* So, what's a CPU they want? Ignoring this one. */ |
435 | cpu = cpumask_first_and(mask, cpu_online_mask); | 465 | cpu = cpumask_first_and(mask, cpu_online_mask); |
@@ -453,11 +483,21 @@ void smp_call_function_many(const struct cpumask *mask, | |||
453 | 483 | ||
454 | data = &__get_cpu_var(cfd_data); | 484 | data = &__get_cpu_var(cfd_data); |
455 | csd_lock(&data->csd); | 485 | csd_lock(&data->csd); |
486 | BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); | ||
456 | 487 | ||
457 | data->csd.func = func; | 488 | data->csd.func = func; |
458 | data->csd.info = info; | 489 | data->csd.info = info; |
459 | cpumask_and(data->cpumask, mask, cpu_online_mask); | 490 | cpumask_and(data->cpumask, mask, cpu_online_mask); |
460 | cpumask_clear_cpu(this_cpu, data->cpumask); | 491 | cpumask_clear_cpu(this_cpu, data->cpumask); |
492 | |||
493 | /* | ||
494 | * To ensure the interrupt handler gets an complete view | ||
495 | * we order the cpumask and refs writes and order the read | ||
496 | * of them in the interrupt handler. In addition we may | ||
497 | * only clear our own cpu bit from the mask. | ||
498 | */ | ||
499 | smp_wmb(); | ||
500 | |||
461 | atomic_set(&data->refs, cpumask_weight(data->cpumask)); | 501 | atomic_set(&data->refs, cpumask_weight(data->cpumask)); |
462 | 502 | ||
463 | raw_spin_lock_irqsave(&call_function.lock, flags); | 503 | raw_spin_lock_irqsave(&call_function.lock, flags); |
@@ -529,3 +569,24 @@ void ipi_call_unlock_irq(void) | |||
529 | { | 569 | { |
530 | raw_spin_unlock_irq(&call_function.lock); | 570 | raw_spin_unlock_irq(&call_function.lock); |
531 | } | 571 | } |
572 | #endif /* USE_GENERIC_SMP_HELPERS */ | ||
573 | |||
574 | /* | ||
575 | * Call a function on all processors. May be used during early boot while | ||
576 | * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead | ||
577 | * of local_irq_disable/enable(). | ||
578 | */ | ||
579 | int on_each_cpu(void (*func) (void *info), void *info, int wait) | ||
580 | { | ||
581 | unsigned long flags; | ||
582 | int ret = 0; | ||
583 | |||
584 | preempt_disable(); | ||
585 | ret = smp_call_function(func, info, wait); | ||
586 | local_irq_save(flags); | ||
587 | func(info); | ||
588 | local_irq_restore(flags); | ||
589 | preempt_enable(); | ||
590 | return ret; | ||
591 | } | ||
592 | EXPORT_SYMBOL(on_each_cpu); | ||
diff --git a/kernel/softirq.c b/kernel/softirq.c index 18f4be0d5fe0..68eb5efec388 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -70,7 +70,7 @@ char *softirq_to_name[NR_SOFTIRQS] = { | |||
70 | static void wakeup_softirqd(void) | 70 | static void wakeup_softirqd(void) |
71 | { | 71 | { |
72 | /* Interrupts are disabled: no need to stop preemption */ | 72 | /* Interrupts are disabled: no need to stop preemption */ |
73 | struct task_struct *tsk = __get_cpu_var(ksoftirqd); | 73 | struct task_struct *tsk = __this_cpu_read(ksoftirqd); |
74 | 74 | ||
75 | if (tsk && tsk->state != TASK_RUNNING) | 75 | if (tsk && tsk->state != TASK_RUNNING) |
76 | wake_up_process(tsk); | 76 | wake_up_process(tsk); |
@@ -388,8 +388,8 @@ void __tasklet_schedule(struct tasklet_struct *t) | |||
388 | 388 | ||
389 | local_irq_save(flags); | 389 | local_irq_save(flags); |
390 | t->next = NULL; | 390 | t->next = NULL; |
391 | *__get_cpu_var(tasklet_vec).tail = t; | 391 | *__this_cpu_read(tasklet_vec.tail) = t; |
392 | __get_cpu_var(tasklet_vec).tail = &(t->next); | 392 | __this_cpu_write(tasklet_vec.tail, &(t->next)); |
393 | raise_softirq_irqoff(TASKLET_SOFTIRQ); | 393 | raise_softirq_irqoff(TASKLET_SOFTIRQ); |
394 | local_irq_restore(flags); | 394 | local_irq_restore(flags); |
395 | } | 395 | } |
@@ -402,8 +402,8 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) | |||
402 | 402 | ||
403 | local_irq_save(flags); | 403 | local_irq_save(flags); |
404 | t->next = NULL; | 404 | t->next = NULL; |
405 | *__get_cpu_var(tasklet_hi_vec).tail = t; | 405 | *__this_cpu_read(tasklet_hi_vec.tail) = t; |
406 | __get_cpu_var(tasklet_hi_vec).tail = &(t->next); | 406 | __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); |
407 | raise_softirq_irqoff(HI_SOFTIRQ); | 407 | raise_softirq_irqoff(HI_SOFTIRQ); |
408 | local_irq_restore(flags); | 408 | local_irq_restore(flags); |
409 | } | 409 | } |
@@ -414,8 +414,8 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t) | |||
414 | { | 414 | { |
415 | BUG_ON(!irqs_disabled()); | 415 | BUG_ON(!irqs_disabled()); |
416 | 416 | ||
417 | t->next = __get_cpu_var(tasklet_hi_vec).head; | 417 | t->next = __this_cpu_read(tasklet_hi_vec.head); |
418 | __get_cpu_var(tasklet_hi_vec).head = t; | 418 | __this_cpu_write(tasklet_hi_vec.head, t); |
419 | __raise_softirq_irqoff(HI_SOFTIRQ); | 419 | __raise_softirq_irqoff(HI_SOFTIRQ); |
420 | } | 420 | } |
421 | 421 | ||
@@ -426,9 +426,9 @@ static void tasklet_action(struct softirq_action *a) | |||
426 | struct tasklet_struct *list; | 426 | struct tasklet_struct *list; |
427 | 427 | ||
428 | local_irq_disable(); | 428 | local_irq_disable(); |
429 | list = __get_cpu_var(tasklet_vec).head; | 429 | list = __this_cpu_read(tasklet_vec.head); |
430 | __get_cpu_var(tasklet_vec).head = NULL; | 430 | __this_cpu_write(tasklet_vec.head, NULL); |
431 | __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; | 431 | __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head); |
432 | local_irq_enable(); | 432 | local_irq_enable(); |
433 | 433 | ||
434 | while (list) { | 434 | while (list) { |
@@ -449,8 +449,8 @@ static void tasklet_action(struct softirq_action *a) | |||
449 | 449 | ||
450 | local_irq_disable(); | 450 | local_irq_disable(); |
451 | t->next = NULL; | 451 | t->next = NULL; |
452 | *__get_cpu_var(tasklet_vec).tail = t; | 452 | *__this_cpu_read(tasklet_vec.tail) = t; |
453 | __get_cpu_var(tasklet_vec).tail = &(t->next); | 453 | __this_cpu_write(tasklet_vec.tail, &(t->next)); |
454 | __raise_softirq_irqoff(TASKLET_SOFTIRQ); | 454 | __raise_softirq_irqoff(TASKLET_SOFTIRQ); |
455 | local_irq_enable(); | 455 | local_irq_enable(); |
456 | } | 456 | } |
@@ -461,9 +461,9 @@ static void tasklet_hi_action(struct softirq_action *a) | |||
461 | struct tasklet_struct *list; | 461 | struct tasklet_struct *list; |
462 | 462 | ||
463 | local_irq_disable(); | 463 | local_irq_disable(); |
464 | list = __get_cpu_var(tasklet_hi_vec).head; | 464 | list = __this_cpu_read(tasklet_hi_vec.head); |
465 | __get_cpu_var(tasklet_hi_vec).head = NULL; | 465 | __this_cpu_write(tasklet_hi_vec.head, NULL); |
466 | __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head; | 466 | __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head); |
467 | local_irq_enable(); | 467 | local_irq_enable(); |
468 | 468 | ||
469 | while (list) { | 469 | while (list) { |
@@ -484,8 +484,8 @@ static void tasklet_hi_action(struct softirq_action *a) | |||
484 | 484 | ||
485 | local_irq_disable(); | 485 | local_irq_disable(); |
486 | t->next = NULL; | 486 | t->next = NULL; |
487 | *__get_cpu_var(tasklet_hi_vec).tail = t; | 487 | *__this_cpu_read(tasklet_hi_vec.tail) = t; |
488 | __get_cpu_var(tasklet_hi_vec).tail = &(t->next); | 488 | __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); |
489 | __raise_softirq_irqoff(HI_SOFTIRQ); | 489 | __raise_softirq_irqoff(HI_SOFTIRQ); |
490 | local_irq_enable(); | 490 | local_irq_enable(); |
491 | } | 491 | } |
@@ -802,16 +802,16 @@ static void takeover_tasklets(unsigned int cpu) | |||
802 | 802 | ||
803 | /* Find end, append list for that CPU. */ | 803 | /* Find end, append list for that CPU. */ |
804 | if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { | 804 | if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { |
805 | *(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head; | 805 | *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head; |
806 | __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail; | 806 | this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail); |
807 | per_cpu(tasklet_vec, cpu).head = NULL; | 807 | per_cpu(tasklet_vec, cpu).head = NULL; |
808 | per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; | 808 | per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; |
809 | } | 809 | } |
810 | raise_softirq_irqoff(TASKLET_SOFTIRQ); | 810 | raise_softirq_irqoff(TASKLET_SOFTIRQ); |
811 | 811 | ||
812 | if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { | 812 | if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { |
813 | *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head; | 813 | *__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head; |
814 | __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail; | 814 | __this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail); |
815 | per_cpu(tasklet_hi_vec, cpu).head = NULL; | 815 | per_cpu(tasklet_hi_vec, cpu).head = NULL; |
816 | per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; | 816 | per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; |
817 | } | 817 | } |
@@ -853,7 +853,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, | |||
853 | cpumask_any(cpu_online_mask)); | 853 | cpumask_any(cpu_online_mask)); |
854 | case CPU_DEAD: | 854 | case CPU_DEAD: |
855 | case CPU_DEAD_FROZEN: { | 855 | case CPU_DEAD_FROZEN: { |
856 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | 856 | static const struct sched_param param = { |
857 | .sched_priority = MAX_RT_PRIO-1 | ||
858 | }; | ||
857 | 859 | ||
858 | p = per_cpu(ksoftirqd, hotcpu); | 860 | p = per_cpu(ksoftirqd, hotcpu); |
859 | per_cpu(ksoftirqd, hotcpu) = NULL; | 861 | per_cpu(ksoftirqd, hotcpu) = NULL; |
@@ -883,25 +885,6 @@ static __init int spawn_ksoftirqd(void) | |||
883 | } | 885 | } |
884 | early_initcall(spawn_ksoftirqd); | 886 | early_initcall(spawn_ksoftirqd); |
885 | 887 | ||
886 | #ifdef CONFIG_SMP | ||
887 | /* | ||
888 | * Call a function on all processors | ||
889 | */ | ||
890 | int on_each_cpu(void (*func) (void *info), void *info, int wait) | ||
891 | { | ||
892 | int ret = 0; | ||
893 | |||
894 | preempt_disable(); | ||
895 | ret = smp_call_function(func, info, wait); | ||
896 | local_irq_disable(); | ||
897 | func(info); | ||
898 | local_irq_enable(); | ||
899 | preempt_enable(); | ||
900 | return ret; | ||
901 | } | ||
902 | EXPORT_SYMBOL(on_each_cpu); | ||
903 | #endif | ||
904 | |||
905 | /* | 888 | /* |
906 | * [ These __weak aliases are kept in a separate compilation unit, so that | 889 | * [ These __weak aliases are kept in a separate compilation unit, so that |
907 | * GCC does not inline them incorrectly. ] | 890 | * GCC does not inline them incorrectly. ] |
diff --git a/kernel/srcu.c b/kernel/srcu.c index c71e07500536..73ce23feaea9 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/rcupdate.h> | 31 | #include <linux/rcupdate.h> |
32 | #include <linux/sched.h> | 32 | #include <linux/sched.h> |
33 | #include <linux/smp.h> | 33 | #include <linux/smp.h> |
34 | #include <linux/delay.h> | ||
34 | #include <linux/srcu.h> | 35 | #include <linux/srcu.h> |
35 | 36 | ||
36 | static int init_srcu_struct_fields(struct srcu_struct *sp) | 37 | static int init_srcu_struct_fields(struct srcu_struct *sp) |
@@ -155,6 +156,16 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx) | |||
155 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); | 156 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); |
156 | 157 | ||
157 | /* | 158 | /* |
159 | * We use an adaptive strategy for synchronize_srcu() and especially for | ||
160 | * synchronize_srcu_expedited(). We spin for a fixed time period | ||
161 | * (defined below) to allow SRCU readers to exit their read-side critical | ||
162 | * sections. If there are still some readers after 10 microseconds, | ||
163 | * we repeatedly block for 1-millisecond time periods. This approach | ||
164 | * has done well in testing, so there is no need for a config parameter. | ||
165 | */ | ||
166 | #define SYNCHRONIZE_SRCU_READER_DELAY 10 | ||
167 | |||
168 | /* | ||
158 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). | 169 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). |
159 | */ | 170 | */ |
160 | static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | 171 | static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) |
@@ -203,9 +214,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | |||
203 | * all srcu_read_lock() calls using the old counters have completed. | 214 | * all srcu_read_lock() calls using the old counters have completed. |
204 | * Their corresponding critical sections might well be still | 215 | * Their corresponding critical sections might well be still |
205 | * executing, but the srcu_read_lock() primitives themselves | 216 | * executing, but the srcu_read_lock() primitives themselves |
206 | * will have finished executing. | 217 | * will have finished executing. We initially give readers |
218 | * an arbitrarily chosen 10 microseconds to get out of their | ||
219 | * SRCU read-side critical sections, then loop waiting 1/HZ | ||
220 | * seconds per iteration. The 10-microsecond value has done | ||
221 | * very well in testing. | ||
207 | */ | 222 | */ |
208 | 223 | ||
224 | if (srcu_readers_active_idx(sp, idx)) | ||
225 | udelay(SYNCHRONIZE_SRCU_READER_DELAY); | ||
209 | while (srcu_readers_active_idx(sp, idx)) | 226 | while (srcu_readers_active_idx(sp, idx)) |
210 | schedule_timeout_interruptible(1); | 227 | schedule_timeout_interruptible(1); |
211 | 228 | ||
diff --git a/kernel/sys.c b/kernel/sys.c index 7f5a0cd296a9..18da702ec813 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -43,6 +43,8 @@ | |||
43 | #include <linux/kprobes.h> | 43 | #include <linux/kprobes.h> |
44 | #include <linux/user_namespace.h> | 44 | #include <linux/user_namespace.h> |
45 | 45 | ||
46 | #include <linux/kmsg_dump.h> | ||
47 | |||
46 | #include <asm/uaccess.h> | 48 | #include <asm/uaccess.h> |
47 | #include <asm/io.h> | 49 | #include <asm/io.h> |
48 | #include <asm/unistd.h> | 50 | #include <asm/unistd.h> |
@@ -285,6 +287,7 @@ out_unlock: | |||
285 | */ | 287 | */ |
286 | void emergency_restart(void) | 288 | void emergency_restart(void) |
287 | { | 289 | { |
290 | kmsg_dump(KMSG_DUMP_EMERG); | ||
288 | machine_emergency_restart(); | 291 | machine_emergency_restart(); |
289 | } | 292 | } |
290 | EXPORT_SYMBOL_GPL(emergency_restart); | 293 | EXPORT_SYMBOL_GPL(emergency_restart); |
@@ -312,6 +315,7 @@ void kernel_restart(char *cmd) | |||
312 | printk(KERN_EMERG "Restarting system.\n"); | 315 | printk(KERN_EMERG "Restarting system.\n"); |
313 | else | 316 | else |
314 | printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); | 317 | printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); |
318 | kmsg_dump(KMSG_DUMP_RESTART); | ||
315 | machine_restart(cmd); | 319 | machine_restart(cmd); |
316 | } | 320 | } |
317 | EXPORT_SYMBOL_GPL(kernel_restart); | 321 | EXPORT_SYMBOL_GPL(kernel_restart); |
@@ -333,6 +337,7 @@ void kernel_halt(void) | |||
333 | kernel_shutdown_prepare(SYSTEM_HALT); | 337 | kernel_shutdown_prepare(SYSTEM_HALT); |
334 | sysdev_shutdown(); | 338 | sysdev_shutdown(); |
335 | printk(KERN_EMERG "System halted.\n"); | 339 | printk(KERN_EMERG "System halted.\n"); |
340 | kmsg_dump(KMSG_DUMP_HALT); | ||
336 | machine_halt(); | 341 | machine_halt(); |
337 | } | 342 | } |
338 | 343 | ||
@@ -351,6 +356,7 @@ void kernel_power_off(void) | |||
351 | disable_nonboot_cpus(); | 356 | disable_nonboot_cpus(); |
352 | sysdev_shutdown(); | 357 | sysdev_shutdown(); |
353 | printk(KERN_EMERG "Power down.\n"); | 358 | printk(KERN_EMERG "Power down.\n"); |
359 | kmsg_dump(KMSG_DUMP_POWEROFF); | ||
354 | machine_power_off(); | 360 | machine_power_off(); |
355 | } | 361 | } |
356 | EXPORT_SYMBOL_GPL(kernel_power_off); | 362 | EXPORT_SYMBOL_GPL(kernel_power_off); |
@@ -1080,8 +1086,10 @@ SYSCALL_DEFINE0(setsid) | |||
1080 | err = session; | 1086 | err = session; |
1081 | out: | 1087 | out: |
1082 | write_unlock_irq(&tasklist_lock); | 1088 | write_unlock_irq(&tasklist_lock); |
1083 | if (err > 0) | 1089 | if (err > 0) { |
1084 | proc_sid_connector(group_leader); | 1090 | proc_sid_connector(group_leader); |
1091 | sched_autogroup_create_attach(group_leader); | ||
1092 | } | ||
1085 | return err; | 1093 | return err; |
1086 | } | 1094 | } |
1087 | 1095 | ||
@@ -1377,7 +1385,8 @@ static int check_prlimit_permission(struct task_struct *task) | |||
1377 | const struct cred *cred = current_cred(), *tcred; | 1385 | const struct cred *cred = current_cred(), *tcred; |
1378 | 1386 | ||
1379 | tcred = __task_cred(task); | 1387 | tcred = __task_cred(task); |
1380 | if ((cred->uid != tcred->euid || | 1388 | if (current != task && |
1389 | (cred->uid != tcred->euid || | ||
1381 | cred->uid != tcred->suid || | 1390 | cred->uid != tcred->suid || |
1382 | cred->uid != tcred->uid || | 1391 | cred->uid != tcred->uid || |
1383 | cred->gid != tcred->egid || | 1392 | cred->gid != tcred->egid || |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5abfa1518554..0f1bd83db985 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | #include <linux/sysctl.h> | 25 | #include <linux/sysctl.h> |
26 | #include <linux/signal.h> | 26 | #include <linux/signal.h> |
27 | #include <linux/printk.h> | ||
27 | #include <linux/proc_fs.h> | 28 | #include <linux/proc_fs.h> |
28 | #include <linux/security.h> | 29 | #include <linux/security.h> |
29 | #include <linux/ctype.h> | 30 | #include <linux/ctype.h> |
@@ -169,7 +170,8 @@ static int proc_taint(struct ctl_table *table, int write, | |||
169 | #endif | 170 | #endif |
170 | 171 | ||
171 | #ifdef CONFIG_MAGIC_SYSRQ | 172 | #ifdef CONFIG_MAGIC_SYSRQ |
172 | static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */ | 173 | /* Note: sysrq code uses it's own private copy */ |
174 | static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; | ||
173 | 175 | ||
174 | static int sysrq_sysctl_handler(ctl_table *table, int write, | 176 | static int sysrq_sysctl_handler(ctl_table *table, int write, |
175 | void __user *buffer, size_t *lenp, | 177 | void __user *buffer, size_t *lenp, |
@@ -245,10 +247,6 @@ static struct ctl_table root_table[] = { | |||
245 | .mode = 0555, | 247 | .mode = 0555, |
246 | .child = dev_table, | 248 | .child = dev_table, |
247 | }, | 249 | }, |
248 | /* | ||
249 | * NOTE: do not add new entries to this table unless you have read | ||
250 | * Documentation/sysctl/ctl_unnumbered.txt | ||
251 | */ | ||
252 | { } | 250 | { } |
253 | }; | 251 | }; |
254 | 252 | ||
@@ -259,8 +257,6 @@ static int min_wakeup_granularity_ns; /* 0 usecs */ | |||
259 | static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ | 257 | static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ |
260 | static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; | 258 | static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; |
261 | static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; | 259 | static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; |
262 | static int min_sched_shares_ratelimit = 100000; /* 100 usec */ | ||
263 | static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */ | ||
264 | #endif | 260 | #endif |
265 | 261 | ||
266 | #ifdef CONFIG_COMPACTION | 262 | #ifdef CONFIG_COMPACTION |
@@ -305,15 +301,6 @@ static struct ctl_table kern_table[] = { | |||
305 | .extra2 = &max_wakeup_granularity_ns, | 301 | .extra2 = &max_wakeup_granularity_ns, |
306 | }, | 302 | }, |
307 | { | 303 | { |
308 | .procname = "sched_shares_ratelimit", | ||
309 | .data = &sysctl_sched_shares_ratelimit, | ||
310 | .maxlen = sizeof(unsigned int), | ||
311 | .mode = 0644, | ||
312 | .proc_handler = sched_proc_update_handler, | ||
313 | .extra1 = &min_sched_shares_ratelimit, | ||
314 | .extra2 = &max_sched_shares_ratelimit, | ||
315 | }, | ||
316 | { | ||
317 | .procname = "sched_tunable_scaling", | 304 | .procname = "sched_tunable_scaling", |
318 | .data = &sysctl_sched_tunable_scaling, | 305 | .data = &sysctl_sched_tunable_scaling, |
319 | .maxlen = sizeof(enum sched_tunable_scaling), | 306 | .maxlen = sizeof(enum sched_tunable_scaling), |
@@ -323,14 +310,6 @@ static struct ctl_table kern_table[] = { | |||
323 | .extra2 = &max_sched_tunable_scaling, | 310 | .extra2 = &max_sched_tunable_scaling, |
324 | }, | 311 | }, |
325 | { | 312 | { |
326 | .procname = "sched_shares_thresh", | ||
327 | .data = &sysctl_sched_shares_thresh, | ||
328 | .maxlen = sizeof(unsigned int), | ||
329 | .mode = 0644, | ||
330 | .proc_handler = proc_dointvec_minmax, | ||
331 | .extra1 = &zero, | ||
332 | }, | ||
333 | { | ||
334 | .procname = "sched_migration_cost", | 313 | .procname = "sched_migration_cost", |
335 | .data = &sysctl_sched_migration_cost, | 314 | .data = &sysctl_sched_migration_cost, |
336 | .maxlen = sizeof(unsigned int), | 315 | .maxlen = sizeof(unsigned int), |
@@ -352,6 +331,13 @@ static struct ctl_table kern_table[] = { | |||
352 | .proc_handler = proc_dointvec, | 331 | .proc_handler = proc_dointvec, |
353 | }, | 332 | }, |
354 | { | 333 | { |
334 | .procname = "sched_shares_window", | ||
335 | .data = &sysctl_sched_shares_window, | ||
336 | .maxlen = sizeof(unsigned int), | ||
337 | .mode = 0644, | ||
338 | .proc_handler = proc_dointvec, | ||
339 | }, | ||
340 | { | ||
355 | .procname = "timer_migration", | 341 | .procname = "timer_migration", |
356 | .data = &sysctl_timer_migration, | 342 | .data = &sysctl_timer_migration, |
357 | .maxlen = sizeof(unsigned int), | 343 | .maxlen = sizeof(unsigned int), |
@@ -382,6 +368,17 @@ static struct ctl_table kern_table[] = { | |||
382 | .mode = 0644, | 368 | .mode = 0644, |
383 | .proc_handler = proc_dointvec, | 369 | .proc_handler = proc_dointvec, |
384 | }, | 370 | }, |
371 | #ifdef CONFIG_SCHED_AUTOGROUP | ||
372 | { | ||
373 | .procname = "sched_autogroup_enabled", | ||
374 | .data = &sysctl_sched_autogroup_enabled, | ||
375 | .maxlen = sizeof(unsigned int), | ||
376 | .mode = 0644, | ||
377 | .proc_handler = proc_dointvec, | ||
378 | .extra1 = &zero, | ||
379 | .extra2 = &one, | ||
380 | }, | ||
381 | #endif | ||
385 | #ifdef CONFIG_PROVE_LOCKING | 382 | #ifdef CONFIG_PROVE_LOCKING |
386 | { | 383 | { |
387 | .procname = "prove_locking", | 384 | .procname = "prove_locking", |
@@ -711,6 +708,15 @@ static struct ctl_table kern_table[] = { | |||
711 | .extra1 = &zero, | 708 | .extra1 = &zero, |
712 | .extra2 = &one, | 709 | .extra2 = &one, |
713 | }, | 710 | }, |
711 | { | ||
712 | .procname = "kptr_restrict", | ||
713 | .data = &kptr_restrict, | ||
714 | .maxlen = sizeof(int), | ||
715 | .mode = 0644, | ||
716 | .proc_handler = proc_dointvec_minmax, | ||
717 | .extra1 = &zero, | ||
718 | .extra2 = &two, | ||
719 | }, | ||
714 | #endif | 720 | #endif |
715 | { | 721 | { |
716 | .procname = "ngroups_max", | 722 | .procname = "ngroups_max", |
@@ -745,21 +751,21 @@ static struct ctl_table kern_table[] = { | |||
745 | .extra1 = &zero, | 751 | .extra1 = &zero, |
746 | .extra2 = &one, | 752 | .extra2 = &one, |
747 | }, | 753 | }, |
748 | #endif | ||
749 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR) | ||
750 | { | 754 | { |
751 | .procname = "unknown_nmi_panic", | 755 | .procname = "nmi_watchdog", |
752 | .data = &unknown_nmi_panic, | 756 | .data = &watchdog_enabled, |
753 | .maxlen = sizeof (int), | 757 | .maxlen = sizeof (int), |
754 | .mode = 0644, | 758 | .mode = 0644, |
755 | .proc_handler = proc_dointvec, | 759 | .proc_handler = proc_dowatchdog_enabled, |
756 | }, | 760 | }, |
761 | #endif | ||
762 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | ||
757 | { | 763 | { |
758 | .procname = "nmi_watchdog", | 764 | .procname = "unknown_nmi_panic", |
759 | .data = &nmi_watchdog_enabled, | 765 | .data = &unknown_nmi_panic, |
760 | .maxlen = sizeof (int), | 766 | .maxlen = sizeof (int), |
761 | .mode = 0644, | 767 | .mode = 0644, |
762 | .proc_handler = proc_nmi_enabled, | 768 | .proc_handler = proc_dointvec, |
763 | }, | 769 | }, |
764 | #endif | 770 | #endif |
765 | #if defined(CONFIG_X86) | 771 | #if defined(CONFIG_X86) |
@@ -963,10 +969,6 @@ static struct ctl_table kern_table[] = { | |||
963 | .proc_handler = proc_dointvec, | 969 | .proc_handler = proc_dointvec, |
964 | }, | 970 | }, |
965 | #endif | 971 | #endif |
966 | /* | ||
967 | * NOTE: do not add new entries to this table unless you have read | ||
968 | * Documentation/sysctl/ctl_unnumbered.txt | ||
969 | */ | ||
970 | { } | 972 | { } |
971 | }; | 973 | }; |
972 | 974 | ||
@@ -1327,11 +1329,6 @@ static struct ctl_table vm_table[] = { | |||
1327 | .extra2 = &one, | 1329 | .extra2 = &one, |
1328 | }, | 1330 | }, |
1329 | #endif | 1331 | #endif |
1330 | |||
1331 | /* | ||
1332 | * NOTE: do not add new entries to this table unless you have read | ||
1333 | * Documentation/sysctl/ctl_unnumbered.txt | ||
1334 | */ | ||
1335 | { } | 1332 | { } |
1336 | }; | 1333 | }; |
1337 | 1334 | ||
@@ -1487,10 +1484,6 @@ static struct ctl_table fs_table[] = { | |||
1487 | .proc_handler = &pipe_proc_fn, | 1484 | .proc_handler = &pipe_proc_fn, |
1488 | .extra1 = &pipe_min_size, | 1485 | .extra1 = &pipe_min_size, |
1489 | }, | 1486 | }, |
1490 | /* | ||
1491 | * NOTE: do not add new entries to this table unless you have read | ||
1492 | * Documentation/sysctl/ctl_unnumbered.txt | ||
1493 | */ | ||
1494 | { } | 1487 | { } |
1495 | }; | 1488 | }; |
1496 | 1489 | ||
@@ -2900,7 +2893,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, | |||
2900 | } | 2893 | } |
2901 | } | 2894 | } |
2902 | 2895 | ||
2903 | #else /* CONFIG_PROC_FS */ | 2896 | #else /* CONFIG_PROC_SYSCTL */ |
2904 | 2897 | ||
2905 | int proc_dostring(struct ctl_table *table, int write, | 2898 | int proc_dostring(struct ctl_table *table, int write, |
2906 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2899 | void __user *buffer, size_t *lenp, loff_t *ppos) |
@@ -2952,7 +2945,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, | |||
2952 | } | 2945 | } |
2953 | 2946 | ||
2954 | 2947 | ||
2955 | #endif /* CONFIG_PROC_FS */ | 2948 | #endif /* CONFIG_PROC_SYSCTL */ |
2956 | 2949 | ||
2957 | /* | 2950 | /* |
2958 | * No sense putting this after each symbol definition, twice, | 2951 | * No sense putting this after each symbol definition, twice, |
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 1357c5786064..b875bedf7c9a 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
@@ -136,7 +136,6 @@ static const struct bin_table bin_kern_table[] = { | |||
136 | { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, | 136 | { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, |
137 | { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, | 137 | { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, |
138 | { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, | 138 | { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, |
139 | { CTL_INT, KERN_NMI_WATCHDOG, "nmi_watchdog" }, | ||
140 | { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, | 139 | { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, |
141 | {} | 140 | {} |
142 | }; | 141 | }; |
@@ -1193,7 +1192,7 @@ static ssize_t bin_dn_node_address(struct file *file, | |||
1193 | 1192 | ||
1194 | buf[result] = '\0'; | 1193 | buf[result] = '\0'; |
1195 | 1194 | ||
1196 | /* Convert the decnet addresss to binary */ | 1195 | /* Convert the decnet address to binary */ |
1197 | result = -EIO; | 1196 | result = -EIO; |
1198 | nodep = strchr(buf, '.') + 1; | 1197 | nodep = strchr(buf, '.') + 1; |
1199 | if (!nodep) | 1198 | if (!nodep) |
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 3308fd7f1b52..3971c6b9d58d 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
@@ -89,8 +89,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, | |||
89 | return -ENOMEM; | 89 | return -ENOMEM; |
90 | 90 | ||
91 | if (!info) { | 91 | if (!info) { |
92 | int seq = get_cpu_var(taskstats_seqnum)++; | 92 | int seq = this_cpu_inc_return(taskstats_seqnum) - 1; |
93 | put_cpu_var(taskstats_seqnum); | ||
94 | 93 | ||
95 | reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); | 94 | reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); |
96 | } else | 95 | } else |
@@ -349,7 +348,7 @@ static int parse(struct nlattr *na, struct cpumask *mask) | |||
349 | return ret; | 348 | return ret; |
350 | } | 349 | } |
351 | 350 | ||
352 | #ifdef CONFIG_IA64 | 351 | #if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) |
353 | #define TASKSTATS_NEEDS_PADDING 1 | 352 | #define TASKSTATS_NEEDS_PADDING 1 |
354 | #endif | 353 | #endif |
355 | 354 | ||
@@ -612,7 +611,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) | |||
612 | fill_tgid_exit(tsk); | 611 | fill_tgid_exit(tsk); |
613 | } | 612 | } |
614 | 613 | ||
615 | listeners = &__raw_get_cpu_var(listener_array); | 614 | listeners = __this_cpu_ptr(&listener_array); |
616 | if (list_empty(&listeners->list)) | 615 | if (list_empty(&listeners->list)) |
617 | return; | 616 | return; |
618 | 617 | ||
diff --git a/kernel/time.c b/kernel/time.c index ba9b338d1835..32174359576f 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -238,7 +238,7 @@ EXPORT_SYMBOL(current_fs_time); | |||
238 | * Avoid unnecessary multiplications/divisions in the | 238 | * Avoid unnecessary multiplications/divisions in the |
239 | * two most common HZ cases: | 239 | * two most common HZ cases: |
240 | */ | 240 | */ |
241 | unsigned int inline jiffies_to_msecs(const unsigned long j) | 241 | inline unsigned int jiffies_to_msecs(const unsigned long j) |
242 | { | 242 | { |
243 | #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) | 243 | #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) |
244 | return (MSEC_PER_SEC / HZ) * j; | 244 | return (MSEC_PER_SEC / HZ) * j; |
@@ -254,7 +254,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j) | |||
254 | } | 254 | } |
255 | EXPORT_SYMBOL(jiffies_to_msecs); | 255 | EXPORT_SYMBOL(jiffies_to_msecs); |
256 | 256 | ||
257 | unsigned int inline jiffies_to_usecs(const unsigned long j) | 257 | inline unsigned int jiffies_to_usecs(const unsigned long j) |
258 | { | 258 | { |
259 | #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) | 259 | #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) |
260 | return (USEC_PER_SEC / HZ) * j; | 260 | return (USEC_PER_SEC / HZ) * j; |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c18d7efa1b4b..6519cf62d9cd 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); | |||
113 | * @shift: pointer to shift variable | 113 | * @shift: pointer to shift variable |
114 | * @from: frequency to convert from | 114 | * @from: frequency to convert from |
115 | * @to: frequency to convert to | 115 | * @to: frequency to convert to |
116 | * @minsec: guaranteed runtime conversion range in seconds | 116 | * @maxsec: guaranteed runtime conversion range in seconds |
117 | * | 117 | * |
118 | * The function evaluates the shift/mult pair for the scaled math | 118 | * The function evaluates the shift/mult pair for the scaled math |
119 | * operations of clocksources and clockevents. | 119 | * operations of clocksources and clockevents. |
@@ -122,7 +122,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); | |||
122 | * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock | 122 | * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock |
123 | * event @to is the counter frequency and @from is NSEC_PER_SEC. | 123 | * event @to is the counter frequency and @from is NSEC_PER_SEC. |
124 | * | 124 | * |
125 | * The @minsec conversion range argument controls the time frame in | 125 | * The @maxsec conversion range argument controls the time frame in |
126 | * seconds which must be covered by the runtime conversion with the | 126 | * seconds which must be covered by the runtime conversion with the |
127 | * calculated mult and shift factors. This guarantees that no 64bit | 127 | * calculated mult and shift factors. This guarantees that no 64bit |
128 | * overflow happens when the input value of the conversion is | 128 | * overflow happens when the input value of the conversion is |
@@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); | |||
131 | * factors. | 131 | * factors. |
132 | */ | 132 | */ |
133 | void | 133 | void |
134 | clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) | 134 | clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) |
135 | { | 135 | { |
136 | u64 tmp; | 136 | u64 tmp; |
137 | u32 sft, sftacc= 32; | 137 | u32 sft, sftacc= 32; |
@@ -140,7 +140,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) | |||
140 | * Calculate the shift factor which is limiting the conversion | 140 | * Calculate the shift factor which is limiting the conversion |
141 | * range: | 141 | * range: |
142 | */ | 142 | */ |
143 | tmp = ((u64)minsec * from) >> 32; | 143 | tmp = ((u64)maxsec * from) >> 32; |
144 | while (tmp) { | 144 | while (tmp) { |
145 | tmp >>=1; | 145 | tmp >>=1; |
146 | sftacc--; | 146 | sftacc--; |
@@ -152,6 +152,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) | |||
152 | */ | 152 | */ |
153 | for (sft = 32; sft > 0; sft--) { | 153 | for (sft = 32; sft > 0; sft--) { |
154 | tmp = (u64) to << sft; | 154 | tmp = (u64) to << sft; |
155 | tmp += from / 2; | ||
155 | do_div(tmp, from); | 156 | do_div(tmp, from); |
156 | if ((tmp >> sftacc) == 0) | 157 | if ((tmp >> sftacc) == 0) |
157 | break; | 158 | break; |
@@ -678,7 +679,7 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); | |||
678 | int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) | 679 | int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) |
679 | { | 680 | { |
680 | 681 | ||
681 | /* Intialize mult/shift and max_idle_ns */ | 682 | /* Initialize mult/shift and max_idle_ns */ |
682 | __clocksource_updatefreq_scale(cs, scale, freq); | 683 | __clocksource_updatefreq_scale(cs, scale, freq); |
683 | 684 | ||
684 | /* Add clocksource to the clcoksource list */ | 685 | /* Add clocksource to the clcoksource list */ |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index d2321891538f..5c00242fa921 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/timex.h> | 14 | #include <linux/timex.h> |
15 | #include <linux/time.h> | 15 | #include <linux/time.h> |
16 | #include <linux/mm.h> | 16 | #include <linux/mm.h> |
17 | #include <linux/module.h> | ||
17 | 18 | ||
18 | /* | 19 | /* |
19 | * NTP timekeeping variables: | 20 | * NTP timekeeping variables: |
@@ -74,6 +75,162 @@ static long time_adjust; | |||
74 | /* constant (boot-param configurable) NTP tick adjustment (upscaled) */ | 75 | /* constant (boot-param configurable) NTP tick adjustment (upscaled) */ |
75 | static s64 ntp_tick_adj; | 76 | static s64 ntp_tick_adj; |
76 | 77 | ||
78 | #ifdef CONFIG_NTP_PPS | ||
79 | |||
80 | /* | ||
81 | * The following variables are used when a pulse-per-second (PPS) signal | ||
82 | * is available. They establish the engineering parameters of the clock | ||
83 | * discipline loop when controlled by the PPS signal. | ||
84 | */ | ||
85 | #define PPS_VALID 10 /* PPS signal watchdog max (s) */ | ||
86 | #define PPS_POPCORN 4 /* popcorn spike threshold (shift) */ | ||
87 | #define PPS_INTMIN 2 /* min freq interval (s) (shift) */ | ||
88 | #define PPS_INTMAX 8 /* max freq interval (s) (shift) */ | ||
89 | #define PPS_INTCOUNT 4 /* number of consecutive good intervals to | ||
90 | increase pps_shift or consecutive bad | ||
91 | intervals to decrease it */ | ||
92 | #define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ | ||
93 | |||
94 | static int pps_valid; /* signal watchdog counter */ | ||
95 | static long pps_tf[3]; /* phase median filter */ | ||
96 | static long pps_jitter; /* current jitter (ns) */ | ||
97 | static struct timespec pps_fbase; /* beginning of the last freq interval */ | ||
98 | static int pps_shift; /* current interval duration (s) (shift) */ | ||
99 | static int pps_intcnt; /* interval counter */ | ||
100 | static s64 pps_freq; /* frequency offset (scaled ns/s) */ | ||
101 | static long pps_stabil; /* current stability (scaled ns/s) */ | ||
102 | |||
103 | /* | ||
104 | * PPS signal quality monitors | ||
105 | */ | ||
106 | static long pps_calcnt; /* calibration intervals */ | ||
107 | static long pps_jitcnt; /* jitter limit exceeded */ | ||
108 | static long pps_stbcnt; /* stability limit exceeded */ | ||
109 | static long pps_errcnt; /* calibration errors */ | ||
110 | |||
111 | |||
112 | /* PPS kernel consumer compensates the whole phase error immediately. | ||
113 | * Otherwise, reduce the offset by a fixed factor times the time constant. | ||
114 | */ | ||
115 | static inline s64 ntp_offset_chunk(s64 offset) | ||
116 | { | ||
117 | if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) | ||
118 | return offset; | ||
119 | else | ||
120 | return shift_right(offset, SHIFT_PLL + time_constant); | ||
121 | } | ||
122 | |||
123 | static inline void pps_reset_freq_interval(void) | ||
124 | { | ||
125 | /* the PPS calibration interval may end | ||
126 | surprisingly early */ | ||
127 | pps_shift = PPS_INTMIN; | ||
128 | pps_intcnt = 0; | ||
129 | } | ||
130 | |||
131 | /** | ||
132 | * pps_clear - Clears the PPS state variables | ||
133 | * | ||
134 | * Must be called while holding a write on the xtime_lock | ||
135 | */ | ||
136 | static inline void pps_clear(void) | ||
137 | { | ||
138 | pps_reset_freq_interval(); | ||
139 | pps_tf[0] = 0; | ||
140 | pps_tf[1] = 0; | ||
141 | pps_tf[2] = 0; | ||
142 | pps_fbase.tv_sec = pps_fbase.tv_nsec = 0; | ||
143 | pps_freq = 0; | ||
144 | } | ||
145 | |||
146 | /* Decrease pps_valid to indicate that another second has passed since | ||
147 | * the last PPS signal. When it reaches 0, indicate that PPS signal is | ||
148 | * missing. | ||
149 | * | ||
150 | * Must be called while holding a write on the xtime_lock | ||
151 | */ | ||
152 | static inline void pps_dec_valid(void) | ||
153 | { | ||
154 | if (pps_valid > 0) | ||
155 | pps_valid--; | ||
156 | else { | ||
157 | time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | | ||
158 | STA_PPSWANDER | STA_PPSERROR); | ||
159 | pps_clear(); | ||
160 | } | ||
161 | } | ||
162 | |||
163 | static inline void pps_set_freq(s64 freq) | ||
164 | { | ||
165 | pps_freq = freq; | ||
166 | } | ||
167 | |||
168 | static inline int is_error_status(int status) | ||
169 | { | ||
170 | return (time_status & (STA_UNSYNC|STA_CLOCKERR)) | ||
171 | /* PPS signal lost when either PPS time or | ||
172 | * PPS frequency synchronization requested | ||
173 | */ | ||
174 | || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) | ||
175 | && !(time_status & STA_PPSSIGNAL)) | ||
176 | /* PPS jitter exceeded when | ||
177 | * PPS time synchronization requested */ | ||
178 | || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) | ||
179 | == (STA_PPSTIME|STA_PPSJITTER)) | ||
180 | /* PPS wander exceeded or calibration error when | ||
181 | * PPS frequency synchronization requested | ||
182 | */ | ||
183 | || ((time_status & STA_PPSFREQ) | ||
184 | && (time_status & (STA_PPSWANDER|STA_PPSERROR))); | ||
185 | } | ||
186 | |||
187 | static inline void pps_fill_timex(struct timex *txc) | ||
188 | { | ||
189 | txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * | ||
190 | PPM_SCALE_INV, NTP_SCALE_SHIFT); | ||
191 | txc->jitter = pps_jitter; | ||
192 | if (!(time_status & STA_NANO)) | ||
193 | txc->jitter /= NSEC_PER_USEC; | ||
194 | txc->shift = pps_shift; | ||
195 | txc->stabil = pps_stabil; | ||
196 | txc->jitcnt = pps_jitcnt; | ||
197 | txc->calcnt = pps_calcnt; | ||
198 | txc->errcnt = pps_errcnt; | ||
199 | txc->stbcnt = pps_stbcnt; | ||
200 | } | ||
201 | |||
202 | #else /* !CONFIG_NTP_PPS */ | ||
203 | |||
204 | static inline s64 ntp_offset_chunk(s64 offset) | ||
205 | { | ||
206 | return shift_right(offset, SHIFT_PLL + time_constant); | ||
207 | } | ||
208 | |||
209 | static inline void pps_reset_freq_interval(void) {} | ||
210 | static inline void pps_clear(void) {} | ||
211 | static inline void pps_dec_valid(void) {} | ||
212 | static inline void pps_set_freq(s64 freq) {} | ||
213 | |||
214 | static inline int is_error_status(int status) | ||
215 | { | ||
216 | return status & (STA_UNSYNC|STA_CLOCKERR); | ||
217 | } | ||
218 | |||
219 | static inline void pps_fill_timex(struct timex *txc) | ||
220 | { | ||
221 | /* PPS is not implemented, so these are zero */ | ||
222 | txc->ppsfreq = 0; | ||
223 | txc->jitter = 0; | ||
224 | txc->shift = 0; | ||
225 | txc->stabil = 0; | ||
226 | txc->jitcnt = 0; | ||
227 | txc->calcnt = 0; | ||
228 | txc->errcnt = 0; | ||
229 | txc->stbcnt = 0; | ||
230 | } | ||
231 | |||
232 | #endif /* CONFIG_NTP_PPS */ | ||
233 | |||
77 | /* | 234 | /* |
78 | * NTP methods: | 235 | * NTP methods: |
79 | */ | 236 | */ |
@@ -185,6 +342,9 @@ void ntp_clear(void) | |||
185 | 342 | ||
186 | tick_length = tick_length_base; | 343 | tick_length = tick_length_base; |
187 | time_offset = 0; | 344 | time_offset = 0; |
345 | |||
346 | /* Clear PPS state variables */ | ||
347 | pps_clear(); | ||
188 | } | 348 | } |
189 | 349 | ||
190 | /* | 350 | /* |
@@ -250,16 +410,16 @@ void second_overflow(void) | |||
250 | time_status |= STA_UNSYNC; | 410 | time_status |= STA_UNSYNC; |
251 | } | 411 | } |
252 | 412 | ||
253 | /* | 413 | /* Compute the phase adjustment for the next second */ |
254 | * Compute the phase adjustment for the next second. The offset is | ||
255 | * reduced by a fixed factor times the time constant. | ||
256 | */ | ||
257 | tick_length = tick_length_base; | 414 | tick_length = tick_length_base; |
258 | 415 | ||
259 | delta = shift_right(time_offset, SHIFT_PLL + time_constant); | 416 | delta = ntp_offset_chunk(time_offset); |
260 | time_offset -= delta; | 417 | time_offset -= delta; |
261 | tick_length += delta; | 418 | tick_length += delta; |
262 | 419 | ||
420 | /* Check PPS signal */ | ||
421 | pps_dec_valid(); | ||
422 | |||
263 | if (!time_adjust) | 423 | if (!time_adjust) |
264 | return; | 424 | return; |
265 | 425 | ||
@@ -369,6 +529,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) | |||
369 | if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { | 529 | if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { |
370 | time_state = TIME_OK; | 530 | time_state = TIME_OK; |
371 | time_status = STA_UNSYNC; | 531 | time_status = STA_UNSYNC; |
532 | /* restart PPS frequency calibration */ | ||
533 | pps_reset_freq_interval(); | ||
372 | } | 534 | } |
373 | 535 | ||
374 | /* | 536 | /* |
@@ -418,6 +580,8 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts | |||
418 | time_freq = txc->freq * PPM_SCALE; | 580 | time_freq = txc->freq * PPM_SCALE; |
419 | time_freq = min(time_freq, MAXFREQ_SCALED); | 581 | time_freq = min(time_freq, MAXFREQ_SCALED); |
420 | time_freq = max(time_freq, -MAXFREQ_SCALED); | 582 | time_freq = max(time_freq, -MAXFREQ_SCALED); |
583 | /* update pps_freq */ | ||
584 | pps_set_freq(time_freq); | ||
421 | } | 585 | } |
422 | 586 | ||
423 | if (txc->modes & ADJ_MAXERROR) | 587 | if (txc->modes & ADJ_MAXERROR) |
@@ -508,7 +672,8 @@ int do_adjtimex(struct timex *txc) | |||
508 | } | 672 | } |
509 | 673 | ||
510 | result = time_state; /* mostly `TIME_OK' */ | 674 | result = time_state; /* mostly `TIME_OK' */ |
511 | if (time_status & (STA_UNSYNC|STA_CLOCKERR)) | 675 | /* check for errors */ |
676 | if (is_error_status(time_status)) | ||
512 | result = TIME_ERROR; | 677 | result = TIME_ERROR; |
513 | 678 | ||
514 | txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * | 679 | txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * |
@@ -522,15 +687,8 @@ int do_adjtimex(struct timex *txc) | |||
522 | txc->tick = tick_usec; | 687 | txc->tick = tick_usec; |
523 | txc->tai = time_tai; | 688 | txc->tai = time_tai; |
524 | 689 | ||
525 | /* PPS is not implemented, so these are zero */ | 690 | /* fill PPS status fields */ |
526 | txc->ppsfreq = 0; | 691 | pps_fill_timex(txc); |
527 | txc->jitter = 0; | ||
528 | txc->shift = 0; | ||
529 | txc->stabil = 0; | ||
530 | txc->jitcnt = 0; | ||
531 | txc->calcnt = 0; | ||
532 | txc->errcnt = 0; | ||
533 | txc->stbcnt = 0; | ||
534 | 692 | ||
535 | write_sequnlock_irq(&xtime_lock); | 693 | write_sequnlock_irq(&xtime_lock); |
536 | 694 | ||
@@ -544,6 +702,243 @@ int do_adjtimex(struct timex *txc) | |||
544 | return result; | 702 | return result; |
545 | } | 703 | } |
546 | 704 | ||
705 | #ifdef CONFIG_NTP_PPS | ||
706 | |||
707 | /* actually struct pps_normtime is good old struct timespec, but it is | ||
708 | * semantically different (and it is the reason why it was invented): | ||
709 | * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] | ||
710 | * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ | ||
711 | struct pps_normtime { | ||
712 | __kernel_time_t sec; /* seconds */ | ||
713 | long nsec; /* nanoseconds */ | ||
714 | }; | ||
715 | |||
716 | /* normalize the timestamp so that nsec is in the | ||
717 | ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ | ||
718 | static inline struct pps_normtime pps_normalize_ts(struct timespec ts) | ||
719 | { | ||
720 | struct pps_normtime norm = { | ||
721 | .sec = ts.tv_sec, | ||
722 | .nsec = ts.tv_nsec | ||
723 | }; | ||
724 | |||
725 | if (norm.nsec > (NSEC_PER_SEC >> 1)) { | ||
726 | norm.nsec -= NSEC_PER_SEC; | ||
727 | norm.sec++; | ||
728 | } | ||
729 | |||
730 | return norm; | ||
731 | } | ||
732 | |||
733 | /* get current phase correction and jitter */ | ||
734 | static inline long pps_phase_filter_get(long *jitter) | ||
735 | { | ||
736 | *jitter = pps_tf[0] - pps_tf[1]; | ||
737 | if (*jitter < 0) | ||
738 | *jitter = -*jitter; | ||
739 | |||
740 | /* TODO: test various filters */ | ||
741 | return pps_tf[0]; | ||
742 | } | ||
743 | |||
744 | /* add the sample to the phase filter */ | ||
745 | static inline void pps_phase_filter_add(long err) | ||
746 | { | ||
747 | pps_tf[2] = pps_tf[1]; | ||
748 | pps_tf[1] = pps_tf[0]; | ||
749 | pps_tf[0] = err; | ||
750 | } | ||
751 | |||
752 | /* decrease frequency calibration interval length. | ||
753 | * It is halved after four consecutive unstable intervals. | ||
754 | */ | ||
755 | static inline void pps_dec_freq_interval(void) | ||
756 | { | ||
757 | if (--pps_intcnt <= -PPS_INTCOUNT) { | ||
758 | pps_intcnt = -PPS_INTCOUNT; | ||
759 | if (pps_shift > PPS_INTMIN) { | ||
760 | pps_shift--; | ||
761 | pps_intcnt = 0; | ||
762 | } | ||
763 | } | ||
764 | } | ||
765 | |||
766 | /* increase frequency calibration interval length. | ||
767 | * It is doubled after four consecutive stable intervals. | ||
768 | */ | ||
769 | static inline void pps_inc_freq_interval(void) | ||
770 | { | ||
771 | if (++pps_intcnt >= PPS_INTCOUNT) { | ||
772 | pps_intcnt = PPS_INTCOUNT; | ||
773 | if (pps_shift < PPS_INTMAX) { | ||
774 | pps_shift++; | ||
775 | pps_intcnt = 0; | ||
776 | } | ||
777 | } | ||
778 | } | ||
779 | |||
780 | /* update clock frequency based on MONOTONIC_RAW clock PPS signal | ||
781 | * timestamps | ||
782 | * | ||
783 | * At the end of the calibration interval the difference between the | ||
784 | * first and last MONOTONIC_RAW clock timestamps divided by the length | ||
785 | * of the interval becomes the frequency update. If the interval was | ||
786 | * too long, the data are discarded. | ||
787 | * Returns the difference between old and new frequency values. | ||
788 | */ | ||
789 | static long hardpps_update_freq(struct pps_normtime freq_norm) | ||
790 | { | ||
791 | long delta, delta_mod; | ||
792 | s64 ftemp; | ||
793 | |||
794 | /* check if the frequency interval was too long */ | ||
795 | if (freq_norm.sec > (2 << pps_shift)) { | ||
796 | time_status |= STA_PPSERROR; | ||
797 | pps_errcnt++; | ||
798 | pps_dec_freq_interval(); | ||
799 | pr_err("hardpps: PPSERROR: interval too long - %ld s\n", | ||
800 | freq_norm.sec); | ||
801 | return 0; | ||
802 | } | ||
803 | |||
804 | /* here the raw frequency offset and wander (stability) is | ||
805 | * calculated. If the wander is less than the wander threshold | ||
806 | * the interval is increased; otherwise it is decreased. | ||
807 | */ | ||
808 | ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT, | ||
809 | freq_norm.sec); | ||
810 | delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); | ||
811 | pps_freq = ftemp; | ||
812 | if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { | ||
813 | pr_warning("hardpps: PPSWANDER: change=%ld\n", delta); | ||
814 | time_status |= STA_PPSWANDER; | ||
815 | pps_stbcnt++; | ||
816 | pps_dec_freq_interval(); | ||
817 | } else { /* good sample */ | ||
818 | pps_inc_freq_interval(); | ||
819 | } | ||
820 | |||
821 | /* the stability metric is calculated as the average of recent | ||
822 | * frequency changes, but is used only for performance | ||
823 | * monitoring | ||
824 | */ | ||
825 | delta_mod = delta; | ||
826 | if (delta_mod < 0) | ||
827 | delta_mod = -delta_mod; | ||
828 | pps_stabil += (div_s64(((s64)delta_mod) << | ||
829 | (NTP_SCALE_SHIFT - SHIFT_USEC), | ||
830 | NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; | ||
831 | |||
832 | /* if enabled, the system clock frequency is updated */ | ||
833 | if ((time_status & STA_PPSFREQ) != 0 && | ||
834 | (time_status & STA_FREQHOLD) == 0) { | ||
835 | time_freq = pps_freq; | ||
836 | ntp_update_frequency(); | ||
837 | } | ||
838 | |||
839 | return delta; | ||
840 | } | ||
841 | |||
842 | /* correct REALTIME clock phase error against PPS signal */ | ||
843 | static void hardpps_update_phase(long error) | ||
844 | { | ||
845 | long correction = -error; | ||
846 | long jitter; | ||
847 | |||
848 | /* add the sample to the median filter */ | ||
849 | pps_phase_filter_add(correction); | ||
850 | correction = pps_phase_filter_get(&jitter); | ||
851 | |||
852 | /* Nominal jitter is due to PPS signal noise. If it exceeds the | ||
853 | * threshold, the sample is discarded; otherwise, if so enabled, | ||
854 | * the time offset is updated. | ||
855 | */ | ||
856 | if (jitter > (pps_jitter << PPS_POPCORN)) { | ||
857 | pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", | ||
858 | jitter, (pps_jitter << PPS_POPCORN)); | ||
859 | time_status |= STA_PPSJITTER; | ||
860 | pps_jitcnt++; | ||
861 | } else if (time_status & STA_PPSTIME) { | ||
862 | /* correct the time using the phase offset */ | ||
863 | time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, | ||
864 | NTP_INTERVAL_FREQ); | ||
865 | /* cancel running adjtime() */ | ||
866 | time_adjust = 0; | ||
867 | } | ||
868 | /* update jitter */ | ||
869 | pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN; | ||
870 | } | ||
871 | |||
872 | /* | ||
873 | * hardpps() - discipline CPU clock oscillator to external PPS signal | ||
874 | * | ||
875 | * This routine is called at each PPS signal arrival in order to | ||
876 | * discipline the CPU clock oscillator to the PPS signal. It takes two | ||
877 | * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former | ||
878 | * is used to correct clock phase error and the latter is used to | ||
879 | * correct the frequency. | ||
880 | * | ||
881 | * This code is based on David Mills's reference nanokernel | ||
882 | * implementation. It was mostly rewritten but keeps the same idea. | ||
883 | */ | ||
884 | void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | ||
885 | { | ||
886 | struct pps_normtime pts_norm, freq_norm; | ||
887 | unsigned long flags; | ||
888 | |||
889 | pts_norm = pps_normalize_ts(*phase_ts); | ||
890 | |||
891 | write_seqlock_irqsave(&xtime_lock, flags); | ||
892 | |||
893 | /* clear the error bits, they will be set again if needed */ | ||
894 | time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); | ||
895 | |||
896 | /* indicate signal presence */ | ||
897 | time_status |= STA_PPSSIGNAL; | ||
898 | pps_valid = PPS_VALID; | ||
899 | |||
900 | /* when called for the first time, | ||
901 | * just start the frequency interval */ | ||
902 | if (unlikely(pps_fbase.tv_sec == 0)) { | ||
903 | pps_fbase = *raw_ts; | ||
904 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
905 | return; | ||
906 | } | ||
907 | |||
908 | /* ok, now we have a base for frequency calculation */ | ||
909 | freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase)); | ||
910 | |||
911 | /* check that the signal is in the range | ||
912 | * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ | ||
913 | if ((freq_norm.sec == 0) || | ||
914 | (freq_norm.nsec > MAXFREQ * freq_norm.sec) || | ||
915 | (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { | ||
916 | time_status |= STA_PPSJITTER; | ||
917 | /* restart the frequency calibration interval */ | ||
918 | pps_fbase = *raw_ts; | ||
919 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
920 | pr_err("hardpps: PPSJITTER: bad pulse\n"); | ||
921 | return; | ||
922 | } | ||
923 | |||
924 | /* signal is ok */ | ||
925 | |||
926 | /* check if the current frequency interval is finished */ | ||
927 | if (freq_norm.sec >= (1 << pps_shift)) { | ||
928 | pps_calcnt++; | ||
929 | /* restart the frequency calibration interval */ | ||
930 | pps_fbase = *raw_ts; | ||
931 | hardpps_update_freq(freq_norm); | ||
932 | } | ||
933 | |||
934 | hardpps_update_phase(pts_norm.nsec); | ||
935 | |||
936 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
937 | } | ||
938 | EXPORT_SYMBOL(hardpps); | ||
939 | |||
940 | #endif /* CONFIG_NTP_PPS */ | ||
941 | |||
547 | static int __init ntp_tick_adj_setup(char *str) | 942 | static int __init ntp_tick_adj_setup(char *str) |
548 | { | 943 | { |
549 | ntp_tick_adj = simple_strtol(str, NULL, 0); | 944 | ntp_tick_adj = simple_strtol(str, NULL, 0); |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b6b898d2eeef..051bc80a0c43 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
@@ -49,7 +49,7 @@ struct tick_device *tick_get_device(int cpu) | |||
49 | */ | 49 | */ |
50 | int tick_is_oneshot_available(void) | 50 | int tick_is_oneshot_available(void) |
51 | { | 51 | { |
52 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 52 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); |
53 | 53 | ||
54 | return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); | 54 | return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); |
55 | } | 55 | } |
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index aada0e52680a..5cbc101f908b 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c | |||
@@ -95,7 +95,7 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, | |||
95 | */ | 95 | */ |
96 | int tick_program_event(ktime_t expires, int force) | 96 | int tick_program_event(ktime_t expires, int force) |
97 | { | 97 | { |
98 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 98 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); |
99 | 99 | ||
100 | return tick_dev_program_event(dev, expires, force); | 100 | return tick_dev_program_event(dev, expires, force); |
101 | } | 101 | } |
@@ -167,7 +167,7 @@ int tick_oneshot_mode_active(void) | |||
167 | int ret; | 167 | int ret; |
168 | 168 | ||
169 | local_irq_save(flags); | 169 | local_irq_save(flags); |
170 | ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT; | 170 | ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT; |
171 | local_irq_restore(flags); | 171 | local_irq_restore(flags); |
172 | 172 | ||
173 | return ret; | 173 | return ret; |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3e216e01bbd1..c55ea2433471 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -642,8 +642,7 @@ static void tick_nohz_switch_to_nohz(void) | |||
642 | } | 642 | } |
643 | local_irq_enable(); | 643 | local_irq_enable(); |
644 | 644 | ||
645 | printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", | 645 | printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id()); |
646 | smp_processor_id()); | ||
647 | } | 646 | } |
648 | 647 | ||
649 | /* | 648 | /* |
@@ -795,8 +794,10 @@ void tick_setup_sched_timer(void) | |||
795 | } | 794 | } |
796 | 795 | ||
797 | #ifdef CONFIG_NO_HZ | 796 | #ifdef CONFIG_NO_HZ |
798 | if (tick_nohz_enabled) | 797 | if (tick_nohz_enabled) { |
799 | ts->nohz_mode = NOHZ_MODE_HIGHRES; | 798 | ts->nohz_mode = NOHZ_MODE_HIGHRES; |
799 | printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id()); | ||
800 | } | ||
800 | #endif | 801 | #endif |
801 | } | 802 | } |
802 | #endif /* HIGH_RES_TIMERS */ | 803 | #endif /* HIGH_RES_TIMERS */ |
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c index ac38fbb176cc..a9ae369925ce 100644 --- a/kernel/time/timecompare.c +++ b/kernel/time/timecompare.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/module.h> | 21 | #include <linux/module.h> |
22 | #include <linux/slab.h> | 22 | #include <linux/slab.h> |
23 | #include <linux/math64.h> | 23 | #include <linux/math64.h> |
24 | #include <linux/kernel.h> | ||
24 | 25 | ||
25 | /* | 26 | /* |
26 | * fixed point arithmetic scale factor for skew | 27 | * fixed point arithmetic scale factor for skew |
@@ -57,11 +58,11 @@ int timecompare_offset(struct timecompare *sync, | |||
57 | int index; | 58 | int index; |
58 | int num_samples = sync->num_samples; | 59 | int num_samples = sync->num_samples; |
59 | 60 | ||
60 | if (num_samples > sizeof(buffer)/sizeof(buffer[0])) { | 61 | if (num_samples > ARRAY_SIZE(buffer)) { |
61 | samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); | 62 | samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); |
62 | if (!samples) { | 63 | if (!samples) { |
63 | samples = buffer; | 64 | samples = buffer; |
64 | num_samples = sizeof(buffer)/sizeof(buffer[0]); | 65 | num_samples = ARRAY_SIZE(buffer); |
65 | } | 66 | } |
66 | } else { | 67 | } else { |
67 | samples = buffer; | 68 | samples = buffer; |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 49010d822f72..d27c7562902c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -32,6 +32,8 @@ struct timekeeper { | |||
32 | cycle_t cycle_interval; | 32 | cycle_t cycle_interval; |
33 | /* Number of clock shifted nano seconds in one NTP interval. */ | 33 | /* Number of clock shifted nano seconds in one NTP interval. */ |
34 | u64 xtime_interval; | 34 | u64 xtime_interval; |
35 | /* shifted nano seconds left over when rounding cycle_interval */ | ||
36 | s64 xtime_remainder; | ||
35 | /* Raw nano seconds accumulated per NTP interval. */ | 37 | /* Raw nano seconds accumulated per NTP interval. */ |
36 | u32 raw_interval; | 38 | u32 raw_interval; |
37 | 39 | ||
@@ -47,7 +49,7 @@ struct timekeeper { | |||
47 | u32 mult; | 49 | u32 mult; |
48 | }; | 50 | }; |
49 | 51 | ||
50 | struct timekeeper timekeeper; | 52 | static struct timekeeper timekeeper; |
51 | 53 | ||
52 | /** | 54 | /** |
53 | * timekeeper_setup_internals - Set up internals to use clocksource clock. | 55 | * timekeeper_setup_internals - Set up internals to use clocksource clock. |
@@ -62,7 +64,7 @@ struct timekeeper timekeeper; | |||
62 | static void timekeeper_setup_internals(struct clocksource *clock) | 64 | static void timekeeper_setup_internals(struct clocksource *clock) |
63 | { | 65 | { |
64 | cycle_t interval; | 66 | cycle_t interval; |
65 | u64 tmp; | 67 | u64 tmp, ntpinterval; |
66 | 68 | ||
67 | timekeeper.clock = clock; | 69 | timekeeper.clock = clock; |
68 | clock->cycle_last = clock->read(clock); | 70 | clock->cycle_last = clock->read(clock); |
@@ -70,6 +72,7 @@ static void timekeeper_setup_internals(struct clocksource *clock) | |||
70 | /* Do the ns -> cycle conversion first, using original mult */ | 72 | /* Do the ns -> cycle conversion first, using original mult */ |
71 | tmp = NTP_INTERVAL_LENGTH; | 73 | tmp = NTP_INTERVAL_LENGTH; |
72 | tmp <<= clock->shift; | 74 | tmp <<= clock->shift; |
75 | ntpinterval = tmp; | ||
73 | tmp += clock->mult/2; | 76 | tmp += clock->mult/2; |
74 | do_div(tmp, clock->mult); | 77 | do_div(tmp, clock->mult); |
75 | if (tmp == 0) | 78 | if (tmp == 0) |
@@ -80,6 +83,7 @@ static void timekeeper_setup_internals(struct clocksource *clock) | |||
80 | 83 | ||
81 | /* Go back from cycles -> shifted ns */ | 84 | /* Go back from cycles -> shifted ns */ |
82 | timekeeper.xtime_interval = (u64) interval * clock->mult; | 85 | timekeeper.xtime_interval = (u64) interval * clock->mult; |
86 | timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval; | ||
83 | timekeeper.raw_interval = | 87 | timekeeper.raw_interval = |
84 | ((u64) interval * clock->mult) >> clock->shift; | 88 | ((u64) interval * clock->mult) >> clock->shift; |
85 | 89 | ||
@@ -160,7 +164,7 @@ static struct timespec total_sleep_time; | |||
160 | /* | 164 | /* |
161 | * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. | 165 | * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. |
162 | */ | 166 | */ |
163 | struct timespec raw_time; | 167 | static struct timespec raw_time; |
164 | 168 | ||
165 | /* flag for if timekeeping is suspended */ | 169 | /* flag for if timekeeping is suspended */ |
166 | int __read_mostly timekeeping_suspended; | 170 | int __read_mostly timekeeping_suspended; |
@@ -284,6 +288,49 @@ void ktime_get_ts(struct timespec *ts) | |||
284 | } | 288 | } |
285 | EXPORT_SYMBOL_GPL(ktime_get_ts); | 289 | EXPORT_SYMBOL_GPL(ktime_get_ts); |
286 | 290 | ||
291 | #ifdef CONFIG_NTP_PPS | ||
292 | |||
293 | /** | ||
294 | * getnstime_raw_and_real - get day and raw monotonic time in timespec format | ||
295 | * @ts_raw: pointer to the timespec to be set to raw monotonic time | ||
296 | * @ts_real: pointer to the timespec to be set to the time of day | ||
297 | * | ||
298 | * This function reads both the time of day and raw monotonic time at the | ||
299 | * same time atomically and stores the resulting timestamps in timespec | ||
300 | * format. | ||
301 | */ | ||
302 | void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) | ||
303 | { | ||
304 | unsigned long seq; | ||
305 | s64 nsecs_raw, nsecs_real; | ||
306 | |||
307 | WARN_ON_ONCE(timekeeping_suspended); | ||
308 | |||
309 | do { | ||
310 | u32 arch_offset; | ||
311 | |||
312 | seq = read_seqbegin(&xtime_lock); | ||
313 | |||
314 | *ts_raw = raw_time; | ||
315 | *ts_real = xtime; | ||
316 | |||
317 | nsecs_raw = timekeeping_get_ns_raw(); | ||
318 | nsecs_real = timekeeping_get_ns(); | ||
319 | |||
320 | /* If arch requires, add in gettimeoffset() */ | ||
321 | arch_offset = arch_gettimeoffset(); | ||
322 | nsecs_raw += arch_offset; | ||
323 | nsecs_real += arch_offset; | ||
324 | |||
325 | } while (read_seqretry(&xtime_lock, seq)); | ||
326 | |||
327 | timespec_add_ns(ts_raw, nsecs_raw); | ||
328 | timespec_add_ns(ts_real, nsecs_real); | ||
329 | } | ||
330 | EXPORT_SYMBOL(getnstime_raw_and_real); | ||
331 | |||
332 | #endif /* CONFIG_NTP_PPS */ | ||
333 | |||
287 | /** | 334 | /** |
288 | * do_gettimeofday - Returns the time of day in a timeval | 335 | * do_gettimeofday - Returns the time of day in a timeval |
289 | * @tv: pointer to the timeval to be set | 336 | * @tv: pointer to the timeval to be set |
@@ -719,7 +766,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) | |||
719 | 766 | ||
720 | /* Accumulate error between NTP and clock interval */ | 767 | /* Accumulate error between NTP and clock interval */ |
721 | timekeeper.ntp_error += tick_length << shift; | 768 | timekeeper.ntp_error += tick_length << shift; |
722 | timekeeper.ntp_error -= timekeeper.xtime_interval << | 769 | timekeeper.ntp_error -= |
770 | (timekeeper.xtime_interval + timekeeper.xtime_remainder) << | ||
723 | (timekeeper.ntp_error_shift + shift); | 771 | (timekeeper.ntp_error_shift + shift); |
724 | 772 | ||
725 | return offset; | 773 | return offset; |
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index ab8f5e33fa92..3258455549f4 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c | |||
@@ -41,7 +41,7 @@ static void print_name_offset(struct seq_file *m, void *sym) | |||
41 | char symname[KSYM_NAME_LEN]; | 41 | char symname[KSYM_NAME_LEN]; |
42 | 42 | ||
43 | if (lookup_symbol_name((unsigned long)sym, symname) < 0) | 43 | if (lookup_symbol_name((unsigned long)sym, symname) < 0) |
44 | SEQ_printf(m, "<%p>", sym); | 44 | SEQ_printf(m, "<%pK>", sym); |
45 | else | 45 | else |
46 | SEQ_printf(m, "%s", symname); | 46 | SEQ_printf(m, "%s", symname); |
47 | } | 47 | } |
@@ -79,26 +79,26 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, | |||
79 | { | 79 | { |
80 | struct hrtimer *timer, tmp; | 80 | struct hrtimer *timer, tmp; |
81 | unsigned long next = 0, i; | 81 | unsigned long next = 0, i; |
82 | struct rb_node *curr; | 82 | struct timerqueue_node *curr; |
83 | unsigned long flags; | 83 | unsigned long flags; |
84 | 84 | ||
85 | next_one: | 85 | next_one: |
86 | i = 0; | 86 | i = 0; |
87 | raw_spin_lock_irqsave(&base->cpu_base->lock, flags); | 87 | raw_spin_lock_irqsave(&base->cpu_base->lock, flags); |
88 | 88 | ||
89 | curr = base->first; | 89 | curr = timerqueue_getnext(&base->active); |
90 | /* | 90 | /* |
91 | * Crude but we have to do this O(N*N) thing, because | 91 | * Crude but we have to do this O(N*N) thing, because |
92 | * we have to unlock the base when printing: | 92 | * we have to unlock the base when printing: |
93 | */ | 93 | */ |
94 | while (curr && i < next) { | 94 | while (curr && i < next) { |
95 | curr = rb_next(curr); | 95 | curr = timerqueue_iterate_next(curr); |
96 | i++; | 96 | i++; |
97 | } | 97 | } |
98 | 98 | ||
99 | if (curr) { | 99 | if (curr) { |
100 | 100 | ||
101 | timer = rb_entry(curr, struct hrtimer, node); | 101 | timer = container_of(curr, struct hrtimer, node); |
102 | tmp = *timer; | 102 | tmp = *timer; |
103 | raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); | 103 | raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); |
104 | 104 | ||
@@ -112,7 +112,7 @@ next_one: | |||
112 | static void | 112 | static void |
113 | print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) | 113 | print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) |
114 | { | 114 | { |
115 | SEQ_printf(m, " .base: %p\n", base); | 115 | SEQ_printf(m, " .base: %pK\n", base); |
116 | SEQ_printf(m, " .index: %d\n", | 116 | SEQ_printf(m, " .index: %d\n", |
117 | base->index); | 117 | base->index); |
118 | SEQ_printf(m, " .resolution: %Lu nsecs\n", | 118 | SEQ_printf(m, " .resolution: %Lu nsecs\n", |
diff --git a/kernel/timer.c b/kernel/timer.c index 353b9227c2ec..d6459923d245 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -88,18 +88,6 @@ struct tvec_base boot_tvec_bases; | |||
88 | EXPORT_SYMBOL(boot_tvec_bases); | 88 | EXPORT_SYMBOL(boot_tvec_bases); |
89 | static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; | 89 | static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; |
90 | 90 | ||
91 | /* | ||
92 | * Note that all tvec_bases are 2 byte aligned and lower bit of | ||
93 | * base in timer_list is guaranteed to be zero. Use the LSB to | ||
94 | * indicate whether the timer is deferrable. | ||
95 | * | ||
96 | * A deferrable timer will work normally when the system is busy, but | ||
97 | * will not cause a CPU to come out of idle just to service it; instead, | ||
98 | * the timer will be serviced when the CPU eventually wakes up with a | ||
99 | * subsequent non-deferrable timer. | ||
100 | */ | ||
101 | #define TBASE_DEFERRABLE_FLAG (0x1) | ||
102 | |||
103 | /* Functions below help us manage 'deferrable' flag */ | 91 | /* Functions below help us manage 'deferrable' flag */ |
104 | static inline unsigned int tbase_get_deferrable(struct tvec_base *base) | 92 | static inline unsigned int tbase_get_deferrable(struct tvec_base *base) |
105 | { | 93 | { |
@@ -113,8 +101,7 @@ static inline struct tvec_base *tbase_get_base(struct tvec_base *base) | |||
113 | 101 | ||
114 | static inline void timer_set_deferrable(struct timer_list *timer) | 102 | static inline void timer_set_deferrable(struct timer_list *timer) |
115 | { | 103 | { |
116 | timer->base = ((struct tvec_base *)((unsigned long)(timer->base) | | 104 | timer->base = TBASE_MAKE_DEFERRED(timer->base); |
117 | TBASE_DEFERRABLE_FLAG)); | ||
118 | } | 105 | } |
119 | 106 | ||
120 | static inline void | 107 | static inline void |
@@ -343,15 +330,6 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) | |||
343 | } | 330 | } |
344 | EXPORT_SYMBOL_GPL(set_timer_slack); | 331 | EXPORT_SYMBOL_GPL(set_timer_slack); |
345 | 332 | ||
346 | |||
347 | static inline void set_running_timer(struct tvec_base *base, | ||
348 | struct timer_list *timer) | ||
349 | { | ||
350 | #ifdef CONFIG_SMP | ||
351 | base->running_timer = timer; | ||
352 | #endif | ||
353 | } | ||
354 | |||
355 | static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) | 333 | static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) |
356 | { | 334 | { |
357 | unsigned long expires = timer->expires; | 335 | unsigned long expires = timer->expires; |
@@ -936,15 +914,12 @@ int del_timer(struct timer_list *timer) | |||
936 | } | 914 | } |
937 | EXPORT_SYMBOL(del_timer); | 915 | EXPORT_SYMBOL(del_timer); |
938 | 916 | ||
939 | #ifdef CONFIG_SMP | ||
940 | /** | 917 | /** |
941 | * try_to_del_timer_sync - Try to deactivate a timer | 918 | * try_to_del_timer_sync - Try to deactivate a timer |
942 | * @timer: timer do del | 919 | * @timer: timer do del |
943 | * | 920 | * |
944 | * This function tries to deactivate a timer. Upon successful (ret >= 0) | 921 | * This function tries to deactivate a timer. Upon successful (ret >= 0) |
945 | * exit the timer is not queued and the handler is not running on any CPU. | 922 | * exit the timer is not queued and the handler is not running on any CPU. |
946 | * | ||
947 | * It must not be called from interrupt contexts. | ||
948 | */ | 923 | */ |
949 | int try_to_del_timer_sync(struct timer_list *timer) | 924 | int try_to_del_timer_sync(struct timer_list *timer) |
950 | { | 925 | { |
@@ -973,6 +948,7 @@ out: | |||
973 | } | 948 | } |
974 | EXPORT_SYMBOL(try_to_del_timer_sync); | 949 | EXPORT_SYMBOL(try_to_del_timer_sync); |
975 | 950 | ||
951 | #ifdef CONFIG_SMP | ||
976 | /** | 952 | /** |
977 | * del_timer_sync - deactivate a timer and wait for the handler to finish. | 953 | * del_timer_sync - deactivate a timer and wait for the handler to finish. |
978 | * @timer: the timer to be deactivated | 954 | * @timer: the timer to be deactivated |
@@ -1000,7 +976,11 @@ int del_timer_sync(struct timer_list *timer) | |||
1000 | lock_map_release(&timer->lockdep_map); | 976 | lock_map_release(&timer->lockdep_map); |
1001 | local_irq_restore(flags); | 977 | local_irq_restore(flags); |
1002 | #endif | 978 | #endif |
1003 | 979 | /* | |
980 | * don't use it in hardirq context, because it | ||
981 | * could lead to deadlock. | ||
982 | */ | ||
983 | WARN_ON(in_irq()); | ||
1004 | for (;;) { | 984 | for (;;) { |
1005 | int ret = try_to_del_timer_sync(timer); | 985 | int ret = try_to_del_timer_sync(timer); |
1006 | if (ret >= 0) | 986 | if (ret >= 0) |
@@ -1111,7 +1091,7 @@ static inline void __run_timers(struct tvec_base *base) | |||
1111 | 1091 | ||
1112 | timer_stats_account_timer(timer); | 1092 | timer_stats_account_timer(timer); |
1113 | 1093 | ||
1114 | set_running_timer(base, timer); | 1094 | base->running_timer = timer; |
1115 | detach_timer(timer, 1); | 1095 | detach_timer(timer, 1); |
1116 | 1096 | ||
1117 | spin_unlock_irq(&base->lock); | 1097 | spin_unlock_irq(&base->lock); |
@@ -1119,7 +1099,7 @@ static inline void __run_timers(struct tvec_base *base) | |||
1119 | spin_lock_irq(&base->lock); | 1099 | spin_lock_irq(&base->lock); |
1120 | } | 1100 | } |
1121 | } | 1101 | } |
1122 | set_running_timer(base, NULL); | 1102 | base->running_timer = NULL; |
1123 | spin_unlock_irq(&base->lock); | 1103 | spin_unlock_irq(&base->lock); |
1124 | } | 1104 | } |
1125 | 1105 | ||
@@ -1249,7 +1229,7 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now, | |||
1249 | */ | 1229 | */ |
1250 | unsigned long get_next_timer_interrupt(unsigned long now) | 1230 | unsigned long get_next_timer_interrupt(unsigned long now) |
1251 | { | 1231 | { |
1252 | struct tvec_base *base = __get_cpu_var(tvec_bases); | 1232 | struct tvec_base *base = __this_cpu_read(tvec_bases); |
1253 | unsigned long expires; | 1233 | unsigned long expires; |
1254 | 1234 | ||
1255 | /* | 1235 | /* |
@@ -1298,7 +1278,7 @@ void update_process_times(int user_tick) | |||
1298 | */ | 1278 | */ |
1299 | static void run_timer_softirq(struct softirq_action *h) | 1279 | static void run_timer_softirq(struct softirq_action *h) |
1300 | { | 1280 | { |
1301 | struct tvec_base *base = __get_cpu_var(tvec_bases); | 1281 | struct tvec_base *base = __this_cpu_read(tvec_bases); |
1302 | 1282 | ||
1303 | hrtimer_run_pending(); | 1283 | hrtimer_run_pending(); |
1304 | 1284 | ||
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index ea37e2ff4164..14674dce77a6 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -69,6 +69,21 @@ config EVENT_TRACING | |||
69 | select CONTEXT_SWITCH_TRACER | 69 | select CONTEXT_SWITCH_TRACER |
70 | bool | 70 | bool |
71 | 71 | ||
72 | config EVENT_POWER_TRACING_DEPRECATED | ||
73 | depends on EVENT_TRACING | ||
74 | bool "Deprecated power event trace API, to be removed" | ||
75 | default y | ||
76 | help | ||
77 | Provides old power event types: | ||
78 | C-state/idle accounting events: | ||
79 | power:power_start | ||
80 | power:power_end | ||
81 | and old cpufreq accounting event: | ||
82 | power:power_frequency | ||
83 | This is for userspace compatibility | ||
84 | and will vanish after 5 kernel iterations, | ||
85 | namely 2.6.41. | ||
86 | |||
72 | config CONTEXT_SWITCH_TRACER | 87 | config CONTEXT_SWITCH_TRACER |
73 | bool | 88 | bool |
74 | 89 | ||
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 53f338190b26..761c510a06c5 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o | |||
52 | endif | 52 | endif |
53 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o | 53 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o |
54 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o | 54 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o |
55 | obj-$(CONFIG_EVENT_TRACING) += power-traces.o | 55 | obj-$(CONFIG_TRACEPOINTS) += power-traces.o |
56 | ifeq ($(CONFIG_TRACING),y) | 56 | ifeq ($(CONFIG_TRACING),y) |
57 | obj-$(CONFIG_KGDB_KDB) += trace_kdb.o | 57 | obj-$(CONFIG_KGDB_KDB) += trace_kdb.o |
58 | endif | 58 | endif |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7b8ec0281548..d95721f33702 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
@@ -138,6 +138,13 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) | |||
138 | !blk_tracer_enabled)) | 138 | !blk_tracer_enabled)) |
139 | return; | 139 | return; |
140 | 140 | ||
141 | /* | ||
142 | * If the BLK_TC_NOTIFY action mask isn't set, don't send any note | ||
143 | * message to the trace. | ||
144 | */ | ||
145 | if (!(bt->act_mask & BLK_TC_NOTIFY)) | ||
146 | return; | ||
147 | |||
141 | local_irq_save(flags); | 148 | local_irq_save(flags); |
142 | buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); | 149 | buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); |
143 | va_start(args, fmt); | 150 | va_start(args, fmt); |
@@ -758,53 +765,58 @@ static void blk_add_trace_rq_complete(void *ignore, | |||
758 | * @q: queue the io is for | 765 | * @q: queue the io is for |
759 | * @bio: the source bio | 766 | * @bio: the source bio |
760 | * @what: the action | 767 | * @what: the action |
768 | * @error: error, if any | ||
761 | * | 769 | * |
762 | * Description: | 770 | * Description: |
763 | * Records an action against a bio. Will log the bio offset + size. | 771 | * Records an action against a bio. Will log the bio offset + size. |
764 | * | 772 | * |
765 | **/ | 773 | **/ |
766 | static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, | 774 | static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, |
767 | u32 what) | 775 | u32 what, int error) |
768 | { | 776 | { |
769 | struct blk_trace *bt = q->blk_trace; | 777 | struct blk_trace *bt = q->blk_trace; |
770 | 778 | ||
771 | if (likely(!bt)) | 779 | if (likely(!bt)) |
772 | return; | 780 | return; |
773 | 781 | ||
782 | if (!error && !bio_flagged(bio, BIO_UPTODATE)) | ||
783 | error = EIO; | ||
784 | |||
774 | __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, | 785 | __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, |
775 | !bio_flagged(bio, BIO_UPTODATE), 0, NULL); | 786 | error, 0, NULL); |
776 | } | 787 | } |
777 | 788 | ||
778 | static void blk_add_trace_bio_bounce(void *ignore, | 789 | static void blk_add_trace_bio_bounce(void *ignore, |
779 | struct request_queue *q, struct bio *bio) | 790 | struct request_queue *q, struct bio *bio) |
780 | { | 791 | { |
781 | blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); | 792 | blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); |
782 | } | 793 | } |
783 | 794 | ||
784 | static void blk_add_trace_bio_complete(void *ignore, | 795 | static void blk_add_trace_bio_complete(void *ignore, |
785 | struct request_queue *q, struct bio *bio) | 796 | struct request_queue *q, struct bio *bio, |
797 | int error) | ||
786 | { | 798 | { |
787 | blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); | 799 | blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); |
788 | } | 800 | } |
789 | 801 | ||
790 | static void blk_add_trace_bio_backmerge(void *ignore, | 802 | static void blk_add_trace_bio_backmerge(void *ignore, |
791 | struct request_queue *q, | 803 | struct request_queue *q, |
792 | struct bio *bio) | 804 | struct bio *bio) |
793 | { | 805 | { |
794 | blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); | 806 | blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); |
795 | } | 807 | } |
796 | 808 | ||
797 | static void blk_add_trace_bio_frontmerge(void *ignore, | 809 | static void blk_add_trace_bio_frontmerge(void *ignore, |
798 | struct request_queue *q, | 810 | struct request_queue *q, |
799 | struct bio *bio) | 811 | struct bio *bio) |
800 | { | 812 | { |
801 | blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); | 813 | blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); |
802 | } | 814 | } |
803 | 815 | ||
804 | static void blk_add_trace_bio_queue(void *ignore, | 816 | static void blk_add_trace_bio_queue(void *ignore, |
805 | struct request_queue *q, struct bio *bio) | 817 | struct request_queue *q, struct bio *bio) |
806 | { | 818 | { |
807 | blk_add_trace_bio(q, bio, BLK_TA_QUEUE); | 819 | blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0); |
808 | } | 820 | } |
809 | 821 | ||
810 | static void blk_add_trace_getrq(void *ignore, | 822 | static void blk_add_trace_getrq(void *ignore, |
@@ -812,7 +824,7 @@ static void blk_add_trace_getrq(void *ignore, | |||
812 | struct bio *bio, int rw) | 824 | struct bio *bio, int rw) |
813 | { | 825 | { |
814 | if (bio) | 826 | if (bio) |
815 | blk_add_trace_bio(q, bio, BLK_TA_GETRQ); | 827 | blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); |
816 | else { | 828 | else { |
817 | struct blk_trace *bt = q->blk_trace; | 829 | struct blk_trace *bt = q->blk_trace; |
818 | 830 | ||
@@ -827,7 +839,7 @@ static void blk_add_trace_sleeprq(void *ignore, | |||
827 | struct bio *bio, int rw) | 839 | struct bio *bio, int rw) |
828 | { | 840 | { |
829 | if (bio) | 841 | if (bio) |
830 | blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); | 842 | blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); |
831 | else { | 843 | else { |
832 | struct blk_trace *bt = q->blk_trace; | 844 | struct blk_trace *bt = q->blk_trace; |
833 | 845 | ||
@@ -887,7 +899,7 @@ static void blk_add_trace_split(void *ignore, | |||
887 | } | 899 | } |
888 | 900 | ||
889 | /** | 901 | /** |
890 | * blk_add_trace_remap - Add a trace for a remap operation | 902 | * blk_add_trace_bio_remap - Add a trace for a bio-remap operation |
891 | * @ignore: trace callback data parameter (not used) | 903 | * @ignore: trace callback data parameter (not used) |
892 | * @q: queue the io is for | 904 | * @q: queue the io is for |
893 | * @bio: the source bio | 905 | * @bio: the source bio |
@@ -899,9 +911,9 @@ static void blk_add_trace_split(void *ignore, | |||
899 | * it spans a stripe (or similar). Add a trace for that action. | 911 | * it spans a stripe (or similar). Add a trace for that action. |
900 | * | 912 | * |
901 | **/ | 913 | **/ |
902 | static void blk_add_trace_remap(void *ignore, | 914 | static void blk_add_trace_bio_remap(void *ignore, |
903 | struct request_queue *q, struct bio *bio, | 915 | struct request_queue *q, struct bio *bio, |
904 | dev_t dev, sector_t from) | 916 | dev_t dev, sector_t from) |
905 | { | 917 | { |
906 | struct blk_trace *bt = q->blk_trace; | 918 | struct blk_trace *bt = q->blk_trace; |
907 | struct blk_io_trace_remap r; | 919 | struct blk_io_trace_remap r; |
@@ -1016,7 +1028,7 @@ static void blk_register_tracepoints(void) | |||
1016 | WARN_ON(ret); | 1028 | WARN_ON(ret); |
1017 | ret = register_trace_block_split(blk_add_trace_split, NULL); | 1029 | ret = register_trace_block_split(blk_add_trace_split, NULL); |
1018 | WARN_ON(ret); | 1030 | WARN_ON(ret); |
1019 | ret = register_trace_block_remap(blk_add_trace_remap, NULL); | 1031 | ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); |
1020 | WARN_ON(ret); | 1032 | WARN_ON(ret); |
1021 | ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); | 1033 | ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); |
1022 | WARN_ON(ret); | 1034 | WARN_ON(ret); |
@@ -1025,7 +1037,7 @@ static void blk_register_tracepoints(void) | |||
1025 | static void blk_unregister_tracepoints(void) | 1037 | static void blk_unregister_tracepoints(void) |
1026 | { | 1038 | { |
1027 | unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); | 1039 | unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); |
1028 | unregister_trace_block_remap(blk_add_trace_remap, NULL); | 1040 | unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); |
1029 | unregister_trace_block_split(blk_add_trace_split, NULL); | 1041 | unregister_trace_block_split(blk_add_trace_split, NULL); |
1030 | unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); | 1042 | unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); |
1031 | unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); | 1043 | unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); |
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index a22582a06161..f55fcf61b223 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c | |||
@@ -13,5 +13,8 @@ | |||
13 | #define CREATE_TRACE_POINTS | 13 | #define CREATE_TRACE_POINTS |
14 | #include <trace/events/power.h> | 14 | #include <trace/events/power.h> |
15 | 15 | ||
16 | EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); | 16 | #ifdef EVENT_POWER_TRACING_DEPRECATED |
17 | EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); | ||
18 | #endif | ||
19 | EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); | ||
17 | 20 | ||
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f8cf959bad45..dc53ecb80589 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -1313,12 +1313,10 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) | |||
1313 | 1313 | ||
1314 | __this_cpu_inc(user_stack_count); | 1314 | __this_cpu_inc(user_stack_count); |
1315 | 1315 | ||
1316 | |||
1317 | |||
1318 | event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, | 1316 | event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, |
1319 | sizeof(*entry), flags, pc); | 1317 | sizeof(*entry), flags, pc); |
1320 | if (!event) | 1318 | if (!event) |
1321 | return; | 1319 | goto out_drop_count; |
1322 | entry = ring_buffer_event_data(event); | 1320 | entry = ring_buffer_event_data(event); |
1323 | 1321 | ||
1324 | entry->tgid = current->tgid; | 1322 | entry->tgid = current->tgid; |
@@ -1333,8 +1331,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) | |||
1333 | if (!filter_check_discard(call, entry, buffer, event)) | 1331 | if (!filter_check_discard(call, entry, buffer, event)) |
1334 | ring_buffer_unlock_commit(buffer, event); | 1332 | ring_buffer_unlock_commit(buffer, event); |
1335 | 1333 | ||
1334 | out_drop_count: | ||
1336 | __this_cpu_dec(user_stack_count); | 1335 | __this_cpu_dec(user_stack_count); |
1337 | |||
1338 | out: | 1336 | out: |
1339 | preempt_enable(); | 1337 | preempt_enable(); |
1340 | } | 1338 | } |
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e3dfecaf13e6..6cf223764be8 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h | |||
@@ -53,7 +53,7 @@ | |||
53 | */ | 53 | */ |
54 | 54 | ||
55 | /* | 55 | /* |
56 | * Function trace entry - function address and parent function addres: | 56 | * Function trace entry - function address and parent function address: |
57 | */ | 57 | */ |
58 | FTRACE_ENTRY(function, ftrace_entry, | 58 | FTRACE_ENTRY(function, ftrace_entry, |
59 | 59 | ||
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 39c059ca670e..19a359d5e6d5 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c | |||
@@ -21,17 +21,46 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) | |||
21 | /* Count the events in use (per event id, not per instance) */ | 21 | /* Count the events in use (per event id, not per instance) */ |
22 | static int total_ref_count; | 22 | static int total_ref_count; |
23 | 23 | ||
24 | static int perf_trace_event_perm(struct ftrace_event_call *tp_event, | ||
25 | struct perf_event *p_event) | ||
26 | { | ||
27 | /* No tracing, just counting, so no obvious leak */ | ||
28 | if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) | ||
29 | return 0; | ||
30 | |||
31 | /* Some events are ok to be traced by non-root users... */ | ||
32 | if (p_event->attach_state == PERF_ATTACH_TASK) { | ||
33 | if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY) | ||
34 | return 0; | ||
35 | } | ||
36 | |||
37 | /* | ||
38 | * ...otherwise raw tracepoint data can be a severe data leak, | ||
39 | * only allow root to have these. | ||
40 | */ | ||
41 | if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) | ||
42 | return -EPERM; | ||
43 | |||
44 | return 0; | ||
45 | } | ||
46 | |||
24 | static int perf_trace_event_init(struct ftrace_event_call *tp_event, | 47 | static int perf_trace_event_init(struct ftrace_event_call *tp_event, |
25 | struct perf_event *p_event) | 48 | struct perf_event *p_event) |
26 | { | 49 | { |
27 | struct hlist_head __percpu *list; | 50 | struct hlist_head __percpu *list; |
28 | int ret = -ENOMEM; | 51 | int ret; |
29 | int cpu; | 52 | int cpu; |
30 | 53 | ||
54 | ret = perf_trace_event_perm(tp_event, p_event); | ||
55 | if (ret) | ||
56 | return ret; | ||
57 | |||
31 | p_event->tp_event = tp_event; | 58 | p_event->tp_event = tp_event; |
32 | if (tp_event->perf_refcount++ > 0) | 59 | if (tp_event->perf_refcount++ > 0) |
33 | return 0; | 60 | return 0; |
34 | 61 | ||
62 | ret = -ENOMEM; | ||
63 | |||
35 | list = alloc_percpu(struct hlist_head); | 64 | list = alloc_percpu(struct hlist_head); |
36 | if (!list) | 65 | if (!list) |
37 | goto fail; | 66 | goto fail; |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 0725eeab1937..5f499e0438a4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -27,6 +27,12 @@ | |||
27 | 27 | ||
28 | DEFINE_MUTEX(event_mutex); | 28 | DEFINE_MUTEX(event_mutex); |
29 | 29 | ||
30 | DEFINE_MUTEX(event_storage_mutex); | ||
31 | EXPORT_SYMBOL_GPL(event_storage_mutex); | ||
32 | |||
33 | char event_storage[EVENT_STORAGE_SIZE]; | ||
34 | EXPORT_SYMBOL_GPL(event_storage); | ||
35 | |||
30 | LIST_HEAD(ftrace_events); | 36 | LIST_HEAD(ftrace_events); |
31 | LIST_HEAD(ftrace_common_fields); | 37 | LIST_HEAD(ftrace_common_fields); |
32 | 38 | ||
@@ -1278,7 +1284,7 @@ trace_create_file_ops(struct module *mod) | |||
1278 | static void trace_module_add_events(struct module *mod) | 1284 | static void trace_module_add_events(struct module *mod) |
1279 | { | 1285 | { |
1280 | struct ftrace_module_file_ops *file_ops = NULL; | 1286 | struct ftrace_module_file_ops *file_ops = NULL; |
1281 | struct ftrace_event_call *call, *start, *end; | 1287 | struct ftrace_event_call **call, **start, **end; |
1282 | 1288 | ||
1283 | start = mod->trace_events; | 1289 | start = mod->trace_events; |
1284 | end = mod->trace_events + mod->num_trace_events; | 1290 | end = mod->trace_events + mod->num_trace_events; |
@@ -1291,7 +1297,7 @@ static void trace_module_add_events(struct module *mod) | |||
1291 | return; | 1297 | return; |
1292 | 1298 | ||
1293 | for_each_event(call, start, end) { | 1299 | for_each_event(call, start, end) { |
1294 | __trace_add_event_call(call, mod, | 1300 | __trace_add_event_call(*call, mod, |
1295 | &file_ops->id, &file_ops->enable, | 1301 | &file_ops->id, &file_ops->enable, |
1296 | &file_ops->filter, &file_ops->format); | 1302 | &file_ops->filter, &file_ops->format); |
1297 | } | 1303 | } |
@@ -1361,8 +1367,8 @@ static struct notifier_block trace_module_nb = { | |||
1361 | .priority = 0, | 1367 | .priority = 0, |
1362 | }; | 1368 | }; |
1363 | 1369 | ||
1364 | extern struct ftrace_event_call __start_ftrace_events[]; | 1370 | extern struct ftrace_event_call *__start_ftrace_events[]; |
1365 | extern struct ftrace_event_call __stop_ftrace_events[]; | 1371 | extern struct ftrace_event_call *__stop_ftrace_events[]; |
1366 | 1372 | ||
1367 | static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; | 1373 | static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; |
1368 | 1374 | ||
@@ -1378,7 +1384,7 @@ __setup("trace_event=", setup_trace_event); | |||
1378 | 1384 | ||
1379 | static __init int event_trace_init(void) | 1385 | static __init int event_trace_init(void) |
1380 | { | 1386 | { |
1381 | struct ftrace_event_call *call; | 1387 | struct ftrace_event_call **call; |
1382 | struct dentry *d_tracer; | 1388 | struct dentry *d_tracer; |
1383 | struct dentry *entry; | 1389 | struct dentry *entry; |
1384 | struct dentry *d_events; | 1390 | struct dentry *d_events; |
@@ -1424,7 +1430,7 @@ static __init int event_trace_init(void) | |||
1424 | pr_warning("tracing: Failed to allocate common fields"); | 1430 | pr_warning("tracing: Failed to allocate common fields"); |
1425 | 1431 | ||
1426 | for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { | 1432 | for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { |
1427 | __trace_add_event_call(call, NULL, &ftrace_event_id_fops, | 1433 | __trace_add_event_call(*call, NULL, &ftrace_event_id_fops, |
1428 | &ftrace_enable_fops, | 1434 | &ftrace_enable_fops, |
1429 | &ftrace_event_filter_fops, | 1435 | &ftrace_event_filter_fops, |
1430 | &ftrace_event_format_fops); | 1436 | &ftrace_event_format_fops); |
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 4ba44deaac25..bbeec31e0ae3 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
@@ -83,13 +83,19 @@ static void __always_unused ____ftrace_check_##name(void) \ | |||
83 | 83 | ||
84 | #undef __array | 84 | #undef __array |
85 | #define __array(type, item, len) \ | 85 | #define __array(type, item, len) \ |
86 | BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ | 86 | do { \ |
87 | ret = trace_define_field(event_call, #type "[" #len "]", #item, \ | 87 | BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ |
88 | mutex_lock(&event_storage_mutex); \ | ||
89 | snprintf(event_storage, sizeof(event_storage), \ | ||
90 | "%s[%d]", #type, len); \ | ||
91 | ret = trace_define_field(event_call, event_storage, #item, \ | ||
88 | offsetof(typeof(field), item), \ | 92 | offsetof(typeof(field), item), \ |
89 | sizeof(field.item), \ | 93 | sizeof(field.item), \ |
90 | is_signed_type(type), FILTER_OTHER); \ | 94 | is_signed_type(type), FILTER_OTHER); \ |
91 | if (ret) \ | 95 | mutex_unlock(&event_storage_mutex); \ |
92 | return ret; | 96 | if (ret) \ |
97 | return ret; \ | ||
98 | } while (0); | ||
93 | 99 | ||
94 | #undef __array_desc | 100 | #undef __array_desc |
95 | #define __array_desc(type, container, item, len) \ | 101 | #define __array_desc(type, container, item, len) \ |
@@ -155,13 +161,13 @@ struct ftrace_event_class event_class_ftrace_##call = { \ | |||
155 | .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ | 161 | .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ |
156 | }; \ | 162 | }; \ |
157 | \ | 163 | \ |
158 | struct ftrace_event_call __used \ | 164 | struct ftrace_event_call __used event_##call = { \ |
159 | __attribute__((__aligned__(4))) \ | ||
160 | __attribute__((section("_ftrace_events"))) event_##call = { \ | ||
161 | .name = #call, \ | 165 | .name = #call, \ |
162 | .event.type = etype, \ | 166 | .event.type = etype, \ |
163 | .class = &event_class_ftrace_##call, \ | 167 | .class = &event_class_ftrace_##call, \ |
164 | .print_fmt = print, \ | 168 | .print_fmt = print, \ |
165 | }; \ | 169 | }; \ |
170 | struct ftrace_event_call __used \ | ||
171 | __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; | ||
166 | 172 | ||
167 | #include "trace_entries.h" | 173 | #include "trace_entries.h" |
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 5cf8c602b880..92b6e1e12d98 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
@@ -453,14 +453,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1) | |||
453 | * Stubs: | 453 | * Stubs: |
454 | */ | 454 | */ |
455 | 455 | ||
456 | void early_boot_irqs_off(void) | ||
457 | { | ||
458 | } | ||
459 | |||
460 | void early_boot_irqs_on(void) | ||
461 | { | ||
462 | } | ||
463 | |||
464 | void trace_softirqs_on(unsigned long ip) | 456 | void trace_softirqs_on(unsigned long ip) |
465 | { | 457 | { |
466 | } | 458 | } |
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 155a415b3209..659732eba07c 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
@@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) | |||
558 | static int trace_wakeup_test_thread(void *data) | 558 | static int trace_wakeup_test_thread(void *data) |
559 | { | 559 | { |
560 | /* Make this a RT thread, doesn't need to be too high */ | 560 | /* Make this a RT thread, doesn't need to be too high */ |
561 | struct sched_param param = { .sched_priority = 5 }; | 561 | static const struct sched_param param = { .sched_priority = 5 }; |
562 | struct completion *x = data; | 562 | struct completion *x = data; |
563 | 563 | ||
564 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 564 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index bac752f0cfb5..5c9fe08d2093 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -23,9 +23,6 @@ static int syscall_exit_register(struct ftrace_event_call *event, | |||
23 | static int syscall_enter_define_fields(struct ftrace_event_call *call); | 23 | static int syscall_enter_define_fields(struct ftrace_event_call *call); |
24 | static int syscall_exit_define_fields(struct ftrace_event_call *call); | 24 | static int syscall_exit_define_fields(struct ftrace_event_call *call); |
25 | 25 | ||
26 | /* All syscall exit events have the same fields */ | ||
27 | static LIST_HEAD(syscall_exit_fields); | ||
28 | |||
29 | static struct list_head * | 26 | static struct list_head * |
30 | syscall_get_enter_fields(struct ftrace_event_call *call) | 27 | syscall_get_enter_fields(struct ftrace_event_call *call) |
31 | { | 28 | { |
@@ -34,50 +31,45 @@ syscall_get_enter_fields(struct ftrace_event_call *call) | |||
34 | return &entry->enter_fields; | 31 | return &entry->enter_fields; |
35 | } | 32 | } |
36 | 33 | ||
37 | static struct list_head * | ||
38 | syscall_get_exit_fields(struct ftrace_event_call *call) | ||
39 | { | ||
40 | return &syscall_exit_fields; | ||
41 | } | ||
42 | |||
43 | struct trace_event_functions enter_syscall_print_funcs = { | 34 | struct trace_event_functions enter_syscall_print_funcs = { |
44 | .trace = print_syscall_enter, | 35 | .trace = print_syscall_enter, |
45 | }; | 36 | }; |
46 | 37 | ||
47 | struct trace_event_functions exit_syscall_print_funcs = { | 38 | struct trace_event_functions exit_syscall_print_funcs = { |
48 | .trace = print_syscall_exit, | 39 | .trace = print_syscall_exit, |
49 | }; | 40 | }; |
50 | 41 | ||
51 | struct ftrace_event_class event_class_syscall_enter = { | 42 | struct ftrace_event_class event_class_syscall_enter = { |
52 | .system = "syscalls", | 43 | .system = "syscalls", |
53 | .reg = syscall_enter_register, | 44 | .reg = syscall_enter_register, |
54 | .define_fields = syscall_enter_define_fields, | 45 | .define_fields = syscall_enter_define_fields, |
55 | .get_fields = syscall_get_enter_fields, | 46 | .get_fields = syscall_get_enter_fields, |
56 | .raw_init = init_syscall_trace, | 47 | .raw_init = init_syscall_trace, |
57 | }; | 48 | }; |
58 | 49 | ||
59 | struct ftrace_event_class event_class_syscall_exit = { | 50 | struct ftrace_event_class event_class_syscall_exit = { |
60 | .system = "syscalls", | 51 | .system = "syscalls", |
61 | .reg = syscall_exit_register, | 52 | .reg = syscall_exit_register, |
62 | .define_fields = syscall_exit_define_fields, | 53 | .define_fields = syscall_exit_define_fields, |
63 | .get_fields = syscall_get_exit_fields, | 54 | .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), |
64 | .raw_init = init_syscall_trace, | 55 | .raw_init = init_syscall_trace, |
65 | }; | 56 | }; |
66 | 57 | ||
67 | extern unsigned long __start_syscalls_metadata[]; | 58 | extern struct syscall_metadata *__start_syscalls_metadata[]; |
68 | extern unsigned long __stop_syscalls_metadata[]; | 59 | extern struct syscall_metadata *__stop_syscalls_metadata[]; |
69 | 60 | ||
70 | static struct syscall_metadata **syscalls_metadata; | 61 | static struct syscall_metadata **syscalls_metadata; |
71 | 62 | ||
72 | static struct syscall_metadata *find_syscall_meta(unsigned long syscall) | 63 | static __init struct syscall_metadata * |
64 | find_syscall_meta(unsigned long syscall) | ||
73 | { | 65 | { |
74 | struct syscall_metadata *start; | 66 | struct syscall_metadata **start; |
75 | struct syscall_metadata *stop; | 67 | struct syscall_metadata **stop; |
76 | char str[KSYM_SYMBOL_LEN]; | 68 | char str[KSYM_SYMBOL_LEN]; |
77 | 69 | ||
78 | 70 | ||
79 | start = (struct syscall_metadata *)__start_syscalls_metadata; | 71 | start = __start_syscalls_metadata; |
80 | stop = (struct syscall_metadata *)__stop_syscalls_metadata; | 72 | stop = __stop_syscalls_metadata; |
81 | kallsyms_lookup(syscall, NULL, NULL, NULL, str); | 73 | kallsyms_lookup(syscall, NULL, NULL, NULL, str); |
82 | 74 | ||
83 | for ( ; start < stop; start++) { | 75 | for ( ; start < stop; start++) { |
@@ -87,8 +79,8 @@ static struct syscall_metadata *find_syscall_meta(unsigned long syscall) | |||
87 | * with "SyS" instead of "sys", leading to an unwanted | 79 | * with "SyS" instead of "sys", leading to an unwanted |
88 | * mismatch. | 80 | * mismatch. |
89 | */ | 81 | */ |
90 | if (start->name && !strcmp(start->name + 3, str + 3)) | 82 | if ((*start)->name && !strcmp((*start)->name + 3, str + 3)) |
91 | return start; | 83 | return *start; |
92 | } | 84 | } |
93 | return NULL; | 85 | return NULL; |
94 | } | 86 | } |
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index e95ee7f31d43..68187af4889e 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c | |||
@@ -27,8 +27,8 @@ | |||
27 | #include <linux/sched.h> | 27 | #include <linux/sched.h> |
28 | #include <linux/jump_label.h> | 28 | #include <linux/jump_label.h> |
29 | 29 | ||
30 | extern struct tracepoint __start___tracepoints[]; | 30 | extern struct tracepoint * const __start___tracepoints_ptrs[]; |
31 | extern struct tracepoint __stop___tracepoints[]; | 31 | extern struct tracepoint * const __stop___tracepoints_ptrs[]; |
32 | 32 | ||
33 | /* Set to 1 to enable tracepoint debug output */ | 33 | /* Set to 1 to enable tracepoint debug output */ |
34 | static const int tracepoint_debug; | 34 | static const int tracepoint_debug; |
@@ -298,10 +298,10 @@ static void disable_tracepoint(struct tracepoint *elem) | |||
298 | * | 298 | * |
299 | * Updates the probe callback corresponding to a range of tracepoints. | 299 | * Updates the probe callback corresponding to a range of tracepoints. |
300 | */ | 300 | */ |
301 | void | 301 | void tracepoint_update_probe_range(struct tracepoint * const *begin, |
302 | tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) | 302 | struct tracepoint * const *end) |
303 | { | 303 | { |
304 | struct tracepoint *iter; | 304 | struct tracepoint * const *iter; |
305 | struct tracepoint_entry *mark_entry; | 305 | struct tracepoint_entry *mark_entry; |
306 | 306 | ||
307 | if (!begin) | 307 | if (!begin) |
@@ -309,12 +309,12 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) | |||
309 | 309 | ||
310 | mutex_lock(&tracepoints_mutex); | 310 | mutex_lock(&tracepoints_mutex); |
311 | for (iter = begin; iter < end; iter++) { | 311 | for (iter = begin; iter < end; iter++) { |
312 | mark_entry = get_tracepoint(iter->name); | 312 | mark_entry = get_tracepoint((*iter)->name); |
313 | if (mark_entry) { | 313 | if (mark_entry) { |
314 | set_tracepoint(&mark_entry, iter, | 314 | set_tracepoint(&mark_entry, *iter, |
315 | !!mark_entry->refcount); | 315 | !!mark_entry->refcount); |
316 | } else { | 316 | } else { |
317 | disable_tracepoint(iter); | 317 | disable_tracepoint(*iter); |
318 | } | 318 | } |
319 | } | 319 | } |
320 | mutex_unlock(&tracepoints_mutex); | 320 | mutex_unlock(&tracepoints_mutex); |
@@ -326,8 +326,8 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) | |||
326 | static void tracepoint_update_probes(void) | 326 | static void tracepoint_update_probes(void) |
327 | { | 327 | { |
328 | /* Core kernel tracepoints */ | 328 | /* Core kernel tracepoints */ |
329 | tracepoint_update_probe_range(__start___tracepoints, | 329 | tracepoint_update_probe_range(__start___tracepoints_ptrs, |
330 | __stop___tracepoints); | 330 | __stop___tracepoints_ptrs); |
331 | /* tracepoints in modules. */ | 331 | /* tracepoints in modules. */ |
332 | module_update_tracepoints(); | 332 | module_update_tracepoints(); |
333 | } | 333 | } |
@@ -514,8 +514,8 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all); | |||
514 | * Will return the first tracepoint in the range if the input tracepoint is | 514 | * Will return the first tracepoint in the range if the input tracepoint is |
515 | * NULL. | 515 | * NULL. |
516 | */ | 516 | */ |
517 | int tracepoint_get_iter_range(struct tracepoint **tracepoint, | 517 | int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, |
518 | struct tracepoint *begin, struct tracepoint *end) | 518 | struct tracepoint * const *begin, struct tracepoint * const *end) |
519 | { | 519 | { |
520 | if (!*tracepoint && begin != end) { | 520 | if (!*tracepoint && begin != end) { |
521 | *tracepoint = begin; | 521 | *tracepoint = begin; |
@@ -534,7 +534,8 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter) | |||
534 | /* Core kernel tracepoints */ | 534 | /* Core kernel tracepoints */ |
535 | if (!iter->module) { | 535 | if (!iter->module) { |
536 | found = tracepoint_get_iter_range(&iter->tracepoint, | 536 | found = tracepoint_get_iter_range(&iter->tracepoint, |
537 | __start___tracepoints, __stop___tracepoints); | 537 | __start___tracepoints_ptrs, |
538 | __stop___tracepoints_ptrs); | ||
538 | if (found) | 539 | if (found) |
539 | goto end; | 540 | goto end; |
540 | } | 541 | } |
@@ -585,8 +586,8 @@ int tracepoint_module_notify(struct notifier_block *self, | |||
585 | switch (val) { | 586 | switch (val) { |
586 | case MODULE_STATE_COMING: | 587 | case MODULE_STATE_COMING: |
587 | case MODULE_STATE_GOING: | 588 | case MODULE_STATE_GOING: |
588 | tracepoint_update_probe_range(mod->tracepoints, | 589 | tracepoint_update_probe_range(mod->tracepoints_ptrs, |
589 | mod->tracepoints + mod->num_tracepoints); | 590 | mod->tracepoints_ptrs + mod->num_tracepoints); |
590 | break; | 591 | break; |
591 | } | 592 | } |
592 | return 0; | 593 | return 0; |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 25915832291a..9da289c34f22 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
@@ -12,6 +12,8 @@ | |||
12 | #include <linux/highuid.h> | 12 | #include <linux/highuid.h> |
13 | #include <linux/cred.h> | 13 | #include <linux/cred.h> |
14 | 14 | ||
15 | static struct kmem_cache *user_ns_cachep __read_mostly; | ||
16 | |||
15 | /* | 17 | /* |
16 | * Create a new user namespace, deriving the creator from the user in the | 18 | * Create a new user namespace, deriving the creator from the user in the |
17 | * passed credentials, and replacing that user with the new root user for the | 19 | * passed credentials, and replacing that user with the new root user for the |
@@ -26,7 +28,7 @@ int create_user_ns(struct cred *new) | |||
26 | struct user_struct *root_user; | 28 | struct user_struct *root_user; |
27 | int n; | 29 | int n; |
28 | 30 | ||
29 | ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); | 31 | ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL); |
30 | if (!ns) | 32 | if (!ns) |
31 | return -ENOMEM; | 33 | return -ENOMEM; |
32 | 34 | ||
@@ -38,7 +40,7 @@ int create_user_ns(struct cred *new) | |||
38 | /* Alloc new root user. */ | 40 | /* Alloc new root user. */ |
39 | root_user = alloc_uid(ns, 0); | 41 | root_user = alloc_uid(ns, 0); |
40 | if (!root_user) { | 42 | if (!root_user) { |
41 | kfree(ns); | 43 | kmem_cache_free(user_ns_cachep, ns); |
42 | return -ENOMEM; | 44 | return -ENOMEM; |
43 | } | 45 | } |
44 | 46 | ||
@@ -71,7 +73,7 @@ static void free_user_ns_work(struct work_struct *work) | |||
71 | struct user_namespace *ns = | 73 | struct user_namespace *ns = |
72 | container_of(work, struct user_namespace, destroyer); | 74 | container_of(work, struct user_namespace, destroyer); |
73 | free_uid(ns->creator); | 75 | free_uid(ns->creator); |
74 | kfree(ns); | 76 | kmem_cache_free(user_ns_cachep, ns); |
75 | } | 77 | } |
76 | 78 | ||
77 | void free_user_ns(struct kref *kref) | 79 | void free_user_ns(struct kref *kref) |
@@ -126,3 +128,10 @@ gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t | |||
126 | /* No useful relationship so no mapping */ | 128 | /* No useful relationship so no mapping */ |
127 | return overflowgid; | 129 | return overflowgid; |
128 | } | 130 | } |
131 | |||
132 | static __init int user_namespaces_init(void) | ||
133 | { | ||
134 | user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); | ||
135 | return 0; | ||
136 | } | ||
137 | module_init(user_namespaces_init); | ||
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5b082156cd21..18bb15776c57 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -27,7 +27,7 @@ | |||
27 | #include <asm/irq_regs.h> | 27 | #include <asm/irq_regs.h> |
28 | #include <linux/perf_event.h> | 28 | #include <linux/perf_event.h> |
29 | 29 | ||
30 | int watchdog_enabled; | 30 | int watchdog_enabled = 1; |
31 | int __read_mostly softlockup_thresh = 60; | 31 | int __read_mostly softlockup_thresh = 60; |
32 | 32 | ||
33 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); | 33 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); |
@@ -43,9 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); | |||
43 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); | 43 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); |
44 | #endif | 44 | #endif |
45 | 45 | ||
46 | static int no_watchdog; | ||
47 | |||
48 | |||
49 | /* boot commands */ | 46 | /* boot commands */ |
50 | /* | 47 | /* |
51 | * Should we panic when a soft-lockup or hard-lockup occurs: | 48 | * Should we panic when a soft-lockup or hard-lockup occurs: |
@@ -57,6 +54,8 @@ static int __init hardlockup_panic_setup(char *str) | |||
57 | { | 54 | { |
58 | if (!strncmp(str, "panic", 5)) | 55 | if (!strncmp(str, "panic", 5)) |
59 | hardlockup_panic = 1; | 56 | hardlockup_panic = 1; |
57 | else if (!strncmp(str, "0", 1)) | ||
58 | watchdog_enabled = 0; | ||
60 | return 1; | 59 | return 1; |
61 | } | 60 | } |
62 | __setup("nmi_watchdog=", hardlockup_panic_setup); | 61 | __setup("nmi_watchdog=", hardlockup_panic_setup); |
@@ -75,7 +74,7 @@ __setup("softlockup_panic=", softlockup_panic_setup); | |||
75 | 74 | ||
76 | static int __init nowatchdog_setup(char *str) | 75 | static int __init nowatchdog_setup(char *str) |
77 | { | 76 | { |
78 | no_watchdog = 1; | 77 | watchdog_enabled = 0; |
79 | return 1; | 78 | return 1; |
80 | } | 79 | } |
81 | __setup("nowatchdog", nowatchdog_setup); | 80 | __setup("nowatchdog", nowatchdog_setup); |
@@ -83,7 +82,7 @@ __setup("nowatchdog", nowatchdog_setup); | |||
83 | /* deprecated */ | 82 | /* deprecated */ |
84 | static int __init nosoftlockup_setup(char *str) | 83 | static int __init nosoftlockup_setup(char *str) |
85 | { | 84 | { |
86 | no_watchdog = 1; | 85 | watchdog_enabled = 0; |
87 | return 1; | 86 | return 1; |
88 | } | 87 | } |
89 | __setup("nosoftlockup", nosoftlockup_setup); | 88 | __setup("nosoftlockup", nosoftlockup_setup); |
@@ -116,12 +115,12 @@ static void __touch_watchdog(void) | |||
116 | { | 115 | { |
117 | int this_cpu = smp_processor_id(); | 116 | int this_cpu = smp_processor_id(); |
118 | 117 | ||
119 | __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu); | 118 | __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu)); |
120 | } | 119 | } |
121 | 120 | ||
122 | void touch_softlockup_watchdog(void) | 121 | void touch_softlockup_watchdog(void) |
123 | { | 122 | { |
124 | __raw_get_cpu_var(watchdog_touch_ts) = 0; | 123 | __this_cpu_write(watchdog_touch_ts, 0); |
125 | } | 124 | } |
126 | EXPORT_SYMBOL(touch_softlockup_watchdog); | 125 | EXPORT_SYMBOL(touch_softlockup_watchdog); |
127 | 126 | ||
@@ -165,12 +164,12 @@ void touch_softlockup_watchdog_sync(void) | |||
165 | /* watchdog detector functions */ | 164 | /* watchdog detector functions */ |
166 | static int is_hardlockup(void) | 165 | static int is_hardlockup(void) |
167 | { | 166 | { |
168 | unsigned long hrint = __get_cpu_var(hrtimer_interrupts); | 167 | unsigned long hrint = __this_cpu_read(hrtimer_interrupts); |
169 | 168 | ||
170 | if (__get_cpu_var(hrtimer_interrupts_saved) == hrint) | 169 | if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) |
171 | return 1; | 170 | return 1; |
172 | 171 | ||
173 | __get_cpu_var(hrtimer_interrupts_saved) = hrint; | 172 | __this_cpu_write(hrtimer_interrupts_saved, hrint); |
174 | return 0; | 173 | return 0; |
175 | } | 174 | } |
176 | #endif | 175 | #endif |
@@ -203,8 +202,8 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi, | |||
203 | /* Ensure the watchdog never gets throttled */ | 202 | /* Ensure the watchdog never gets throttled */ |
204 | event->hw.interrupts = 0; | 203 | event->hw.interrupts = 0; |
205 | 204 | ||
206 | if (__get_cpu_var(watchdog_nmi_touch) == true) { | 205 | if (__this_cpu_read(watchdog_nmi_touch) == true) { |
207 | __get_cpu_var(watchdog_nmi_touch) = false; | 206 | __this_cpu_write(watchdog_nmi_touch, false); |
208 | return; | 207 | return; |
209 | } | 208 | } |
210 | 209 | ||
@@ -218,7 +217,7 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi, | |||
218 | int this_cpu = smp_processor_id(); | 217 | int this_cpu = smp_processor_id(); |
219 | 218 | ||
220 | /* only print hardlockups once */ | 219 | /* only print hardlockups once */ |
221 | if (__get_cpu_var(hard_watchdog_warn) == true) | 220 | if (__this_cpu_read(hard_watchdog_warn) == true) |
222 | return; | 221 | return; |
223 | 222 | ||
224 | if (hardlockup_panic) | 223 | if (hardlockup_panic) |
@@ -226,16 +225,16 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi, | |||
226 | else | 225 | else |
227 | WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); | 226 | WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); |
228 | 227 | ||
229 | __get_cpu_var(hard_watchdog_warn) = true; | 228 | __this_cpu_write(hard_watchdog_warn, true); |
230 | return; | 229 | return; |
231 | } | 230 | } |
232 | 231 | ||
233 | __get_cpu_var(hard_watchdog_warn) = false; | 232 | __this_cpu_write(hard_watchdog_warn, false); |
234 | return; | 233 | return; |
235 | } | 234 | } |
236 | static void watchdog_interrupt_count(void) | 235 | static void watchdog_interrupt_count(void) |
237 | { | 236 | { |
238 | __get_cpu_var(hrtimer_interrupts)++; | 237 | __this_cpu_inc(hrtimer_interrupts); |
239 | } | 238 | } |
240 | #else | 239 | #else |
241 | static inline void watchdog_interrupt_count(void) { return; } | 240 | static inline void watchdog_interrupt_count(void) { return; } |
@@ -244,7 +243,7 @@ static inline void watchdog_interrupt_count(void) { return; } | |||
244 | /* watchdog kicker functions */ | 243 | /* watchdog kicker functions */ |
245 | static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | 244 | static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) |
246 | { | 245 | { |
247 | unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts); | 246 | unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); |
248 | struct pt_regs *regs = get_irq_regs(); | 247 | struct pt_regs *regs = get_irq_regs(); |
249 | int duration; | 248 | int duration; |
250 | 249 | ||
@@ -252,18 +251,18 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
252 | watchdog_interrupt_count(); | 251 | watchdog_interrupt_count(); |
253 | 252 | ||
254 | /* kick the softlockup detector */ | 253 | /* kick the softlockup detector */ |
255 | wake_up_process(__get_cpu_var(softlockup_watchdog)); | 254 | wake_up_process(__this_cpu_read(softlockup_watchdog)); |
256 | 255 | ||
257 | /* .. and repeat */ | 256 | /* .. and repeat */ |
258 | hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); | 257 | hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); |
259 | 258 | ||
260 | if (touch_ts == 0) { | 259 | if (touch_ts == 0) { |
261 | if (unlikely(__get_cpu_var(softlockup_touch_sync))) { | 260 | if (unlikely(__this_cpu_read(softlockup_touch_sync))) { |
262 | /* | 261 | /* |
263 | * If the time stamp was touched atomically | 262 | * If the time stamp was touched atomically |
264 | * make sure the scheduler tick is up to date. | 263 | * make sure the scheduler tick is up to date. |
265 | */ | 264 | */ |
266 | __get_cpu_var(softlockup_touch_sync) = false; | 265 | __this_cpu_write(softlockup_touch_sync, false); |
267 | sched_clock_tick(); | 266 | sched_clock_tick(); |
268 | } | 267 | } |
269 | __touch_watchdog(); | 268 | __touch_watchdog(); |
@@ -279,7 +278,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
279 | duration = is_softlockup(touch_ts); | 278 | duration = is_softlockup(touch_ts); |
280 | if (unlikely(duration)) { | 279 | if (unlikely(duration)) { |
281 | /* only warn once */ | 280 | /* only warn once */ |
282 | if (__get_cpu_var(soft_watchdog_warn) == true) | 281 | if (__this_cpu_read(soft_watchdog_warn) == true) |
283 | return HRTIMER_RESTART; | 282 | return HRTIMER_RESTART; |
284 | 283 | ||
285 | printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", | 284 | printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", |
@@ -294,9 +293,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
294 | 293 | ||
295 | if (softlockup_panic) | 294 | if (softlockup_panic) |
296 | panic("softlockup: hung tasks"); | 295 | panic("softlockup: hung tasks"); |
297 | __get_cpu_var(soft_watchdog_warn) = true; | 296 | __this_cpu_write(soft_watchdog_warn, true); |
298 | } else | 297 | } else |
299 | __get_cpu_var(soft_watchdog_warn) = false; | 298 | __this_cpu_write(soft_watchdog_warn, false); |
300 | 299 | ||
301 | return HRTIMER_RESTART; | 300 | return HRTIMER_RESTART; |
302 | } | 301 | } |
@@ -307,7 +306,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
307 | */ | 306 | */ |
308 | static int watchdog(void *unused) | 307 | static int watchdog(void *unused) |
309 | { | 308 | { |
310 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | 309 | static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; |
311 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | 310 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); |
312 | 311 | ||
313 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 312 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
@@ -364,8 +363,14 @@ static int watchdog_nmi_enable(int cpu) | |||
364 | goto out_save; | 363 | goto out_save; |
365 | } | 364 | } |
366 | 365 | ||
367 | printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n", | 366 | |
368 | cpu, PTR_ERR(event)); | 367 | /* vary the KERN level based on the returned errno */ |
368 | if (PTR_ERR(event) == -EOPNOTSUPP) | ||
369 | printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu); | ||
370 | else if (PTR_ERR(event) == -ENOENT) | ||
371 | printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu); | ||
372 | else | ||
373 | printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event)); | ||
369 | return PTR_ERR(event); | 374 | return PTR_ERR(event); |
370 | 375 | ||
371 | /* success path */ | 376 | /* success path */ |
@@ -430,9 +435,6 @@ static int watchdog_enable(int cpu) | |||
430 | wake_up_process(p); | 435 | wake_up_process(p); |
431 | } | 436 | } |
432 | 437 | ||
433 | /* if any cpu succeeds, watchdog is considered enabled for the system */ | ||
434 | watchdog_enabled = 1; | ||
435 | |||
436 | return 0; | 438 | return 0; |
437 | } | 439 | } |
438 | 440 | ||
@@ -460,12 +462,16 @@ static void watchdog_disable(int cpu) | |||
460 | static void watchdog_enable_all_cpus(void) | 462 | static void watchdog_enable_all_cpus(void) |
461 | { | 463 | { |
462 | int cpu; | 464 | int cpu; |
463 | int result = 0; | 465 | |
466 | watchdog_enabled = 0; | ||
464 | 467 | ||
465 | for_each_online_cpu(cpu) | 468 | for_each_online_cpu(cpu) |
466 | result += watchdog_enable(cpu); | 469 | if (!watchdog_enable(cpu)) |
470 | /* if any cpu succeeds, watchdog is considered | ||
471 | enabled for the system */ | ||
472 | watchdog_enabled = 1; | ||
467 | 473 | ||
468 | if (result) | 474 | if (!watchdog_enabled) |
469 | printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); | 475 | printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); |
470 | 476 | ||
471 | } | 477 | } |
@@ -474,9 +480,6 @@ static void watchdog_disable_all_cpus(void) | |||
474 | { | 480 | { |
475 | int cpu; | 481 | int cpu; |
476 | 482 | ||
477 | if (no_watchdog) | ||
478 | return; | ||
479 | |||
480 | for_each_online_cpu(cpu) | 483 | for_each_online_cpu(cpu) |
481 | watchdog_disable(cpu); | 484 | watchdog_disable(cpu); |
482 | 485 | ||
@@ -496,10 +499,12 @@ int proc_dowatchdog_enabled(struct ctl_table *table, int write, | |||
496 | { | 499 | { |
497 | proc_dointvec(table, write, buffer, length, ppos); | 500 | proc_dointvec(table, write, buffer, length, ppos); |
498 | 501 | ||
499 | if (watchdog_enabled) | 502 | if (write) { |
500 | watchdog_enable_all_cpus(); | 503 | if (watchdog_enabled) |
501 | else | 504 | watchdog_enable_all_cpus(); |
502 | watchdog_disable_all_cpus(); | 505 | else |
506 | watchdog_disable_all_cpus(); | ||
507 | } | ||
503 | return 0; | 508 | return 0; |
504 | } | 509 | } |
505 | 510 | ||
@@ -528,7 +533,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
528 | break; | 533 | break; |
529 | case CPU_ONLINE: | 534 | case CPU_ONLINE: |
530 | case CPU_ONLINE_FROZEN: | 535 | case CPU_ONLINE_FROZEN: |
531 | err = watchdog_enable(hotcpu); | 536 | if (watchdog_enabled) |
537 | err = watchdog_enable(hotcpu); | ||
532 | break; | 538 | break; |
533 | #ifdef CONFIG_HOTPLUG_CPU | 539 | #ifdef CONFIG_HOTPLUG_CPU |
534 | case CPU_UP_CANCELED: | 540 | case CPU_UP_CANCELED: |
@@ -548,20 +554,16 @@ static struct notifier_block __cpuinitdata cpu_nfb = { | |||
548 | .notifier_call = cpu_callback | 554 | .notifier_call = cpu_callback |
549 | }; | 555 | }; |
550 | 556 | ||
551 | static int __init spawn_watchdog_task(void) | 557 | void __init lockup_detector_init(void) |
552 | { | 558 | { |
553 | void *cpu = (void *)(long)smp_processor_id(); | 559 | void *cpu = (void *)(long)smp_processor_id(); |
554 | int err; | 560 | int err; |
555 | 561 | ||
556 | if (no_watchdog) | ||
557 | return 0; | ||
558 | |||
559 | err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | 562 | err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); |
560 | WARN_ON(notifier_to_errno(err)); | 563 | WARN_ON(notifier_to_errno(err)); |
561 | 564 | ||
562 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | 565 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); |
563 | register_cpu_notifier(&cpu_nfb); | 566 | register_cpu_notifier(&cpu_nfb); |
564 | 567 | ||
565 | return 0; | 568 | return; |
566 | } | 569 | } |
567 | early_initcall(spawn_watchdog_task); | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e785b0f2aea5..ee6578b578ad 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -79,7 +79,9 @@ enum { | |||
79 | MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ | 79 | MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ |
80 | IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ | 80 | IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ |
81 | 81 | ||
82 | MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */ | 82 | MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2, |
83 | /* call for help after 10ms | ||
84 | (min two ticks) */ | ||
83 | MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ | 85 | MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ |
84 | CREATE_COOLDOWN = HZ, /* time to breath after fail */ | 86 | CREATE_COOLDOWN = HZ, /* time to breath after fail */ |
85 | TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ | 87 | TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ |
@@ -768,7 +770,11 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) | |||
768 | 770 | ||
769 | worker->flags &= ~flags; | 771 | worker->flags &= ~flags; |
770 | 772 | ||
771 | /* if transitioning out of NOT_RUNNING, increment nr_running */ | 773 | /* |
774 | * If transitioning out of NOT_RUNNING, increment nr_running. Note | ||
775 | * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask | ||
776 | * of multiple flags, not a single flag. | ||
777 | */ | ||
772 | if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) | 778 | if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) |
773 | if (!(worker->flags & WORKER_NOT_RUNNING)) | 779 | if (!(worker->flags & WORKER_NOT_RUNNING)) |
774 | atomic_inc(get_gcwq_nr_running(gcwq->cpu)); | 780 | atomic_inc(get_gcwq_nr_running(gcwq->cpu)); |
@@ -932,6 +938,38 @@ static void insert_work(struct cpu_workqueue_struct *cwq, | |||
932 | wake_up_worker(gcwq); | 938 | wake_up_worker(gcwq); |
933 | } | 939 | } |
934 | 940 | ||
941 | /* | ||
942 | * Test whether @work is being queued from another work executing on the | ||
943 | * same workqueue. This is rather expensive and should only be used from | ||
944 | * cold paths. | ||
945 | */ | ||
946 | static bool is_chained_work(struct workqueue_struct *wq) | ||
947 | { | ||
948 | unsigned long flags; | ||
949 | unsigned int cpu; | ||
950 | |||
951 | for_each_gcwq_cpu(cpu) { | ||
952 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
953 | struct worker *worker; | ||
954 | struct hlist_node *pos; | ||
955 | int i; | ||
956 | |||
957 | spin_lock_irqsave(&gcwq->lock, flags); | ||
958 | for_each_busy_worker(worker, i, pos, gcwq) { | ||
959 | if (worker->task != current) | ||
960 | continue; | ||
961 | spin_unlock_irqrestore(&gcwq->lock, flags); | ||
962 | /* | ||
963 | * I'm @worker, no locking necessary. See if @work | ||
964 | * is headed to the same workqueue. | ||
965 | */ | ||
966 | return worker->current_cwq->wq == wq; | ||
967 | } | ||
968 | spin_unlock_irqrestore(&gcwq->lock, flags); | ||
969 | } | ||
970 | return false; | ||
971 | } | ||
972 | |||
935 | static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | 973 | static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, |
936 | struct work_struct *work) | 974 | struct work_struct *work) |
937 | { | 975 | { |
@@ -943,7 +981,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
943 | 981 | ||
944 | debug_work_activate(work); | 982 | debug_work_activate(work); |
945 | 983 | ||
946 | if (WARN_ON_ONCE(wq->flags & WQ_DYING)) | 984 | /* if dying, only works from the same workqueue are allowed */ |
985 | if (unlikely(wq->flags & WQ_DYING) && | ||
986 | WARN_ON_ONCE(!is_chained_work(wq))) | ||
947 | return; | 987 | return; |
948 | 988 | ||
949 | /* determine gcwq to use */ | 989 | /* determine gcwq to use */ |
@@ -1806,7 +1846,7 @@ __acquires(&gcwq->lock) | |||
1806 | spin_unlock_irq(&gcwq->lock); | 1846 | spin_unlock_irq(&gcwq->lock); |
1807 | 1847 | ||
1808 | work_clear_pending(work); | 1848 | work_clear_pending(work); |
1809 | lock_map_acquire(&cwq->wq->lockdep_map); | 1849 | lock_map_acquire_read(&cwq->wq->lockdep_map); |
1810 | lock_map_acquire(&lockdep_map); | 1850 | lock_map_acquire(&lockdep_map); |
1811 | trace_workqueue_execute_start(work); | 1851 | trace_workqueue_execute_start(work); |
1812 | f(work); | 1852 | f(work); |
@@ -2009,6 +2049,15 @@ repeat: | |||
2009 | move_linked_works(work, scheduled, &n); | 2049 | move_linked_works(work, scheduled, &n); |
2010 | 2050 | ||
2011 | process_scheduled_works(rescuer); | 2051 | process_scheduled_works(rescuer); |
2052 | |||
2053 | /* | ||
2054 | * Leave this gcwq. If keep_working() is %true, notify a | ||
2055 | * regular worker; otherwise, we end up with 0 concurrency | ||
2056 | * and stalling the execution. | ||
2057 | */ | ||
2058 | if (keep_working(gcwq)) | ||
2059 | wake_up_worker(gcwq); | ||
2060 | |||
2012 | spin_unlock_irq(&gcwq->lock); | 2061 | spin_unlock_irq(&gcwq->lock); |
2013 | } | 2062 | } |
2014 | 2063 | ||
@@ -2350,8 +2399,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, | |||
2350 | insert_wq_barrier(cwq, barr, work, worker); | 2399 | insert_wq_barrier(cwq, barr, work, worker); |
2351 | spin_unlock_irq(&gcwq->lock); | 2400 | spin_unlock_irq(&gcwq->lock); |
2352 | 2401 | ||
2353 | lock_map_acquire(&cwq->wq->lockdep_map); | 2402 | /* |
2403 | * If @max_active is 1 or rescuer is in use, flushing another work | ||
2404 | * item on the same workqueue may lead to deadlock. Make sure the | ||
2405 | * flusher is not running on the same workqueue by verifying write | ||
2406 | * access. | ||
2407 | */ | ||
2408 | if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER) | ||
2409 | lock_map_acquire(&cwq->wq->lockdep_map); | ||
2410 | else | ||
2411 | lock_map_acquire_read(&cwq->wq->lockdep_map); | ||
2354 | lock_map_release(&cwq->wq->lockdep_map); | 2412 | lock_map_release(&cwq->wq->lockdep_map); |
2413 | |||
2355 | return true; | 2414 | return true; |
2356 | already_gone: | 2415 | already_gone: |
2357 | spin_unlock_irq(&gcwq->lock); | 2416 | spin_unlock_irq(&gcwq->lock); |
@@ -2908,7 +2967,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, | |||
2908 | */ | 2967 | */ |
2909 | spin_lock(&workqueue_lock); | 2968 | spin_lock(&workqueue_lock); |
2910 | 2969 | ||
2911 | if (workqueue_freezing && wq->flags & WQ_FREEZEABLE) | 2970 | if (workqueue_freezing && wq->flags & WQ_FREEZABLE) |
2912 | for_each_cwq_cpu(cpu, wq) | 2971 | for_each_cwq_cpu(cpu, wq) |
2913 | get_cwq(cpu, wq)->max_active = 0; | 2972 | get_cwq(cpu, wq)->max_active = 0; |
2914 | 2973 | ||
@@ -2936,11 +2995,35 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key); | |||
2936 | */ | 2995 | */ |
2937 | void destroy_workqueue(struct workqueue_struct *wq) | 2996 | void destroy_workqueue(struct workqueue_struct *wq) |
2938 | { | 2997 | { |
2998 | unsigned int flush_cnt = 0; | ||
2939 | unsigned int cpu; | 2999 | unsigned int cpu; |
2940 | 3000 | ||
3001 | /* | ||
3002 | * Mark @wq dying and drain all pending works. Once WQ_DYING is | ||
3003 | * set, only chain queueing is allowed. IOW, only currently | ||
3004 | * pending or running work items on @wq can queue further work | ||
3005 | * items on it. @wq is flushed repeatedly until it becomes empty. | ||
3006 | * The number of flushing is detemined by the depth of chaining and | ||
3007 | * should be relatively short. Whine if it takes too long. | ||
3008 | */ | ||
2941 | wq->flags |= WQ_DYING; | 3009 | wq->flags |= WQ_DYING; |
3010 | reflush: | ||
2942 | flush_workqueue(wq); | 3011 | flush_workqueue(wq); |
2943 | 3012 | ||
3013 | for_each_cwq_cpu(cpu, wq) { | ||
3014 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | ||
3015 | |||
3016 | if (!cwq->nr_active && list_empty(&cwq->delayed_works)) | ||
3017 | continue; | ||
3018 | |||
3019 | if (++flush_cnt == 10 || | ||
3020 | (flush_cnt % 100 == 0 && flush_cnt <= 1000)) | ||
3021 | printk(KERN_WARNING "workqueue %s: flush on " | ||
3022 | "destruction isn't complete after %u tries\n", | ||
3023 | wq->name, flush_cnt); | ||
3024 | goto reflush; | ||
3025 | } | ||
3026 | |||
2944 | /* | 3027 | /* |
2945 | * wq list is used to freeze wq, remove from list after | 3028 | * wq list is used to freeze wq, remove from list after |
2946 | * flushing is complete in case freeze races us. | 3029 | * flushing is complete in case freeze races us. |
@@ -2996,7 +3079,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) | |||
2996 | 3079 | ||
2997 | spin_lock_irq(&gcwq->lock); | 3080 | spin_lock_irq(&gcwq->lock); |
2998 | 3081 | ||
2999 | if (!(wq->flags & WQ_FREEZEABLE) || | 3082 | if (!(wq->flags & WQ_FREEZABLE) || |
3000 | !(gcwq->flags & GCWQ_FREEZING)) | 3083 | !(gcwq->flags & GCWQ_FREEZING)) |
3001 | get_cwq(gcwq->cpu, wq)->max_active = max_active; | 3084 | get_cwq(gcwq->cpu, wq)->max_active = max_active; |
3002 | 3085 | ||
@@ -3246,7 +3329,7 @@ static int __cpuinit trustee_thread(void *__gcwq) | |||
3246 | * want to get it over with ASAP - spam rescuers, wake up as | 3329 | * want to get it over with ASAP - spam rescuers, wake up as |
3247 | * many idlers as necessary and create new ones till the | 3330 | * many idlers as necessary and create new ones till the |
3248 | * worklist is empty. Note that if the gcwq is frozen, there | 3331 | * worklist is empty. Note that if the gcwq is frozen, there |
3249 | * may be frozen works in freezeable cwqs. Don't declare | 3332 | * may be frozen works in freezable cwqs. Don't declare |
3250 | * completion while frozen. | 3333 | * completion while frozen. |
3251 | */ | 3334 | */ |
3252 | while (gcwq->nr_workers != gcwq->nr_idle || | 3335 | while (gcwq->nr_workers != gcwq->nr_idle || |
@@ -3504,9 +3587,9 @@ EXPORT_SYMBOL_GPL(work_on_cpu); | |||
3504 | /** | 3587 | /** |
3505 | * freeze_workqueues_begin - begin freezing workqueues | 3588 | * freeze_workqueues_begin - begin freezing workqueues |
3506 | * | 3589 | * |
3507 | * Start freezing workqueues. After this function returns, all | 3590 | * Start freezing workqueues. After this function returns, all freezable |
3508 | * freezeable workqueues will queue new works to their frozen_works | 3591 | * workqueues will queue new works to their frozen_works list instead of |
3509 | * list instead of gcwq->worklist. | 3592 | * gcwq->worklist. |
3510 | * | 3593 | * |
3511 | * CONTEXT: | 3594 | * CONTEXT: |
3512 | * Grabs and releases workqueue_lock and gcwq->lock's. | 3595 | * Grabs and releases workqueue_lock and gcwq->lock's. |
@@ -3532,7 +3615,7 @@ void freeze_workqueues_begin(void) | |||
3532 | list_for_each_entry(wq, &workqueues, list) { | 3615 | list_for_each_entry(wq, &workqueues, list) { |
3533 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 3616 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
3534 | 3617 | ||
3535 | if (cwq && wq->flags & WQ_FREEZEABLE) | 3618 | if (cwq && wq->flags & WQ_FREEZABLE) |
3536 | cwq->max_active = 0; | 3619 | cwq->max_active = 0; |
3537 | } | 3620 | } |
3538 | 3621 | ||
@@ -3543,7 +3626,7 @@ void freeze_workqueues_begin(void) | |||
3543 | } | 3626 | } |
3544 | 3627 | ||
3545 | /** | 3628 | /** |
3546 | * freeze_workqueues_busy - are freezeable workqueues still busy? | 3629 | * freeze_workqueues_busy - are freezable workqueues still busy? |
3547 | * | 3630 | * |
3548 | * Check whether freezing is complete. This function must be called | 3631 | * Check whether freezing is complete. This function must be called |
3549 | * between freeze_workqueues_begin() and thaw_workqueues(). | 3632 | * between freeze_workqueues_begin() and thaw_workqueues(). |
@@ -3552,8 +3635,8 @@ void freeze_workqueues_begin(void) | |||
3552 | * Grabs and releases workqueue_lock. | 3635 | * Grabs and releases workqueue_lock. |
3553 | * | 3636 | * |
3554 | * RETURNS: | 3637 | * RETURNS: |
3555 | * %true if some freezeable workqueues are still busy. %false if | 3638 | * %true if some freezable workqueues are still busy. %false if freezing |
3556 | * freezing is complete. | 3639 | * is complete. |
3557 | */ | 3640 | */ |
3558 | bool freeze_workqueues_busy(void) | 3641 | bool freeze_workqueues_busy(void) |
3559 | { | 3642 | { |
@@ -3573,7 +3656,7 @@ bool freeze_workqueues_busy(void) | |||
3573 | list_for_each_entry(wq, &workqueues, list) { | 3656 | list_for_each_entry(wq, &workqueues, list) { |
3574 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 3657 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
3575 | 3658 | ||
3576 | if (!cwq || !(wq->flags & WQ_FREEZEABLE)) | 3659 | if (!cwq || !(wq->flags & WQ_FREEZABLE)) |
3577 | continue; | 3660 | continue; |
3578 | 3661 | ||
3579 | BUG_ON(cwq->nr_active < 0); | 3662 | BUG_ON(cwq->nr_active < 0); |
@@ -3618,7 +3701,7 @@ void thaw_workqueues(void) | |||
3618 | list_for_each_entry(wq, &workqueues, list) { | 3701 | list_for_each_entry(wq, &workqueues, list) { |
3619 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 3702 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
3620 | 3703 | ||
3621 | if (!cwq || !(wq->flags & WQ_FREEZEABLE)) | 3704 | if (!cwq || !(wq->flags & WQ_FREEZABLE)) |
3622 | continue; | 3705 | continue; |
3623 | 3706 | ||
3624 | /* restore max_active and repopulate worklist */ | 3707 | /* restore max_active and repopulate worklist */ |