diff options
Diffstat (limited to 'kernel')
60 files changed, 3639 insertions, 1693 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 66a416b42c1..51cddc11cd8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -763,6 +763,8 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); | |||
763 | * -> cgroup_mkdir. | 763 | * -> cgroup_mkdir. |
764 | */ | 764 | */ |
765 | 765 | ||
766 | static struct dentry *cgroup_lookup(struct inode *dir, | ||
767 | struct dentry *dentry, struct nameidata *nd); | ||
766 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); | 768 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); |
767 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); | 769 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); |
768 | static int cgroup_populate_dir(struct cgroup *cgrp); | 770 | static int cgroup_populate_dir(struct cgroup *cgrp); |
@@ -874,25 +876,29 @@ static void cgroup_clear_directory(struct dentry *dentry) | |||
874 | struct list_head *node; | 876 | struct list_head *node; |
875 | 877 | ||
876 | BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); | 878 | BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); |
877 | spin_lock(&dcache_lock); | 879 | spin_lock(&dentry->d_lock); |
878 | node = dentry->d_subdirs.next; | 880 | node = dentry->d_subdirs.next; |
879 | while (node != &dentry->d_subdirs) { | 881 | while (node != &dentry->d_subdirs) { |
880 | struct dentry *d = list_entry(node, struct dentry, d_u.d_child); | 882 | struct dentry *d = list_entry(node, struct dentry, d_u.d_child); |
883 | |||
884 | spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); | ||
881 | list_del_init(node); | 885 | list_del_init(node); |
882 | if (d->d_inode) { | 886 | if (d->d_inode) { |
883 | /* This should never be called on a cgroup | 887 | /* This should never be called on a cgroup |
884 | * directory with child cgroups */ | 888 | * directory with child cgroups */ |
885 | BUG_ON(d->d_inode->i_mode & S_IFDIR); | 889 | BUG_ON(d->d_inode->i_mode & S_IFDIR); |
886 | d = dget_locked(d); | 890 | dget_dlock(d); |
887 | spin_unlock(&dcache_lock); | 891 | spin_unlock(&d->d_lock); |
892 | spin_unlock(&dentry->d_lock); | ||
888 | d_delete(d); | 893 | d_delete(d); |
889 | simple_unlink(dentry->d_inode, d); | 894 | simple_unlink(dentry->d_inode, d); |
890 | dput(d); | 895 | dput(d); |
891 | spin_lock(&dcache_lock); | 896 | spin_lock(&dentry->d_lock); |
892 | } | 897 | } else |
898 | spin_unlock(&d->d_lock); | ||
893 | node = dentry->d_subdirs.next; | 899 | node = dentry->d_subdirs.next; |
894 | } | 900 | } |
895 | spin_unlock(&dcache_lock); | 901 | spin_unlock(&dentry->d_lock); |
896 | } | 902 | } |
897 | 903 | ||
898 | /* | 904 | /* |
@@ -900,11 +906,16 @@ static void cgroup_clear_directory(struct dentry *dentry) | |||
900 | */ | 906 | */ |
901 | static void cgroup_d_remove_dir(struct dentry *dentry) | 907 | static void cgroup_d_remove_dir(struct dentry *dentry) |
902 | { | 908 | { |
909 | struct dentry *parent; | ||
910 | |||
903 | cgroup_clear_directory(dentry); | 911 | cgroup_clear_directory(dentry); |
904 | 912 | ||
905 | spin_lock(&dcache_lock); | 913 | parent = dentry->d_parent; |
914 | spin_lock(&parent->d_lock); | ||
915 | spin_lock(&dentry->d_lock); | ||
906 | list_del_init(&dentry->d_u.d_child); | 916 | list_del_init(&dentry->d_u.d_child); |
907 | spin_unlock(&dcache_lock); | 917 | spin_unlock(&dentry->d_lock); |
918 | spin_unlock(&parent->d_lock); | ||
908 | remove_dir(dentry); | 919 | remove_dir(dentry); |
909 | } | 920 | } |
910 | 921 | ||
@@ -2180,7 +2191,7 @@ static const struct file_operations cgroup_file_operations = { | |||
2180 | }; | 2191 | }; |
2181 | 2192 | ||
2182 | static const struct inode_operations cgroup_dir_inode_operations = { | 2193 | static const struct inode_operations cgroup_dir_inode_operations = { |
2183 | .lookup = simple_lookup, | 2194 | .lookup = cgroup_lookup, |
2184 | .mkdir = cgroup_mkdir, | 2195 | .mkdir = cgroup_mkdir, |
2185 | .rmdir = cgroup_rmdir, | 2196 | .rmdir = cgroup_rmdir, |
2186 | .rename = cgroup_rename, | 2197 | .rename = cgroup_rename, |
@@ -2196,13 +2207,29 @@ static inline struct cftype *__file_cft(struct file *file) | |||
2196 | return __d_cft(file->f_dentry); | 2207 | return __d_cft(file->f_dentry); |
2197 | } | 2208 | } |
2198 | 2209 | ||
2199 | static int cgroup_create_file(struct dentry *dentry, mode_t mode, | 2210 | static int cgroup_delete_dentry(const struct dentry *dentry) |
2200 | struct super_block *sb) | 2211 | { |
2212 | return 1; | ||
2213 | } | ||
2214 | |||
2215 | static struct dentry *cgroup_lookup(struct inode *dir, | ||
2216 | struct dentry *dentry, struct nameidata *nd) | ||
2201 | { | 2217 | { |
2202 | static const struct dentry_operations cgroup_dops = { | 2218 | static const struct dentry_operations cgroup_dentry_operations = { |
2219 | .d_delete = cgroup_delete_dentry, | ||
2203 | .d_iput = cgroup_diput, | 2220 | .d_iput = cgroup_diput, |
2204 | }; | 2221 | }; |
2205 | 2222 | ||
2223 | if (dentry->d_name.len > NAME_MAX) | ||
2224 | return ERR_PTR(-ENAMETOOLONG); | ||
2225 | d_set_d_op(dentry, &cgroup_dentry_operations); | ||
2226 | d_add(dentry, NULL); | ||
2227 | return NULL; | ||
2228 | } | ||
2229 | |||
2230 | static int cgroup_create_file(struct dentry *dentry, mode_t mode, | ||
2231 | struct super_block *sb) | ||
2232 | { | ||
2206 | struct inode *inode; | 2233 | struct inode *inode; |
2207 | 2234 | ||
2208 | if (!dentry) | 2235 | if (!dentry) |
@@ -2228,7 +2255,6 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode, | |||
2228 | inode->i_size = 0; | 2255 | inode->i_size = 0; |
2229 | inode->i_fop = &cgroup_file_operations; | 2256 | inode->i_fop = &cgroup_file_operations; |
2230 | } | 2257 | } |
2231 | dentry->d_op = &cgroup_dops; | ||
2232 | d_instantiate(dentry, inode); | 2258 | d_instantiate(dentry, inode); |
2233 | dget(dentry); /* Extra count - pin the dentry in core */ | 2259 | dget(dentry); /* Extra count - pin the dentry in core */ |
2234 | return 0; | 2260 | return 0; |
@@ -3638,9 +3664,7 @@ again: | |||
3638 | list_del(&cgrp->sibling); | 3664 | list_del(&cgrp->sibling); |
3639 | cgroup_unlock_hierarchy(cgrp->root); | 3665 | cgroup_unlock_hierarchy(cgrp->root); |
3640 | 3666 | ||
3641 | spin_lock(&cgrp->dentry->d_lock); | ||
3642 | d = dget(cgrp->dentry); | 3667 | d = dget(cgrp->dentry); |
3643 | spin_unlock(&d->d_lock); | ||
3644 | 3668 | ||
3645 | cgroup_d_remove_dir(d); | 3669 | cgroup_d_remove_dir(d); |
3646 | dput(d); | 3670 | dput(d); |
diff --git a/kernel/cpu.c b/kernel/cpu.c index f6e726f1849..156cc555614 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu) | |||
189 | } | 189 | } |
190 | 190 | ||
191 | struct take_cpu_down_param { | 191 | struct take_cpu_down_param { |
192 | struct task_struct *caller; | ||
193 | unsigned long mod; | 192 | unsigned long mod; |
194 | void *hcpu; | 193 | void *hcpu; |
195 | }; | 194 | }; |
@@ -198,7 +197,6 @@ struct take_cpu_down_param { | |||
198 | static int __ref take_cpu_down(void *_param) | 197 | static int __ref take_cpu_down(void *_param) |
199 | { | 198 | { |
200 | struct take_cpu_down_param *param = _param; | 199 | struct take_cpu_down_param *param = _param; |
201 | unsigned int cpu = (unsigned long)param->hcpu; | ||
202 | int err; | 200 | int err; |
203 | 201 | ||
204 | /* Ensure this CPU doesn't handle any more interrupts. */ | 202 | /* Ensure this CPU doesn't handle any more interrupts. */ |
@@ -208,11 +206,6 @@ static int __ref take_cpu_down(void *_param) | |||
208 | 206 | ||
209 | cpu_notify(CPU_DYING | param->mod, param->hcpu); | 207 | cpu_notify(CPU_DYING | param->mod, param->hcpu); |
210 | 208 | ||
211 | if (task_cpu(param->caller) == cpu) | ||
212 | move_task_off_dead_cpu(cpu, param->caller); | ||
213 | /* Force idle task to run as soon as we yield: it should | ||
214 | immediately notice cpu is offline and die quickly. */ | ||
215 | sched_idle_next(); | ||
216 | return 0; | 209 | return 0; |
217 | } | 210 | } |
218 | 211 | ||
@@ -223,7 +216,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
223 | void *hcpu = (void *)(long)cpu; | 216 | void *hcpu = (void *)(long)cpu; |
224 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; | 217 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; |
225 | struct take_cpu_down_param tcd_param = { | 218 | struct take_cpu_down_param tcd_param = { |
226 | .caller = current, | ||
227 | .mod = mod, | 219 | .mod = mod, |
228 | .hcpu = hcpu, | 220 | .hcpu = hcpu, |
229 | }; | 221 | }; |
@@ -253,9 +245,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
253 | } | 245 | } |
254 | BUG_ON(cpu_online(cpu)); | 246 | BUG_ON(cpu_online(cpu)); |
255 | 247 | ||
256 | /* Wait for it to sleep (leaving idle task). */ | 248 | /* |
249 | * The migration_call() CPU_DYING callback will have removed all | ||
250 | * runnable tasks from the cpu, there's only the idle task left now | ||
251 | * that the migration thread is done doing the stop_machine thing. | ||
252 | * | ||
253 | * Wait for the stop thread to go away. | ||
254 | */ | ||
257 | while (!idle_cpu(cpu)) | 255 | while (!idle_cpu(cpu)) |
258 | yield(); | 256 | cpu_relax(); |
259 | 257 | ||
260 | /* This actually kills the CPU. */ | 258 | /* This actually kills the CPU. */ |
261 | __cpu_die(cpu); | 259 | __cpu_die(cpu); |
@@ -386,6 +384,14 @@ out: | |||
386 | #ifdef CONFIG_PM_SLEEP_SMP | 384 | #ifdef CONFIG_PM_SLEEP_SMP |
387 | static cpumask_var_t frozen_cpus; | 385 | static cpumask_var_t frozen_cpus; |
388 | 386 | ||
387 | void __weak arch_disable_nonboot_cpus_begin(void) | ||
388 | { | ||
389 | } | ||
390 | |||
391 | void __weak arch_disable_nonboot_cpus_end(void) | ||
392 | { | ||
393 | } | ||
394 | |||
389 | int disable_nonboot_cpus(void) | 395 | int disable_nonboot_cpus(void) |
390 | { | 396 | { |
391 | int cpu, first_cpu, error = 0; | 397 | int cpu, first_cpu, error = 0; |
@@ -397,6 +403,7 @@ int disable_nonboot_cpus(void) | |||
397 | * with the userspace trying to use the CPU hotplug at the same time | 403 | * with the userspace trying to use the CPU hotplug at the same time |
398 | */ | 404 | */ |
399 | cpumask_clear(frozen_cpus); | 405 | cpumask_clear(frozen_cpus); |
406 | arch_disable_nonboot_cpus_begin(); | ||
400 | 407 | ||
401 | printk("Disabling non-boot CPUs ...\n"); | 408 | printk("Disabling non-boot CPUs ...\n"); |
402 | for_each_online_cpu(cpu) { | 409 | for_each_online_cpu(cpu) { |
@@ -412,6 +419,8 @@ int disable_nonboot_cpus(void) | |||
412 | } | 419 | } |
413 | } | 420 | } |
414 | 421 | ||
422 | arch_disable_nonboot_cpus_end(); | ||
423 | |||
415 | if (!error) { | 424 | if (!error) { |
416 | BUG_ON(num_online_cpus() > 1); | 425 | BUG_ON(num_online_cpus() > 1); |
417 | /* Make sure the CPUs won't be enabled by someone else */ | 426 | /* Make sure the CPUs won't be enabled by someone else */ |
diff --git a/kernel/exit.c b/kernel/exit.c index 21aa7b3001f..676149a4ac5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -914,6 +914,15 @@ NORET_TYPE void do_exit(long code) | |||
914 | if (unlikely(!tsk->pid)) | 914 | if (unlikely(!tsk->pid)) |
915 | panic("Attempted to kill the idle task!"); | 915 | panic("Attempted to kill the idle task!"); |
916 | 916 | ||
917 | /* | ||
918 | * If do_exit is called because this processes oopsed, it's possible | ||
919 | * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before | ||
920 | * continuing. Amongst other possible reasons, this is to prevent | ||
921 | * mm_release()->clear_child_tid() from writing to a user-controlled | ||
922 | * kernel address. | ||
923 | */ | ||
924 | set_fs(USER_DS); | ||
925 | |||
917 | tracehook_report_exit(&code); | 926 | tracehook_report_exit(&code); |
918 | 927 | ||
919 | validate_creds_for_do_exit(tsk); | 928 | validate_creds_for_do_exit(tsk); |
diff --git a/kernel/fork.c b/kernel/fork.c index 3b159c5991b..dc1a8bbcea7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -169,6 +169,7 @@ EXPORT_SYMBOL(free_task); | |||
169 | static inline void free_signal_struct(struct signal_struct *sig) | 169 | static inline void free_signal_struct(struct signal_struct *sig) |
170 | { | 170 | { |
171 | taskstats_tgid_free(sig); | 171 | taskstats_tgid_free(sig); |
172 | sched_autogroup_exit(sig); | ||
172 | kmem_cache_free(signal_cachep, sig); | 173 | kmem_cache_free(signal_cachep, sig); |
173 | } | 174 | } |
174 | 175 | ||
@@ -273,6 +274,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
273 | 274 | ||
274 | setup_thread_stack(tsk, orig); | 275 | setup_thread_stack(tsk, orig); |
275 | clear_user_return_notifier(tsk); | 276 | clear_user_return_notifier(tsk); |
277 | clear_tsk_need_resched(tsk); | ||
276 | stackend = end_of_stack(tsk); | 278 | stackend = end_of_stack(tsk); |
277 | *stackend = STACK_END_MAGIC; /* for overflow detection */ | 279 | *stackend = STACK_END_MAGIC; /* for overflow detection */ |
278 | 280 | ||
@@ -904,6 +906,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
904 | posix_cpu_timers_init_group(sig); | 906 | posix_cpu_timers_init_group(sig); |
905 | 907 | ||
906 | tty_audit_fork(sig); | 908 | tty_audit_fork(sig); |
909 | sched_autogroup_fork(sig); | ||
907 | 910 | ||
908 | sig->oom_adj = current->signal->oom_adj; | 911 | sig->oom_adj = current->signal->oom_adj; |
909 | sig->oom_score_adj = current->signal->oom_score_adj; | 912 | sig->oom_score_adj = current->signal->oom_score_adj; |
diff --git a/kernel/futex.c b/kernel/futex.c index 40a8777a27d..3019b92e691 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -69,6 +69,14 @@ int __read_mostly futex_cmpxchg_enabled; | |||
69 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) | 69 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) |
70 | 70 | ||
71 | /* | 71 | /* |
72 | * Futex flags used to encode options to functions and preserve them across | ||
73 | * restarts. | ||
74 | */ | ||
75 | #define FLAGS_SHARED 0x01 | ||
76 | #define FLAGS_CLOCKRT 0x02 | ||
77 | #define FLAGS_HAS_TIMEOUT 0x04 | ||
78 | |||
79 | /* | ||
72 | * Priority Inheritance state: | 80 | * Priority Inheritance state: |
73 | */ | 81 | */ |
74 | struct futex_pi_state { | 82 | struct futex_pi_state { |
@@ -123,6 +131,12 @@ struct futex_q { | |||
123 | u32 bitset; | 131 | u32 bitset; |
124 | }; | 132 | }; |
125 | 133 | ||
134 | static const struct futex_q futex_q_init = { | ||
135 | /* list gets initialized in queue_me()*/ | ||
136 | .key = FUTEX_KEY_INIT, | ||
137 | .bitset = FUTEX_BITSET_MATCH_ANY | ||
138 | }; | ||
139 | |||
126 | /* | 140 | /* |
127 | * Hash buckets are shared by all the futex_keys that hash to the same | 141 | * Hash buckets are shared by all the futex_keys that hash to the same |
128 | * location. Each key may have multiple futex_q structures, one for each task | 142 | * location. Each key may have multiple futex_q structures, one for each task |
@@ -283,8 +297,7 @@ again: | |||
283 | return 0; | 297 | return 0; |
284 | } | 298 | } |
285 | 299 | ||
286 | static inline | 300 | static inline void put_futex_key(union futex_key *key) |
287 | void put_futex_key(int fshared, union futex_key *key) | ||
288 | { | 301 | { |
289 | drop_futex_key_refs(key); | 302 | drop_futex_key_refs(key); |
290 | } | 303 | } |
@@ -870,7 +883,8 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) | |||
870 | /* | 883 | /* |
871 | * Wake up waiters matching bitset queued on this futex (uaddr). | 884 | * Wake up waiters matching bitset queued on this futex (uaddr). |
872 | */ | 885 | */ |
873 | static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) | 886 | static int |
887 | futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) | ||
874 | { | 888 | { |
875 | struct futex_hash_bucket *hb; | 889 | struct futex_hash_bucket *hb; |
876 | struct futex_q *this, *next; | 890 | struct futex_q *this, *next; |
@@ -881,7 +895,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) | |||
881 | if (!bitset) | 895 | if (!bitset) |
882 | return -EINVAL; | 896 | return -EINVAL; |
883 | 897 | ||
884 | ret = get_futex_key(uaddr, fshared, &key); | 898 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); |
885 | if (unlikely(ret != 0)) | 899 | if (unlikely(ret != 0)) |
886 | goto out; | 900 | goto out; |
887 | 901 | ||
@@ -907,7 +921,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) | |||
907 | } | 921 | } |
908 | 922 | ||
909 | spin_unlock(&hb->lock); | 923 | spin_unlock(&hb->lock); |
910 | put_futex_key(fshared, &key); | 924 | put_futex_key(&key); |
911 | out: | 925 | out: |
912 | return ret; | 926 | return ret; |
913 | } | 927 | } |
@@ -917,7 +931,7 @@ out: | |||
917 | * to this virtual address: | 931 | * to this virtual address: |
918 | */ | 932 | */ |
919 | static int | 933 | static int |
920 | futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, | 934 | futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, |
921 | int nr_wake, int nr_wake2, int op) | 935 | int nr_wake, int nr_wake2, int op) |
922 | { | 936 | { |
923 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; | 937 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; |
@@ -927,10 +941,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, | |||
927 | int ret, op_ret; | 941 | int ret, op_ret; |
928 | 942 | ||
929 | retry: | 943 | retry: |
930 | ret = get_futex_key(uaddr1, fshared, &key1); | 944 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); |
931 | if (unlikely(ret != 0)) | 945 | if (unlikely(ret != 0)) |
932 | goto out; | 946 | goto out; |
933 | ret = get_futex_key(uaddr2, fshared, &key2); | 947 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); |
934 | if (unlikely(ret != 0)) | 948 | if (unlikely(ret != 0)) |
935 | goto out_put_key1; | 949 | goto out_put_key1; |
936 | 950 | ||
@@ -962,11 +976,11 @@ retry_private: | |||
962 | if (ret) | 976 | if (ret) |
963 | goto out_put_keys; | 977 | goto out_put_keys; |
964 | 978 | ||
965 | if (!fshared) | 979 | if (!(flags & FLAGS_SHARED)) |
966 | goto retry_private; | 980 | goto retry_private; |
967 | 981 | ||
968 | put_futex_key(fshared, &key2); | 982 | put_futex_key(&key2); |
969 | put_futex_key(fshared, &key1); | 983 | put_futex_key(&key1); |
970 | goto retry; | 984 | goto retry; |
971 | } | 985 | } |
972 | 986 | ||
@@ -996,9 +1010,9 @@ retry_private: | |||
996 | 1010 | ||
997 | double_unlock_hb(hb1, hb2); | 1011 | double_unlock_hb(hb1, hb2); |
998 | out_put_keys: | 1012 | out_put_keys: |
999 | put_futex_key(fshared, &key2); | 1013 | put_futex_key(&key2); |
1000 | out_put_key1: | 1014 | out_put_key1: |
1001 | put_futex_key(fshared, &key1); | 1015 | put_futex_key(&key1); |
1002 | out: | 1016 | out: |
1003 | return ret; | 1017 | return ret; |
1004 | } | 1018 | } |
@@ -1133,13 +1147,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, | |||
1133 | /** | 1147 | /** |
1134 | * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 | 1148 | * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 |
1135 | * @uaddr1: source futex user address | 1149 | * @uaddr1: source futex user address |
1136 | * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED | 1150 | * @flags: futex flags (FLAGS_SHARED, etc.) |
1137 | * @uaddr2: target futex user address | 1151 | * @uaddr2: target futex user address |
1138 | * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) | 1152 | * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) |
1139 | * @nr_requeue: number of waiters to requeue (0-INT_MAX) | 1153 | * @nr_requeue: number of waiters to requeue (0-INT_MAX) |
1140 | * @cmpval: @uaddr1 expected value (or %NULL) | 1154 | * @cmpval: @uaddr1 expected value (or %NULL) |
1141 | * @requeue_pi: if we are attempting to requeue from a non-pi futex to a | 1155 | * @requeue_pi: if we are attempting to requeue from a non-pi futex to a |
1142 | * pi futex (pi to pi requeue is not supported) | 1156 | * pi futex (pi to pi requeue is not supported) |
1143 | * | 1157 | * |
1144 | * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire | 1158 | * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire |
1145 | * uaddr2 atomically on behalf of the top waiter. | 1159 | * uaddr2 atomically on behalf of the top waiter. |
@@ -1148,9 +1162,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, | |||
1148 | * >=0 - on success, the number of tasks requeued or woken | 1162 | * >=0 - on success, the number of tasks requeued or woken |
1149 | * <0 - on error | 1163 | * <0 - on error |
1150 | */ | 1164 | */ |
1151 | static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, | 1165 | static int futex_requeue(u32 __user *uaddr1, unsigned int flags, |
1152 | int nr_wake, int nr_requeue, u32 *cmpval, | 1166 | u32 __user *uaddr2, int nr_wake, int nr_requeue, |
1153 | int requeue_pi) | 1167 | u32 *cmpval, int requeue_pi) |
1154 | { | 1168 | { |
1155 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; | 1169 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; |
1156 | int drop_count = 0, task_count = 0, ret; | 1170 | int drop_count = 0, task_count = 0, ret; |
@@ -1191,10 +1205,10 @@ retry: | |||
1191 | pi_state = NULL; | 1205 | pi_state = NULL; |
1192 | } | 1206 | } |
1193 | 1207 | ||
1194 | ret = get_futex_key(uaddr1, fshared, &key1); | 1208 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); |
1195 | if (unlikely(ret != 0)) | 1209 | if (unlikely(ret != 0)) |
1196 | goto out; | 1210 | goto out; |
1197 | ret = get_futex_key(uaddr2, fshared, &key2); | 1211 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); |
1198 | if (unlikely(ret != 0)) | 1212 | if (unlikely(ret != 0)) |
1199 | goto out_put_key1; | 1213 | goto out_put_key1; |
1200 | 1214 | ||
@@ -1216,11 +1230,11 @@ retry_private: | |||
1216 | if (ret) | 1230 | if (ret) |
1217 | goto out_put_keys; | 1231 | goto out_put_keys; |
1218 | 1232 | ||
1219 | if (!fshared) | 1233 | if (!(flags & FLAGS_SHARED)) |
1220 | goto retry_private; | 1234 | goto retry_private; |
1221 | 1235 | ||
1222 | put_futex_key(fshared, &key2); | 1236 | put_futex_key(&key2); |
1223 | put_futex_key(fshared, &key1); | 1237 | put_futex_key(&key1); |
1224 | goto retry; | 1238 | goto retry; |
1225 | } | 1239 | } |
1226 | if (curval != *cmpval) { | 1240 | if (curval != *cmpval) { |
@@ -1260,8 +1274,8 @@ retry_private: | |||
1260 | break; | 1274 | break; |
1261 | case -EFAULT: | 1275 | case -EFAULT: |
1262 | double_unlock_hb(hb1, hb2); | 1276 | double_unlock_hb(hb1, hb2); |
1263 | put_futex_key(fshared, &key2); | 1277 | put_futex_key(&key2); |
1264 | put_futex_key(fshared, &key1); | 1278 | put_futex_key(&key1); |
1265 | ret = fault_in_user_writeable(uaddr2); | 1279 | ret = fault_in_user_writeable(uaddr2); |
1266 | if (!ret) | 1280 | if (!ret) |
1267 | goto retry; | 1281 | goto retry; |
@@ -1269,8 +1283,8 @@ retry_private: | |||
1269 | case -EAGAIN: | 1283 | case -EAGAIN: |
1270 | /* The owner was exiting, try again. */ | 1284 | /* The owner was exiting, try again. */ |
1271 | double_unlock_hb(hb1, hb2); | 1285 | double_unlock_hb(hb1, hb2); |
1272 | put_futex_key(fshared, &key2); | 1286 | put_futex_key(&key2); |
1273 | put_futex_key(fshared, &key1); | 1287 | put_futex_key(&key1); |
1274 | cond_resched(); | 1288 | cond_resched(); |
1275 | goto retry; | 1289 | goto retry; |
1276 | default: | 1290 | default: |
@@ -1352,9 +1366,9 @@ out_unlock: | |||
1352 | drop_futex_key_refs(&key1); | 1366 | drop_futex_key_refs(&key1); |
1353 | 1367 | ||
1354 | out_put_keys: | 1368 | out_put_keys: |
1355 | put_futex_key(fshared, &key2); | 1369 | put_futex_key(&key2); |
1356 | out_put_key1: | 1370 | out_put_key1: |
1357 | put_futex_key(fshared, &key1); | 1371 | put_futex_key(&key1); |
1358 | out: | 1372 | out: |
1359 | if (pi_state != NULL) | 1373 | if (pi_state != NULL) |
1360 | free_pi_state(pi_state); | 1374 | free_pi_state(pi_state); |
@@ -1494,7 +1508,7 @@ static void unqueue_me_pi(struct futex_q *q) | |||
1494 | * private futexes. | 1508 | * private futexes. |
1495 | */ | 1509 | */ |
1496 | static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | 1510 | static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, |
1497 | struct task_struct *newowner, int fshared) | 1511 | struct task_struct *newowner) |
1498 | { | 1512 | { |
1499 | u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; | 1513 | u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; |
1500 | struct futex_pi_state *pi_state = q->pi_state; | 1514 | struct futex_pi_state *pi_state = q->pi_state; |
@@ -1587,20 +1601,11 @@ handle_fault: | |||
1587 | goto retry; | 1601 | goto retry; |
1588 | } | 1602 | } |
1589 | 1603 | ||
1590 | /* | ||
1591 | * In case we must use restart_block to restart a futex_wait, | ||
1592 | * we encode in the 'flags' shared capability | ||
1593 | */ | ||
1594 | #define FLAGS_SHARED 0x01 | ||
1595 | #define FLAGS_CLOCKRT 0x02 | ||
1596 | #define FLAGS_HAS_TIMEOUT 0x04 | ||
1597 | |||
1598 | static long futex_wait_restart(struct restart_block *restart); | 1604 | static long futex_wait_restart(struct restart_block *restart); |
1599 | 1605 | ||
1600 | /** | 1606 | /** |
1601 | * fixup_owner() - Post lock pi_state and corner case management | 1607 | * fixup_owner() - Post lock pi_state and corner case management |
1602 | * @uaddr: user address of the futex | 1608 | * @uaddr: user address of the futex |
1603 | * @fshared: whether the futex is shared (1) or not (0) | ||
1604 | * @q: futex_q (contains pi_state and access to the rt_mutex) | 1609 | * @q: futex_q (contains pi_state and access to the rt_mutex) |
1605 | * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) | 1610 | * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) |
1606 | * | 1611 | * |
@@ -1613,8 +1618,7 @@ static long futex_wait_restart(struct restart_block *restart); | |||
1613 | * 0 - success, lock not taken | 1618 | * 0 - success, lock not taken |
1614 | * <0 - on error (-EFAULT) | 1619 | * <0 - on error (-EFAULT) |
1615 | */ | 1620 | */ |
1616 | static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, | 1621 | static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) |
1617 | int locked) | ||
1618 | { | 1622 | { |
1619 | struct task_struct *owner; | 1623 | struct task_struct *owner; |
1620 | int ret = 0; | 1624 | int ret = 0; |
@@ -1625,7 +1629,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, | |||
1625 | * did a lock-steal - fix up the PI-state in that case: | 1629 | * did a lock-steal - fix up the PI-state in that case: |
1626 | */ | 1630 | */ |
1627 | if (q->pi_state->owner != current) | 1631 | if (q->pi_state->owner != current) |
1628 | ret = fixup_pi_state_owner(uaddr, q, current, fshared); | 1632 | ret = fixup_pi_state_owner(uaddr, q, current); |
1629 | goto out; | 1633 | goto out; |
1630 | } | 1634 | } |
1631 | 1635 | ||
@@ -1652,7 +1656,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, | |||
1652 | * lock. Fix the state up. | 1656 | * lock. Fix the state up. |
1653 | */ | 1657 | */ |
1654 | owner = rt_mutex_owner(&q->pi_state->pi_mutex); | 1658 | owner = rt_mutex_owner(&q->pi_state->pi_mutex); |
1655 | ret = fixup_pi_state_owner(uaddr, q, owner, fshared); | 1659 | ret = fixup_pi_state_owner(uaddr, q, owner); |
1656 | goto out; | 1660 | goto out; |
1657 | } | 1661 | } |
1658 | 1662 | ||
@@ -1715,7 +1719,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, | |||
1715 | * futex_wait_setup() - Prepare to wait on a futex | 1719 | * futex_wait_setup() - Prepare to wait on a futex |
1716 | * @uaddr: the futex userspace address | 1720 | * @uaddr: the futex userspace address |
1717 | * @val: the expected value | 1721 | * @val: the expected value |
1718 | * @fshared: whether the futex is shared (1) or not (0) | 1722 | * @flags: futex flags (FLAGS_SHARED, etc.) |
1719 | * @q: the associated futex_q | 1723 | * @q: the associated futex_q |
1720 | * @hb: storage for hash_bucket pointer to be returned to caller | 1724 | * @hb: storage for hash_bucket pointer to be returned to caller |
1721 | * | 1725 | * |
@@ -1728,7 +1732,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, | |||
1728 | * 0 - uaddr contains val and hb has been locked | 1732 | * 0 - uaddr contains val and hb has been locked |
1729 | * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked | 1733 | * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked |
1730 | */ | 1734 | */ |
1731 | static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, | 1735 | static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, |
1732 | struct futex_q *q, struct futex_hash_bucket **hb) | 1736 | struct futex_q *q, struct futex_hash_bucket **hb) |
1733 | { | 1737 | { |
1734 | u32 uval; | 1738 | u32 uval; |
@@ -1752,8 +1756,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, | |||
1752 | * rare, but normal. | 1756 | * rare, but normal. |
1753 | */ | 1757 | */ |
1754 | retry: | 1758 | retry: |
1755 | q->key = FUTEX_KEY_INIT; | 1759 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); |
1756 | ret = get_futex_key(uaddr, fshared, &q->key); | ||
1757 | if (unlikely(ret != 0)) | 1760 | if (unlikely(ret != 0)) |
1758 | return ret; | 1761 | return ret; |
1759 | 1762 | ||
@@ -1769,10 +1772,10 @@ retry_private: | |||
1769 | if (ret) | 1772 | if (ret) |
1770 | goto out; | 1773 | goto out; |
1771 | 1774 | ||
1772 | if (!fshared) | 1775 | if (!(flags & FLAGS_SHARED)) |
1773 | goto retry_private; | 1776 | goto retry_private; |
1774 | 1777 | ||
1775 | put_futex_key(fshared, &q->key); | 1778 | put_futex_key(&q->key); |
1776 | goto retry; | 1779 | goto retry; |
1777 | } | 1780 | } |
1778 | 1781 | ||
@@ -1783,32 +1786,29 @@ retry_private: | |||
1783 | 1786 | ||
1784 | out: | 1787 | out: |
1785 | if (ret) | 1788 | if (ret) |
1786 | put_futex_key(fshared, &q->key); | 1789 | put_futex_key(&q->key); |
1787 | return ret; | 1790 | return ret; |
1788 | } | 1791 | } |
1789 | 1792 | ||
1790 | static int futex_wait(u32 __user *uaddr, int fshared, | 1793 | static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, |
1791 | u32 val, ktime_t *abs_time, u32 bitset, int clockrt) | 1794 | ktime_t *abs_time, u32 bitset) |
1792 | { | 1795 | { |
1793 | struct hrtimer_sleeper timeout, *to = NULL; | 1796 | struct hrtimer_sleeper timeout, *to = NULL; |
1794 | struct restart_block *restart; | 1797 | struct restart_block *restart; |
1795 | struct futex_hash_bucket *hb; | 1798 | struct futex_hash_bucket *hb; |
1796 | struct futex_q q; | 1799 | struct futex_q q = futex_q_init; |
1797 | int ret; | 1800 | int ret; |
1798 | 1801 | ||
1799 | if (!bitset) | 1802 | if (!bitset) |
1800 | return -EINVAL; | 1803 | return -EINVAL; |
1801 | |||
1802 | q.pi_state = NULL; | ||
1803 | q.bitset = bitset; | 1804 | q.bitset = bitset; |
1804 | q.rt_waiter = NULL; | ||
1805 | q.requeue_pi_key = NULL; | ||
1806 | 1805 | ||
1807 | if (abs_time) { | 1806 | if (abs_time) { |
1808 | to = &timeout; | 1807 | to = &timeout; |
1809 | 1808 | ||
1810 | hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : | 1809 | hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? |
1811 | CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | 1810 | CLOCK_REALTIME : CLOCK_MONOTONIC, |
1811 | HRTIMER_MODE_ABS); | ||
1812 | hrtimer_init_sleeper(to, current); | 1812 | hrtimer_init_sleeper(to, current); |
1813 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, | 1813 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, |
1814 | current->timer_slack_ns); | 1814 | current->timer_slack_ns); |
@@ -1819,7 +1819,7 @@ retry: | |||
1819 | * Prepare to wait on uaddr. On success, holds hb lock and increments | 1819 | * Prepare to wait on uaddr. On success, holds hb lock and increments |
1820 | * q.key refs. | 1820 | * q.key refs. |
1821 | */ | 1821 | */ |
1822 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); | 1822 | ret = futex_wait_setup(uaddr, val, flags, &q, &hb); |
1823 | if (ret) | 1823 | if (ret) |
1824 | goto out; | 1824 | goto out; |
1825 | 1825 | ||
@@ -1852,12 +1852,7 @@ retry: | |||
1852 | restart->futex.val = val; | 1852 | restart->futex.val = val; |
1853 | restart->futex.time = abs_time->tv64; | 1853 | restart->futex.time = abs_time->tv64; |
1854 | restart->futex.bitset = bitset; | 1854 | restart->futex.bitset = bitset; |
1855 | restart->futex.flags = FLAGS_HAS_TIMEOUT; | 1855 | restart->futex.flags = flags; |
1856 | |||
1857 | if (fshared) | ||
1858 | restart->futex.flags |= FLAGS_SHARED; | ||
1859 | if (clockrt) | ||
1860 | restart->futex.flags |= FLAGS_CLOCKRT; | ||
1861 | 1856 | ||
1862 | ret = -ERESTART_RESTARTBLOCK; | 1857 | ret = -ERESTART_RESTARTBLOCK; |
1863 | 1858 | ||
@@ -1873,7 +1868,6 @@ out: | |||
1873 | static long futex_wait_restart(struct restart_block *restart) | 1868 | static long futex_wait_restart(struct restart_block *restart) |
1874 | { | 1869 | { |
1875 | u32 __user *uaddr = restart->futex.uaddr; | 1870 | u32 __user *uaddr = restart->futex.uaddr; |
1876 | int fshared = 0; | ||
1877 | ktime_t t, *tp = NULL; | 1871 | ktime_t t, *tp = NULL; |
1878 | 1872 | ||
1879 | if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { | 1873 | if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { |
@@ -1881,11 +1875,9 @@ static long futex_wait_restart(struct restart_block *restart) | |||
1881 | tp = &t; | 1875 | tp = &t; |
1882 | } | 1876 | } |
1883 | restart->fn = do_no_restart_syscall; | 1877 | restart->fn = do_no_restart_syscall; |
1884 | if (restart->futex.flags & FLAGS_SHARED) | 1878 | |
1885 | fshared = 1; | 1879 | return (long)futex_wait(uaddr, restart->futex.flags, |
1886 | return (long)futex_wait(uaddr, fshared, restart->futex.val, tp, | 1880 | restart->futex.val, tp, restart->futex.bitset); |
1887 | restart->futex.bitset, | ||
1888 | restart->futex.flags & FLAGS_CLOCKRT); | ||
1889 | } | 1881 | } |
1890 | 1882 | ||
1891 | 1883 | ||
@@ -1895,12 +1887,12 @@ static long futex_wait_restart(struct restart_block *restart) | |||
1895 | * if there are waiters then it will block, it does PI, etc. (Due to | 1887 | * if there are waiters then it will block, it does PI, etc. (Due to |
1896 | * races the kernel might see a 0 value of the futex too.) | 1888 | * races the kernel might see a 0 value of the futex too.) |
1897 | */ | 1889 | */ |
1898 | static int futex_lock_pi(u32 __user *uaddr, int fshared, | 1890 | static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, |
1899 | int detect, ktime_t *time, int trylock) | 1891 | ktime_t *time, int trylock) |
1900 | { | 1892 | { |
1901 | struct hrtimer_sleeper timeout, *to = NULL; | 1893 | struct hrtimer_sleeper timeout, *to = NULL; |
1902 | struct futex_hash_bucket *hb; | 1894 | struct futex_hash_bucket *hb; |
1903 | struct futex_q q; | 1895 | struct futex_q q = futex_q_init; |
1904 | int res, ret; | 1896 | int res, ret; |
1905 | 1897 | ||
1906 | if (refill_pi_state_cache()) | 1898 | if (refill_pi_state_cache()) |
@@ -1914,12 +1906,8 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, | |||
1914 | hrtimer_set_expires(&to->timer, *time); | 1906 | hrtimer_set_expires(&to->timer, *time); |
1915 | } | 1907 | } |
1916 | 1908 | ||
1917 | q.pi_state = NULL; | ||
1918 | q.rt_waiter = NULL; | ||
1919 | q.requeue_pi_key = NULL; | ||
1920 | retry: | 1909 | retry: |
1921 | q.key = FUTEX_KEY_INIT; | 1910 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key); |
1922 | ret = get_futex_key(uaddr, fshared, &q.key); | ||
1923 | if (unlikely(ret != 0)) | 1911 | if (unlikely(ret != 0)) |
1924 | goto out; | 1912 | goto out; |
1925 | 1913 | ||
@@ -1941,7 +1929,7 @@ retry_private: | |||
1941 | * exit to complete. | 1929 | * exit to complete. |
1942 | */ | 1930 | */ |
1943 | queue_unlock(&q, hb); | 1931 | queue_unlock(&q, hb); |
1944 | put_futex_key(fshared, &q.key); | 1932 | put_futex_key(&q.key); |
1945 | cond_resched(); | 1933 | cond_resched(); |
1946 | goto retry; | 1934 | goto retry; |
1947 | default: | 1935 | default: |
@@ -1971,7 +1959,7 @@ retry_private: | |||
1971 | * Fixup the pi_state owner and possibly acquire the lock if we | 1959 | * Fixup the pi_state owner and possibly acquire the lock if we |
1972 | * haven't already. | 1960 | * haven't already. |
1973 | */ | 1961 | */ |
1974 | res = fixup_owner(uaddr, fshared, &q, !ret); | 1962 | res = fixup_owner(uaddr, &q, !ret); |
1975 | /* | 1963 | /* |
1976 | * If fixup_owner() returned an error, proprogate that. If it acquired | 1964 | * If fixup_owner() returned an error, proprogate that. If it acquired |
1977 | * the lock, clear our -ETIMEDOUT or -EINTR. | 1965 | * the lock, clear our -ETIMEDOUT or -EINTR. |
@@ -1995,7 +1983,7 @@ out_unlock_put_key: | |||
1995 | queue_unlock(&q, hb); | 1983 | queue_unlock(&q, hb); |
1996 | 1984 | ||
1997 | out_put_key: | 1985 | out_put_key: |
1998 | put_futex_key(fshared, &q.key); | 1986 | put_futex_key(&q.key); |
1999 | out: | 1987 | out: |
2000 | if (to) | 1988 | if (to) |
2001 | destroy_hrtimer_on_stack(&to->timer); | 1989 | destroy_hrtimer_on_stack(&to->timer); |
@@ -2008,10 +1996,10 @@ uaddr_faulted: | |||
2008 | if (ret) | 1996 | if (ret) |
2009 | goto out_put_key; | 1997 | goto out_put_key; |
2010 | 1998 | ||
2011 | if (!fshared) | 1999 | if (!(flags & FLAGS_SHARED)) |
2012 | goto retry_private; | 2000 | goto retry_private; |
2013 | 2001 | ||
2014 | put_futex_key(fshared, &q.key); | 2002 | put_futex_key(&q.key); |
2015 | goto retry; | 2003 | goto retry; |
2016 | } | 2004 | } |
2017 | 2005 | ||
@@ -2020,7 +2008,7 @@ uaddr_faulted: | |||
2020 | * This is the in-kernel slowpath: we look up the PI state (if any), | 2008 | * This is the in-kernel slowpath: we look up the PI state (if any), |
2021 | * and do the rt-mutex unlock. | 2009 | * and do the rt-mutex unlock. |
2022 | */ | 2010 | */ |
2023 | static int futex_unlock_pi(u32 __user *uaddr, int fshared) | 2011 | static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) |
2024 | { | 2012 | { |
2025 | struct futex_hash_bucket *hb; | 2013 | struct futex_hash_bucket *hb; |
2026 | struct futex_q *this, *next; | 2014 | struct futex_q *this, *next; |
@@ -2038,7 +2026,7 @@ retry: | |||
2038 | if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) | 2026 | if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) |
2039 | return -EPERM; | 2027 | return -EPERM; |
2040 | 2028 | ||
2041 | ret = get_futex_key(uaddr, fshared, &key); | 2029 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); |
2042 | if (unlikely(ret != 0)) | 2030 | if (unlikely(ret != 0)) |
2043 | goto out; | 2031 | goto out; |
2044 | 2032 | ||
@@ -2093,14 +2081,14 @@ retry: | |||
2093 | 2081 | ||
2094 | out_unlock: | 2082 | out_unlock: |
2095 | spin_unlock(&hb->lock); | 2083 | spin_unlock(&hb->lock); |
2096 | put_futex_key(fshared, &key); | 2084 | put_futex_key(&key); |
2097 | 2085 | ||
2098 | out: | 2086 | out: |
2099 | return ret; | 2087 | return ret; |
2100 | 2088 | ||
2101 | pi_faulted: | 2089 | pi_faulted: |
2102 | spin_unlock(&hb->lock); | 2090 | spin_unlock(&hb->lock); |
2103 | put_futex_key(fshared, &key); | 2091 | put_futex_key(&key); |
2104 | 2092 | ||
2105 | ret = fault_in_user_writeable(uaddr); | 2093 | ret = fault_in_user_writeable(uaddr); |
2106 | if (!ret) | 2094 | if (!ret) |
@@ -2160,7 +2148,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | |||
2160 | /** | 2148 | /** |
2161 | * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 | 2149 | * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 |
2162 | * @uaddr: the futex we initially wait on (non-pi) | 2150 | * @uaddr: the futex we initially wait on (non-pi) |
2163 | * @fshared: whether the futexes are shared (1) or not (0). They must be | 2151 | * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be |
2164 | * the same type, no requeueing from private to shared, etc. | 2152 | * the same type, no requeueing from private to shared, etc. |
2165 | * @val: the expected value of uaddr | 2153 | * @val: the expected value of uaddr |
2166 | * @abs_time: absolute timeout | 2154 | * @abs_time: absolute timeout |
@@ -2198,16 +2186,16 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | |||
2198 | * 0 - On success | 2186 | * 0 - On success |
2199 | * <0 - On error | 2187 | * <0 - On error |
2200 | */ | 2188 | */ |
2201 | static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | 2189 | static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, |
2202 | u32 val, ktime_t *abs_time, u32 bitset, | 2190 | u32 val, ktime_t *abs_time, u32 bitset, |
2203 | int clockrt, u32 __user *uaddr2) | 2191 | u32 __user *uaddr2) |
2204 | { | 2192 | { |
2205 | struct hrtimer_sleeper timeout, *to = NULL; | 2193 | struct hrtimer_sleeper timeout, *to = NULL; |
2206 | struct rt_mutex_waiter rt_waiter; | 2194 | struct rt_mutex_waiter rt_waiter; |
2207 | struct rt_mutex *pi_mutex = NULL; | 2195 | struct rt_mutex *pi_mutex = NULL; |
2208 | struct futex_hash_bucket *hb; | 2196 | struct futex_hash_bucket *hb; |
2209 | union futex_key key2; | 2197 | union futex_key key2 = FUTEX_KEY_INIT; |
2210 | struct futex_q q; | 2198 | struct futex_q q = futex_q_init; |
2211 | int res, ret; | 2199 | int res, ret; |
2212 | 2200 | ||
2213 | if (!bitset) | 2201 | if (!bitset) |
@@ -2215,8 +2203,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
2215 | 2203 | ||
2216 | if (abs_time) { | 2204 | if (abs_time) { |
2217 | to = &timeout; | 2205 | to = &timeout; |
2218 | hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : | 2206 | hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? |
2219 | CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | 2207 | CLOCK_REALTIME : CLOCK_MONOTONIC, |
2208 | HRTIMER_MODE_ABS); | ||
2220 | hrtimer_init_sleeper(to, current); | 2209 | hrtimer_init_sleeper(to, current); |
2221 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, | 2210 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, |
2222 | current->timer_slack_ns); | 2211 | current->timer_slack_ns); |
@@ -2229,12 +2218,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
2229 | debug_rt_mutex_init_waiter(&rt_waiter); | 2218 | debug_rt_mutex_init_waiter(&rt_waiter); |
2230 | rt_waiter.task = NULL; | 2219 | rt_waiter.task = NULL; |
2231 | 2220 | ||
2232 | key2 = FUTEX_KEY_INIT; | 2221 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); |
2233 | ret = get_futex_key(uaddr2, fshared, &key2); | ||
2234 | if (unlikely(ret != 0)) | 2222 | if (unlikely(ret != 0)) |
2235 | goto out; | 2223 | goto out; |
2236 | 2224 | ||
2237 | q.pi_state = NULL; | ||
2238 | q.bitset = bitset; | 2225 | q.bitset = bitset; |
2239 | q.rt_waiter = &rt_waiter; | 2226 | q.rt_waiter = &rt_waiter; |
2240 | q.requeue_pi_key = &key2; | 2227 | q.requeue_pi_key = &key2; |
@@ -2243,7 +2230,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
2243 | * Prepare to wait on uaddr. On success, increments q.key (key1) ref | 2230 | * Prepare to wait on uaddr. On success, increments q.key (key1) ref |
2244 | * count. | 2231 | * count. |
2245 | */ | 2232 | */ |
2246 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); | 2233 | ret = futex_wait_setup(uaddr, val, flags, &q, &hb); |
2247 | if (ret) | 2234 | if (ret) |
2248 | goto out_key2; | 2235 | goto out_key2; |
2249 | 2236 | ||
@@ -2273,8 +2260,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
2273 | */ | 2260 | */ |
2274 | if (q.pi_state && (q.pi_state->owner != current)) { | 2261 | if (q.pi_state && (q.pi_state->owner != current)) { |
2275 | spin_lock(q.lock_ptr); | 2262 | spin_lock(q.lock_ptr); |
2276 | ret = fixup_pi_state_owner(uaddr2, &q, current, | 2263 | ret = fixup_pi_state_owner(uaddr2, &q, current); |
2277 | fshared); | ||
2278 | spin_unlock(q.lock_ptr); | 2264 | spin_unlock(q.lock_ptr); |
2279 | } | 2265 | } |
2280 | } else { | 2266 | } else { |
@@ -2293,7 +2279,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
2293 | * Fixup the pi_state owner and possibly acquire the lock if we | 2279 | * Fixup the pi_state owner and possibly acquire the lock if we |
2294 | * haven't already. | 2280 | * haven't already. |
2295 | */ | 2281 | */ |
2296 | res = fixup_owner(uaddr2, fshared, &q, !ret); | 2282 | res = fixup_owner(uaddr2, &q, !ret); |
2297 | /* | 2283 | /* |
2298 | * If fixup_owner() returned an error, proprogate that. If it | 2284 | * If fixup_owner() returned an error, proprogate that. If it |
2299 | * acquired the lock, clear -ETIMEDOUT or -EINTR. | 2285 | * acquired the lock, clear -ETIMEDOUT or -EINTR. |
@@ -2324,9 +2310,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
2324 | } | 2310 | } |
2325 | 2311 | ||
2326 | out_put_keys: | 2312 | out_put_keys: |
2327 | put_futex_key(fshared, &q.key); | 2313 | put_futex_key(&q.key); |
2328 | out_key2: | 2314 | out_key2: |
2329 | put_futex_key(fshared, &key2); | 2315 | put_futex_key(&key2); |
2330 | 2316 | ||
2331 | out: | 2317 | out: |
2332 | if (to) { | 2318 | if (to) { |
@@ -2551,58 +2537,57 @@ void exit_robust_list(struct task_struct *curr) | |||
2551 | long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, | 2537 | long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, |
2552 | u32 __user *uaddr2, u32 val2, u32 val3) | 2538 | u32 __user *uaddr2, u32 val2, u32 val3) |
2553 | { | 2539 | { |
2554 | int clockrt, ret = -ENOSYS; | 2540 | int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK; |
2555 | int cmd = op & FUTEX_CMD_MASK; | 2541 | unsigned int flags = 0; |
2556 | int fshared = 0; | ||
2557 | 2542 | ||
2558 | if (!(op & FUTEX_PRIVATE_FLAG)) | 2543 | if (!(op & FUTEX_PRIVATE_FLAG)) |
2559 | fshared = 1; | 2544 | flags |= FLAGS_SHARED; |
2560 | 2545 | ||
2561 | clockrt = op & FUTEX_CLOCK_REALTIME; | 2546 | if (op & FUTEX_CLOCK_REALTIME) { |
2562 | if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) | 2547 | flags |= FLAGS_CLOCKRT; |
2563 | return -ENOSYS; | 2548 | if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) |
2549 | return -ENOSYS; | ||
2550 | } | ||
2564 | 2551 | ||
2565 | switch (cmd) { | 2552 | switch (cmd) { |
2566 | case FUTEX_WAIT: | 2553 | case FUTEX_WAIT: |
2567 | val3 = FUTEX_BITSET_MATCH_ANY; | 2554 | val3 = FUTEX_BITSET_MATCH_ANY; |
2568 | case FUTEX_WAIT_BITSET: | 2555 | case FUTEX_WAIT_BITSET: |
2569 | ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt); | 2556 | ret = futex_wait(uaddr, flags, val, timeout, val3); |
2570 | break; | 2557 | break; |
2571 | case FUTEX_WAKE: | 2558 | case FUTEX_WAKE: |
2572 | val3 = FUTEX_BITSET_MATCH_ANY; | 2559 | val3 = FUTEX_BITSET_MATCH_ANY; |
2573 | case FUTEX_WAKE_BITSET: | 2560 | case FUTEX_WAKE_BITSET: |
2574 | ret = futex_wake(uaddr, fshared, val, val3); | 2561 | ret = futex_wake(uaddr, flags, val, val3); |
2575 | break; | 2562 | break; |
2576 | case FUTEX_REQUEUE: | 2563 | case FUTEX_REQUEUE: |
2577 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); | 2564 | ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); |
2578 | break; | 2565 | break; |
2579 | case FUTEX_CMP_REQUEUE: | 2566 | case FUTEX_CMP_REQUEUE: |
2580 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, | 2567 | ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); |
2581 | 0); | ||
2582 | break; | 2568 | break; |
2583 | case FUTEX_WAKE_OP: | 2569 | case FUTEX_WAKE_OP: |
2584 | ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); | 2570 | ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); |
2585 | break; | 2571 | break; |
2586 | case FUTEX_LOCK_PI: | 2572 | case FUTEX_LOCK_PI: |
2587 | if (futex_cmpxchg_enabled) | 2573 | if (futex_cmpxchg_enabled) |
2588 | ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); | 2574 | ret = futex_lock_pi(uaddr, flags, val, timeout, 0); |
2589 | break; | 2575 | break; |
2590 | case FUTEX_UNLOCK_PI: | 2576 | case FUTEX_UNLOCK_PI: |
2591 | if (futex_cmpxchg_enabled) | 2577 | if (futex_cmpxchg_enabled) |
2592 | ret = futex_unlock_pi(uaddr, fshared); | 2578 | ret = futex_unlock_pi(uaddr, flags); |
2593 | break; | 2579 | break; |
2594 | case FUTEX_TRYLOCK_PI: | 2580 | case FUTEX_TRYLOCK_PI: |
2595 | if (futex_cmpxchg_enabled) | 2581 | if (futex_cmpxchg_enabled) |
2596 | ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); | 2582 | ret = futex_lock_pi(uaddr, flags, 0, timeout, 1); |
2597 | break; | 2583 | break; |
2598 | case FUTEX_WAIT_REQUEUE_PI: | 2584 | case FUTEX_WAIT_REQUEUE_PI: |
2599 | val3 = FUTEX_BITSET_MATCH_ANY; | 2585 | val3 = FUTEX_BITSET_MATCH_ANY; |
2600 | ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, | 2586 | ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, |
2601 | clockrt, uaddr2); | 2587 | uaddr2); |
2602 | break; | 2588 | break; |
2603 | case FUTEX_CMP_REQUEUE_PI: | 2589 | case FUTEX_CMP_REQUEUE_PI: |
2604 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, | 2590 | ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); |
2605 | 1); | ||
2606 | break; | 2591 | break; |
2607 | default: | 2592 | default: |
2608 | ret = -ENOSYS; | 2593 | ret = -ENOSYS; |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 72206cf5c6c..f2429fc3438 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -516,10 +516,13 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) | |||
516 | 516 | ||
517 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { | 517 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { |
518 | struct hrtimer *timer; | 518 | struct hrtimer *timer; |
519 | struct timerqueue_node *next; | ||
519 | 520 | ||
520 | if (!base->first) | 521 | next = timerqueue_getnext(&base->active); |
522 | if (!next) | ||
521 | continue; | 523 | continue; |
522 | timer = rb_entry(base->first, struct hrtimer, node); | 524 | timer = container_of(next, struct hrtimer, node); |
525 | |||
523 | expires = ktime_sub(hrtimer_get_expires(timer), base->offset); | 526 | expires = ktime_sub(hrtimer_get_expires(timer), base->offset); |
524 | /* | 527 | /* |
525 | * clock_was_set() has changed base->offset so the | 528 | * clock_was_set() has changed base->offset so the |
@@ -840,48 +843,17 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); | |||
840 | static int enqueue_hrtimer(struct hrtimer *timer, | 843 | static int enqueue_hrtimer(struct hrtimer *timer, |
841 | struct hrtimer_clock_base *base) | 844 | struct hrtimer_clock_base *base) |
842 | { | 845 | { |
843 | struct rb_node **link = &base->active.rb_node; | ||
844 | struct rb_node *parent = NULL; | ||
845 | struct hrtimer *entry; | ||
846 | int leftmost = 1; | ||
847 | |||
848 | debug_activate(timer); | 846 | debug_activate(timer); |
849 | 847 | ||
850 | /* | 848 | timerqueue_add(&base->active, &timer->node); |
851 | * Find the right place in the rbtree: | ||
852 | */ | ||
853 | while (*link) { | ||
854 | parent = *link; | ||
855 | entry = rb_entry(parent, struct hrtimer, node); | ||
856 | /* | ||
857 | * We dont care about collisions. Nodes with | ||
858 | * the same expiry time stay together. | ||
859 | */ | ||
860 | if (hrtimer_get_expires_tv64(timer) < | ||
861 | hrtimer_get_expires_tv64(entry)) { | ||
862 | link = &(*link)->rb_left; | ||
863 | } else { | ||
864 | link = &(*link)->rb_right; | ||
865 | leftmost = 0; | ||
866 | } | ||
867 | } | ||
868 | |||
869 | /* | ||
870 | * Insert the timer to the rbtree and check whether it | ||
871 | * replaces the first pending timer | ||
872 | */ | ||
873 | if (leftmost) | ||
874 | base->first = &timer->node; | ||
875 | 849 | ||
876 | rb_link_node(&timer->node, parent, link); | ||
877 | rb_insert_color(&timer->node, &base->active); | ||
878 | /* | 850 | /* |
879 | * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the | 851 | * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the |
880 | * state of a possibly running callback. | 852 | * state of a possibly running callback. |
881 | */ | 853 | */ |
882 | timer->state |= HRTIMER_STATE_ENQUEUED; | 854 | timer->state |= HRTIMER_STATE_ENQUEUED; |
883 | 855 | ||
884 | return leftmost; | 856 | return (&timer->node == base->active.next); |
885 | } | 857 | } |
886 | 858 | ||
887 | /* | 859 | /* |
@@ -901,12 +873,7 @@ static void __remove_hrtimer(struct hrtimer *timer, | |||
901 | if (!(timer->state & HRTIMER_STATE_ENQUEUED)) | 873 | if (!(timer->state & HRTIMER_STATE_ENQUEUED)) |
902 | goto out; | 874 | goto out; |
903 | 875 | ||
904 | /* | 876 | if (&timer->node == timerqueue_getnext(&base->active)) { |
905 | * Remove the timer from the rbtree and replace the first | ||
906 | * entry pointer if necessary. | ||
907 | */ | ||
908 | if (base->first == &timer->node) { | ||
909 | base->first = rb_next(&timer->node); | ||
910 | #ifdef CONFIG_HIGH_RES_TIMERS | 877 | #ifdef CONFIG_HIGH_RES_TIMERS |
911 | /* Reprogram the clock event device. if enabled */ | 878 | /* Reprogram the clock event device. if enabled */ |
912 | if (reprogram && hrtimer_hres_active()) { | 879 | if (reprogram && hrtimer_hres_active()) { |
@@ -919,7 +886,7 @@ static void __remove_hrtimer(struct hrtimer *timer, | |||
919 | } | 886 | } |
920 | #endif | 887 | #endif |
921 | } | 888 | } |
922 | rb_erase(&timer->node, &base->active); | 889 | timerqueue_del(&base->active, &timer->node); |
923 | out: | 890 | out: |
924 | timer->state = newstate; | 891 | timer->state = newstate; |
925 | } | 892 | } |
@@ -1128,11 +1095,13 @@ ktime_t hrtimer_get_next_event(void) | |||
1128 | if (!hrtimer_hres_active()) { | 1095 | if (!hrtimer_hres_active()) { |
1129 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { | 1096 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { |
1130 | struct hrtimer *timer; | 1097 | struct hrtimer *timer; |
1098 | struct timerqueue_node *next; | ||
1131 | 1099 | ||
1132 | if (!base->first) | 1100 | next = timerqueue_getnext(&base->active); |
1101 | if (!next) | ||
1133 | continue; | 1102 | continue; |
1134 | 1103 | ||
1135 | timer = rb_entry(base->first, struct hrtimer, node); | 1104 | timer = container_of(next, struct hrtimer, node); |
1136 | delta.tv64 = hrtimer_get_expires_tv64(timer); | 1105 | delta.tv64 = hrtimer_get_expires_tv64(timer); |
1137 | delta = ktime_sub(delta, base->get_time()); | 1106 | delta = ktime_sub(delta, base->get_time()); |
1138 | if (delta.tv64 < mindelta.tv64) | 1107 | if (delta.tv64 < mindelta.tv64) |
@@ -1162,6 +1131,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | |||
1162 | 1131 | ||
1163 | timer->base = &cpu_base->clock_base[clock_id]; | 1132 | timer->base = &cpu_base->clock_base[clock_id]; |
1164 | hrtimer_init_timer_hres(timer); | 1133 | hrtimer_init_timer_hres(timer); |
1134 | timerqueue_init(&timer->node); | ||
1165 | 1135 | ||
1166 | #ifdef CONFIG_TIMER_STATS | 1136 | #ifdef CONFIG_TIMER_STATS |
1167 | timer->start_site = NULL; | 1137 | timer->start_site = NULL; |
@@ -1278,14 +1248,14 @@ retry: | |||
1278 | 1248 | ||
1279 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { | 1249 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { |
1280 | ktime_t basenow; | 1250 | ktime_t basenow; |
1281 | struct rb_node *node; | 1251 | struct timerqueue_node *node; |
1282 | 1252 | ||
1283 | basenow = ktime_add(now, base->offset); | 1253 | basenow = ktime_add(now, base->offset); |
1284 | 1254 | ||
1285 | while ((node = base->first)) { | 1255 | while ((node = timerqueue_getnext(&base->active))) { |
1286 | struct hrtimer *timer; | 1256 | struct hrtimer *timer; |
1287 | 1257 | ||
1288 | timer = rb_entry(node, struct hrtimer, node); | 1258 | timer = container_of(node, struct hrtimer, node); |
1289 | 1259 | ||
1290 | /* | 1260 | /* |
1291 | * The immediate goal for using the softexpires is | 1261 | * The immediate goal for using the softexpires is |
@@ -1441,7 +1411,7 @@ void hrtimer_run_pending(void) | |||
1441 | */ | 1411 | */ |
1442 | void hrtimer_run_queues(void) | 1412 | void hrtimer_run_queues(void) |
1443 | { | 1413 | { |
1444 | struct rb_node *node; | 1414 | struct timerqueue_node *node; |
1445 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 1415 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); |
1446 | struct hrtimer_clock_base *base; | 1416 | struct hrtimer_clock_base *base; |
1447 | int index, gettime = 1; | 1417 | int index, gettime = 1; |
@@ -1451,8 +1421,7 @@ void hrtimer_run_queues(void) | |||
1451 | 1421 | ||
1452 | for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { | 1422 | for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { |
1453 | base = &cpu_base->clock_base[index]; | 1423 | base = &cpu_base->clock_base[index]; |
1454 | 1424 | if (!timerqueue_getnext(&base->active)) | |
1455 | if (!base->first) | ||
1456 | continue; | 1425 | continue; |
1457 | 1426 | ||
1458 | if (gettime) { | 1427 | if (gettime) { |
@@ -1462,10 +1431,10 @@ void hrtimer_run_queues(void) | |||
1462 | 1431 | ||
1463 | raw_spin_lock(&cpu_base->lock); | 1432 | raw_spin_lock(&cpu_base->lock); |
1464 | 1433 | ||
1465 | while ((node = base->first)) { | 1434 | while ((node = timerqueue_getnext(&base->active))) { |
1466 | struct hrtimer *timer; | 1435 | struct hrtimer *timer; |
1467 | 1436 | ||
1468 | timer = rb_entry(node, struct hrtimer, node); | 1437 | timer = container_of(node, struct hrtimer, node); |
1469 | if (base->softirq_time.tv64 <= | 1438 | if (base->softirq_time.tv64 <= |
1470 | hrtimer_get_expires_tv64(timer)) | 1439 | hrtimer_get_expires_tv64(timer)) |
1471 | break; | 1440 | break; |
@@ -1630,8 +1599,10 @@ static void __cpuinit init_hrtimers_cpu(int cpu) | |||
1630 | 1599 | ||
1631 | raw_spin_lock_init(&cpu_base->lock); | 1600 | raw_spin_lock_init(&cpu_base->lock); |
1632 | 1601 | ||
1633 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) | 1602 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { |
1634 | cpu_base->clock_base[i].cpu_base = cpu_base; | 1603 | cpu_base->clock_base[i].cpu_base = cpu_base; |
1604 | timerqueue_init_head(&cpu_base->clock_base[i].active); | ||
1605 | } | ||
1635 | 1606 | ||
1636 | hrtimer_init_hres(cpu_base); | 1607 | hrtimer_init_hres(cpu_base); |
1637 | } | 1608 | } |
@@ -1642,10 +1613,10 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, | |||
1642 | struct hrtimer_clock_base *new_base) | 1613 | struct hrtimer_clock_base *new_base) |
1643 | { | 1614 | { |
1644 | struct hrtimer *timer; | 1615 | struct hrtimer *timer; |
1645 | struct rb_node *node; | 1616 | struct timerqueue_node *node; |
1646 | 1617 | ||
1647 | while ((node = rb_first(&old_base->active))) { | 1618 | while ((node = timerqueue_getnext(&old_base->active))) { |
1648 | timer = rb_entry(node, struct hrtimer, node); | 1619 | timer = container_of(node, struct hrtimer, node); |
1649 | BUG_ON(hrtimer_callback_running(timer)); | 1620 | BUG_ON(hrtimer_callback_running(timer)); |
1650 | debug_deactivate(timer); | 1621 | debug_deactivate(timer); |
1651 | 1622 | ||
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 2c9120f0afc..086adf25a55 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c | |||
@@ -620,7 +620,7 @@ static struct pmu perf_breakpoint = { | |||
620 | .read = hw_breakpoint_pmu_read, | 620 | .read = hw_breakpoint_pmu_read, |
621 | }; | 621 | }; |
622 | 622 | ||
623 | static int __init init_hw_breakpoint(void) | 623 | int __init init_hw_breakpoint(void) |
624 | { | 624 | { |
625 | unsigned int **task_bp_pinned; | 625 | unsigned int **task_bp_pinned; |
626 | int cpu, err_cpu; | 626 | int cpu, err_cpu; |
@@ -641,7 +641,7 @@ static int __init init_hw_breakpoint(void) | |||
641 | 641 | ||
642 | constraints_initialized = 1; | 642 | constraints_initialized = 1; |
643 | 643 | ||
644 | perf_pmu_register(&perf_breakpoint); | 644 | perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); |
645 | 645 | ||
646 | return register_die_notifier(&hw_breakpoint_exceptions_nb); | 646 | return register_die_notifier(&hw_breakpoint_exceptions_nb); |
647 | 647 | ||
@@ -655,6 +655,5 @@ static int __init init_hw_breakpoint(void) | |||
655 | 655 | ||
656 | return -ENOMEM; | 656 | return -ENOMEM; |
657 | } | 657 | } |
658 | core_initcall(init_hw_breakpoint); | ||
659 | 658 | ||
660 | 659 | ||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5f92acc5f95..0caa59f747d 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -577,7 +577,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } | |||
577 | */ | 577 | */ |
578 | static int irq_thread(void *data) | 578 | static int irq_thread(void *data) |
579 | { | 579 | { |
580 | struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; | 580 | static const struct sched_param param = { |
581 | .sched_priority = MAX_USER_RT_PRIO/2, | ||
582 | }; | ||
581 | struct irqaction *action = data; | 583 | struct irqaction *action = data; |
582 | struct irq_desc *desc = irq_to_desc(action->irq); | 584 | struct irq_desc *desc = irq_to_desc(action->irq); |
583 | int wake, oneshot = desc->status & IRQ_ONESHOT; | 585 | int wake, oneshot = desc->status & IRQ_ONESHOT; |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 01b1d3a8898..6c8a2a9f8a7 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -214,7 +214,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v) | |||
214 | 214 | ||
215 | static int irq_spurious_proc_open(struct inode *inode, struct file *file) | 215 | static int irq_spurious_proc_open(struct inode *inode, struct file *file) |
216 | { | 216 | { |
217 | return single_open(file, irq_spurious_proc_show, NULL); | 217 | return single_open(file, irq_spurious_proc_show, PDE(inode)->data); |
218 | } | 218 | } |
219 | 219 | ||
220 | static const struct file_operations irq_spurious_proc_fops = { | 220 | static const struct file_operations irq_spurious_proc_fops = { |
diff --git a/kernel/irq_work.c b/kernel/irq_work.c index f16763ff848..90f881904bb 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c | |||
@@ -145,7 +145,9 @@ void irq_work_run(void) | |||
145 | * Clear the BUSY bit and return to the free state if | 145 | * Clear the BUSY bit and return to the free state if |
146 | * no-one else claimed it meanwhile. | 146 | * no-one else claimed it meanwhile. |
147 | */ | 147 | */ |
148 | cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL); | 148 | (void)cmpxchg(&entry->next, |
149 | next_flags(NULL, IRQ_WORK_BUSY), | ||
150 | NULL); | ||
149 | } | 151 | } |
150 | } | 152 | } |
151 | EXPORT_SYMBOL_GPL(irq_work_run); | 153 | EXPORT_SYMBOL_GPL(irq_work_run); |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9737a76e106..7663e5df0e6 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -354,13 +354,20 @@ static inline int kprobe_aggrprobe(struct kprobe *p) | |||
354 | return p->pre_handler == aggr_pre_handler; | 354 | return p->pre_handler == aggr_pre_handler; |
355 | } | 355 | } |
356 | 356 | ||
357 | /* Return true(!0) if the kprobe is unused */ | ||
358 | static inline int kprobe_unused(struct kprobe *p) | ||
359 | { | ||
360 | return kprobe_aggrprobe(p) && kprobe_disabled(p) && | ||
361 | list_empty(&p->list); | ||
362 | } | ||
363 | |||
357 | /* | 364 | /* |
358 | * Keep all fields in the kprobe consistent | 365 | * Keep all fields in the kprobe consistent |
359 | */ | 366 | */ |
360 | static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) | 367 | static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p) |
361 | { | 368 | { |
362 | memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); | 369 | memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t)); |
363 | memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); | 370 | memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn)); |
364 | } | 371 | } |
365 | 372 | ||
366 | #ifdef CONFIG_OPTPROBES | 373 | #ifdef CONFIG_OPTPROBES |
@@ -384,6 +391,17 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) | |||
384 | } | 391 | } |
385 | } | 392 | } |
386 | 393 | ||
394 | /* Free optimized instructions and optimized_kprobe */ | ||
395 | static __kprobes void free_aggr_kprobe(struct kprobe *p) | ||
396 | { | ||
397 | struct optimized_kprobe *op; | ||
398 | |||
399 | op = container_of(p, struct optimized_kprobe, kp); | ||
400 | arch_remove_optimized_kprobe(op); | ||
401 | arch_remove_kprobe(p); | ||
402 | kfree(op); | ||
403 | } | ||
404 | |||
387 | /* Return true(!0) if the kprobe is ready for optimization. */ | 405 | /* Return true(!0) if the kprobe is ready for optimization. */ |
388 | static inline int kprobe_optready(struct kprobe *p) | 406 | static inline int kprobe_optready(struct kprobe *p) |
389 | { | 407 | { |
@@ -397,6 +415,33 @@ static inline int kprobe_optready(struct kprobe *p) | |||
397 | return 0; | 415 | return 0; |
398 | } | 416 | } |
399 | 417 | ||
418 | /* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */ | ||
419 | static inline int kprobe_disarmed(struct kprobe *p) | ||
420 | { | ||
421 | struct optimized_kprobe *op; | ||
422 | |||
423 | /* If kprobe is not aggr/opt probe, just return kprobe is disabled */ | ||
424 | if (!kprobe_aggrprobe(p)) | ||
425 | return kprobe_disabled(p); | ||
426 | |||
427 | op = container_of(p, struct optimized_kprobe, kp); | ||
428 | |||
429 | return kprobe_disabled(p) && list_empty(&op->list); | ||
430 | } | ||
431 | |||
432 | /* Return true(!0) if the probe is queued on (un)optimizing lists */ | ||
433 | static int __kprobes kprobe_queued(struct kprobe *p) | ||
434 | { | ||
435 | struct optimized_kprobe *op; | ||
436 | |||
437 | if (kprobe_aggrprobe(p)) { | ||
438 | op = container_of(p, struct optimized_kprobe, kp); | ||
439 | if (!list_empty(&op->list)) | ||
440 | return 1; | ||
441 | } | ||
442 | return 0; | ||
443 | } | ||
444 | |||
400 | /* | 445 | /* |
401 | * Return an optimized kprobe whose optimizing code replaces | 446 | * Return an optimized kprobe whose optimizing code replaces |
402 | * instructions including addr (exclude breakpoint). | 447 | * instructions including addr (exclude breakpoint). |
@@ -422,30 +467,23 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) | |||
422 | 467 | ||
423 | /* Optimization staging list, protected by kprobe_mutex */ | 468 | /* Optimization staging list, protected by kprobe_mutex */ |
424 | static LIST_HEAD(optimizing_list); | 469 | static LIST_HEAD(optimizing_list); |
470 | static LIST_HEAD(unoptimizing_list); | ||
425 | 471 | ||
426 | static void kprobe_optimizer(struct work_struct *work); | 472 | static void kprobe_optimizer(struct work_struct *work); |
427 | static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); | 473 | static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); |
474 | static DECLARE_COMPLETION(optimizer_comp); | ||
428 | #define OPTIMIZE_DELAY 5 | 475 | #define OPTIMIZE_DELAY 5 |
429 | 476 | ||
430 | /* Kprobe jump optimizer */ | 477 | /* |
431 | static __kprobes void kprobe_optimizer(struct work_struct *work) | 478 | * Optimize (replace a breakpoint with a jump) kprobes listed on |
479 | * optimizing_list. | ||
480 | */ | ||
481 | static __kprobes void do_optimize_kprobes(void) | ||
432 | { | 482 | { |
433 | struct optimized_kprobe *op, *tmp; | 483 | /* Optimization never be done when disarmed */ |
434 | 484 | if (kprobes_all_disarmed || !kprobes_allow_optimization || | |
435 | /* Lock modules while optimizing kprobes */ | 485 | list_empty(&optimizing_list)) |
436 | mutex_lock(&module_mutex); | 486 | return; |
437 | mutex_lock(&kprobe_mutex); | ||
438 | if (kprobes_all_disarmed || !kprobes_allow_optimization) | ||
439 | goto end; | ||
440 | |||
441 | /* | ||
442 | * Wait for quiesence period to ensure all running interrupts | ||
443 | * are done. Because optprobe may modify multiple instructions | ||
444 | * there is a chance that Nth instruction is interrupted. In that | ||
445 | * case, running interrupt can return to 2nd-Nth byte of jump | ||
446 | * instruction. This wait is for avoiding it. | ||
447 | */ | ||
448 | synchronize_sched(); | ||
449 | 487 | ||
450 | /* | 488 | /* |
451 | * The optimization/unoptimization refers online_cpus via | 489 | * The optimization/unoptimization refers online_cpus via |
@@ -459,17 +497,111 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) | |||
459 | */ | 497 | */ |
460 | get_online_cpus(); | 498 | get_online_cpus(); |
461 | mutex_lock(&text_mutex); | 499 | mutex_lock(&text_mutex); |
462 | list_for_each_entry_safe(op, tmp, &optimizing_list, list) { | 500 | arch_optimize_kprobes(&optimizing_list); |
463 | WARN_ON(kprobe_disabled(&op->kp)); | 501 | mutex_unlock(&text_mutex); |
464 | if (arch_optimize_kprobe(op) < 0) | 502 | put_online_cpus(); |
465 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | 503 | } |
466 | list_del_init(&op->list); | 504 | |
505 | /* | ||
506 | * Unoptimize (replace a jump with a breakpoint and remove the breakpoint | ||
507 | * if need) kprobes listed on unoptimizing_list. | ||
508 | */ | ||
509 | static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) | ||
510 | { | ||
511 | struct optimized_kprobe *op, *tmp; | ||
512 | |||
513 | /* Unoptimization must be done anytime */ | ||
514 | if (list_empty(&unoptimizing_list)) | ||
515 | return; | ||
516 | |||
517 | /* Ditto to do_optimize_kprobes */ | ||
518 | get_online_cpus(); | ||
519 | mutex_lock(&text_mutex); | ||
520 | arch_unoptimize_kprobes(&unoptimizing_list, free_list); | ||
521 | /* Loop free_list for disarming */ | ||
522 | list_for_each_entry_safe(op, tmp, free_list, list) { | ||
523 | /* Disarm probes if marked disabled */ | ||
524 | if (kprobe_disabled(&op->kp)) | ||
525 | arch_disarm_kprobe(&op->kp); | ||
526 | if (kprobe_unused(&op->kp)) { | ||
527 | /* | ||
528 | * Remove unused probes from hash list. After waiting | ||
529 | * for synchronization, these probes are reclaimed. | ||
530 | * (reclaiming is done by do_free_cleaned_kprobes.) | ||
531 | */ | ||
532 | hlist_del_rcu(&op->kp.hlist); | ||
533 | } else | ||
534 | list_del_init(&op->list); | ||
467 | } | 535 | } |
468 | mutex_unlock(&text_mutex); | 536 | mutex_unlock(&text_mutex); |
469 | put_online_cpus(); | 537 | put_online_cpus(); |
470 | end: | 538 | } |
539 | |||
540 | /* Reclaim all kprobes on the free_list */ | ||
541 | static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) | ||
542 | { | ||
543 | struct optimized_kprobe *op, *tmp; | ||
544 | |||
545 | list_for_each_entry_safe(op, tmp, free_list, list) { | ||
546 | BUG_ON(!kprobe_unused(&op->kp)); | ||
547 | list_del_init(&op->list); | ||
548 | free_aggr_kprobe(&op->kp); | ||
549 | } | ||
550 | } | ||
551 | |||
552 | /* Start optimizer after OPTIMIZE_DELAY passed */ | ||
553 | static __kprobes void kick_kprobe_optimizer(void) | ||
554 | { | ||
555 | if (!delayed_work_pending(&optimizing_work)) | ||
556 | schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); | ||
557 | } | ||
558 | |||
559 | /* Kprobe jump optimizer */ | ||
560 | static __kprobes void kprobe_optimizer(struct work_struct *work) | ||
561 | { | ||
562 | LIST_HEAD(free_list); | ||
563 | |||
564 | /* Lock modules while optimizing kprobes */ | ||
565 | mutex_lock(&module_mutex); | ||
566 | mutex_lock(&kprobe_mutex); | ||
567 | |||
568 | /* | ||
569 | * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) | ||
570 | * kprobes before waiting for quiesence period. | ||
571 | */ | ||
572 | do_unoptimize_kprobes(&free_list); | ||
573 | |||
574 | /* | ||
575 | * Step 2: Wait for quiesence period to ensure all running interrupts | ||
576 | * are done. Because optprobe may modify multiple instructions | ||
577 | * there is a chance that Nth instruction is interrupted. In that | ||
578 | * case, running interrupt can return to 2nd-Nth byte of jump | ||
579 | * instruction. This wait is for avoiding it. | ||
580 | */ | ||
581 | synchronize_sched(); | ||
582 | |||
583 | /* Step 3: Optimize kprobes after quiesence period */ | ||
584 | do_optimize_kprobes(); | ||
585 | |||
586 | /* Step 4: Free cleaned kprobes after quiesence period */ | ||
587 | do_free_cleaned_kprobes(&free_list); | ||
588 | |||
471 | mutex_unlock(&kprobe_mutex); | 589 | mutex_unlock(&kprobe_mutex); |
472 | mutex_unlock(&module_mutex); | 590 | mutex_unlock(&module_mutex); |
591 | |||
592 | /* Step 5: Kick optimizer again if needed */ | ||
593 | if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) | ||
594 | kick_kprobe_optimizer(); | ||
595 | else | ||
596 | /* Wake up all waiters */ | ||
597 | complete_all(&optimizer_comp); | ||
598 | } | ||
599 | |||
600 | /* Wait for completing optimization and unoptimization */ | ||
601 | static __kprobes void wait_for_kprobe_optimizer(void) | ||
602 | { | ||
603 | if (delayed_work_pending(&optimizing_work)) | ||
604 | wait_for_completion(&optimizer_comp); | ||
473 | } | 605 | } |
474 | 606 | ||
475 | /* Optimize kprobe if p is ready to be optimized */ | 607 | /* Optimize kprobe if p is ready to be optimized */ |
@@ -495,42 +627,99 @@ static __kprobes void optimize_kprobe(struct kprobe *p) | |||
495 | /* Check if it is already optimized. */ | 627 | /* Check if it is already optimized. */ |
496 | if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) | 628 | if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) |
497 | return; | 629 | return; |
498 | |||
499 | op->kp.flags |= KPROBE_FLAG_OPTIMIZED; | 630 | op->kp.flags |= KPROBE_FLAG_OPTIMIZED; |
500 | list_add(&op->list, &optimizing_list); | 631 | |
501 | if (!delayed_work_pending(&optimizing_work)) | 632 | if (!list_empty(&op->list)) |
502 | schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); | 633 | /* This is under unoptimizing. Just dequeue the probe */ |
634 | list_del_init(&op->list); | ||
635 | else { | ||
636 | list_add(&op->list, &optimizing_list); | ||
637 | kick_kprobe_optimizer(); | ||
638 | } | ||
639 | } | ||
640 | |||
641 | /* Short cut to direct unoptimizing */ | ||
642 | static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op) | ||
643 | { | ||
644 | get_online_cpus(); | ||
645 | arch_unoptimize_kprobe(op); | ||
646 | put_online_cpus(); | ||
647 | if (kprobe_disabled(&op->kp)) | ||
648 | arch_disarm_kprobe(&op->kp); | ||
503 | } | 649 | } |
504 | 650 | ||
505 | /* Unoptimize a kprobe if p is optimized */ | 651 | /* Unoptimize a kprobe if p is optimized */ |
506 | static __kprobes void unoptimize_kprobe(struct kprobe *p) | 652 | static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force) |
507 | { | 653 | { |
508 | struct optimized_kprobe *op; | 654 | struct optimized_kprobe *op; |
509 | 655 | ||
510 | if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) { | 656 | if (!kprobe_aggrprobe(p) || kprobe_disarmed(p)) |
511 | op = container_of(p, struct optimized_kprobe, kp); | 657 | return; /* This is not an optprobe nor optimized */ |
512 | if (!list_empty(&op->list)) | 658 | |
513 | /* Dequeue from the optimization queue */ | 659 | op = container_of(p, struct optimized_kprobe, kp); |
660 | if (!kprobe_optimized(p)) { | ||
661 | /* Unoptimized or unoptimizing case */ | ||
662 | if (force && !list_empty(&op->list)) { | ||
663 | /* | ||
664 | * Only if this is unoptimizing kprobe and forced, | ||
665 | * forcibly unoptimize it. (No need to unoptimize | ||
666 | * unoptimized kprobe again :) | ||
667 | */ | ||
514 | list_del_init(&op->list); | 668 | list_del_init(&op->list); |
515 | else | 669 | force_unoptimize_kprobe(op); |
516 | /* Replace jump with break */ | 670 | } |
517 | arch_unoptimize_kprobe(op); | 671 | return; |
518 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | 672 | } |
673 | |||
674 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | ||
675 | if (!list_empty(&op->list)) { | ||
676 | /* Dequeue from the optimization queue */ | ||
677 | list_del_init(&op->list); | ||
678 | return; | ||
679 | } | ||
680 | /* Optimized kprobe case */ | ||
681 | if (force) | ||
682 | /* Forcibly update the code: this is a special case */ | ||
683 | force_unoptimize_kprobe(op); | ||
684 | else { | ||
685 | list_add(&op->list, &unoptimizing_list); | ||
686 | kick_kprobe_optimizer(); | ||
519 | } | 687 | } |
520 | } | 688 | } |
521 | 689 | ||
690 | /* Cancel unoptimizing for reusing */ | ||
691 | static void reuse_unused_kprobe(struct kprobe *ap) | ||
692 | { | ||
693 | struct optimized_kprobe *op; | ||
694 | |||
695 | BUG_ON(!kprobe_unused(ap)); | ||
696 | /* | ||
697 | * Unused kprobe MUST be on the way of delayed unoptimizing (means | ||
698 | * there is still a relative jump) and disabled. | ||
699 | */ | ||
700 | op = container_of(ap, struct optimized_kprobe, kp); | ||
701 | if (unlikely(list_empty(&op->list))) | ||
702 | printk(KERN_WARNING "Warning: found a stray unused " | ||
703 | "aggrprobe@%p\n", ap->addr); | ||
704 | /* Enable the probe again */ | ||
705 | ap->flags &= ~KPROBE_FLAG_DISABLED; | ||
706 | /* Optimize it again (remove from op->list) */ | ||
707 | BUG_ON(!kprobe_optready(ap)); | ||
708 | optimize_kprobe(ap); | ||
709 | } | ||
710 | |||
522 | /* Remove optimized instructions */ | 711 | /* Remove optimized instructions */ |
523 | static void __kprobes kill_optimized_kprobe(struct kprobe *p) | 712 | static void __kprobes kill_optimized_kprobe(struct kprobe *p) |
524 | { | 713 | { |
525 | struct optimized_kprobe *op; | 714 | struct optimized_kprobe *op; |
526 | 715 | ||
527 | op = container_of(p, struct optimized_kprobe, kp); | 716 | op = container_of(p, struct optimized_kprobe, kp); |
528 | if (!list_empty(&op->list)) { | 717 | if (!list_empty(&op->list)) |
529 | /* Dequeue from the optimization queue */ | 718 | /* Dequeue from the (un)optimization queue */ |
530 | list_del_init(&op->list); | 719 | list_del_init(&op->list); |
531 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | 720 | |
532 | } | 721 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; |
533 | /* Don't unoptimize, because the target code will be freed. */ | 722 | /* Don't touch the code, because it is already freed. */ |
534 | arch_remove_optimized_kprobe(op); | 723 | arch_remove_optimized_kprobe(op); |
535 | } | 724 | } |
536 | 725 | ||
@@ -543,16 +732,6 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p) | |||
543 | arch_prepare_optimized_kprobe(op); | 732 | arch_prepare_optimized_kprobe(op); |
544 | } | 733 | } |
545 | 734 | ||
546 | /* Free optimized instructions and optimized_kprobe */ | ||
547 | static __kprobes void free_aggr_kprobe(struct kprobe *p) | ||
548 | { | ||
549 | struct optimized_kprobe *op; | ||
550 | |||
551 | op = container_of(p, struct optimized_kprobe, kp); | ||
552 | arch_remove_optimized_kprobe(op); | ||
553 | kfree(op); | ||
554 | } | ||
555 | |||
556 | /* Allocate new optimized_kprobe and try to prepare optimized instructions */ | 735 | /* Allocate new optimized_kprobe and try to prepare optimized instructions */ |
557 | static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) | 736 | static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) |
558 | { | 737 | { |
@@ -587,7 +766,8 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p) | |||
587 | op = container_of(ap, struct optimized_kprobe, kp); | 766 | op = container_of(ap, struct optimized_kprobe, kp); |
588 | if (!arch_prepared_optinsn(&op->optinsn)) { | 767 | if (!arch_prepared_optinsn(&op->optinsn)) { |
589 | /* If failed to setup optimizing, fallback to kprobe */ | 768 | /* If failed to setup optimizing, fallback to kprobe */ |
590 | free_aggr_kprobe(ap); | 769 | arch_remove_optimized_kprobe(op); |
770 | kfree(op); | ||
591 | return; | 771 | return; |
592 | } | 772 | } |
593 | 773 | ||
@@ -631,21 +811,16 @@ static void __kprobes unoptimize_all_kprobes(void) | |||
631 | return; | 811 | return; |
632 | 812 | ||
633 | kprobes_allow_optimization = false; | 813 | kprobes_allow_optimization = false; |
634 | printk(KERN_INFO "Kprobes globally unoptimized\n"); | ||
635 | get_online_cpus(); /* For avoiding text_mutex deadlock */ | ||
636 | mutex_lock(&text_mutex); | ||
637 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 814 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
638 | head = &kprobe_table[i]; | 815 | head = &kprobe_table[i]; |
639 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 816 | hlist_for_each_entry_rcu(p, node, head, hlist) { |
640 | if (!kprobe_disabled(p)) | 817 | if (!kprobe_disabled(p)) |
641 | unoptimize_kprobe(p); | 818 | unoptimize_kprobe(p, false); |
642 | } | 819 | } |
643 | } | 820 | } |
644 | 821 | /* Wait for unoptimizing completion */ | |
645 | mutex_unlock(&text_mutex); | 822 | wait_for_kprobe_optimizer(); |
646 | put_online_cpus(); | 823 | printk(KERN_INFO "Kprobes globally unoptimized\n"); |
647 | /* Allow all currently running kprobes to complete */ | ||
648 | synchronize_sched(); | ||
649 | } | 824 | } |
650 | 825 | ||
651 | int sysctl_kprobes_optimization; | 826 | int sysctl_kprobes_optimization; |
@@ -669,44 +844,60 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, | |||
669 | } | 844 | } |
670 | #endif /* CONFIG_SYSCTL */ | 845 | #endif /* CONFIG_SYSCTL */ |
671 | 846 | ||
847 | /* Put a breakpoint for a probe. Must be called with text_mutex locked */ | ||
672 | static void __kprobes __arm_kprobe(struct kprobe *p) | 848 | static void __kprobes __arm_kprobe(struct kprobe *p) |
673 | { | 849 | { |
674 | struct kprobe *old_p; | 850 | struct kprobe *_p; |
675 | 851 | ||
676 | /* Check collision with other optimized kprobes */ | 852 | /* Check collision with other optimized kprobes */ |
677 | old_p = get_optimized_kprobe((unsigned long)p->addr); | 853 | _p = get_optimized_kprobe((unsigned long)p->addr); |
678 | if (unlikely(old_p)) | 854 | if (unlikely(_p)) |
679 | unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */ | 855 | /* Fallback to unoptimized kprobe */ |
856 | unoptimize_kprobe(_p, true); | ||
680 | 857 | ||
681 | arch_arm_kprobe(p); | 858 | arch_arm_kprobe(p); |
682 | optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ | 859 | optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ |
683 | } | 860 | } |
684 | 861 | ||
685 | static void __kprobes __disarm_kprobe(struct kprobe *p) | 862 | /* Remove the breakpoint of a probe. Must be called with text_mutex locked */ |
863 | static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt) | ||
686 | { | 864 | { |
687 | struct kprobe *old_p; | 865 | struct kprobe *_p; |
688 | 866 | ||
689 | unoptimize_kprobe(p); /* Try to unoptimize */ | 867 | unoptimize_kprobe(p, false); /* Try to unoptimize */ |
690 | arch_disarm_kprobe(p); | ||
691 | 868 | ||
692 | /* If another kprobe was blocked, optimize it. */ | 869 | if (!kprobe_queued(p)) { |
693 | old_p = get_optimized_kprobe((unsigned long)p->addr); | 870 | arch_disarm_kprobe(p); |
694 | if (unlikely(old_p)) | 871 | /* If another kprobe was blocked, optimize it. */ |
695 | optimize_kprobe(old_p); | 872 | _p = get_optimized_kprobe((unsigned long)p->addr); |
873 | if (unlikely(_p) && reopt) | ||
874 | optimize_kprobe(_p); | ||
875 | } | ||
876 | /* TODO: reoptimize others after unoptimized this probe */ | ||
696 | } | 877 | } |
697 | 878 | ||
698 | #else /* !CONFIG_OPTPROBES */ | 879 | #else /* !CONFIG_OPTPROBES */ |
699 | 880 | ||
700 | #define optimize_kprobe(p) do {} while (0) | 881 | #define optimize_kprobe(p) do {} while (0) |
701 | #define unoptimize_kprobe(p) do {} while (0) | 882 | #define unoptimize_kprobe(p, f) do {} while (0) |
702 | #define kill_optimized_kprobe(p) do {} while (0) | 883 | #define kill_optimized_kprobe(p) do {} while (0) |
703 | #define prepare_optimized_kprobe(p) do {} while (0) | 884 | #define prepare_optimized_kprobe(p) do {} while (0) |
704 | #define try_to_optimize_kprobe(p) do {} while (0) | 885 | #define try_to_optimize_kprobe(p) do {} while (0) |
705 | #define __arm_kprobe(p) arch_arm_kprobe(p) | 886 | #define __arm_kprobe(p) arch_arm_kprobe(p) |
706 | #define __disarm_kprobe(p) arch_disarm_kprobe(p) | 887 | #define __disarm_kprobe(p, o) arch_disarm_kprobe(p) |
888 | #define kprobe_disarmed(p) kprobe_disabled(p) | ||
889 | #define wait_for_kprobe_optimizer() do {} while (0) | ||
890 | |||
891 | /* There should be no unused kprobes can be reused without optimization */ | ||
892 | static void reuse_unused_kprobe(struct kprobe *ap) | ||
893 | { | ||
894 | printk(KERN_ERR "Error: There should be no unused kprobe here.\n"); | ||
895 | BUG_ON(kprobe_unused(ap)); | ||
896 | } | ||
707 | 897 | ||
708 | static __kprobes void free_aggr_kprobe(struct kprobe *p) | 898 | static __kprobes void free_aggr_kprobe(struct kprobe *p) |
709 | { | 899 | { |
900 | arch_remove_kprobe(p); | ||
710 | kfree(p); | 901 | kfree(p); |
711 | } | 902 | } |
712 | 903 | ||
@@ -732,11 +923,10 @@ static void __kprobes arm_kprobe(struct kprobe *kp) | |||
732 | /* Disarm a kprobe with text_mutex */ | 923 | /* Disarm a kprobe with text_mutex */ |
733 | static void __kprobes disarm_kprobe(struct kprobe *kp) | 924 | static void __kprobes disarm_kprobe(struct kprobe *kp) |
734 | { | 925 | { |
735 | get_online_cpus(); /* For avoiding text_mutex deadlock */ | 926 | /* Ditto */ |
736 | mutex_lock(&text_mutex); | 927 | mutex_lock(&text_mutex); |
737 | __disarm_kprobe(kp); | 928 | __disarm_kprobe(kp, true); |
738 | mutex_unlock(&text_mutex); | 929 | mutex_unlock(&text_mutex); |
739 | put_online_cpus(); | ||
740 | } | 930 | } |
741 | 931 | ||
742 | /* | 932 | /* |
@@ -942,7 +1132,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) | |||
942 | BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); | 1132 | BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); |
943 | 1133 | ||
944 | if (p->break_handler || p->post_handler) | 1134 | if (p->break_handler || p->post_handler) |
945 | unoptimize_kprobe(ap); /* Fall back to normal kprobe */ | 1135 | unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */ |
946 | 1136 | ||
947 | if (p->break_handler) { | 1137 | if (p->break_handler) { |
948 | if (ap->break_handler) | 1138 | if (ap->break_handler) |
@@ -993,19 +1183,21 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | |||
993 | * This is the second or subsequent kprobe at the address - handle | 1183 | * This is the second or subsequent kprobe at the address - handle |
994 | * the intricacies | 1184 | * the intricacies |
995 | */ | 1185 | */ |
996 | static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | 1186 | static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, |
997 | struct kprobe *p) | 1187 | struct kprobe *p) |
998 | { | 1188 | { |
999 | int ret = 0; | 1189 | int ret = 0; |
1000 | struct kprobe *ap = old_p; | 1190 | struct kprobe *ap = orig_p; |
1001 | 1191 | ||
1002 | if (!kprobe_aggrprobe(old_p)) { | 1192 | if (!kprobe_aggrprobe(orig_p)) { |
1003 | /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */ | 1193 | /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */ |
1004 | ap = alloc_aggr_kprobe(old_p); | 1194 | ap = alloc_aggr_kprobe(orig_p); |
1005 | if (!ap) | 1195 | if (!ap) |
1006 | return -ENOMEM; | 1196 | return -ENOMEM; |
1007 | init_aggr_kprobe(ap, old_p); | 1197 | init_aggr_kprobe(ap, orig_p); |
1008 | } | 1198 | } else if (kprobe_unused(ap)) |
1199 | /* This probe is going to die. Rescue it */ | ||
1200 | reuse_unused_kprobe(ap); | ||
1009 | 1201 | ||
1010 | if (kprobe_gone(ap)) { | 1202 | if (kprobe_gone(ap)) { |
1011 | /* | 1203 | /* |
@@ -1039,23 +1231,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | |||
1039 | return add_new_kprobe(ap, p); | 1231 | return add_new_kprobe(ap, p); |
1040 | } | 1232 | } |
1041 | 1233 | ||
1042 | /* Try to disable aggr_kprobe, and return 1 if succeeded.*/ | ||
1043 | static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p) | ||
1044 | { | ||
1045 | struct kprobe *kp; | ||
1046 | |||
1047 | list_for_each_entry_rcu(kp, &p->list, list) { | ||
1048 | if (!kprobe_disabled(kp)) | ||
1049 | /* | ||
1050 | * There is an active probe on the list. | ||
1051 | * We can't disable aggr_kprobe. | ||
1052 | */ | ||
1053 | return 0; | ||
1054 | } | ||
1055 | p->flags |= KPROBE_FLAG_DISABLED; | ||
1056 | return 1; | ||
1057 | } | ||
1058 | |||
1059 | static int __kprobes in_kprobes_functions(unsigned long addr) | 1234 | static int __kprobes in_kprobes_functions(unsigned long addr) |
1060 | { | 1235 | { |
1061 | struct kprobe_blackpoint *kb; | 1236 | struct kprobe_blackpoint *kb; |
@@ -1098,34 +1273,33 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) | |||
1098 | /* Check passed kprobe is valid and return kprobe in kprobe_table. */ | 1273 | /* Check passed kprobe is valid and return kprobe in kprobe_table. */ |
1099 | static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) | 1274 | static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) |
1100 | { | 1275 | { |
1101 | struct kprobe *old_p, *list_p; | 1276 | struct kprobe *ap, *list_p; |
1102 | 1277 | ||
1103 | old_p = get_kprobe(p->addr); | 1278 | ap = get_kprobe(p->addr); |
1104 | if (unlikely(!old_p)) | 1279 | if (unlikely(!ap)) |
1105 | return NULL; | 1280 | return NULL; |
1106 | 1281 | ||
1107 | if (p != old_p) { | 1282 | if (p != ap) { |
1108 | list_for_each_entry_rcu(list_p, &old_p->list, list) | 1283 | list_for_each_entry_rcu(list_p, &ap->list, list) |
1109 | if (list_p == p) | 1284 | if (list_p == p) |
1110 | /* kprobe p is a valid probe */ | 1285 | /* kprobe p is a valid probe */ |
1111 | goto valid; | 1286 | goto valid; |
1112 | return NULL; | 1287 | return NULL; |
1113 | } | 1288 | } |
1114 | valid: | 1289 | valid: |
1115 | return old_p; | 1290 | return ap; |
1116 | } | 1291 | } |
1117 | 1292 | ||
1118 | /* Return error if the kprobe is being re-registered */ | 1293 | /* Return error if the kprobe is being re-registered */ |
1119 | static inline int check_kprobe_rereg(struct kprobe *p) | 1294 | static inline int check_kprobe_rereg(struct kprobe *p) |
1120 | { | 1295 | { |
1121 | int ret = 0; | 1296 | int ret = 0; |
1122 | struct kprobe *old_p; | ||
1123 | 1297 | ||
1124 | mutex_lock(&kprobe_mutex); | 1298 | mutex_lock(&kprobe_mutex); |
1125 | old_p = __get_valid_kprobe(p); | 1299 | if (__get_valid_kprobe(p)) |
1126 | if (old_p) | ||
1127 | ret = -EINVAL; | 1300 | ret = -EINVAL; |
1128 | mutex_unlock(&kprobe_mutex); | 1301 | mutex_unlock(&kprobe_mutex); |
1302 | |||
1129 | return ret; | 1303 | return ret; |
1130 | } | 1304 | } |
1131 | 1305 | ||
@@ -1229,67 +1403,121 @@ fail_with_jump_label: | |||
1229 | } | 1403 | } |
1230 | EXPORT_SYMBOL_GPL(register_kprobe); | 1404 | EXPORT_SYMBOL_GPL(register_kprobe); |
1231 | 1405 | ||
1406 | /* Check if all probes on the aggrprobe are disabled */ | ||
1407 | static int __kprobes aggr_kprobe_disabled(struct kprobe *ap) | ||
1408 | { | ||
1409 | struct kprobe *kp; | ||
1410 | |||
1411 | list_for_each_entry_rcu(kp, &ap->list, list) | ||
1412 | if (!kprobe_disabled(kp)) | ||
1413 | /* | ||
1414 | * There is an active probe on the list. | ||
1415 | * We can't disable this ap. | ||
1416 | */ | ||
1417 | return 0; | ||
1418 | |||
1419 | return 1; | ||
1420 | } | ||
1421 | |||
1422 | /* Disable one kprobe: Make sure called under kprobe_mutex is locked */ | ||
1423 | static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) | ||
1424 | { | ||
1425 | struct kprobe *orig_p; | ||
1426 | |||
1427 | /* Get an original kprobe for return */ | ||
1428 | orig_p = __get_valid_kprobe(p); | ||
1429 | if (unlikely(orig_p == NULL)) | ||
1430 | return NULL; | ||
1431 | |||
1432 | if (!kprobe_disabled(p)) { | ||
1433 | /* Disable probe if it is a child probe */ | ||
1434 | if (p != orig_p) | ||
1435 | p->flags |= KPROBE_FLAG_DISABLED; | ||
1436 | |||
1437 | /* Try to disarm and disable this/parent probe */ | ||
1438 | if (p == orig_p || aggr_kprobe_disabled(orig_p)) { | ||
1439 | disarm_kprobe(orig_p); | ||
1440 | orig_p->flags |= KPROBE_FLAG_DISABLED; | ||
1441 | } | ||
1442 | } | ||
1443 | |||
1444 | return orig_p; | ||
1445 | } | ||
1446 | |||
1232 | /* | 1447 | /* |
1233 | * Unregister a kprobe without a scheduler synchronization. | 1448 | * Unregister a kprobe without a scheduler synchronization. |
1234 | */ | 1449 | */ |
1235 | static int __kprobes __unregister_kprobe_top(struct kprobe *p) | 1450 | static int __kprobes __unregister_kprobe_top(struct kprobe *p) |
1236 | { | 1451 | { |
1237 | struct kprobe *old_p, *list_p; | 1452 | struct kprobe *ap, *list_p; |
1238 | 1453 | ||
1239 | old_p = __get_valid_kprobe(p); | 1454 | /* Disable kprobe. This will disarm it if needed. */ |
1240 | if (old_p == NULL) | 1455 | ap = __disable_kprobe(p); |
1456 | if (ap == NULL) | ||
1241 | return -EINVAL; | 1457 | return -EINVAL; |
1242 | 1458 | ||
1243 | if (old_p == p || | 1459 | if (ap == p) |
1244 | (kprobe_aggrprobe(old_p) && | ||
1245 | list_is_singular(&old_p->list))) { | ||
1246 | /* | 1460 | /* |
1247 | * Only probe on the hash list. Disarm only if kprobes are | 1461 | * This probe is an independent(and non-optimized) kprobe |
1248 | * enabled and not gone - otherwise, the breakpoint would | 1462 | * (not an aggrprobe). Remove from the hash list. |
1249 | * already have been removed. We save on flushing icache. | ||
1250 | */ | 1463 | */ |
1251 | if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) | 1464 | goto disarmed; |
1252 | disarm_kprobe(old_p); | 1465 | |
1253 | hlist_del_rcu(&old_p->hlist); | 1466 | /* Following process expects this probe is an aggrprobe */ |
1254 | } else { | 1467 | WARN_ON(!kprobe_aggrprobe(ap)); |
1468 | |||
1469 | if (list_is_singular(&ap->list) && kprobe_disarmed(ap)) | ||
1470 | /* | ||
1471 | * !disarmed could be happen if the probe is under delayed | ||
1472 | * unoptimizing. | ||
1473 | */ | ||
1474 | goto disarmed; | ||
1475 | else { | ||
1476 | /* If disabling probe has special handlers, update aggrprobe */ | ||
1255 | if (p->break_handler && !kprobe_gone(p)) | 1477 | if (p->break_handler && !kprobe_gone(p)) |
1256 | old_p->break_handler = NULL; | 1478 | ap->break_handler = NULL; |
1257 | if (p->post_handler && !kprobe_gone(p)) { | 1479 | if (p->post_handler && !kprobe_gone(p)) { |
1258 | list_for_each_entry_rcu(list_p, &old_p->list, list) { | 1480 | list_for_each_entry_rcu(list_p, &ap->list, list) { |
1259 | if ((list_p != p) && (list_p->post_handler)) | 1481 | if ((list_p != p) && (list_p->post_handler)) |
1260 | goto noclean; | 1482 | goto noclean; |
1261 | } | 1483 | } |
1262 | old_p->post_handler = NULL; | 1484 | ap->post_handler = NULL; |
1263 | } | 1485 | } |
1264 | noclean: | 1486 | noclean: |
1487 | /* | ||
1488 | * Remove from the aggrprobe: this path will do nothing in | ||
1489 | * __unregister_kprobe_bottom(). | ||
1490 | */ | ||
1265 | list_del_rcu(&p->list); | 1491 | list_del_rcu(&p->list); |
1266 | if (!kprobe_disabled(old_p)) { | 1492 | if (!kprobe_disabled(ap) && !kprobes_all_disarmed) |
1267 | try_to_disable_aggr_kprobe(old_p); | 1493 | /* |
1268 | if (!kprobes_all_disarmed) { | 1494 | * Try to optimize this probe again, because post |
1269 | if (kprobe_disabled(old_p)) | 1495 | * handler may have been changed. |
1270 | disarm_kprobe(old_p); | 1496 | */ |
1271 | else | 1497 | optimize_kprobe(ap); |
1272 | /* Try to optimize this probe again */ | ||
1273 | optimize_kprobe(old_p); | ||
1274 | } | ||
1275 | } | ||
1276 | } | 1498 | } |
1277 | return 0; | 1499 | return 0; |
1500 | |||
1501 | disarmed: | ||
1502 | BUG_ON(!kprobe_disarmed(ap)); | ||
1503 | hlist_del_rcu(&ap->hlist); | ||
1504 | return 0; | ||
1278 | } | 1505 | } |
1279 | 1506 | ||
1280 | static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) | 1507 | static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) |
1281 | { | 1508 | { |
1282 | struct kprobe *old_p; | 1509 | struct kprobe *ap; |
1283 | 1510 | ||
1284 | if (list_empty(&p->list)) | 1511 | if (list_empty(&p->list)) |
1512 | /* This is an independent kprobe */ | ||
1285 | arch_remove_kprobe(p); | 1513 | arch_remove_kprobe(p); |
1286 | else if (list_is_singular(&p->list)) { | 1514 | else if (list_is_singular(&p->list)) { |
1287 | /* "p" is the last child of an aggr_kprobe */ | 1515 | /* This is the last child of an aggrprobe */ |
1288 | old_p = list_entry(p->list.next, struct kprobe, list); | 1516 | ap = list_entry(p->list.next, struct kprobe, list); |
1289 | list_del(&p->list); | 1517 | list_del(&p->list); |
1290 | arch_remove_kprobe(old_p); | 1518 | free_aggr_kprobe(ap); |
1291 | free_aggr_kprobe(old_p); | ||
1292 | } | 1519 | } |
1520 | /* Otherwise, do nothing. */ | ||
1293 | } | 1521 | } |
1294 | 1522 | ||
1295 | int __kprobes register_kprobes(struct kprobe **kps, int num) | 1523 | int __kprobes register_kprobes(struct kprobe **kps, int num) |
@@ -1607,29 +1835,13 @@ static void __kprobes kill_kprobe(struct kprobe *p) | |||
1607 | int __kprobes disable_kprobe(struct kprobe *kp) | 1835 | int __kprobes disable_kprobe(struct kprobe *kp) |
1608 | { | 1836 | { |
1609 | int ret = 0; | 1837 | int ret = 0; |
1610 | struct kprobe *p; | ||
1611 | 1838 | ||
1612 | mutex_lock(&kprobe_mutex); | 1839 | mutex_lock(&kprobe_mutex); |
1613 | 1840 | ||
1614 | /* Check whether specified probe is valid. */ | 1841 | /* Disable this kprobe */ |
1615 | p = __get_valid_kprobe(kp); | 1842 | if (__disable_kprobe(kp) == NULL) |
1616 | if (unlikely(p == NULL)) { | ||
1617 | ret = -EINVAL; | 1843 | ret = -EINVAL; |
1618 | goto out; | ||
1619 | } | ||
1620 | 1844 | ||
1621 | /* If the probe is already disabled (or gone), just return */ | ||
1622 | if (kprobe_disabled(kp)) | ||
1623 | goto out; | ||
1624 | |||
1625 | kp->flags |= KPROBE_FLAG_DISABLED; | ||
1626 | if (p != kp) | ||
1627 | /* When kp != p, p is always enabled. */ | ||
1628 | try_to_disable_aggr_kprobe(p); | ||
1629 | |||
1630 | if (!kprobes_all_disarmed && kprobe_disabled(p)) | ||
1631 | disarm_kprobe(p); | ||
1632 | out: | ||
1633 | mutex_unlock(&kprobe_mutex); | 1845 | mutex_unlock(&kprobe_mutex); |
1634 | return ret; | 1846 | return ret; |
1635 | } | 1847 | } |
@@ -1927,36 +2139,27 @@ static void __kprobes disarm_all_kprobes(void) | |||
1927 | mutex_lock(&kprobe_mutex); | 2139 | mutex_lock(&kprobe_mutex); |
1928 | 2140 | ||
1929 | /* If kprobes are already disarmed, just return */ | 2141 | /* If kprobes are already disarmed, just return */ |
1930 | if (kprobes_all_disarmed) | 2142 | if (kprobes_all_disarmed) { |
1931 | goto already_disabled; | 2143 | mutex_unlock(&kprobe_mutex); |
2144 | return; | ||
2145 | } | ||
1932 | 2146 | ||
1933 | kprobes_all_disarmed = true; | 2147 | kprobes_all_disarmed = true; |
1934 | printk(KERN_INFO "Kprobes globally disabled\n"); | 2148 | printk(KERN_INFO "Kprobes globally disabled\n"); |
1935 | 2149 | ||
1936 | /* | ||
1937 | * Here we call get_online_cpus() for avoiding text_mutex deadlock, | ||
1938 | * because disarming may also unoptimize kprobes. | ||
1939 | */ | ||
1940 | get_online_cpus(); | ||
1941 | mutex_lock(&text_mutex); | 2150 | mutex_lock(&text_mutex); |
1942 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 2151 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
1943 | head = &kprobe_table[i]; | 2152 | head = &kprobe_table[i]; |
1944 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 2153 | hlist_for_each_entry_rcu(p, node, head, hlist) { |
1945 | if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) | 2154 | if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) |
1946 | __disarm_kprobe(p); | 2155 | __disarm_kprobe(p, false); |
1947 | } | 2156 | } |
1948 | } | 2157 | } |
1949 | |||
1950 | mutex_unlock(&text_mutex); | 2158 | mutex_unlock(&text_mutex); |
1951 | put_online_cpus(); | ||
1952 | mutex_unlock(&kprobe_mutex); | 2159 | mutex_unlock(&kprobe_mutex); |
1953 | /* Allow all currently running kprobes to complete */ | ||
1954 | synchronize_sched(); | ||
1955 | return; | ||
1956 | 2160 | ||
1957 | already_disabled: | 2161 | /* Wait for disarming all kprobes by optimizer */ |
1958 | mutex_unlock(&kprobe_mutex); | 2162 | wait_for_kprobe_optimizer(); |
1959 | return; | ||
1960 | } | 2163 | } |
1961 | 2164 | ||
1962 | /* | 2165 | /* |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 2dc3786349d..c55afba990a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), | |||
148 | wait_for_completion(&create.done); | 148 | wait_for_completion(&create.done); |
149 | 149 | ||
150 | if (!IS_ERR(create.result)) { | 150 | if (!IS_ERR(create.result)) { |
151 | struct sched_param param = { .sched_priority = 0 }; | 151 | static const struct sched_param param = { .sched_priority = 0 }; |
152 | va_list args; | 152 | va_list args; |
153 | 153 | ||
154 | va_start(args, namefmt); | 154 | va_start(args, namefmt); |
@@ -265,6 +265,17 @@ int kthreadd(void *unused) | |||
265 | return 0; | 265 | return 0; |
266 | } | 266 | } |
267 | 267 | ||
268 | void __init_kthread_worker(struct kthread_worker *worker, | ||
269 | const char *name, | ||
270 | struct lock_class_key *key) | ||
271 | { | ||
272 | spin_lock_init(&worker->lock); | ||
273 | lockdep_set_class_and_name(&worker->lock, key, name); | ||
274 | INIT_LIST_HEAD(&worker->work_list); | ||
275 | worker->task = NULL; | ||
276 | } | ||
277 | EXPORT_SYMBOL_GPL(__init_kthread_worker); | ||
278 | |||
268 | /** | 279 | /** |
269 | * kthread_worker_fn - kthread function to process kthread_worker | 280 | * kthread_worker_fn - kthread function to process kthread_worker |
270 | * @worker_ptr: pointer to initialized kthread_worker | 281 | * @worker_ptr: pointer to initialized kthread_worker |
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 59b76c8ce9d..1969d2fc4b3 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c | |||
@@ -494,7 +494,6 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) | |||
494 | namelen += 2; | 494 | namelen += 2; |
495 | 495 | ||
496 | for (i = 0; i < LOCKSTAT_POINTS; i++) { | 496 | for (i = 0; i < LOCKSTAT_POINTS; i++) { |
497 | char sym[KSYM_SYMBOL_LEN]; | ||
498 | char ip[32]; | 497 | char ip[32]; |
499 | 498 | ||
500 | if (class->contention_point[i] == 0) | 499 | if (class->contention_point[i] == 0) |
@@ -503,15 +502,13 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) | |||
503 | if (!i) | 502 | if (!i) |
504 | seq_line(m, '-', 40-namelen, namelen); | 503 | seq_line(m, '-', 40-namelen, namelen); |
505 | 504 | ||
506 | sprint_symbol(sym, class->contention_point[i]); | ||
507 | snprintf(ip, sizeof(ip), "[<%p>]", | 505 | snprintf(ip, sizeof(ip), "[<%p>]", |
508 | (void *)class->contention_point[i]); | 506 | (void *)class->contention_point[i]); |
509 | seq_printf(m, "%40s %14lu %29s %s\n", name, | 507 | seq_printf(m, "%40s %14lu %29s %pS\n", |
510 | stats->contention_point[i], | 508 | name, stats->contention_point[i], |
511 | ip, sym); | 509 | ip, (void *)class->contention_point[i]); |
512 | } | 510 | } |
513 | for (i = 0; i < LOCKSTAT_POINTS; i++) { | 511 | for (i = 0; i < LOCKSTAT_POINTS; i++) { |
514 | char sym[KSYM_SYMBOL_LEN]; | ||
515 | char ip[32]; | 512 | char ip[32]; |
516 | 513 | ||
517 | if (class->contending_point[i] == 0) | 514 | if (class->contending_point[i] == 0) |
@@ -520,12 +517,11 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) | |||
520 | if (!i) | 517 | if (!i) |
521 | seq_line(m, '-', 40-namelen, namelen); | 518 | seq_line(m, '-', 40-namelen, namelen); |
522 | 519 | ||
523 | sprint_symbol(sym, class->contending_point[i]); | ||
524 | snprintf(ip, sizeof(ip), "[<%p>]", | 520 | snprintf(ip, sizeof(ip), "[<%p>]", |
525 | (void *)class->contending_point[i]); | 521 | (void *)class->contending_point[i]); |
526 | seq_printf(m, "%40s %14lu %29s %s\n", name, | 522 | seq_printf(m, "%40s %14lu %29s %pS\n", |
527 | stats->contending_point[i], | 523 | name, stats->contending_point[i], |
528 | ip, sym); | 524 | ip, (void *)class->contending_point[i]); |
529 | } | 525 | } |
530 | if (i) { | 526 | if (i) { |
531 | seq_puts(m, "\n"); | 527 | seq_puts(m, "\n"); |
diff --git a/kernel/module.c b/kernel/module.c index 437a74a7524..34e00b708fa 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -56,6 +56,7 @@ | |||
56 | #include <linux/percpu.h> | 56 | #include <linux/percpu.h> |
57 | #include <linux/kmemleak.h> | 57 | #include <linux/kmemleak.h> |
58 | #include <linux/jump_label.h> | 58 | #include <linux/jump_label.h> |
59 | #include <linux/pfn.h> | ||
59 | 60 | ||
60 | #define CREATE_TRACE_POINTS | 61 | #define CREATE_TRACE_POINTS |
61 | #include <trace/events/module.h> | 62 | #include <trace/events/module.h> |
@@ -70,6 +71,26 @@ | |||
70 | #define ARCH_SHF_SMALL 0 | 71 | #define ARCH_SHF_SMALL 0 |
71 | #endif | 72 | #endif |
72 | 73 | ||
74 | /* | ||
75 | * Modules' sections will be aligned on page boundaries | ||
76 | * to ensure complete separation of code and data, but | ||
77 | * only when CONFIG_DEBUG_SET_MODULE_RONX=y | ||
78 | */ | ||
79 | #ifdef CONFIG_DEBUG_SET_MODULE_RONX | ||
80 | # define debug_align(X) ALIGN(X, PAGE_SIZE) | ||
81 | #else | ||
82 | # define debug_align(X) (X) | ||
83 | #endif | ||
84 | |||
85 | /* | ||
86 | * Given BASE and SIZE this macro calculates the number of pages the | ||
87 | * memory regions occupies | ||
88 | */ | ||
89 | #define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \ | ||
90 | (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \ | ||
91 | PFN_DOWN((unsigned long)BASE) + 1) \ | ||
92 | : (0UL)) | ||
93 | |||
73 | /* If this is set, the section belongs in the init part of the module */ | 94 | /* If this is set, the section belongs in the init part of the module */ |
74 | #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) | 95 | #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) |
75 | 96 | ||
@@ -1542,6 +1563,115 @@ static int __unlink_module(void *_mod) | |||
1542 | return 0; | 1563 | return 0; |
1543 | } | 1564 | } |
1544 | 1565 | ||
1566 | #ifdef CONFIG_DEBUG_SET_MODULE_RONX | ||
1567 | /* | ||
1568 | * LKM RO/NX protection: protect module's text/ro-data | ||
1569 | * from modification and any data from execution. | ||
1570 | */ | ||
1571 | void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages)) | ||
1572 | { | ||
1573 | unsigned long begin_pfn = PFN_DOWN((unsigned long)start); | ||
1574 | unsigned long end_pfn = PFN_DOWN((unsigned long)end); | ||
1575 | |||
1576 | if (end_pfn > begin_pfn) | ||
1577 | set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); | ||
1578 | } | ||
1579 | |||
1580 | static void set_section_ro_nx(void *base, | ||
1581 | unsigned long text_size, | ||
1582 | unsigned long ro_size, | ||
1583 | unsigned long total_size) | ||
1584 | { | ||
1585 | /* begin and end PFNs of the current subsection */ | ||
1586 | unsigned long begin_pfn; | ||
1587 | unsigned long end_pfn; | ||
1588 | |||
1589 | /* | ||
1590 | * Set RO for module text and RO-data: | ||
1591 | * - Always protect first page. | ||
1592 | * - Do not protect last partial page. | ||
1593 | */ | ||
1594 | if (ro_size > 0) | ||
1595 | set_page_attributes(base, base + ro_size, set_memory_ro); | ||
1596 | |||
1597 | /* | ||
1598 | * Set NX permissions for module data: | ||
1599 | * - Do not protect first partial page. | ||
1600 | * - Always protect last page. | ||
1601 | */ | ||
1602 | if (total_size > text_size) { | ||
1603 | begin_pfn = PFN_UP((unsigned long)base + text_size); | ||
1604 | end_pfn = PFN_UP((unsigned long)base + total_size); | ||
1605 | if (end_pfn > begin_pfn) | ||
1606 | set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); | ||
1607 | } | ||
1608 | } | ||
1609 | |||
1610 | /* Setting memory back to RW+NX before releasing it */ | ||
1611 | void unset_section_ro_nx(struct module *mod, void *module_region) | ||
1612 | { | ||
1613 | unsigned long total_pages; | ||
1614 | |||
1615 | if (mod->module_core == module_region) { | ||
1616 | /* Set core as NX+RW */ | ||
1617 | total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size); | ||
1618 | set_memory_nx((unsigned long)mod->module_core, total_pages); | ||
1619 | set_memory_rw((unsigned long)mod->module_core, total_pages); | ||
1620 | |||
1621 | } else if (mod->module_init == module_region) { | ||
1622 | /* Set init as NX+RW */ | ||
1623 | total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size); | ||
1624 | set_memory_nx((unsigned long)mod->module_init, total_pages); | ||
1625 | set_memory_rw((unsigned long)mod->module_init, total_pages); | ||
1626 | } | ||
1627 | } | ||
1628 | |||
1629 | /* Iterate through all modules and set each module's text as RW */ | ||
1630 | void set_all_modules_text_rw() | ||
1631 | { | ||
1632 | struct module *mod; | ||
1633 | |||
1634 | mutex_lock(&module_mutex); | ||
1635 | list_for_each_entry_rcu(mod, &modules, list) { | ||
1636 | if ((mod->module_core) && (mod->core_text_size)) { | ||
1637 | set_page_attributes(mod->module_core, | ||
1638 | mod->module_core + mod->core_text_size, | ||
1639 | set_memory_rw); | ||
1640 | } | ||
1641 | if ((mod->module_init) && (mod->init_text_size)) { | ||
1642 | set_page_attributes(mod->module_init, | ||
1643 | mod->module_init + mod->init_text_size, | ||
1644 | set_memory_rw); | ||
1645 | } | ||
1646 | } | ||
1647 | mutex_unlock(&module_mutex); | ||
1648 | } | ||
1649 | |||
1650 | /* Iterate through all modules and set each module's text as RO */ | ||
1651 | void set_all_modules_text_ro() | ||
1652 | { | ||
1653 | struct module *mod; | ||
1654 | |||
1655 | mutex_lock(&module_mutex); | ||
1656 | list_for_each_entry_rcu(mod, &modules, list) { | ||
1657 | if ((mod->module_core) && (mod->core_text_size)) { | ||
1658 | set_page_attributes(mod->module_core, | ||
1659 | mod->module_core + mod->core_text_size, | ||
1660 | set_memory_ro); | ||
1661 | } | ||
1662 | if ((mod->module_init) && (mod->init_text_size)) { | ||
1663 | set_page_attributes(mod->module_init, | ||
1664 | mod->module_init + mod->init_text_size, | ||
1665 | set_memory_ro); | ||
1666 | } | ||
1667 | } | ||
1668 | mutex_unlock(&module_mutex); | ||
1669 | } | ||
1670 | #else | ||
1671 | static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } | ||
1672 | static inline void unset_section_ro_nx(struct module *mod, void *module_region) { } | ||
1673 | #endif | ||
1674 | |||
1545 | /* Free a module, remove from lists, etc. */ | 1675 | /* Free a module, remove from lists, etc. */ |
1546 | static void free_module(struct module *mod) | 1676 | static void free_module(struct module *mod) |
1547 | { | 1677 | { |
@@ -1566,6 +1696,7 @@ static void free_module(struct module *mod) | |||
1566 | destroy_params(mod->kp, mod->num_kp); | 1696 | destroy_params(mod->kp, mod->num_kp); |
1567 | 1697 | ||
1568 | /* This may be NULL, but that's OK */ | 1698 | /* This may be NULL, but that's OK */ |
1699 | unset_section_ro_nx(mod, mod->module_init); | ||
1569 | module_free(mod, mod->module_init); | 1700 | module_free(mod, mod->module_init); |
1570 | kfree(mod->args); | 1701 | kfree(mod->args); |
1571 | percpu_modfree(mod); | 1702 | percpu_modfree(mod); |
@@ -1574,6 +1705,7 @@ static void free_module(struct module *mod) | |||
1574 | lockdep_free_key_range(mod->module_core, mod->core_size); | 1705 | lockdep_free_key_range(mod->module_core, mod->core_size); |
1575 | 1706 | ||
1576 | /* Finally, free the core (containing the module structure) */ | 1707 | /* Finally, free the core (containing the module structure) */ |
1708 | unset_section_ro_nx(mod, mod->module_core); | ||
1577 | module_free(mod, mod->module_core); | 1709 | module_free(mod, mod->module_core); |
1578 | 1710 | ||
1579 | #ifdef CONFIG_MPU | 1711 | #ifdef CONFIG_MPU |
@@ -1777,8 +1909,19 @@ static void layout_sections(struct module *mod, struct load_info *info) | |||
1777 | s->sh_entsize = get_offset(mod, &mod->core_size, s, i); | 1909 | s->sh_entsize = get_offset(mod, &mod->core_size, s, i); |
1778 | DEBUGP("\t%s\n", name); | 1910 | DEBUGP("\t%s\n", name); |
1779 | } | 1911 | } |
1780 | if (m == 0) | 1912 | switch (m) { |
1913 | case 0: /* executable */ | ||
1914 | mod->core_size = debug_align(mod->core_size); | ||
1781 | mod->core_text_size = mod->core_size; | 1915 | mod->core_text_size = mod->core_size; |
1916 | break; | ||
1917 | case 1: /* RO: text and ro-data */ | ||
1918 | mod->core_size = debug_align(mod->core_size); | ||
1919 | mod->core_ro_size = mod->core_size; | ||
1920 | break; | ||
1921 | case 3: /* whole core */ | ||
1922 | mod->core_size = debug_align(mod->core_size); | ||
1923 | break; | ||
1924 | } | ||
1782 | } | 1925 | } |
1783 | 1926 | ||
1784 | DEBUGP("Init section allocation order:\n"); | 1927 | DEBUGP("Init section allocation order:\n"); |
@@ -1796,8 +1939,19 @@ static void layout_sections(struct module *mod, struct load_info *info) | |||
1796 | | INIT_OFFSET_MASK); | 1939 | | INIT_OFFSET_MASK); |
1797 | DEBUGP("\t%s\n", sname); | 1940 | DEBUGP("\t%s\n", sname); |
1798 | } | 1941 | } |
1799 | if (m == 0) | 1942 | switch (m) { |
1943 | case 0: /* executable */ | ||
1944 | mod->init_size = debug_align(mod->init_size); | ||
1800 | mod->init_text_size = mod->init_size; | 1945 | mod->init_text_size = mod->init_size; |
1946 | break; | ||
1947 | case 1: /* RO: text and ro-data */ | ||
1948 | mod->init_size = debug_align(mod->init_size); | ||
1949 | mod->init_ro_size = mod->init_size; | ||
1950 | break; | ||
1951 | case 3: /* whole init */ | ||
1952 | mod->init_size = debug_align(mod->init_size); | ||
1953 | break; | ||
1954 | } | ||
1801 | } | 1955 | } |
1802 | } | 1956 | } |
1803 | 1957 | ||
@@ -2326,6 +2480,18 @@ static void find_module_sections(struct module *mod, struct load_info *info) | |||
2326 | kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * | 2480 | kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * |
2327 | mod->num_trace_events, GFP_KERNEL); | 2481 | mod->num_trace_events, GFP_KERNEL); |
2328 | #endif | 2482 | #endif |
2483 | #ifdef CONFIG_TRACING | ||
2484 | mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", | ||
2485 | sizeof(*mod->trace_bprintk_fmt_start), | ||
2486 | &mod->num_trace_bprintk_fmt); | ||
2487 | /* | ||
2488 | * This section contains pointers to allocated objects in the trace | ||
2489 | * code and not scanning it leads to false positives. | ||
2490 | */ | ||
2491 | kmemleak_scan_area(mod->trace_bprintk_fmt_start, | ||
2492 | sizeof(*mod->trace_bprintk_fmt_start) * | ||
2493 | mod->num_trace_bprintk_fmt, GFP_KERNEL); | ||
2494 | #endif | ||
2329 | #ifdef CONFIG_FTRACE_MCOUNT_RECORD | 2495 | #ifdef CONFIG_FTRACE_MCOUNT_RECORD |
2330 | /* sechdrs[0].sh_size is always zero */ | 2496 | /* sechdrs[0].sh_size is always zero */ |
2331 | mod->ftrace_callsites = section_objs(info, "__mcount_loc", | 2497 | mod->ftrace_callsites = section_objs(info, "__mcount_loc", |
@@ -2710,6 +2876,18 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, | |||
2710 | blocking_notifier_call_chain(&module_notify_list, | 2876 | blocking_notifier_call_chain(&module_notify_list, |
2711 | MODULE_STATE_COMING, mod); | 2877 | MODULE_STATE_COMING, mod); |
2712 | 2878 | ||
2879 | /* Set RO and NX regions for core */ | ||
2880 | set_section_ro_nx(mod->module_core, | ||
2881 | mod->core_text_size, | ||
2882 | mod->core_ro_size, | ||
2883 | mod->core_size); | ||
2884 | |||
2885 | /* Set RO and NX regions for init */ | ||
2886 | set_section_ro_nx(mod->module_init, | ||
2887 | mod->init_text_size, | ||
2888 | mod->init_ro_size, | ||
2889 | mod->init_size); | ||
2890 | |||
2713 | do_mod_ctors(mod); | 2891 | do_mod_ctors(mod); |
2714 | /* Start the module */ | 2892 | /* Start the module */ |
2715 | if (mod->init != NULL) | 2893 | if (mod->init != NULL) |
@@ -2753,6 +2931,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, | |||
2753 | mod->symtab = mod->core_symtab; | 2931 | mod->symtab = mod->core_symtab; |
2754 | mod->strtab = mod->core_strtab; | 2932 | mod->strtab = mod->core_strtab; |
2755 | #endif | 2933 | #endif |
2934 | unset_section_ro_nx(mod, mod->module_init); | ||
2756 | module_free(mod, mod->module_init); | 2935 | module_free(mod, mod->module_init); |
2757 | mod->module_init = NULL; | 2936 | mod->module_init = NULL; |
2758 | mod->init_size = 0; | 2937 | mod->init_size = 0; |
diff --git a/kernel/mutex.c b/kernel/mutex.c index 200407c1502..a5889fb28ec 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -199,7 +199,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
199 | * memory barriers as we'll eventually observe the right | 199 | * memory barriers as we'll eventually observe the right |
200 | * values at the cost of a few extra spins. | 200 | * values at the cost of a few extra spins. |
201 | */ | 201 | */ |
202 | cpu_relax(); | 202 | arch_mutex_cpu_relax(); |
203 | } | 203 | } |
204 | #endif | 204 | #endif |
205 | spin_lock_mutex(&lock->wait_lock, flags); | 205 | spin_lock_mutex(&lock->wait_lock, flags); |
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index cb6c0d2af68..11847bf1e8c 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/mm.h> | 13 | #include <linux/mm.h> |
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | #include <linux/smp.h> | 15 | #include <linux/smp.h> |
16 | #include <linux/idr.h> | ||
16 | #include <linux/file.h> | 17 | #include <linux/file.h> |
17 | #include <linux/poll.h> | 18 | #include <linux/poll.h> |
18 | #include <linux/slab.h> | 19 | #include <linux/slab.h> |
@@ -21,7 +22,9 @@ | |||
21 | #include <linux/dcache.h> | 22 | #include <linux/dcache.h> |
22 | #include <linux/percpu.h> | 23 | #include <linux/percpu.h> |
23 | #include <linux/ptrace.h> | 24 | #include <linux/ptrace.h> |
25 | #include <linux/reboot.h> | ||
24 | #include <linux/vmstat.h> | 26 | #include <linux/vmstat.h> |
27 | #include <linux/device.h> | ||
25 | #include <linux/vmalloc.h> | 28 | #include <linux/vmalloc.h> |
26 | #include <linux/hardirq.h> | 29 | #include <linux/hardirq.h> |
27 | #include <linux/rculist.h> | 30 | #include <linux/rculist.h> |
@@ -31,6 +34,7 @@ | |||
31 | #include <linux/kernel_stat.h> | 34 | #include <linux/kernel_stat.h> |
32 | #include <linux/perf_event.h> | 35 | #include <linux/perf_event.h> |
33 | #include <linux/ftrace_event.h> | 36 | #include <linux/ftrace_event.h> |
37 | #include <linux/hw_breakpoint.h> | ||
34 | 38 | ||
35 | #include <asm/irq_regs.h> | 39 | #include <asm/irq_regs.h> |
36 | 40 | ||
@@ -132,6 +136,28 @@ static void unclone_ctx(struct perf_event_context *ctx) | |||
132 | } | 136 | } |
133 | } | 137 | } |
134 | 138 | ||
139 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) | ||
140 | { | ||
141 | /* | ||
142 | * only top level events have the pid namespace they were created in | ||
143 | */ | ||
144 | if (event->parent) | ||
145 | event = event->parent; | ||
146 | |||
147 | return task_tgid_nr_ns(p, event->ns); | ||
148 | } | ||
149 | |||
150 | static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) | ||
151 | { | ||
152 | /* | ||
153 | * only top level events have the pid namespace they were created in | ||
154 | */ | ||
155 | if (event->parent) | ||
156 | event = event->parent; | ||
157 | |||
158 | return task_pid_nr_ns(p, event->ns); | ||
159 | } | ||
160 | |||
135 | /* | 161 | /* |
136 | * If we inherit events we want to return the parent event id | 162 | * If we inherit events we want to return the parent event id |
137 | * to userspace. | 163 | * to userspace. |
@@ -311,9 +337,84 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
311 | ctx->nr_stat++; | 337 | ctx->nr_stat++; |
312 | } | 338 | } |
313 | 339 | ||
340 | /* | ||
341 | * Called at perf_event creation and when events are attached/detached from a | ||
342 | * group. | ||
343 | */ | ||
344 | static void perf_event__read_size(struct perf_event *event) | ||
345 | { | ||
346 | int entry = sizeof(u64); /* value */ | ||
347 | int size = 0; | ||
348 | int nr = 1; | ||
349 | |||
350 | if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) | ||
351 | size += sizeof(u64); | ||
352 | |||
353 | if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) | ||
354 | size += sizeof(u64); | ||
355 | |||
356 | if (event->attr.read_format & PERF_FORMAT_ID) | ||
357 | entry += sizeof(u64); | ||
358 | |||
359 | if (event->attr.read_format & PERF_FORMAT_GROUP) { | ||
360 | nr += event->group_leader->nr_siblings; | ||
361 | size += sizeof(u64); | ||
362 | } | ||
363 | |||
364 | size += entry * nr; | ||
365 | event->read_size = size; | ||
366 | } | ||
367 | |||
368 | static void perf_event__header_size(struct perf_event *event) | ||
369 | { | ||
370 | struct perf_sample_data *data; | ||
371 | u64 sample_type = event->attr.sample_type; | ||
372 | u16 size = 0; | ||
373 | |||
374 | perf_event__read_size(event); | ||
375 | |||
376 | if (sample_type & PERF_SAMPLE_IP) | ||
377 | size += sizeof(data->ip); | ||
378 | |||
379 | if (sample_type & PERF_SAMPLE_ADDR) | ||
380 | size += sizeof(data->addr); | ||
381 | |||
382 | if (sample_type & PERF_SAMPLE_PERIOD) | ||
383 | size += sizeof(data->period); | ||
384 | |||
385 | if (sample_type & PERF_SAMPLE_READ) | ||
386 | size += event->read_size; | ||
387 | |||
388 | event->header_size = size; | ||
389 | } | ||
390 | |||
391 | static void perf_event__id_header_size(struct perf_event *event) | ||
392 | { | ||
393 | struct perf_sample_data *data; | ||
394 | u64 sample_type = event->attr.sample_type; | ||
395 | u16 size = 0; | ||
396 | |||
397 | if (sample_type & PERF_SAMPLE_TID) | ||
398 | size += sizeof(data->tid_entry); | ||
399 | |||
400 | if (sample_type & PERF_SAMPLE_TIME) | ||
401 | size += sizeof(data->time); | ||
402 | |||
403 | if (sample_type & PERF_SAMPLE_ID) | ||
404 | size += sizeof(data->id); | ||
405 | |||
406 | if (sample_type & PERF_SAMPLE_STREAM_ID) | ||
407 | size += sizeof(data->stream_id); | ||
408 | |||
409 | if (sample_type & PERF_SAMPLE_CPU) | ||
410 | size += sizeof(data->cpu_entry); | ||
411 | |||
412 | event->id_header_size = size; | ||
413 | } | ||
414 | |||
314 | static void perf_group_attach(struct perf_event *event) | 415 | static void perf_group_attach(struct perf_event *event) |
315 | { | 416 | { |
316 | struct perf_event *group_leader = event->group_leader; | 417 | struct perf_event *group_leader = event->group_leader, *pos; |
317 | 418 | ||
318 | /* | 419 | /* |
319 | * We can have double attach due to group movement in perf_event_open. | 420 | * We can have double attach due to group movement in perf_event_open. |
@@ -332,6 +433,11 @@ static void perf_group_attach(struct perf_event *event) | |||
332 | 433 | ||
333 | list_add_tail(&event->group_entry, &group_leader->sibling_list); | 434 | list_add_tail(&event->group_entry, &group_leader->sibling_list); |
334 | group_leader->nr_siblings++; | 435 | group_leader->nr_siblings++; |
436 | |||
437 | perf_event__header_size(group_leader); | ||
438 | |||
439 | list_for_each_entry(pos, &group_leader->sibling_list, group_entry) | ||
440 | perf_event__header_size(pos); | ||
335 | } | 441 | } |
336 | 442 | ||
337 | /* | 443 | /* |
@@ -390,7 +496,7 @@ static void perf_group_detach(struct perf_event *event) | |||
390 | if (event->group_leader != event) { | 496 | if (event->group_leader != event) { |
391 | list_del_init(&event->group_entry); | 497 | list_del_init(&event->group_entry); |
392 | event->group_leader->nr_siblings--; | 498 | event->group_leader->nr_siblings--; |
393 | return; | 499 | goto out; |
394 | } | 500 | } |
395 | 501 | ||
396 | if (!list_empty(&event->group_entry)) | 502 | if (!list_empty(&event->group_entry)) |
@@ -409,6 +515,12 @@ static void perf_group_detach(struct perf_event *event) | |||
409 | /* Inherit group flags from the previous leader */ | 515 | /* Inherit group flags from the previous leader */ |
410 | sibling->group_flags = event->group_flags; | 516 | sibling->group_flags = event->group_flags; |
411 | } | 517 | } |
518 | |||
519 | out: | ||
520 | perf_event__header_size(event->group_leader); | ||
521 | |||
522 | list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) | ||
523 | perf_event__header_size(tmp); | ||
412 | } | 524 | } |
413 | 525 | ||
414 | static inline int | 526 | static inline int |
@@ -1072,7 +1184,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh) | |||
1072 | /* | 1184 | /* |
1073 | * not supported on inherited events | 1185 | * not supported on inherited events |
1074 | */ | 1186 | */ |
1075 | if (event->attr.inherit) | 1187 | if (event->attr.inherit || !is_sampling_event(event)) |
1076 | return -EINVAL; | 1188 | return -EINVAL; |
1077 | 1189 | ||
1078 | atomic_add(refresh, &event->event_limit); | 1190 | atomic_add(refresh, &event->event_limit); |
@@ -1286,8 +1398,6 @@ void __perf_event_task_sched_out(struct task_struct *task, | |||
1286 | { | 1398 | { |
1287 | int ctxn; | 1399 | int ctxn; |
1288 | 1400 | ||
1289 | perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); | ||
1290 | |||
1291 | for_each_task_context_nr(ctxn) | 1401 | for_each_task_context_nr(ctxn) |
1292 | perf_event_context_sched_out(task, ctxn, next); | 1402 | perf_event_context_sched_out(task, ctxn, next); |
1293 | } | 1403 | } |
@@ -1621,8 +1731,12 @@ static void rotate_ctx(struct perf_event_context *ctx) | |||
1621 | { | 1731 | { |
1622 | raw_spin_lock(&ctx->lock); | 1732 | raw_spin_lock(&ctx->lock); |
1623 | 1733 | ||
1624 | /* Rotate the first entry last of non-pinned groups */ | 1734 | /* |
1625 | list_rotate_left(&ctx->flexible_groups); | 1735 | * Rotate the first entry last of non-pinned groups. Rotation might be |
1736 | * disabled by the inheritance code. | ||
1737 | */ | ||
1738 | if (!ctx->rotate_disable) | ||
1739 | list_rotate_left(&ctx->flexible_groups); | ||
1626 | 1740 | ||
1627 | raw_spin_unlock(&ctx->lock); | 1741 | raw_spin_unlock(&ctx->lock); |
1628 | } | 1742 | } |
@@ -2234,11 +2348,6 @@ int perf_event_release_kernel(struct perf_event *event) | |||
2234 | raw_spin_unlock_irq(&ctx->lock); | 2348 | raw_spin_unlock_irq(&ctx->lock); |
2235 | mutex_unlock(&ctx->mutex); | 2349 | mutex_unlock(&ctx->mutex); |
2236 | 2350 | ||
2237 | mutex_lock(&event->owner->perf_event_mutex); | ||
2238 | list_del_init(&event->owner_entry); | ||
2239 | mutex_unlock(&event->owner->perf_event_mutex); | ||
2240 | put_task_struct(event->owner); | ||
2241 | |||
2242 | free_event(event); | 2351 | free_event(event); |
2243 | 2352 | ||
2244 | return 0; | 2353 | return 0; |
@@ -2251,35 +2360,44 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); | |||
2251 | static int perf_release(struct inode *inode, struct file *file) | 2360 | static int perf_release(struct inode *inode, struct file *file) |
2252 | { | 2361 | { |
2253 | struct perf_event *event = file->private_data; | 2362 | struct perf_event *event = file->private_data; |
2363 | struct task_struct *owner; | ||
2254 | 2364 | ||
2255 | file->private_data = NULL; | 2365 | file->private_data = NULL; |
2256 | 2366 | ||
2257 | return perf_event_release_kernel(event); | 2367 | rcu_read_lock(); |
2258 | } | 2368 | owner = ACCESS_ONCE(event->owner); |
2259 | 2369 | /* | |
2260 | static int perf_event_read_size(struct perf_event *event) | 2370 | * Matches the smp_wmb() in perf_event_exit_task(). If we observe |
2261 | { | 2371 | * !owner it means the list deletion is complete and we can indeed |
2262 | int entry = sizeof(u64); /* value */ | 2372 | * free this event, otherwise we need to serialize on |
2263 | int size = 0; | 2373 | * owner->perf_event_mutex. |
2264 | int nr = 1; | 2374 | */ |
2265 | 2375 | smp_read_barrier_depends(); | |
2266 | if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) | 2376 | if (owner) { |
2267 | size += sizeof(u64); | 2377 | /* |
2268 | 2378 | * Since delayed_put_task_struct() also drops the last | |
2269 | if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) | 2379 | * task reference we can safely take a new reference |
2270 | size += sizeof(u64); | 2380 | * while holding the rcu_read_lock(). |
2271 | 2381 | */ | |
2272 | if (event->attr.read_format & PERF_FORMAT_ID) | 2382 | get_task_struct(owner); |
2273 | entry += sizeof(u64); | ||
2274 | |||
2275 | if (event->attr.read_format & PERF_FORMAT_GROUP) { | ||
2276 | nr += event->group_leader->nr_siblings; | ||
2277 | size += sizeof(u64); | ||
2278 | } | 2383 | } |
2384 | rcu_read_unlock(); | ||
2279 | 2385 | ||
2280 | size += entry * nr; | 2386 | if (owner) { |
2387 | mutex_lock(&owner->perf_event_mutex); | ||
2388 | /* | ||
2389 | * We have to re-check the event->owner field, if it is cleared | ||
2390 | * we raced with perf_event_exit_task(), acquiring the mutex | ||
2391 | * ensured they're done, and we can proceed with freeing the | ||
2392 | * event. | ||
2393 | */ | ||
2394 | if (event->owner) | ||
2395 | list_del_init(&event->owner_entry); | ||
2396 | mutex_unlock(&owner->perf_event_mutex); | ||
2397 | put_task_struct(owner); | ||
2398 | } | ||
2281 | 2399 | ||
2282 | return size; | 2400 | return perf_event_release_kernel(event); |
2283 | } | 2401 | } |
2284 | 2402 | ||
2285 | u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) | 2403 | u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) |
@@ -2396,7 +2514,7 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count) | |||
2396 | if (event->state == PERF_EVENT_STATE_ERROR) | 2514 | if (event->state == PERF_EVENT_STATE_ERROR) |
2397 | return 0; | 2515 | return 0; |
2398 | 2516 | ||
2399 | if (count < perf_event_read_size(event)) | 2517 | if (count < event->read_size) |
2400 | return -ENOSPC; | 2518 | return -ENOSPC; |
2401 | 2519 | ||
2402 | WARN_ON_ONCE(event->ctx->parent_ctx); | 2520 | WARN_ON_ONCE(event->ctx->parent_ctx); |
@@ -2482,7 +2600,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) | |||
2482 | int ret = 0; | 2600 | int ret = 0; |
2483 | u64 value; | 2601 | u64 value; |
2484 | 2602 | ||
2485 | if (!event->attr.sample_period) | 2603 | if (!is_sampling_event(event)) |
2486 | return -EINVAL; | 2604 | return -EINVAL; |
2487 | 2605 | ||
2488 | if (copy_from_user(&value, arg, sizeof(value))) | 2606 | if (copy_from_user(&value, arg, sizeof(value))) |
@@ -3273,6 +3391,73 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle, | |||
3273 | } while (len); | 3391 | } while (len); |
3274 | } | 3392 | } |
3275 | 3393 | ||
3394 | static void __perf_event_header__init_id(struct perf_event_header *header, | ||
3395 | struct perf_sample_data *data, | ||
3396 | struct perf_event *event) | ||
3397 | { | ||
3398 | u64 sample_type = event->attr.sample_type; | ||
3399 | |||
3400 | data->type = sample_type; | ||
3401 | header->size += event->id_header_size; | ||
3402 | |||
3403 | if (sample_type & PERF_SAMPLE_TID) { | ||
3404 | /* namespace issues */ | ||
3405 | data->tid_entry.pid = perf_event_pid(event, current); | ||
3406 | data->tid_entry.tid = perf_event_tid(event, current); | ||
3407 | } | ||
3408 | |||
3409 | if (sample_type & PERF_SAMPLE_TIME) | ||
3410 | data->time = perf_clock(); | ||
3411 | |||
3412 | if (sample_type & PERF_SAMPLE_ID) | ||
3413 | data->id = primary_event_id(event); | ||
3414 | |||
3415 | if (sample_type & PERF_SAMPLE_STREAM_ID) | ||
3416 | data->stream_id = event->id; | ||
3417 | |||
3418 | if (sample_type & PERF_SAMPLE_CPU) { | ||
3419 | data->cpu_entry.cpu = raw_smp_processor_id(); | ||
3420 | data->cpu_entry.reserved = 0; | ||
3421 | } | ||
3422 | } | ||
3423 | |||
3424 | static void perf_event_header__init_id(struct perf_event_header *header, | ||
3425 | struct perf_sample_data *data, | ||
3426 | struct perf_event *event) | ||
3427 | { | ||
3428 | if (event->attr.sample_id_all) | ||
3429 | __perf_event_header__init_id(header, data, event); | ||
3430 | } | ||
3431 | |||
3432 | static void __perf_event__output_id_sample(struct perf_output_handle *handle, | ||
3433 | struct perf_sample_data *data) | ||
3434 | { | ||
3435 | u64 sample_type = data->type; | ||
3436 | |||
3437 | if (sample_type & PERF_SAMPLE_TID) | ||
3438 | perf_output_put(handle, data->tid_entry); | ||
3439 | |||
3440 | if (sample_type & PERF_SAMPLE_TIME) | ||
3441 | perf_output_put(handle, data->time); | ||
3442 | |||
3443 | if (sample_type & PERF_SAMPLE_ID) | ||
3444 | perf_output_put(handle, data->id); | ||
3445 | |||
3446 | if (sample_type & PERF_SAMPLE_STREAM_ID) | ||
3447 | perf_output_put(handle, data->stream_id); | ||
3448 | |||
3449 | if (sample_type & PERF_SAMPLE_CPU) | ||
3450 | perf_output_put(handle, data->cpu_entry); | ||
3451 | } | ||
3452 | |||
3453 | static void perf_event__output_id_sample(struct perf_event *event, | ||
3454 | struct perf_output_handle *handle, | ||
3455 | struct perf_sample_data *sample) | ||
3456 | { | ||
3457 | if (event->attr.sample_id_all) | ||
3458 | __perf_event__output_id_sample(handle, sample); | ||
3459 | } | ||
3460 | |||
3276 | int perf_output_begin(struct perf_output_handle *handle, | 3461 | int perf_output_begin(struct perf_output_handle *handle, |
3277 | struct perf_event *event, unsigned int size, | 3462 | struct perf_event *event, unsigned int size, |
3278 | int nmi, int sample) | 3463 | int nmi, int sample) |
@@ -3280,6 +3465,7 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3280 | struct perf_buffer *buffer; | 3465 | struct perf_buffer *buffer; |
3281 | unsigned long tail, offset, head; | 3466 | unsigned long tail, offset, head; |
3282 | int have_lost; | 3467 | int have_lost; |
3468 | struct perf_sample_data sample_data; | ||
3283 | struct { | 3469 | struct { |
3284 | struct perf_event_header header; | 3470 | struct perf_event_header header; |
3285 | u64 id; | 3471 | u64 id; |
@@ -3306,8 +3492,12 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3306 | goto out; | 3492 | goto out; |
3307 | 3493 | ||
3308 | have_lost = local_read(&buffer->lost); | 3494 | have_lost = local_read(&buffer->lost); |
3309 | if (have_lost) | 3495 | if (have_lost) { |
3310 | size += sizeof(lost_event); | 3496 | lost_event.header.size = sizeof(lost_event); |
3497 | perf_event_header__init_id(&lost_event.header, &sample_data, | ||
3498 | event); | ||
3499 | size += lost_event.header.size; | ||
3500 | } | ||
3311 | 3501 | ||
3312 | perf_output_get_handle(handle); | 3502 | perf_output_get_handle(handle); |
3313 | 3503 | ||
@@ -3338,11 +3528,11 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3338 | if (have_lost) { | 3528 | if (have_lost) { |
3339 | lost_event.header.type = PERF_RECORD_LOST; | 3529 | lost_event.header.type = PERF_RECORD_LOST; |
3340 | lost_event.header.misc = 0; | 3530 | lost_event.header.misc = 0; |
3341 | lost_event.header.size = sizeof(lost_event); | ||
3342 | lost_event.id = event->id; | 3531 | lost_event.id = event->id; |
3343 | lost_event.lost = local_xchg(&buffer->lost, 0); | 3532 | lost_event.lost = local_xchg(&buffer->lost, 0); |
3344 | 3533 | ||
3345 | perf_output_put(handle, lost_event); | 3534 | perf_output_put(handle, lost_event); |
3535 | perf_event__output_id_sample(event, handle, &sample_data); | ||
3346 | } | 3536 | } |
3347 | 3537 | ||
3348 | return 0; | 3538 | return 0; |
@@ -3375,28 +3565,6 @@ void perf_output_end(struct perf_output_handle *handle) | |||
3375 | rcu_read_unlock(); | 3565 | rcu_read_unlock(); |
3376 | } | 3566 | } |
3377 | 3567 | ||
3378 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) | ||
3379 | { | ||
3380 | /* | ||
3381 | * only top level events have the pid namespace they were created in | ||
3382 | */ | ||
3383 | if (event->parent) | ||
3384 | event = event->parent; | ||
3385 | |||
3386 | return task_tgid_nr_ns(p, event->ns); | ||
3387 | } | ||
3388 | |||
3389 | static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) | ||
3390 | { | ||
3391 | /* | ||
3392 | * only top level events have the pid namespace they were created in | ||
3393 | */ | ||
3394 | if (event->parent) | ||
3395 | event = event->parent; | ||
3396 | |||
3397 | return task_pid_nr_ns(p, event->ns); | ||
3398 | } | ||
3399 | |||
3400 | static void perf_output_read_one(struct perf_output_handle *handle, | 3568 | static void perf_output_read_one(struct perf_output_handle *handle, |
3401 | struct perf_event *event, | 3569 | struct perf_event *event, |
3402 | u64 enabled, u64 running) | 3570 | u64 enabled, u64 running) |
@@ -3571,61 +3739,16 @@ void perf_prepare_sample(struct perf_event_header *header, | |||
3571 | { | 3739 | { |
3572 | u64 sample_type = event->attr.sample_type; | 3740 | u64 sample_type = event->attr.sample_type; |
3573 | 3741 | ||
3574 | data->type = sample_type; | ||
3575 | |||
3576 | header->type = PERF_RECORD_SAMPLE; | 3742 | header->type = PERF_RECORD_SAMPLE; |
3577 | header->size = sizeof(*header); | 3743 | header->size = sizeof(*header) + event->header_size; |
3578 | 3744 | ||
3579 | header->misc = 0; | 3745 | header->misc = 0; |
3580 | header->misc |= perf_misc_flags(regs); | 3746 | header->misc |= perf_misc_flags(regs); |
3581 | 3747 | ||
3582 | if (sample_type & PERF_SAMPLE_IP) { | 3748 | __perf_event_header__init_id(header, data, event); |
3583 | data->ip = perf_instruction_pointer(regs); | ||
3584 | |||
3585 | header->size += sizeof(data->ip); | ||
3586 | } | ||
3587 | |||
3588 | if (sample_type & PERF_SAMPLE_TID) { | ||
3589 | /* namespace issues */ | ||
3590 | data->tid_entry.pid = perf_event_pid(event, current); | ||
3591 | data->tid_entry.tid = perf_event_tid(event, current); | ||
3592 | |||
3593 | header->size += sizeof(data->tid_entry); | ||
3594 | } | ||
3595 | |||
3596 | if (sample_type & PERF_SAMPLE_TIME) { | ||
3597 | data->time = perf_clock(); | ||
3598 | |||
3599 | header->size += sizeof(data->time); | ||
3600 | } | ||
3601 | |||
3602 | if (sample_type & PERF_SAMPLE_ADDR) | ||
3603 | header->size += sizeof(data->addr); | ||
3604 | |||
3605 | if (sample_type & PERF_SAMPLE_ID) { | ||
3606 | data->id = primary_event_id(event); | ||
3607 | |||
3608 | header->size += sizeof(data->id); | ||
3609 | } | ||
3610 | |||
3611 | if (sample_type & PERF_SAMPLE_STREAM_ID) { | ||
3612 | data->stream_id = event->id; | ||
3613 | |||
3614 | header->size += sizeof(data->stream_id); | ||
3615 | } | ||
3616 | |||
3617 | if (sample_type & PERF_SAMPLE_CPU) { | ||
3618 | data->cpu_entry.cpu = raw_smp_processor_id(); | ||
3619 | data->cpu_entry.reserved = 0; | ||
3620 | |||
3621 | header->size += sizeof(data->cpu_entry); | ||
3622 | } | ||
3623 | |||
3624 | if (sample_type & PERF_SAMPLE_PERIOD) | ||
3625 | header->size += sizeof(data->period); | ||
3626 | 3749 | ||
3627 | if (sample_type & PERF_SAMPLE_READ) | 3750 | if (sample_type & PERF_SAMPLE_IP) |
3628 | header->size += perf_event_read_size(event); | 3751 | data->ip = perf_instruction_pointer(regs); |
3629 | 3752 | ||
3630 | if (sample_type & PERF_SAMPLE_CALLCHAIN) { | 3753 | if (sample_type & PERF_SAMPLE_CALLCHAIN) { |
3631 | int size = 1; | 3754 | int size = 1; |
@@ -3690,23 +3813,26 @@ perf_event_read_event(struct perf_event *event, | |||
3690 | struct task_struct *task) | 3813 | struct task_struct *task) |
3691 | { | 3814 | { |
3692 | struct perf_output_handle handle; | 3815 | struct perf_output_handle handle; |
3816 | struct perf_sample_data sample; | ||
3693 | struct perf_read_event read_event = { | 3817 | struct perf_read_event read_event = { |
3694 | .header = { | 3818 | .header = { |
3695 | .type = PERF_RECORD_READ, | 3819 | .type = PERF_RECORD_READ, |
3696 | .misc = 0, | 3820 | .misc = 0, |
3697 | .size = sizeof(read_event) + perf_event_read_size(event), | 3821 | .size = sizeof(read_event) + event->read_size, |
3698 | }, | 3822 | }, |
3699 | .pid = perf_event_pid(event, task), | 3823 | .pid = perf_event_pid(event, task), |
3700 | .tid = perf_event_tid(event, task), | 3824 | .tid = perf_event_tid(event, task), |
3701 | }; | 3825 | }; |
3702 | int ret; | 3826 | int ret; |
3703 | 3827 | ||
3828 | perf_event_header__init_id(&read_event.header, &sample, event); | ||
3704 | ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); | 3829 | ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); |
3705 | if (ret) | 3830 | if (ret) |
3706 | return; | 3831 | return; |
3707 | 3832 | ||
3708 | perf_output_put(&handle, read_event); | 3833 | perf_output_put(&handle, read_event); |
3709 | perf_output_read(&handle, event); | 3834 | perf_output_read(&handle, event); |
3835 | perf_event__output_id_sample(event, &handle, &sample); | ||
3710 | 3836 | ||
3711 | perf_output_end(&handle); | 3837 | perf_output_end(&handle); |
3712 | } | 3838 | } |
@@ -3736,14 +3862,16 @@ static void perf_event_task_output(struct perf_event *event, | |||
3736 | struct perf_task_event *task_event) | 3862 | struct perf_task_event *task_event) |
3737 | { | 3863 | { |
3738 | struct perf_output_handle handle; | 3864 | struct perf_output_handle handle; |
3865 | struct perf_sample_data sample; | ||
3739 | struct task_struct *task = task_event->task; | 3866 | struct task_struct *task = task_event->task; |
3740 | int size, ret; | 3867 | int ret, size = task_event->event_id.header.size; |
3741 | 3868 | ||
3742 | size = task_event->event_id.header.size; | 3869 | perf_event_header__init_id(&task_event->event_id.header, &sample, event); |
3743 | ret = perf_output_begin(&handle, event, size, 0, 0); | ||
3744 | 3870 | ||
3871 | ret = perf_output_begin(&handle, event, | ||
3872 | task_event->event_id.header.size, 0, 0); | ||
3745 | if (ret) | 3873 | if (ret) |
3746 | return; | 3874 | goto out; |
3747 | 3875 | ||
3748 | task_event->event_id.pid = perf_event_pid(event, task); | 3876 | task_event->event_id.pid = perf_event_pid(event, task); |
3749 | task_event->event_id.ppid = perf_event_pid(event, current); | 3877 | task_event->event_id.ppid = perf_event_pid(event, current); |
@@ -3753,7 +3881,11 @@ static void perf_event_task_output(struct perf_event *event, | |||
3753 | 3881 | ||
3754 | perf_output_put(&handle, task_event->event_id); | 3882 | perf_output_put(&handle, task_event->event_id); |
3755 | 3883 | ||
3884 | perf_event__output_id_sample(event, &handle, &sample); | ||
3885 | |||
3756 | perf_output_end(&handle); | 3886 | perf_output_end(&handle); |
3887 | out: | ||
3888 | task_event->event_id.header.size = size; | ||
3757 | } | 3889 | } |
3758 | 3890 | ||
3759 | static int perf_event_task_match(struct perf_event *event) | 3891 | static int perf_event_task_match(struct perf_event *event) |
@@ -3792,6 +3924,8 @@ static void perf_event_task_event(struct perf_task_event *task_event) | |||
3792 | rcu_read_lock(); | 3924 | rcu_read_lock(); |
3793 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 3925 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
3794 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); | 3926 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
3927 | if (cpuctx->active_pmu != pmu) | ||
3928 | goto next; | ||
3795 | perf_event_task_ctx(&cpuctx->ctx, task_event); | 3929 | perf_event_task_ctx(&cpuctx->ctx, task_event); |
3796 | 3930 | ||
3797 | ctx = task_event->task_ctx; | 3931 | ctx = task_event->task_ctx; |
@@ -3866,11 +4000,16 @@ static void perf_event_comm_output(struct perf_event *event, | |||
3866 | struct perf_comm_event *comm_event) | 4000 | struct perf_comm_event *comm_event) |
3867 | { | 4001 | { |
3868 | struct perf_output_handle handle; | 4002 | struct perf_output_handle handle; |
4003 | struct perf_sample_data sample; | ||
3869 | int size = comm_event->event_id.header.size; | 4004 | int size = comm_event->event_id.header.size; |
3870 | int ret = perf_output_begin(&handle, event, size, 0, 0); | 4005 | int ret; |
4006 | |||
4007 | perf_event_header__init_id(&comm_event->event_id.header, &sample, event); | ||
4008 | ret = perf_output_begin(&handle, event, | ||
4009 | comm_event->event_id.header.size, 0, 0); | ||
3871 | 4010 | ||
3872 | if (ret) | 4011 | if (ret) |
3873 | return; | 4012 | goto out; |
3874 | 4013 | ||
3875 | comm_event->event_id.pid = perf_event_pid(event, comm_event->task); | 4014 | comm_event->event_id.pid = perf_event_pid(event, comm_event->task); |
3876 | comm_event->event_id.tid = perf_event_tid(event, comm_event->task); | 4015 | comm_event->event_id.tid = perf_event_tid(event, comm_event->task); |
@@ -3878,7 +4017,12 @@ static void perf_event_comm_output(struct perf_event *event, | |||
3878 | perf_output_put(&handle, comm_event->event_id); | 4017 | perf_output_put(&handle, comm_event->event_id); |
3879 | perf_output_copy(&handle, comm_event->comm, | 4018 | perf_output_copy(&handle, comm_event->comm, |
3880 | comm_event->comm_size); | 4019 | comm_event->comm_size); |
4020 | |||
4021 | perf_event__output_id_sample(event, &handle, &sample); | ||
4022 | |||
3881 | perf_output_end(&handle); | 4023 | perf_output_end(&handle); |
4024 | out: | ||
4025 | comm_event->event_id.header.size = size; | ||
3882 | } | 4026 | } |
3883 | 4027 | ||
3884 | static int perf_event_comm_match(struct perf_event *event) | 4028 | static int perf_event_comm_match(struct perf_event *event) |
@@ -3923,10 +4067,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
3923 | comm_event->comm_size = size; | 4067 | comm_event->comm_size = size; |
3924 | 4068 | ||
3925 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; | 4069 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; |
3926 | |||
3927 | rcu_read_lock(); | 4070 | rcu_read_lock(); |
3928 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 4071 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
3929 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); | 4072 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
4073 | if (cpuctx->active_pmu != pmu) | ||
4074 | goto next; | ||
3930 | perf_event_comm_ctx(&cpuctx->ctx, comm_event); | 4075 | perf_event_comm_ctx(&cpuctx->ctx, comm_event); |
3931 | 4076 | ||
3932 | ctxn = pmu->task_ctx_nr; | 4077 | ctxn = pmu->task_ctx_nr; |
@@ -4002,11 +4147,15 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
4002 | struct perf_mmap_event *mmap_event) | 4147 | struct perf_mmap_event *mmap_event) |
4003 | { | 4148 | { |
4004 | struct perf_output_handle handle; | 4149 | struct perf_output_handle handle; |
4150 | struct perf_sample_data sample; | ||
4005 | int size = mmap_event->event_id.header.size; | 4151 | int size = mmap_event->event_id.header.size; |
4006 | int ret = perf_output_begin(&handle, event, size, 0, 0); | 4152 | int ret; |
4007 | 4153 | ||
4154 | perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); | ||
4155 | ret = perf_output_begin(&handle, event, | ||
4156 | mmap_event->event_id.header.size, 0, 0); | ||
4008 | if (ret) | 4157 | if (ret) |
4009 | return; | 4158 | goto out; |
4010 | 4159 | ||
4011 | mmap_event->event_id.pid = perf_event_pid(event, current); | 4160 | mmap_event->event_id.pid = perf_event_pid(event, current); |
4012 | mmap_event->event_id.tid = perf_event_tid(event, current); | 4161 | mmap_event->event_id.tid = perf_event_tid(event, current); |
@@ -4014,7 +4163,12 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
4014 | perf_output_put(&handle, mmap_event->event_id); | 4163 | perf_output_put(&handle, mmap_event->event_id); |
4015 | perf_output_copy(&handle, mmap_event->file_name, | 4164 | perf_output_copy(&handle, mmap_event->file_name, |
4016 | mmap_event->file_size); | 4165 | mmap_event->file_size); |
4166 | |||
4167 | perf_event__output_id_sample(event, &handle, &sample); | ||
4168 | |||
4017 | perf_output_end(&handle); | 4169 | perf_output_end(&handle); |
4170 | out: | ||
4171 | mmap_event->event_id.header.size = size; | ||
4018 | } | 4172 | } |
4019 | 4173 | ||
4020 | static int perf_event_mmap_match(struct perf_event *event, | 4174 | static int perf_event_mmap_match(struct perf_event *event, |
@@ -4112,6 +4266,8 @@ got_name: | |||
4112 | rcu_read_lock(); | 4266 | rcu_read_lock(); |
4113 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 4267 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
4114 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); | 4268 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
4269 | if (cpuctx->active_pmu != pmu) | ||
4270 | goto next; | ||
4115 | perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, | 4271 | perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, |
4116 | vma->vm_flags & VM_EXEC); | 4272 | vma->vm_flags & VM_EXEC); |
4117 | 4273 | ||
@@ -4167,6 +4323,7 @@ void perf_event_mmap(struct vm_area_struct *vma) | |||
4167 | static void perf_log_throttle(struct perf_event *event, int enable) | 4323 | static void perf_log_throttle(struct perf_event *event, int enable) |
4168 | { | 4324 | { |
4169 | struct perf_output_handle handle; | 4325 | struct perf_output_handle handle; |
4326 | struct perf_sample_data sample; | ||
4170 | int ret; | 4327 | int ret; |
4171 | 4328 | ||
4172 | struct { | 4329 | struct { |
@@ -4188,11 +4345,15 @@ static void perf_log_throttle(struct perf_event *event, int enable) | |||
4188 | if (enable) | 4345 | if (enable) |
4189 | throttle_event.header.type = PERF_RECORD_UNTHROTTLE; | 4346 | throttle_event.header.type = PERF_RECORD_UNTHROTTLE; |
4190 | 4347 | ||
4191 | ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); | 4348 | perf_event_header__init_id(&throttle_event.header, &sample, event); |
4349 | |||
4350 | ret = perf_output_begin(&handle, event, | ||
4351 | throttle_event.header.size, 1, 0); | ||
4192 | if (ret) | 4352 | if (ret) |
4193 | return; | 4353 | return; |
4194 | 4354 | ||
4195 | perf_output_put(&handle, throttle_event); | 4355 | perf_output_put(&handle, throttle_event); |
4356 | perf_event__output_id_sample(event, &handle, &sample); | ||
4196 | perf_output_end(&handle); | 4357 | perf_output_end(&handle); |
4197 | } | 4358 | } |
4198 | 4359 | ||
@@ -4208,6 +4369,13 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
4208 | struct hw_perf_event *hwc = &event->hw; | 4369 | struct hw_perf_event *hwc = &event->hw; |
4209 | int ret = 0; | 4370 | int ret = 0; |
4210 | 4371 | ||
4372 | /* | ||
4373 | * Non-sampling counters might still use the PMI to fold short | ||
4374 | * hardware counters, ignore those. | ||
4375 | */ | ||
4376 | if (unlikely(!is_sampling_event(event))) | ||
4377 | return 0; | ||
4378 | |||
4211 | if (!throttle) { | 4379 | if (!throttle) { |
4212 | hwc->interrupts++; | 4380 | hwc->interrupts++; |
4213 | } else { | 4381 | } else { |
@@ -4353,7 +4521,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, | |||
4353 | if (!regs) | 4521 | if (!regs) |
4354 | return; | 4522 | return; |
4355 | 4523 | ||
4356 | if (!hwc->sample_period) | 4524 | if (!is_sampling_event(event)) |
4357 | return; | 4525 | return; |
4358 | 4526 | ||
4359 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) | 4527 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) |
@@ -4516,7 +4684,7 @@ static int perf_swevent_add(struct perf_event *event, int flags) | |||
4516 | struct hw_perf_event *hwc = &event->hw; | 4684 | struct hw_perf_event *hwc = &event->hw; |
4517 | struct hlist_head *head; | 4685 | struct hlist_head *head; |
4518 | 4686 | ||
4519 | if (hwc->sample_period) { | 4687 | if (is_sampling_event(event)) { |
4520 | hwc->last_period = hwc->sample_period; | 4688 | hwc->last_period = hwc->sample_period; |
4521 | perf_swevent_set_period(event); | 4689 | perf_swevent_set_period(event); |
4522 | } | 4690 | } |
@@ -4681,7 +4849,7 @@ static int perf_swevent_init(struct perf_event *event) | |||
4681 | break; | 4849 | break; |
4682 | } | 4850 | } |
4683 | 4851 | ||
4684 | if (event_id > PERF_COUNT_SW_MAX) | 4852 | if (event_id >= PERF_COUNT_SW_MAX) |
4685 | return -ENOENT; | 4853 | return -ENOENT; |
4686 | 4854 | ||
4687 | if (!event->parent) { | 4855 | if (!event->parent) { |
@@ -4773,15 +4941,6 @@ static int perf_tp_event_init(struct perf_event *event) | |||
4773 | if (event->attr.type != PERF_TYPE_TRACEPOINT) | 4941 | if (event->attr.type != PERF_TYPE_TRACEPOINT) |
4774 | return -ENOENT; | 4942 | return -ENOENT; |
4775 | 4943 | ||
4776 | /* | ||
4777 | * Raw tracepoint data is a severe data leak, only allow root to | ||
4778 | * have these. | ||
4779 | */ | ||
4780 | if ((event->attr.sample_type & PERF_SAMPLE_RAW) && | ||
4781 | perf_paranoid_tracepoint_raw() && | ||
4782 | !capable(CAP_SYS_ADMIN)) | ||
4783 | return -EPERM; | ||
4784 | |||
4785 | err = perf_trace_init(event); | 4944 | err = perf_trace_init(event); |
4786 | if (err) | 4945 | if (err) |
4787 | return err; | 4946 | return err; |
@@ -4804,7 +4963,7 @@ static struct pmu perf_tracepoint = { | |||
4804 | 4963 | ||
4805 | static inline void perf_tp_register(void) | 4964 | static inline void perf_tp_register(void) |
4806 | { | 4965 | { |
4807 | perf_pmu_register(&perf_tracepoint); | 4966 | perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); |
4808 | } | 4967 | } |
4809 | 4968 | ||
4810 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) | 4969 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) |
@@ -4894,31 +5053,33 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | |||
4894 | static void perf_swevent_start_hrtimer(struct perf_event *event) | 5053 | static void perf_swevent_start_hrtimer(struct perf_event *event) |
4895 | { | 5054 | { |
4896 | struct hw_perf_event *hwc = &event->hw; | 5055 | struct hw_perf_event *hwc = &event->hw; |
5056 | s64 period; | ||
5057 | |||
5058 | if (!is_sampling_event(event)) | ||
5059 | return; | ||
4897 | 5060 | ||
4898 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 5061 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
4899 | hwc->hrtimer.function = perf_swevent_hrtimer; | 5062 | hwc->hrtimer.function = perf_swevent_hrtimer; |
4900 | if (hwc->sample_period) { | ||
4901 | s64 period = local64_read(&hwc->period_left); | ||
4902 | 5063 | ||
4903 | if (period) { | 5064 | period = local64_read(&hwc->period_left); |
4904 | if (period < 0) | 5065 | if (period) { |
4905 | period = 10000; | 5066 | if (period < 0) |
5067 | period = 10000; | ||
4906 | 5068 | ||
4907 | local64_set(&hwc->period_left, 0); | 5069 | local64_set(&hwc->period_left, 0); |
4908 | } else { | 5070 | } else { |
4909 | period = max_t(u64, 10000, hwc->sample_period); | 5071 | period = max_t(u64, 10000, hwc->sample_period); |
4910 | } | 5072 | } |
4911 | __hrtimer_start_range_ns(&hwc->hrtimer, | 5073 | __hrtimer_start_range_ns(&hwc->hrtimer, |
4912 | ns_to_ktime(period), 0, | 5074 | ns_to_ktime(period), 0, |
4913 | HRTIMER_MODE_REL_PINNED, 0); | 5075 | HRTIMER_MODE_REL_PINNED, 0); |
4914 | } | ||
4915 | } | 5076 | } |
4916 | 5077 | ||
4917 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) | 5078 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) |
4918 | { | 5079 | { |
4919 | struct hw_perf_event *hwc = &event->hw; | 5080 | struct hw_perf_event *hwc = &event->hw; |
4920 | 5081 | ||
4921 | if (hwc->sample_period) { | 5082 | if (is_sampling_event(event)) { |
4922 | ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); | 5083 | ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); |
4923 | local64_set(&hwc->period_left, ktime_to_ns(remaining)); | 5084 | local64_set(&hwc->period_left, ktime_to_ns(remaining)); |
4924 | 5085 | ||
@@ -5113,25 +5274,94 @@ static void *find_pmu_context(int ctxn) | |||
5113 | return NULL; | 5274 | return NULL; |
5114 | } | 5275 | } |
5115 | 5276 | ||
5116 | static void free_pmu_context(void * __percpu cpu_context) | 5277 | static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) |
5117 | { | 5278 | { |
5118 | struct pmu *pmu; | 5279 | int cpu; |
5280 | |||
5281 | for_each_possible_cpu(cpu) { | ||
5282 | struct perf_cpu_context *cpuctx; | ||
5283 | |||
5284 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | ||
5285 | |||
5286 | if (cpuctx->active_pmu == old_pmu) | ||
5287 | cpuctx->active_pmu = pmu; | ||
5288 | } | ||
5289 | } | ||
5290 | |||
5291 | static void free_pmu_context(struct pmu *pmu) | ||
5292 | { | ||
5293 | struct pmu *i; | ||
5119 | 5294 | ||
5120 | mutex_lock(&pmus_lock); | 5295 | mutex_lock(&pmus_lock); |
5121 | /* | 5296 | /* |
5122 | * Like a real lame refcount. | 5297 | * Like a real lame refcount. |
5123 | */ | 5298 | */ |
5124 | list_for_each_entry(pmu, &pmus, entry) { | 5299 | list_for_each_entry(i, &pmus, entry) { |
5125 | if (pmu->pmu_cpu_context == cpu_context) | 5300 | if (i->pmu_cpu_context == pmu->pmu_cpu_context) { |
5301 | update_pmu_context(i, pmu); | ||
5126 | goto out; | 5302 | goto out; |
5303 | } | ||
5127 | } | 5304 | } |
5128 | 5305 | ||
5129 | free_percpu(cpu_context); | 5306 | free_percpu(pmu->pmu_cpu_context); |
5130 | out: | 5307 | out: |
5131 | mutex_unlock(&pmus_lock); | 5308 | mutex_unlock(&pmus_lock); |
5132 | } | 5309 | } |
5310 | static struct idr pmu_idr; | ||
5311 | |||
5312 | static ssize_t | ||
5313 | type_show(struct device *dev, struct device_attribute *attr, char *page) | ||
5314 | { | ||
5315 | struct pmu *pmu = dev_get_drvdata(dev); | ||
5316 | |||
5317 | return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); | ||
5318 | } | ||
5319 | |||
5320 | static struct device_attribute pmu_dev_attrs[] = { | ||
5321 | __ATTR_RO(type), | ||
5322 | __ATTR_NULL, | ||
5323 | }; | ||
5324 | |||
5325 | static int pmu_bus_running; | ||
5326 | static struct bus_type pmu_bus = { | ||
5327 | .name = "event_source", | ||
5328 | .dev_attrs = pmu_dev_attrs, | ||
5329 | }; | ||
5330 | |||
5331 | static void pmu_dev_release(struct device *dev) | ||
5332 | { | ||
5333 | kfree(dev); | ||
5334 | } | ||
5335 | |||
5336 | static int pmu_dev_alloc(struct pmu *pmu) | ||
5337 | { | ||
5338 | int ret = -ENOMEM; | ||
5339 | |||
5340 | pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); | ||
5341 | if (!pmu->dev) | ||
5342 | goto out; | ||
5343 | |||
5344 | device_initialize(pmu->dev); | ||
5345 | ret = dev_set_name(pmu->dev, "%s", pmu->name); | ||
5346 | if (ret) | ||
5347 | goto free_dev; | ||
5348 | |||
5349 | dev_set_drvdata(pmu->dev, pmu); | ||
5350 | pmu->dev->bus = &pmu_bus; | ||
5351 | pmu->dev->release = pmu_dev_release; | ||
5352 | ret = device_add(pmu->dev); | ||
5353 | if (ret) | ||
5354 | goto free_dev; | ||
5355 | |||
5356 | out: | ||
5357 | return ret; | ||
5358 | |||
5359 | free_dev: | ||
5360 | put_device(pmu->dev); | ||
5361 | goto out; | ||
5362 | } | ||
5133 | 5363 | ||
5134 | int perf_pmu_register(struct pmu *pmu) | 5364 | int perf_pmu_register(struct pmu *pmu, char *name, int type) |
5135 | { | 5365 | { |
5136 | int cpu, ret; | 5366 | int cpu, ret; |
5137 | 5367 | ||
@@ -5141,13 +5371,38 @@ int perf_pmu_register(struct pmu *pmu) | |||
5141 | if (!pmu->pmu_disable_count) | 5371 | if (!pmu->pmu_disable_count) |
5142 | goto unlock; | 5372 | goto unlock; |
5143 | 5373 | ||
5374 | pmu->type = -1; | ||
5375 | if (!name) | ||
5376 | goto skip_type; | ||
5377 | pmu->name = name; | ||
5378 | |||
5379 | if (type < 0) { | ||
5380 | int err = idr_pre_get(&pmu_idr, GFP_KERNEL); | ||
5381 | if (!err) | ||
5382 | goto free_pdc; | ||
5383 | |||
5384 | err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type); | ||
5385 | if (err) { | ||
5386 | ret = err; | ||
5387 | goto free_pdc; | ||
5388 | } | ||
5389 | } | ||
5390 | pmu->type = type; | ||
5391 | |||
5392 | if (pmu_bus_running) { | ||
5393 | ret = pmu_dev_alloc(pmu); | ||
5394 | if (ret) | ||
5395 | goto free_idr; | ||
5396 | } | ||
5397 | |||
5398 | skip_type: | ||
5144 | pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); | 5399 | pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); |
5145 | if (pmu->pmu_cpu_context) | 5400 | if (pmu->pmu_cpu_context) |
5146 | goto got_cpu_context; | 5401 | goto got_cpu_context; |
5147 | 5402 | ||
5148 | pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); | 5403 | pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); |
5149 | if (!pmu->pmu_cpu_context) | 5404 | if (!pmu->pmu_cpu_context) |
5150 | goto free_pdc; | 5405 | goto free_dev; |
5151 | 5406 | ||
5152 | for_each_possible_cpu(cpu) { | 5407 | for_each_possible_cpu(cpu) { |
5153 | struct perf_cpu_context *cpuctx; | 5408 | struct perf_cpu_context *cpuctx; |
@@ -5158,6 +5413,7 @@ int perf_pmu_register(struct pmu *pmu) | |||
5158 | cpuctx->ctx.pmu = pmu; | 5413 | cpuctx->ctx.pmu = pmu; |
5159 | cpuctx->jiffies_interval = 1; | 5414 | cpuctx->jiffies_interval = 1; |
5160 | INIT_LIST_HEAD(&cpuctx->rotation_list); | 5415 | INIT_LIST_HEAD(&cpuctx->rotation_list); |
5416 | cpuctx->active_pmu = pmu; | ||
5161 | } | 5417 | } |
5162 | 5418 | ||
5163 | got_cpu_context: | 5419 | got_cpu_context: |
@@ -5190,6 +5446,14 @@ unlock: | |||
5190 | 5446 | ||
5191 | return ret; | 5447 | return ret; |
5192 | 5448 | ||
5449 | free_dev: | ||
5450 | device_del(pmu->dev); | ||
5451 | put_device(pmu->dev); | ||
5452 | |||
5453 | free_idr: | ||
5454 | if (pmu->type >= PERF_TYPE_MAX) | ||
5455 | idr_remove(&pmu_idr, pmu->type); | ||
5456 | |||
5193 | free_pdc: | 5457 | free_pdc: |
5194 | free_percpu(pmu->pmu_disable_count); | 5458 | free_percpu(pmu->pmu_disable_count); |
5195 | goto unlock; | 5459 | goto unlock; |
@@ -5209,7 +5473,11 @@ void perf_pmu_unregister(struct pmu *pmu) | |||
5209 | synchronize_rcu(); | 5473 | synchronize_rcu(); |
5210 | 5474 | ||
5211 | free_percpu(pmu->pmu_disable_count); | 5475 | free_percpu(pmu->pmu_disable_count); |
5212 | free_pmu_context(pmu->pmu_cpu_context); | 5476 | if (pmu->type >= PERF_TYPE_MAX) |
5477 | idr_remove(&pmu_idr, pmu->type); | ||
5478 | device_del(pmu->dev); | ||
5479 | put_device(pmu->dev); | ||
5480 | free_pmu_context(pmu); | ||
5213 | } | 5481 | } |
5214 | 5482 | ||
5215 | struct pmu *perf_init_event(struct perf_event *event) | 5483 | struct pmu *perf_init_event(struct perf_event *event) |
@@ -5218,6 +5486,13 @@ struct pmu *perf_init_event(struct perf_event *event) | |||
5218 | int idx; | 5486 | int idx; |
5219 | 5487 | ||
5220 | idx = srcu_read_lock(&pmus_srcu); | 5488 | idx = srcu_read_lock(&pmus_srcu); |
5489 | |||
5490 | rcu_read_lock(); | ||
5491 | pmu = idr_find(&pmu_idr, event->attr.type); | ||
5492 | rcu_read_unlock(); | ||
5493 | if (pmu) | ||
5494 | goto unlock; | ||
5495 | |||
5221 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 5496 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
5222 | int ret = pmu->event_init(event); | 5497 | int ret = pmu->event_init(event); |
5223 | if (!ret) | 5498 | if (!ret) |
@@ -5677,12 +5952,18 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5677 | mutex_unlock(&ctx->mutex); | 5952 | mutex_unlock(&ctx->mutex); |
5678 | 5953 | ||
5679 | event->owner = current; | 5954 | event->owner = current; |
5680 | get_task_struct(current); | 5955 | |
5681 | mutex_lock(¤t->perf_event_mutex); | 5956 | mutex_lock(¤t->perf_event_mutex); |
5682 | list_add_tail(&event->owner_entry, ¤t->perf_event_list); | 5957 | list_add_tail(&event->owner_entry, ¤t->perf_event_list); |
5683 | mutex_unlock(¤t->perf_event_mutex); | 5958 | mutex_unlock(¤t->perf_event_mutex); |
5684 | 5959 | ||
5685 | /* | 5960 | /* |
5961 | * Precalculate sample_data sizes | ||
5962 | */ | ||
5963 | perf_event__header_size(event); | ||
5964 | perf_event__id_header_size(event); | ||
5965 | |||
5966 | /* | ||
5686 | * Drop the reference on the group_event after placing the | 5967 | * Drop the reference on the group_event after placing the |
5687 | * new event on the sibling_list. This ensures destruction | 5968 | * new event on the sibling_list. This ensures destruction |
5688 | * of the group leader will find the pointer to itself in | 5969 | * of the group leader will find the pointer to itself in |
@@ -5745,12 +6026,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
5745 | ++ctx->generation; | 6026 | ++ctx->generation; |
5746 | mutex_unlock(&ctx->mutex); | 6027 | mutex_unlock(&ctx->mutex); |
5747 | 6028 | ||
5748 | event->owner = current; | ||
5749 | get_task_struct(current); | ||
5750 | mutex_lock(¤t->perf_event_mutex); | ||
5751 | list_add_tail(&event->owner_entry, ¤t->perf_event_list); | ||
5752 | mutex_unlock(¤t->perf_event_mutex); | ||
5753 | |||
5754 | return event; | 6029 | return event; |
5755 | 6030 | ||
5756 | err_free: | 6031 | err_free: |
@@ -5901,8 +6176,24 @@ again: | |||
5901 | */ | 6176 | */ |
5902 | void perf_event_exit_task(struct task_struct *child) | 6177 | void perf_event_exit_task(struct task_struct *child) |
5903 | { | 6178 | { |
6179 | struct perf_event *event, *tmp; | ||
5904 | int ctxn; | 6180 | int ctxn; |
5905 | 6181 | ||
6182 | mutex_lock(&child->perf_event_mutex); | ||
6183 | list_for_each_entry_safe(event, tmp, &child->perf_event_list, | ||
6184 | owner_entry) { | ||
6185 | list_del_init(&event->owner_entry); | ||
6186 | |||
6187 | /* | ||
6188 | * Ensure the list deletion is visible before we clear | ||
6189 | * the owner, closes a race against perf_release() where | ||
6190 | * we need to serialize on the owner->perf_event_mutex. | ||
6191 | */ | ||
6192 | smp_wmb(); | ||
6193 | event->owner = NULL; | ||
6194 | } | ||
6195 | mutex_unlock(&child->perf_event_mutex); | ||
6196 | |||
5906 | for_each_task_context_nr(ctxn) | 6197 | for_each_task_context_nr(ctxn) |
5907 | perf_event_exit_task_context(child, ctxn); | 6198 | perf_event_exit_task_context(child, ctxn); |
5908 | } | 6199 | } |
@@ -6025,6 +6316,12 @@ inherit_event(struct perf_event *parent_event, | |||
6025 | child_event->overflow_handler = parent_event->overflow_handler; | 6316 | child_event->overflow_handler = parent_event->overflow_handler; |
6026 | 6317 | ||
6027 | /* | 6318 | /* |
6319 | * Precalculate sample_data sizes | ||
6320 | */ | ||
6321 | perf_event__header_size(child_event); | ||
6322 | perf_event__id_header_size(child_event); | ||
6323 | |||
6324 | /* | ||
6028 | * Link it up in the child's context: | 6325 | * Link it up in the child's context: |
6029 | */ | 6326 | */ |
6030 | raw_spin_lock_irqsave(&child_ctx->lock, flags); | 6327 | raw_spin_lock_irqsave(&child_ctx->lock, flags); |
@@ -6122,6 +6419,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn) | |||
6122 | struct perf_event *event; | 6419 | struct perf_event *event; |
6123 | struct task_struct *parent = current; | 6420 | struct task_struct *parent = current; |
6124 | int inherited_all = 1; | 6421 | int inherited_all = 1; |
6422 | unsigned long flags; | ||
6125 | int ret = 0; | 6423 | int ret = 0; |
6126 | 6424 | ||
6127 | child->perf_event_ctxp[ctxn] = NULL; | 6425 | child->perf_event_ctxp[ctxn] = NULL; |
@@ -6162,6 +6460,15 @@ int perf_event_init_context(struct task_struct *child, int ctxn) | |||
6162 | break; | 6460 | break; |
6163 | } | 6461 | } |
6164 | 6462 | ||
6463 | /* | ||
6464 | * We can't hold ctx->lock when iterating the ->flexible_group list due | ||
6465 | * to allocations, but we need to prevent rotation because | ||
6466 | * rotate_ctx() will change the list from interrupt context. | ||
6467 | */ | ||
6468 | raw_spin_lock_irqsave(&parent_ctx->lock, flags); | ||
6469 | parent_ctx->rotate_disable = 1; | ||
6470 | raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); | ||
6471 | |||
6165 | list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { | 6472 | list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { |
6166 | ret = inherit_task_group(event, parent, parent_ctx, | 6473 | ret = inherit_task_group(event, parent, parent_ctx, |
6167 | child, ctxn, &inherited_all); | 6474 | child, ctxn, &inherited_all); |
@@ -6169,6 +6476,10 @@ int perf_event_init_context(struct task_struct *child, int ctxn) | |||
6169 | break; | 6476 | break; |
6170 | } | 6477 | } |
6171 | 6478 | ||
6479 | raw_spin_lock_irqsave(&parent_ctx->lock, flags); | ||
6480 | parent_ctx->rotate_disable = 0; | ||
6481 | raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); | ||
6482 | |||
6172 | child_ctx = child->perf_event_ctxp[ctxn]; | 6483 | child_ctx = child->perf_event_ctxp[ctxn]; |
6173 | 6484 | ||
6174 | if (child_ctx && inherited_all) { | 6485 | if (child_ctx && inherited_all) { |
@@ -6241,7 +6552,7 @@ static void __cpuinit perf_event_init_cpu(int cpu) | |||
6241 | mutex_unlock(&swhash->hlist_mutex); | 6552 | mutex_unlock(&swhash->hlist_mutex); |
6242 | } | 6553 | } |
6243 | 6554 | ||
6244 | #ifdef CONFIG_HOTPLUG_CPU | 6555 | #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC |
6245 | static void perf_pmu_rotate_stop(struct pmu *pmu) | 6556 | static void perf_pmu_rotate_stop(struct pmu *pmu) |
6246 | { | 6557 | { |
6247 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | 6558 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); |
@@ -6295,6 +6606,26 @@ static void perf_event_exit_cpu(int cpu) | |||
6295 | static inline void perf_event_exit_cpu(int cpu) { } | 6606 | static inline void perf_event_exit_cpu(int cpu) { } |
6296 | #endif | 6607 | #endif |
6297 | 6608 | ||
6609 | static int | ||
6610 | perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) | ||
6611 | { | ||
6612 | int cpu; | ||
6613 | |||
6614 | for_each_online_cpu(cpu) | ||
6615 | perf_event_exit_cpu(cpu); | ||
6616 | |||
6617 | return NOTIFY_OK; | ||
6618 | } | ||
6619 | |||
6620 | /* | ||
6621 | * Run the perf reboot notifier at the very last possible moment so that | ||
6622 | * the generic watchdog code runs as long as possible. | ||
6623 | */ | ||
6624 | static struct notifier_block perf_reboot_notifier = { | ||
6625 | .notifier_call = perf_reboot, | ||
6626 | .priority = INT_MIN, | ||
6627 | }; | ||
6628 | |||
6298 | static int __cpuinit | 6629 | static int __cpuinit |
6299 | perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | 6630 | perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) |
6300 | { | 6631 | { |
@@ -6321,11 +6652,47 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | |||
6321 | 6652 | ||
6322 | void __init perf_event_init(void) | 6653 | void __init perf_event_init(void) |
6323 | { | 6654 | { |
6655 | int ret; | ||
6656 | |||
6657 | idr_init(&pmu_idr); | ||
6658 | |||
6324 | perf_event_init_all_cpus(); | 6659 | perf_event_init_all_cpus(); |
6325 | init_srcu_struct(&pmus_srcu); | 6660 | init_srcu_struct(&pmus_srcu); |
6326 | perf_pmu_register(&perf_swevent); | 6661 | perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); |
6327 | perf_pmu_register(&perf_cpu_clock); | 6662 | perf_pmu_register(&perf_cpu_clock, NULL, -1); |
6328 | perf_pmu_register(&perf_task_clock); | 6663 | perf_pmu_register(&perf_task_clock, NULL, -1); |
6329 | perf_tp_register(); | 6664 | perf_tp_register(); |
6330 | perf_cpu_notifier(perf_cpu_notify); | 6665 | perf_cpu_notifier(perf_cpu_notify); |
6666 | register_reboot_notifier(&perf_reboot_notifier); | ||
6667 | |||
6668 | ret = init_hw_breakpoint(); | ||
6669 | WARN(ret, "hw_breakpoint initialization failed with: %d", ret); | ||
6670 | } | ||
6671 | |||
6672 | static int __init perf_event_sysfs_init(void) | ||
6673 | { | ||
6674 | struct pmu *pmu; | ||
6675 | int ret; | ||
6676 | |||
6677 | mutex_lock(&pmus_lock); | ||
6678 | |||
6679 | ret = bus_register(&pmu_bus); | ||
6680 | if (ret) | ||
6681 | goto unlock; | ||
6682 | |||
6683 | list_for_each_entry(pmu, &pmus, entry) { | ||
6684 | if (!pmu->name || pmu->type < 0) | ||
6685 | continue; | ||
6686 | |||
6687 | ret = pmu_dev_alloc(pmu); | ||
6688 | WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret); | ||
6689 | } | ||
6690 | pmu_bus_running = 1; | ||
6691 | ret = 0; | ||
6692 | |||
6693 | unlock: | ||
6694 | mutex_unlock(&pmus_lock); | ||
6695 | |||
6696 | return ret; | ||
6331 | } | 6697 | } |
6698 | device_initcall(perf_event_sysfs_init); | ||
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 6842eeba587..05bb7173850 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -37,13 +37,13 @@ static int check_clock(const clockid_t which_clock) | |||
37 | if (pid == 0) | 37 | if (pid == 0) |
38 | return 0; | 38 | return 0; |
39 | 39 | ||
40 | read_lock(&tasklist_lock); | 40 | rcu_read_lock(); |
41 | p = find_task_by_vpid(pid); | 41 | p = find_task_by_vpid(pid); |
42 | if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? | 42 | if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? |
43 | same_thread_group(p, current) : thread_group_leader(p))) { | 43 | same_thread_group(p, current) : has_group_leader_pid(p))) { |
44 | error = -EINVAL; | 44 | error = -EINVAL; |
45 | } | 45 | } |
46 | read_unlock(&tasklist_lock); | 46 | rcu_read_unlock(); |
47 | 47 | ||
48 | return error; | 48 | return error; |
49 | } | 49 | } |
@@ -390,7 +390,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) | |||
390 | 390 | ||
391 | INIT_LIST_HEAD(&new_timer->it.cpu.entry); | 391 | INIT_LIST_HEAD(&new_timer->it.cpu.entry); |
392 | 392 | ||
393 | read_lock(&tasklist_lock); | 393 | rcu_read_lock(); |
394 | if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { | 394 | if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { |
395 | if (pid == 0) { | 395 | if (pid == 0) { |
396 | p = current; | 396 | p = current; |
@@ -404,7 +404,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) | |||
404 | p = current->group_leader; | 404 | p = current->group_leader; |
405 | } else { | 405 | } else { |
406 | p = find_task_by_vpid(pid); | 406 | p = find_task_by_vpid(pid); |
407 | if (p && !thread_group_leader(p)) | 407 | if (p && !has_group_leader_pid(p)) |
408 | p = NULL; | 408 | p = NULL; |
409 | } | 409 | } |
410 | } | 410 | } |
@@ -414,7 +414,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) | |||
414 | } else { | 414 | } else { |
415 | ret = -EINVAL; | 415 | ret = -EINVAL; |
416 | } | 416 | } |
417 | read_unlock(&tasklist_lock); | 417 | rcu_read_unlock(); |
418 | 418 | ||
419 | return ret; | 419 | return ret; |
420 | } | 420 | } |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 9ca4973f736..93bd2eb2bc5 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -145,7 +145,13 @@ static int common_timer_del(struct k_itimer *timer); | |||
145 | 145 | ||
146 | static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); | 146 | static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); |
147 | 147 | ||
148 | static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); | 148 | static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); |
149 | |||
150 | #define lock_timer(tid, flags) \ | ||
151 | ({ struct k_itimer *__timr; \ | ||
152 | __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ | ||
153 | __timr; \ | ||
154 | }) | ||
149 | 155 | ||
150 | static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) | 156 | static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) |
151 | { | 157 | { |
@@ -619,7 +625,7 @@ out: | |||
619 | * the find to the timer lock. To avoid a dead lock, the timer id MUST | 625 | * the find to the timer lock. To avoid a dead lock, the timer id MUST |
620 | * be release with out holding the timer lock. | 626 | * be release with out holding the timer lock. |
621 | */ | 627 | */ |
622 | static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags) | 628 | static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) |
623 | { | 629 | { |
624 | struct k_itimer *timr; | 630 | struct k_itimer *timr; |
625 | /* | 631 | /* |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 657272e91d0..048d0b51483 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -327,7 +327,6 @@ static int create_image(int platform_mode) | |||
327 | int hibernation_snapshot(int platform_mode) | 327 | int hibernation_snapshot(int platform_mode) |
328 | { | 328 | { |
329 | int error; | 329 | int error; |
330 | gfp_t saved_mask; | ||
331 | 330 | ||
332 | error = platform_begin(platform_mode); | 331 | error = platform_begin(platform_mode); |
333 | if (error) | 332 | if (error) |
@@ -339,7 +338,7 @@ int hibernation_snapshot(int platform_mode) | |||
339 | goto Close; | 338 | goto Close; |
340 | 339 | ||
341 | suspend_console(); | 340 | suspend_console(); |
342 | saved_mask = clear_gfp_allowed_mask(GFP_IOFS); | 341 | pm_restrict_gfp_mask(); |
343 | error = dpm_suspend_start(PMSG_FREEZE); | 342 | error = dpm_suspend_start(PMSG_FREEZE); |
344 | if (error) | 343 | if (error) |
345 | goto Recover_platform; | 344 | goto Recover_platform; |
@@ -348,7 +347,10 @@ int hibernation_snapshot(int platform_mode) | |||
348 | goto Recover_platform; | 347 | goto Recover_platform; |
349 | 348 | ||
350 | error = create_image(platform_mode); | 349 | error = create_image(platform_mode); |
351 | /* Control returns here after successful restore */ | 350 | /* |
351 | * Control returns here (1) after the image has been created or the | ||
352 | * image creation has failed and (2) after a successful restore. | ||
353 | */ | ||
352 | 354 | ||
353 | Resume_devices: | 355 | Resume_devices: |
354 | /* We may need to release the preallocated image pages here. */ | 356 | /* We may need to release the preallocated image pages here. */ |
@@ -357,7 +359,10 @@ int hibernation_snapshot(int platform_mode) | |||
357 | 359 | ||
358 | dpm_resume_end(in_suspend ? | 360 | dpm_resume_end(in_suspend ? |
359 | (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); | 361 | (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); |
360 | set_gfp_allowed_mask(saved_mask); | 362 | |
363 | if (error || !in_suspend) | ||
364 | pm_restore_gfp_mask(); | ||
365 | |||
361 | resume_console(); | 366 | resume_console(); |
362 | Close: | 367 | Close: |
363 | platform_end(platform_mode); | 368 | platform_end(platform_mode); |
@@ -452,17 +457,16 @@ static int resume_target_kernel(bool platform_mode) | |||
452 | int hibernation_restore(int platform_mode) | 457 | int hibernation_restore(int platform_mode) |
453 | { | 458 | { |
454 | int error; | 459 | int error; |
455 | gfp_t saved_mask; | ||
456 | 460 | ||
457 | pm_prepare_console(); | 461 | pm_prepare_console(); |
458 | suspend_console(); | 462 | suspend_console(); |
459 | saved_mask = clear_gfp_allowed_mask(GFP_IOFS); | 463 | pm_restrict_gfp_mask(); |
460 | error = dpm_suspend_start(PMSG_QUIESCE); | 464 | error = dpm_suspend_start(PMSG_QUIESCE); |
461 | if (!error) { | 465 | if (!error) { |
462 | error = resume_target_kernel(platform_mode); | 466 | error = resume_target_kernel(platform_mode); |
463 | dpm_resume_end(PMSG_RECOVER); | 467 | dpm_resume_end(PMSG_RECOVER); |
464 | } | 468 | } |
465 | set_gfp_allowed_mask(saved_mask); | 469 | pm_restore_gfp_mask(); |
466 | resume_console(); | 470 | resume_console(); |
467 | pm_restore_console(); | 471 | pm_restore_console(); |
468 | return error; | 472 | return error; |
@@ -476,7 +480,6 @@ int hibernation_restore(int platform_mode) | |||
476 | int hibernation_platform_enter(void) | 480 | int hibernation_platform_enter(void) |
477 | { | 481 | { |
478 | int error; | 482 | int error; |
479 | gfp_t saved_mask; | ||
480 | 483 | ||
481 | if (!hibernation_ops) | 484 | if (!hibernation_ops) |
482 | return -ENOSYS; | 485 | return -ENOSYS; |
@@ -492,7 +495,6 @@ int hibernation_platform_enter(void) | |||
492 | 495 | ||
493 | entering_platform_hibernation = true; | 496 | entering_platform_hibernation = true; |
494 | suspend_console(); | 497 | suspend_console(); |
495 | saved_mask = clear_gfp_allowed_mask(GFP_IOFS); | ||
496 | error = dpm_suspend_start(PMSG_HIBERNATE); | 498 | error = dpm_suspend_start(PMSG_HIBERNATE); |
497 | if (error) { | 499 | if (error) { |
498 | if (hibernation_ops->recover) | 500 | if (hibernation_ops->recover) |
@@ -536,7 +538,6 @@ int hibernation_platform_enter(void) | |||
536 | Resume_devices: | 538 | Resume_devices: |
537 | entering_platform_hibernation = false; | 539 | entering_platform_hibernation = false; |
538 | dpm_resume_end(PMSG_RESTORE); | 540 | dpm_resume_end(PMSG_RESTORE); |
539 | set_gfp_allowed_mask(saved_mask); | ||
540 | resume_console(); | 541 | resume_console(); |
541 | 542 | ||
542 | Close: | 543 | Close: |
@@ -646,6 +647,7 @@ int hibernate(void) | |||
646 | swsusp_free(); | 647 | swsusp_free(); |
647 | if (!error) | 648 | if (!error) |
648 | power_down(); | 649 | power_down(); |
650 | pm_restore_gfp_mask(); | ||
649 | } else { | 651 | } else { |
650 | pr_debug("PM: Image restored successfully.\n"); | 652 | pr_debug("PM: Image restored successfully.\n"); |
651 | } | 653 | } |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 7335952ee47..031d5e3a619 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/mm.h> | 22 | #include <linux/mm.h> |
23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
24 | #include <linux/suspend.h> | 24 | #include <linux/suspend.h> |
25 | #include <trace/events/power.h> | ||
25 | 26 | ||
26 | #include "power.h" | 27 | #include "power.h" |
27 | 28 | ||
@@ -197,18 +198,18 @@ static int suspend_enter(suspend_state_t state) | |||
197 | int suspend_devices_and_enter(suspend_state_t state) | 198 | int suspend_devices_and_enter(suspend_state_t state) |
198 | { | 199 | { |
199 | int error; | 200 | int error; |
200 | gfp_t saved_mask; | ||
201 | 201 | ||
202 | if (!suspend_ops) | 202 | if (!suspend_ops) |
203 | return -ENOSYS; | 203 | return -ENOSYS; |
204 | 204 | ||
205 | trace_machine_suspend(state); | ||
205 | if (suspend_ops->begin) { | 206 | if (suspend_ops->begin) { |
206 | error = suspend_ops->begin(state); | 207 | error = suspend_ops->begin(state); |
207 | if (error) | 208 | if (error) |
208 | goto Close; | 209 | goto Close; |
209 | } | 210 | } |
210 | suspend_console(); | 211 | suspend_console(); |
211 | saved_mask = clear_gfp_allowed_mask(GFP_IOFS); | 212 | pm_restrict_gfp_mask(); |
212 | suspend_test_start(); | 213 | suspend_test_start(); |
213 | error = dpm_suspend_start(PMSG_SUSPEND); | 214 | error = dpm_suspend_start(PMSG_SUSPEND); |
214 | if (error) { | 215 | if (error) { |
@@ -225,11 +226,12 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
225 | suspend_test_start(); | 226 | suspend_test_start(); |
226 | dpm_resume_end(PMSG_RESUME); | 227 | dpm_resume_end(PMSG_RESUME); |
227 | suspend_test_finish("resume devices"); | 228 | suspend_test_finish("resume devices"); |
228 | set_gfp_allowed_mask(saved_mask); | 229 | pm_restore_gfp_mask(); |
229 | resume_console(); | 230 | resume_console(); |
230 | Close: | 231 | Close: |
231 | if (suspend_ops->end) | 232 | if (suspend_ops->end) |
232 | suspend_ops->end(); | 233 | suspend_ops->end(); |
234 | trace_machine_suspend(PWR_EVENT_EXIT); | ||
233 | return error; | 235 | return error; |
234 | 236 | ||
235 | Recover_platform: | 237 | Recover_platform: |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index a0e4a86ccf9..8c7e4832b9b 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -6,6 +6,7 @@ | |||
6 | * | 6 | * |
7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> | 7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> |
8 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> | 8 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> |
9 | * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com> | ||
9 | * | 10 | * |
10 | * This file is released under the GPLv2. | 11 | * This file is released under the GPLv2. |
11 | * | 12 | * |
@@ -29,7 +30,7 @@ | |||
29 | 30 | ||
30 | #include "power.h" | 31 | #include "power.h" |
31 | 32 | ||
32 | #define HIBERNATE_SIG "LINHIB0001" | 33 | #define HIBERNATE_SIG "S1SUSPEND" |
33 | 34 | ||
34 | /* | 35 | /* |
35 | * The swap map is a data structure used for keeping track of each page | 36 | * The swap map is a data structure used for keeping track of each page |
@@ -753,30 +754,43 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
753 | { | 754 | { |
754 | unsigned int m; | 755 | unsigned int m; |
755 | int error = 0; | 756 | int error = 0; |
757 | struct bio *bio; | ||
756 | struct timeval start; | 758 | struct timeval start; |
757 | struct timeval stop; | 759 | struct timeval stop; |
758 | unsigned nr_pages; | 760 | unsigned nr_pages; |
759 | size_t off, unc_len, cmp_len; | 761 | size_t i, off, unc_len, cmp_len; |
760 | unsigned char *unc, *cmp, *page; | 762 | unsigned char *unc, *cmp, *page[LZO_CMP_PAGES]; |
761 | 763 | ||
762 | page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | 764 | for (i = 0; i < LZO_CMP_PAGES; i++) { |
763 | if (!page) { | 765 | page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); |
764 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); | 766 | if (!page[i]) { |
765 | return -ENOMEM; | 767 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); |
768 | |||
769 | while (i) | ||
770 | free_page((unsigned long)page[--i]); | ||
771 | |||
772 | return -ENOMEM; | ||
773 | } | ||
766 | } | 774 | } |
767 | 775 | ||
768 | unc = vmalloc(LZO_UNC_SIZE); | 776 | unc = vmalloc(LZO_UNC_SIZE); |
769 | if (!unc) { | 777 | if (!unc) { |
770 | printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); | 778 | printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); |
771 | free_page((unsigned long)page); | 779 | |
780 | for (i = 0; i < LZO_CMP_PAGES; i++) | ||
781 | free_page((unsigned long)page[i]); | ||
782 | |||
772 | return -ENOMEM; | 783 | return -ENOMEM; |
773 | } | 784 | } |
774 | 785 | ||
775 | cmp = vmalloc(LZO_CMP_SIZE); | 786 | cmp = vmalloc(LZO_CMP_SIZE); |
776 | if (!cmp) { | 787 | if (!cmp) { |
777 | printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); | 788 | printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); |
789 | |||
778 | vfree(unc); | 790 | vfree(unc); |
779 | free_page((unsigned long)page); | 791 | for (i = 0; i < LZO_CMP_PAGES; i++) |
792 | free_page((unsigned long)page[i]); | ||
793 | |||
780 | return -ENOMEM; | 794 | return -ENOMEM; |
781 | } | 795 | } |
782 | 796 | ||
@@ -787,6 +801,7 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
787 | if (!m) | 801 | if (!m) |
788 | m = 1; | 802 | m = 1; |
789 | nr_pages = 0; | 803 | nr_pages = 0; |
804 | bio = NULL; | ||
790 | do_gettimeofday(&start); | 805 | do_gettimeofday(&start); |
791 | 806 | ||
792 | error = snapshot_write_next(snapshot); | 807 | error = snapshot_write_next(snapshot); |
@@ -794,11 +809,11 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
794 | goto out_finish; | 809 | goto out_finish; |
795 | 810 | ||
796 | for (;;) { | 811 | for (;;) { |
797 | error = swap_read_page(handle, page, NULL); /* sync */ | 812 | error = swap_read_page(handle, page[0], NULL); /* sync */ |
798 | if (error) | 813 | if (error) |
799 | break; | 814 | break; |
800 | 815 | ||
801 | cmp_len = *(size_t *)page; | 816 | cmp_len = *(size_t *)page[0]; |
802 | if (unlikely(!cmp_len || | 817 | if (unlikely(!cmp_len || |
803 | cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { | 818 | cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { |
804 | printk(KERN_ERR "PM: Invalid LZO compressed length\n"); | 819 | printk(KERN_ERR "PM: Invalid LZO compressed length\n"); |
@@ -806,13 +821,20 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
806 | break; | 821 | break; |
807 | } | 822 | } |
808 | 823 | ||
809 | memcpy(cmp, page, PAGE_SIZE); | 824 | for (off = PAGE_SIZE, i = 1; |
810 | for (off = PAGE_SIZE; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { | 825 | off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { |
811 | error = swap_read_page(handle, page, NULL); /* sync */ | 826 | error = swap_read_page(handle, page[i], &bio); |
812 | if (error) | 827 | if (error) |
813 | goto out_finish; | 828 | goto out_finish; |
829 | } | ||
814 | 830 | ||
815 | memcpy(cmp + off, page, PAGE_SIZE); | 831 | error = hib_wait_on_bio_chain(&bio); /* need all data now */ |
832 | if (error) | ||
833 | goto out_finish; | ||
834 | |||
835 | for (off = 0, i = 0; | ||
836 | off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { | ||
837 | memcpy(cmp + off, page[i], PAGE_SIZE); | ||
816 | } | 838 | } |
817 | 839 | ||
818 | unc_len = LZO_UNC_SIZE; | 840 | unc_len = LZO_UNC_SIZE; |
@@ -857,7 +879,8 @@ out_finish: | |||
857 | 879 | ||
858 | vfree(cmp); | 880 | vfree(cmp); |
859 | vfree(unc); | 881 | vfree(unc); |
860 | free_page((unsigned long)page); | 882 | for (i = 0; i < LZO_CMP_PAGES; i++) |
883 | free_page((unsigned long)page[i]); | ||
861 | 884 | ||
862 | return error; | 885 | return error; |
863 | } | 886 | } |
diff --git a/kernel/power/user.c b/kernel/power/user.c index e819e17877c..c36c3b9e8a8 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
@@ -137,7 +137,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) | |||
137 | free_all_swap_pages(data->swap); | 137 | free_all_swap_pages(data->swap); |
138 | if (data->frozen) | 138 | if (data->frozen) |
139 | thaw_processes(); | 139 | thaw_processes(); |
140 | pm_notifier_call_chain(data->mode == O_WRONLY ? | 140 | pm_notifier_call_chain(data->mode == O_RDONLY ? |
141 | PM_POST_HIBERNATION : PM_POST_RESTORE); | 141 | PM_POST_HIBERNATION : PM_POST_RESTORE); |
142 | atomic_inc(&snapshot_device_available); | 142 | atomic_inc(&snapshot_device_available); |
143 | 143 | ||
@@ -263,6 +263,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
263 | case SNAPSHOT_UNFREEZE: | 263 | case SNAPSHOT_UNFREEZE: |
264 | if (!data->frozen || data->ready) | 264 | if (!data->frozen || data->ready) |
265 | break; | 265 | break; |
266 | pm_restore_gfp_mask(); | ||
266 | thaw_processes(); | 267 | thaw_processes(); |
267 | usermodehelper_enable(); | 268 | usermodehelper_enable(); |
268 | data->frozen = 0; | 269 | data->frozen = 0; |
@@ -275,6 +276,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
275 | error = -EPERM; | 276 | error = -EPERM; |
276 | break; | 277 | break; |
277 | } | 278 | } |
279 | pm_restore_gfp_mask(); | ||
278 | error = hibernation_snapshot(data->platform_support); | 280 | error = hibernation_snapshot(data->platform_support); |
279 | if (!error) | 281 | if (!error) |
280 | error = put_user(in_suspend, (int __user *)arg); | 282 | error = put_user(in_suspend, (int __user *)arg); |
diff --git a/kernel/printk.c b/kernel/printk.c index 9a2264fc42c..4642a5c439e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -43,12 +43,6 @@ | |||
43 | #include <asm/uaccess.h> | 43 | #include <asm/uaccess.h> |
44 | 44 | ||
45 | /* | 45 | /* |
46 | * for_each_console() allows you to iterate on each console | ||
47 | */ | ||
48 | #define for_each_console(con) \ | ||
49 | for (con = console_drivers; con != NULL; con = con->next) | ||
50 | |||
51 | /* | ||
52 | * Architectures can override it: | 46 | * Architectures can override it: |
53 | */ | 47 | */ |
54 | void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) | 48 | void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) |
@@ -1074,21 +1068,23 @@ static DEFINE_PER_CPU(int, printk_pending); | |||
1074 | 1068 | ||
1075 | void printk_tick(void) | 1069 | void printk_tick(void) |
1076 | { | 1070 | { |
1077 | if (__get_cpu_var(printk_pending)) { | 1071 | if (__this_cpu_read(printk_pending)) { |
1078 | __get_cpu_var(printk_pending) = 0; | 1072 | __this_cpu_write(printk_pending, 0); |
1079 | wake_up_interruptible(&log_wait); | 1073 | wake_up_interruptible(&log_wait); |
1080 | } | 1074 | } |
1081 | } | 1075 | } |
1082 | 1076 | ||
1083 | int printk_needs_cpu(int cpu) | 1077 | int printk_needs_cpu(int cpu) |
1084 | { | 1078 | { |
1085 | return per_cpu(printk_pending, cpu); | 1079 | if (cpu_is_offline(cpu)) |
1080 | printk_tick(); | ||
1081 | return __this_cpu_read(printk_pending); | ||
1086 | } | 1082 | } |
1087 | 1083 | ||
1088 | void wake_up_klogd(void) | 1084 | void wake_up_klogd(void) |
1089 | { | 1085 | { |
1090 | if (waitqueue_active(&log_wait)) | 1086 | if (waitqueue_active(&log_wait)) |
1091 | __raw_get_cpu_var(printk_pending) = 1; | 1087 | this_cpu_write(printk_pending, 1); |
1092 | } | 1088 | } |
1093 | 1089 | ||
1094 | /** | 1090 | /** |
@@ -1357,6 +1353,7 @@ void register_console(struct console *newcon) | |||
1357 | spin_unlock_irqrestore(&logbuf_lock, flags); | 1353 | spin_unlock_irqrestore(&logbuf_lock, flags); |
1358 | } | 1354 | } |
1359 | release_console_sem(); | 1355 | release_console_sem(); |
1356 | console_sysfs_notify(); | ||
1360 | 1357 | ||
1361 | /* | 1358 | /* |
1362 | * By unregistering the bootconsoles after we enable the real console | 1359 | * By unregistering the bootconsoles after we enable the real console |
@@ -1415,6 +1412,7 @@ int unregister_console(struct console *console) | |||
1415 | console_drivers->flags |= CON_CONSDEV; | 1412 | console_drivers->flags |= CON_CONSDEV; |
1416 | 1413 | ||
1417 | release_console_sem(); | 1414 | release_console_sem(); |
1415 | console_sysfs_notify(); | ||
1418 | return res; | 1416 | return res; |
1419 | } | 1417 | } |
1420 | EXPORT_SYMBOL(unregister_console); | 1418 | EXPORT_SYMBOL(unregister_console); |
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index d806735342a..03449372474 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
@@ -36,31 +36,16 @@ | |||
36 | #include <linux/time.h> | 36 | #include <linux/time.h> |
37 | #include <linux/cpu.h> | 37 | #include <linux/cpu.h> |
38 | 38 | ||
39 | /* Global control variables for rcupdate callback mechanism. */ | 39 | /* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ |
40 | struct rcu_ctrlblk { | 40 | static struct task_struct *rcu_kthread_task; |
41 | struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ | 41 | static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); |
42 | struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ | 42 | static unsigned long have_rcu_kthread_work; |
43 | struct rcu_head **curtail; /* ->next pointer of last CB. */ | 43 | static void invoke_rcu_kthread(void); |
44 | }; | ||
45 | |||
46 | /* Definition for rcupdate control block. */ | ||
47 | static struct rcu_ctrlblk rcu_sched_ctrlblk = { | ||
48 | .donetail = &rcu_sched_ctrlblk.rcucblist, | ||
49 | .curtail = &rcu_sched_ctrlblk.rcucblist, | ||
50 | }; | ||
51 | |||
52 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | ||
53 | .donetail = &rcu_bh_ctrlblk.rcucblist, | ||
54 | .curtail = &rcu_bh_ctrlblk.rcucblist, | ||
55 | }; | ||
56 | |||
57 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
58 | int rcu_scheduler_active __read_mostly; | ||
59 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); | ||
60 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
61 | 44 | ||
62 | /* Forward declarations for rcutiny_plugin.h. */ | 45 | /* Forward declarations for rcutiny_plugin.h. */ |
63 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); | 46 | struct rcu_ctrlblk; |
47 | static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); | ||
48 | static int rcu_kthread(void *arg); | ||
64 | static void __call_rcu(struct rcu_head *head, | 49 | static void __call_rcu(struct rcu_head *head, |
65 | void (*func)(struct rcu_head *rcu), | 50 | void (*func)(struct rcu_head *rcu), |
66 | struct rcu_ctrlblk *rcp); | 51 | struct rcu_ctrlblk *rcp); |
@@ -123,7 +108,7 @@ void rcu_sched_qs(int cpu) | |||
123 | { | 108 | { |
124 | if (rcu_qsctr_help(&rcu_sched_ctrlblk) + | 109 | if (rcu_qsctr_help(&rcu_sched_ctrlblk) + |
125 | rcu_qsctr_help(&rcu_bh_ctrlblk)) | 110 | rcu_qsctr_help(&rcu_bh_ctrlblk)) |
126 | raise_softirq(RCU_SOFTIRQ); | 111 | invoke_rcu_kthread(); |
127 | } | 112 | } |
128 | 113 | ||
129 | /* | 114 | /* |
@@ -132,7 +117,7 @@ void rcu_sched_qs(int cpu) | |||
132 | void rcu_bh_qs(int cpu) | 117 | void rcu_bh_qs(int cpu) |
133 | { | 118 | { |
134 | if (rcu_qsctr_help(&rcu_bh_ctrlblk)) | 119 | if (rcu_qsctr_help(&rcu_bh_ctrlblk)) |
135 | raise_softirq(RCU_SOFTIRQ); | 120 | invoke_rcu_kthread(); |
136 | } | 121 | } |
137 | 122 | ||
138 | /* | 123 | /* |
@@ -152,13 +137,14 @@ void rcu_check_callbacks(int cpu, int user) | |||
152 | } | 137 | } |
153 | 138 | ||
154 | /* | 139 | /* |
155 | * Helper function for rcu_process_callbacks() that operates on the | 140 | * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure |
156 | * specified rcu_ctrlkblk structure. | 141 | * whose grace period has elapsed. |
157 | */ | 142 | */ |
158 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | 143 | static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) |
159 | { | 144 | { |
160 | struct rcu_head *next, *list; | 145 | struct rcu_head *next, *list; |
161 | unsigned long flags; | 146 | unsigned long flags; |
147 | RCU_TRACE(int cb_count = 0); | ||
162 | 148 | ||
163 | /* If no RCU callbacks ready to invoke, just return. */ | 149 | /* If no RCU callbacks ready to invoke, just return. */ |
164 | if (&rcp->rcucblist == rcp->donetail) | 150 | if (&rcp->rcucblist == rcp->donetail) |
@@ -180,19 +166,58 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
180 | next = list->next; | 166 | next = list->next; |
181 | prefetch(next); | 167 | prefetch(next); |
182 | debug_rcu_head_unqueue(list); | 168 | debug_rcu_head_unqueue(list); |
169 | local_bh_disable(); | ||
183 | list->func(list); | 170 | list->func(list); |
171 | local_bh_enable(); | ||
184 | list = next; | 172 | list = next; |
173 | RCU_TRACE(cb_count++); | ||
185 | } | 174 | } |
175 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); | ||
186 | } | 176 | } |
187 | 177 | ||
188 | /* | 178 | /* |
189 | * Invoke any callbacks whose grace period has completed. | 179 | * This kthread invokes RCU callbacks whose grace periods have |
180 | * elapsed. It is awakened as needed, and takes the place of the | ||
181 | * RCU_SOFTIRQ that was used previously for this purpose. | ||
182 | * This is a kthread, but it is never stopped, at least not until | ||
183 | * the system goes down. | ||
190 | */ | 184 | */ |
191 | static void rcu_process_callbacks(struct softirq_action *unused) | 185 | static int rcu_kthread(void *arg) |
192 | { | 186 | { |
193 | __rcu_process_callbacks(&rcu_sched_ctrlblk); | 187 | unsigned long work; |
194 | __rcu_process_callbacks(&rcu_bh_ctrlblk); | 188 | unsigned long morework; |
195 | rcu_preempt_process_callbacks(); | 189 | unsigned long flags; |
190 | |||
191 | for (;;) { | ||
192 | wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0); | ||
193 | morework = rcu_boost(); | ||
194 | local_irq_save(flags); | ||
195 | work = have_rcu_kthread_work; | ||
196 | have_rcu_kthread_work = morework; | ||
197 | local_irq_restore(flags); | ||
198 | if (work) { | ||
199 | rcu_process_callbacks(&rcu_sched_ctrlblk); | ||
200 | rcu_process_callbacks(&rcu_bh_ctrlblk); | ||
201 | rcu_preempt_process_callbacks(); | ||
202 | } | ||
203 | schedule_timeout_interruptible(1); /* Leave CPU for others. */ | ||
204 | } | ||
205 | |||
206 | return 0; /* Not reached, but needed to shut gcc up. */ | ||
207 | } | ||
208 | |||
209 | /* | ||
210 | * Wake up rcu_kthread() to process callbacks now eligible for invocation | ||
211 | * or to boost readers. | ||
212 | */ | ||
213 | static void invoke_rcu_kthread(void) | ||
214 | { | ||
215 | unsigned long flags; | ||
216 | |||
217 | local_irq_save(flags); | ||
218 | have_rcu_kthread_work = 1; | ||
219 | wake_up(&rcu_kthread_wq); | ||
220 | local_irq_restore(flags); | ||
196 | } | 221 | } |
197 | 222 | ||
198 | /* | 223 | /* |
@@ -230,6 +255,7 @@ static void __call_rcu(struct rcu_head *head, | |||
230 | local_irq_save(flags); | 255 | local_irq_save(flags); |
231 | *rcp->curtail = head; | 256 | *rcp->curtail = head; |
232 | rcp->curtail = &head->next; | 257 | rcp->curtail = &head->next; |
258 | RCU_TRACE(rcp->qlen++); | ||
233 | local_irq_restore(flags); | 259 | local_irq_restore(flags); |
234 | } | 260 | } |
235 | 261 | ||
@@ -282,7 +308,16 @@ void rcu_barrier_sched(void) | |||
282 | } | 308 | } |
283 | EXPORT_SYMBOL_GPL(rcu_barrier_sched); | 309 | EXPORT_SYMBOL_GPL(rcu_barrier_sched); |
284 | 310 | ||
285 | void __init rcu_init(void) | 311 | /* |
312 | * Spawn the kthread that invokes RCU callbacks. | ||
313 | */ | ||
314 | static int __init rcu_spawn_kthreads(void) | ||
286 | { | 315 | { |
287 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 316 | struct sched_param sp; |
317 | |||
318 | rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread"); | ||
319 | sp.sched_priority = RCU_BOOST_PRIO; | ||
320 | sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp); | ||
321 | return 0; | ||
288 | } | 322 | } |
323 | early_initcall(rcu_spawn_kthreads); | ||
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 6ceca4f745f..015abaea962 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -22,6 +22,40 @@ | |||
22 | * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 22 | * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> |
23 | */ | 23 | */ |
24 | 24 | ||
25 | #include <linux/kthread.h> | ||
26 | #include <linux/debugfs.h> | ||
27 | #include <linux/seq_file.h> | ||
28 | |||
29 | #ifdef CONFIG_RCU_TRACE | ||
30 | #define RCU_TRACE(stmt) stmt | ||
31 | #else /* #ifdef CONFIG_RCU_TRACE */ | ||
32 | #define RCU_TRACE(stmt) | ||
33 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | ||
34 | |||
35 | /* Global control variables for rcupdate callback mechanism. */ | ||
36 | struct rcu_ctrlblk { | ||
37 | struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ | ||
38 | struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ | ||
39 | struct rcu_head **curtail; /* ->next pointer of last CB. */ | ||
40 | RCU_TRACE(long qlen); /* Number of pending CBs. */ | ||
41 | }; | ||
42 | |||
43 | /* Definition for rcupdate control block. */ | ||
44 | static struct rcu_ctrlblk rcu_sched_ctrlblk = { | ||
45 | .donetail = &rcu_sched_ctrlblk.rcucblist, | ||
46 | .curtail = &rcu_sched_ctrlblk.rcucblist, | ||
47 | }; | ||
48 | |||
49 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | ||
50 | .donetail = &rcu_bh_ctrlblk.rcucblist, | ||
51 | .curtail = &rcu_bh_ctrlblk.rcucblist, | ||
52 | }; | ||
53 | |||
54 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
55 | int rcu_scheduler_active __read_mostly; | ||
56 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); | ||
57 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
58 | |||
25 | #ifdef CONFIG_TINY_PREEMPT_RCU | 59 | #ifdef CONFIG_TINY_PREEMPT_RCU |
26 | 60 | ||
27 | #include <linux/delay.h> | 61 | #include <linux/delay.h> |
@@ -46,17 +80,45 @@ struct rcu_preempt_ctrlblk { | |||
46 | struct list_head *gp_tasks; | 80 | struct list_head *gp_tasks; |
47 | /* Pointer to the first task blocking the */ | 81 | /* Pointer to the first task blocking the */ |
48 | /* current grace period, or NULL if there */ | 82 | /* current grace period, or NULL if there */ |
49 | /* is not such task. */ | 83 | /* is no such task. */ |
50 | struct list_head *exp_tasks; | 84 | struct list_head *exp_tasks; |
51 | /* Pointer to first task blocking the */ | 85 | /* Pointer to first task blocking the */ |
52 | /* current expedited grace period, or NULL */ | 86 | /* current expedited grace period, or NULL */ |
53 | /* if there is no such task. If there */ | 87 | /* if there is no such task. If there */ |
54 | /* is no current expedited grace period, */ | 88 | /* is no current expedited grace period, */ |
55 | /* then there cannot be any such task. */ | 89 | /* then there cannot be any such task. */ |
90 | #ifdef CONFIG_RCU_BOOST | ||
91 | struct list_head *boost_tasks; | ||
92 | /* Pointer to first task that needs to be */ | ||
93 | /* priority-boosted, or NULL if no priority */ | ||
94 | /* boosting is needed. If there is no */ | ||
95 | /* current or expedited grace period, there */ | ||
96 | /* can be no such task. */ | ||
97 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
56 | u8 gpnum; /* Current grace period. */ | 98 | u8 gpnum; /* Current grace period. */ |
57 | u8 gpcpu; /* Last grace period blocked by the CPU. */ | 99 | u8 gpcpu; /* Last grace period blocked by the CPU. */ |
58 | u8 completed; /* Last grace period completed. */ | 100 | u8 completed; /* Last grace period completed. */ |
59 | /* If all three are equal, RCU is idle. */ | 101 | /* If all three are equal, RCU is idle. */ |
102 | #ifdef CONFIG_RCU_BOOST | ||
103 | s8 boosted_this_gp; /* Has boosting already happened? */ | ||
104 | unsigned long boost_time; /* When to start boosting (jiffies) */ | ||
105 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
106 | #ifdef CONFIG_RCU_TRACE | ||
107 | unsigned long n_grace_periods; | ||
108 | #ifdef CONFIG_RCU_BOOST | ||
109 | unsigned long n_tasks_boosted; | ||
110 | unsigned long n_exp_boosts; | ||
111 | unsigned long n_normal_boosts; | ||
112 | unsigned long n_normal_balk_blkd_tasks; | ||
113 | unsigned long n_normal_balk_gp_tasks; | ||
114 | unsigned long n_normal_balk_boost_tasks; | ||
115 | unsigned long n_normal_balk_boosted; | ||
116 | unsigned long n_normal_balk_notyet; | ||
117 | unsigned long n_normal_balk_nos; | ||
118 | unsigned long n_exp_balk_blkd_tasks; | ||
119 | unsigned long n_exp_balk_nos; | ||
120 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
121 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
60 | }; | 122 | }; |
61 | 123 | ||
62 | static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { | 124 | static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { |
@@ -122,6 +184,210 @@ static int rcu_preempt_gp_in_progress(void) | |||
122 | } | 184 | } |
123 | 185 | ||
124 | /* | 186 | /* |
187 | * Advance a ->blkd_tasks-list pointer to the next entry, instead | ||
188 | * returning NULL if at the end of the list. | ||
189 | */ | ||
190 | static struct list_head *rcu_next_node_entry(struct task_struct *t) | ||
191 | { | ||
192 | struct list_head *np; | ||
193 | |||
194 | np = t->rcu_node_entry.next; | ||
195 | if (np == &rcu_preempt_ctrlblk.blkd_tasks) | ||
196 | np = NULL; | ||
197 | return np; | ||
198 | } | ||
199 | |||
200 | #ifdef CONFIG_RCU_TRACE | ||
201 | |||
202 | #ifdef CONFIG_RCU_BOOST | ||
203 | static void rcu_initiate_boost_trace(void); | ||
204 | static void rcu_initiate_exp_boost_trace(void); | ||
205 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
206 | |||
207 | /* | ||
208 | * Dump additional statistice for TINY_PREEMPT_RCU. | ||
209 | */ | ||
210 | static void show_tiny_preempt_stats(struct seq_file *m) | ||
211 | { | ||
212 | seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n", | ||
213 | rcu_preempt_ctrlblk.rcb.qlen, | ||
214 | rcu_preempt_ctrlblk.n_grace_periods, | ||
215 | rcu_preempt_ctrlblk.gpnum, | ||
216 | rcu_preempt_ctrlblk.gpcpu, | ||
217 | rcu_preempt_ctrlblk.completed, | ||
218 | "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)], | ||
219 | "N."[!rcu_preempt_ctrlblk.gp_tasks], | ||
220 | "E."[!rcu_preempt_ctrlblk.exp_tasks]); | ||
221 | #ifdef CONFIG_RCU_BOOST | ||
222 | seq_printf(m, " ttb=%c btg=", | ||
223 | "B."[!rcu_preempt_ctrlblk.boost_tasks]); | ||
224 | switch (rcu_preempt_ctrlblk.boosted_this_gp) { | ||
225 | case -1: | ||
226 | seq_puts(m, "exp"); | ||
227 | break; | ||
228 | case 0: | ||
229 | seq_puts(m, "no"); | ||
230 | break; | ||
231 | case 1: | ||
232 | seq_puts(m, "begun"); | ||
233 | break; | ||
234 | case 2: | ||
235 | seq_puts(m, "done"); | ||
236 | break; | ||
237 | default: | ||
238 | seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp); | ||
239 | } | ||
240 | seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", | ||
241 | rcu_preempt_ctrlblk.n_tasks_boosted, | ||
242 | rcu_preempt_ctrlblk.n_exp_boosts, | ||
243 | rcu_preempt_ctrlblk.n_normal_boosts, | ||
244 | (int)(jiffies & 0xffff), | ||
245 | (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); | ||
246 | seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n", | ||
247 | "normal balk", | ||
248 | rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks, | ||
249 | rcu_preempt_ctrlblk.n_normal_balk_gp_tasks, | ||
250 | rcu_preempt_ctrlblk.n_normal_balk_boost_tasks, | ||
251 | rcu_preempt_ctrlblk.n_normal_balk_boosted, | ||
252 | rcu_preempt_ctrlblk.n_normal_balk_notyet, | ||
253 | rcu_preempt_ctrlblk.n_normal_balk_nos); | ||
254 | seq_printf(m, " exp balk: bt=%lu nos=%lu\n", | ||
255 | rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks, | ||
256 | rcu_preempt_ctrlblk.n_exp_balk_nos); | ||
257 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
258 | } | ||
259 | |||
260 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
261 | |||
262 | #ifdef CONFIG_RCU_BOOST | ||
263 | |||
264 | #include "rtmutex_common.h" | ||
265 | |||
266 | /* | ||
267 | * Carry out RCU priority boosting on the task indicated by ->boost_tasks, | ||
268 | * and advance ->boost_tasks to the next task in the ->blkd_tasks list. | ||
269 | */ | ||
270 | static int rcu_boost(void) | ||
271 | { | ||
272 | unsigned long flags; | ||
273 | struct rt_mutex mtx; | ||
274 | struct list_head *np; | ||
275 | struct task_struct *t; | ||
276 | |||
277 | if (rcu_preempt_ctrlblk.boost_tasks == NULL) | ||
278 | return 0; /* Nothing to boost. */ | ||
279 | raw_local_irq_save(flags); | ||
280 | rcu_preempt_ctrlblk.boosted_this_gp++; | ||
281 | t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct, | ||
282 | rcu_node_entry); | ||
283 | np = rcu_next_node_entry(t); | ||
284 | rt_mutex_init_proxy_locked(&mtx, t); | ||
285 | t->rcu_boost_mutex = &mtx; | ||
286 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; | ||
287 | raw_local_irq_restore(flags); | ||
288 | rt_mutex_lock(&mtx); | ||
289 | RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); | ||
290 | rcu_preempt_ctrlblk.boosted_this_gp++; | ||
291 | rt_mutex_unlock(&mtx); | ||
292 | return rcu_preempt_ctrlblk.boost_tasks != NULL; | ||
293 | } | ||
294 | |||
295 | /* | ||
296 | * Check to see if it is now time to start boosting RCU readers blocking | ||
297 | * the current grace period, and, if so, tell the rcu_kthread_task to | ||
298 | * start boosting them. If there is an expedited boost in progress, | ||
299 | * we wait for it to complete. | ||
300 | * | ||
301 | * If there are no blocked readers blocking the current grace period, | ||
302 | * return 0 to let the caller know, otherwise return 1. Note that this | ||
303 | * return value is independent of whether or not boosting was done. | ||
304 | */ | ||
305 | static int rcu_initiate_boost(void) | ||
306 | { | ||
307 | if (!rcu_preempt_blocked_readers_cgp()) { | ||
308 | RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++); | ||
309 | return 0; | ||
310 | } | ||
311 | if (rcu_preempt_ctrlblk.gp_tasks != NULL && | ||
312 | rcu_preempt_ctrlblk.boost_tasks == NULL && | ||
313 | rcu_preempt_ctrlblk.boosted_this_gp == 0 && | ||
314 | ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) { | ||
315 | rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; | ||
316 | invoke_rcu_kthread(); | ||
317 | RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); | ||
318 | } else | ||
319 | RCU_TRACE(rcu_initiate_boost_trace()); | ||
320 | return 1; | ||
321 | } | ||
322 | |||
323 | /* | ||
324 | * Initiate boosting for an expedited grace period. | ||
325 | */ | ||
326 | static void rcu_initiate_expedited_boost(void) | ||
327 | { | ||
328 | unsigned long flags; | ||
329 | |||
330 | raw_local_irq_save(flags); | ||
331 | if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) { | ||
332 | rcu_preempt_ctrlblk.boost_tasks = | ||
333 | rcu_preempt_ctrlblk.blkd_tasks.next; | ||
334 | rcu_preempt_ctrlblk.boosted_this_gp = -1; | ||
335 | invoke_rcu_kthread(); | ||
336 | RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); | ||
337 | } else | ||
338 | RCU_TRACE(rcu_initiate_exp_boost_trace()); | ||
339 | raw_local_irq_restore(flags); | ||
340 | } | ||
341 | |||
342 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000); | ||
343 | |||
344 | /* | ||
345 | * Do priority-boost accounting for the start of a new grace period. | ||
346 | */ | ||
347 | static void rcu_preempt_boost_start_gp(void) | ||
348 | { | ||
349 | rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; | ||
350 | if (rcu_preempt_ctrlblk.boosted_this_gp > 0) | ||
351 | rcu_preempt_ctrlblk.boosted_this_gp = 0; | ||
352 | } | ||
353 | |||
354 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
355 | |||
356 | /* | ||
357 | * If there is no RCU priority boosting, we don't boost. | ||
358 | */ | ||
359 | static int rcu_boost(void) | ||
360 | { | ||
361 | return 0; | ||
362 | } | ||
363 | |||
364 | /* | ||
365 | * If there is no RCU priority boosting, we don't initiate boosting, | ||
366 | * but we do indicate whether there are blocked readers blocking the | ||
367 | * current grace period. | ||
368 | */ | ||
369 | static int rcu_initiate_boost(void) | ||
370 | { | ||
371 | return rcu_preempt_blocked_readers_cgp(); | ||
372 | } | ||
373 | |||
374 | /* | ||
375 | * If there is no RCU priority boosting, we don't initiate expedited boosting. | ||
376 | */ | ||
377 | static void rcu_initiate_expedited_boost(void) | ||
378 | { | ||
379 | } | ||
380 | |||
381 | /* | ||
382 | * If there is no RCU priority boosting, nothing to do at grace-period start. | ||
383 | */ | ||
384 | static void rcu_preempt_boost_start_gp(void) | ||
385 | { | ||
386 | } | ||
387 | |||
388 | #endif /* else #ifdef CONFIG_RCU_BOOST */ | ||
389 | |||
390 | /* | ||
125 | * Record a preemptible-RCU quiescent state for the specified CPU. Note | 391 | * Record a preemptible-RCU quiescent state for the specified CPU. Note |
126 | * that this just means that the task currently running on the CPU is | 392 | * that this just means that the task currently running on the CPU is |
127 | * in a quiescent state. There might be any number of tasks blocked | 393 | * in a quiescent state. There might be any number of tasks blocked |
@@ -148,11 +414,14 @@ static void rcu_preempt_cpu_qs(void) | |||
148 | rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; | 414 | rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; |
149 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; | 415 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; |
150 | 416 | ||
417 | /* If there is no GP then there is nothing more to do. */ | ||
418 | if (!rcu_preempt_gp_in_progress()) | ||
419 | return; | ||
151 | /* | 420 | /* |
152 | * If there is no GP, or if blocked readers are still blocking GP, | 421 | * Check up on boosting. If there are no readers blocking the |
153 | * then there is nothing more to do. | 422 | * current grace period, leave. |
154 | */ | 423 | */ |
155 | if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp()) | 424 | if (rcu_initiate_boost()) |
156 | return; | 425 | return; |
157 | 426 | ||
158 | /* Advance callbacks. */ | 427 | /* Advance callbacks. */ |
@@ -164,9 +433,9 @@ static void rcu_preempt_cpu_qs(void) | |||
164 | if (!rcu_preempt_blocked_readers_any()) | 433 | if (!rcu_preempt_blocked_readers_any()) |
165 | rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; | 434 | rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; |
166 | 435 | ||
167 | /* If there are done callbacks, make RCU_SOFTIRQ process them. */ | 436 | /* If there are done callbacks, cause them to be invoked. */ |
168 | if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) | 437 | if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) |
169 | raise_softirq(RCU_SOFTIRQ); | 438 | invoke_rcu_kthread(); |
170 | } | 439 | } |
171 | 440 | ||
172 | /* | 441 | /* |
@@ -178,12 +447,16 @@ static void rcu_preempt_start_gp(void) | |||
178 | 447 | ||
179 | /* Official start of GP. */ | 448 | /* Official start of GP. */ |
180 | rcu_preempt_ctrlblk.gpnum++; | 449 | rcu_preempt_ctrlblk.gpnum++; |
450 | RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); | ||
181 | 451 | ||
182 | /* Any blocked RCU readers block new GP. */ | 452 | /* Any blocked RCU readers block new GP. */ |
183 | if (rcu_preempt_blocked_readers_any()) | 453 | if (rcu_preempt_blocked_readers_any()) |
184 | rcu_preempt_ctrlblk.gp_tasks = | 454 | rcu_preempt_ctrlblk.gp_tasks = |
185 | rcu_preempt_ctrlblk.blkd_tasks.next; | 455 | rcu_preempt_ctrlblk.blkd_tasks.next; |
186 | 456 | ||
457 | /* Set up for RCU priority boosting. */ | ||
458 | rcu_preempt_boost_start_gp(); | ||
459 | |||
187 | /* If there is no running reader, CPU is done with GP. */ | 460 | /* If there is no running reader, CPU is done with GP. */ |
188 | if (!rcu_preempt_running_reader()) | 461 | if (!rcu_preempt_running_reader()) |
189 | rcu_preempt_cpu_qs(); | 462 | rcu_preempt_cpu_qs(); |
@@ -304,14 +577,16 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
304 | */ | 577 | */ |
305 | empty = !rcu_preempt_blocked_readers_cgp(); | 578 | empty = !rcu_preempt_blocked_readers_cgp(); |
306 | empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; | 579 | empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; |
307 | np = t->rcu_node_entry.next; | 580 | np = rcu_next_node_entry(t); |
308 | if (np == &rcu_preempt_ctrlblk.blkd_tasks) | ||
309 | np = NULL; | ||
310 | list_del(&t->rcu_node_entry); | 581 | list_del(&t->rcu_node_entry); |
311 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) | 582 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) |
312 | rcu_preempt_ctrlblk.gp_tasks = np; | 583 | rcu_preempt_ctrlblk.gp_tasks = np; |
313 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) | 584 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) |
314 | rcu_preempt_ctrlblk.exp_tasks = np; | 585 | rcu_preempt_ctrlblk.exp_tasks = np; |
586 | #ifdef CONFIG_RCU_BOOST | ||
587 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) | ||
588 | rcu_preempt_ctrlblk.boost_tasks = np; | ||
589 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
315 | INIT_LIST_HEAD(&t->rcu_node_entry); | 590 | INIT_LIST_HEAD(&t->rcu_node_entry); |
316 | 591 | ||
317 | /* | 592 | /* |
@@ -331,6 +606,14 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
331 | if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) | 606 | if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) |
332 | rcu_report_exp_done(); | 607 | rcu_report_exp_done(); |
333 | } | 608 | } |
609 | #ifdef CONFIG_RCU_BOOST | ||
610 | /* Unboost self if was boosted. */ | ||
611 | if (special & RCU_READ_UNLOCK_BOOSTED) { | ||
612 | t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; | ||
613 | rt_mutex_unlock(t->rcu_boost_mutex); | ||
614 | t->rcu_boost_mutex = NULL; | ||
615 | } | ||
616 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
334 | local_irq_restore(flags); | 617 | local_irq_restore(flags); |
335 | } | 618 | } |
336 | 619 | ||
@@ -374,7 +657,7 @@ static void rcu_preempt_check_callbacks(void) | |||
374 | rcu_preempt_cpu_qs(); | 657 | rcu_preempt_cpu_qs(); |
375 | if (&rcu_preempt_ctrlblk.rcb.rcucblist != | 658 | if (&rcu_preempt_ctrlblk.rcb.rcucblist != |
376 | rcu_preempt_ctrlblk.rcb.donetail) | 659 | rcu_preempt_ctrlblk.rcb.donetail) |
377 | raise_softirq(RCU_SOFTIRQ); | 660 | invoke_rcu_kthread(); |
378 | if (rcu_preempt_gp_in_progress() && | 661 | if (rcu_preempt_gp_in_progress() && |
379 | rcu_cpu_blocking_cur_gp() && | 662 | rcu_cpu_blocking_cur_gp() && |
380 | rcu_preempt_running_reader()) | 663 | rcu_preempt_running_reader()) |
@@ -383,7 +666,7 @@ static void rcu_preempt_check_callbacks(void) | |||
383 | 666 | ||
384 | /* | 667 | /* |
385 | * TINY_PREEMPT_RCU has an extra callback-list tail pointer to | 668 | * TINY_PREEMPT_RCU has an extra callback-list tail pointer to |
386 | * update, so this is invoked from __rcu_process_callbacks() to | 669 | * update, so this is invoked from rcu_process_callbacks() to |
387 | * handle that case. Of course, it is invoked for all flavors of | 670 | * handle that case. Of course, it is invoked for all flavors of |
388 | * RCU, but RCU callbacks can appear only on one of the lists, and | 671 | * RCU, but RCU callbacks can appear only on one of the lists, and |
389 | * neither ->nexttail nor ->donetail can possibly be NULL, so there | 672 | * neither ->nexttail nor ->donetail can possibly be NULL, so there |
@@ -400,7 +683,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) | |||
400 | */ | 683 | */ |
401 | static void rcu_preempt_process_callbacks(void) | 684 | static void rcu_preempt_process_callbacks(void) |
402 | { | 685 | { |
403 | __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); | 686 | rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); |
404 | } | 687 | } |
405 | 688 | ||
406 | /* | 689 | /* |
@@ -417,6 +700,7 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
417 | local_irq_save(flags); | 700 | local_irq_save(flags); |
418 | *rcu_preempt_ctrlblk.nexttail = head; | 701 | *rcu_preempt_ctrlblk.nexttail = head; |
419 | rcu_preempt_ctrlblk.nexttail = &head->next; | 702 | rcu_preempt_ctrlblk.nexttail = &head->next; |
703 | RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++); | ||
420 | rcu_preempt_start_gp(); /* checks to see if GP needed. */ | 704 | rcu_preempt_start_gp(); /* checks to see if GP needed. */ |
421 | local_irq_restore(flags); | 705 | local_irq_restore(flags); |
422 | } | 706 | } |
@@ -532,6 +816,7 @@ void synchronize_rcu_expedited(void) | |||
532 | 816 | ||
533 | /* Wait for tail of ->blkd_tasks list to drain. */ | 817 | /* Wait for tail of ->blkd_tasks list to drain. */ |
534 | if (rcu_preempted_readers_exp()) | 818 | if (rcu_preempted_readers_exp()) |
819 | rcu_initiate_expedited_boost(); | ||
535 | wait_event(sync_rcu_preempt_exp_wq, | 820 | wait_event(sync_rcu_preempt_exp_wq, |
536 | !rcu_preempted_readers_exp()); | 821 | !rcu_preempted_readers_exp()); |
537 | 822 | ||
@@ -572,6 +857,27 @@ void exit_rcu(void) | |||
572 | 857 | ||
573 | #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ | 858 | #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ |
574 | 859 | ||
860 | #ifdef CONFIG_RCU_TRACE | ||
861 | |||
862 | /* | ||
863 | * Because preemptible RCU does not exist, it is not necessary to | ||
864 | * dump out its statistics. | ||
865 | */ | ||
866 | static void show_tiny_preempt_stats(struct seq_file *m) | ||
867 | { | ||
868 | } | ||
869 | |||
870 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
871 | |||
872 | /* | ||
873 | * Because preemptible RCU does not exist, it is never necessary to | ||
874 | * boost preempted RCU readers. | ||
875 | */ | ||
876 | static int rcu_boost(void) | ||
877 | { | ||
878 | return 0; | ||
879 | } | ||
880 | |||
575 | /* | 881 | /* |
576 | * Because preemptible RCU does not exist, it never has any callbacks | 882 | * Because preemptible RCU does not exist, it never has any callbacks |
577 | * to check. | 883 | * to check. |
@@ -599,17 +905,116 @@ static void rcu_preempt_process_callbacks(void) | |||
599 | #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ | 905 | #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ |
600 | 906 | ||
601 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 907 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
602 | |||
603 | #include <linux/kernel_stat.h> | 908 | #include <linux/kernel_stat.h> |
604 | 909 | ||
605 | /* | 910 | /* |
606 | * During boot, we forgive RCU lockdep issues. After this function is | 911 | * During boot, we forgive RCU lockdep issues. After this function is |
607 | * invoked, we start taking RCU lockdep issues seriously. | 912 | * invoked, we start taking RCU lockdep issues seriously. |
608 | */ | 913 | */ |
609 | void rcu_scheduler_starting(void) | 914 | void __init rcu_scheduler_starting(void) |
610 | { | 915 | { |
611 | WARN_ON(nr_context_switches() > 0); | 916 | WARN_ON(nr_context_switches() > 0); |
612 | rcu_scheduler_active = 1; | 917 | rcu_scheduler_active = 1; |
613 | } | 918 | } |
614 | 919 | ||
615 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 920 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
921 | |||
922 | #ifdef CONFIG_RCU_BOOST | ||
923 | #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO | ||
924 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
925 | #define RCU_BOOST_PRIO 1 | ||
926 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
927 | |||
928 | #ifdef CONFIG_RCU_TRACE | ||
929 | |||
930 | #ifdef CONFIG_RCU_BOOST | ||
931 | |||
932 | static void rcu_initiate_boost_trace(void) | ||
933 | { | ||
934 | if (rcu_preempt_ctrlblk.gp_tasks == NULL) | ||
935 | rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++; | ||
936 | else if (rcu_preempt_ctrlblk.boost_tasks != NULL) | ||
937 | rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++; | ||
938 | else if (rcu_preempt_ctrlblk.boosted_this_gp != 0) | ||
939 | rcu_preempt_ctrlblk.n_normal_balk_boosted++; | ||
940 | else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) | ||
941 | rcu_preempt_ctrlblk.n_normal_balk_notyet++; | ||
942 | else | ||
943 | rcu_preempt_ctrlblk.n_normal_balk_nos++; | ||
944 | } | ||
945 | |||
946 | static void rcu_initiate_exp_boost_trace(void) | ||
947 | { | ||
948 | if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) | ||
949 | rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++; | ||
950 | else | ||
951 | rcu_preempt_ctrlblk.n_exp_balk_nos++; | ||
952 | } | ||
953 | |||
954 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
955 | |||
956 | static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) | ||
957 | { | ||
958 | unsigned long flags; | ||
959 | |||
960 | raw_local_irq_save(flags); | ||
961 | rcp->qlen -= n; | ||
962 | raw_local_irq_restore(flags); | ||
963 | } | ||
964 | |||
965 | /* | ||
966 | * Dump statistics for TINY_RCU, such as they are. | ||
967 | */ | ||
968 | static int show_tiny_stats(struct seq_file *m, void *unused) | ||
969 | { | ||
970 | show_tiny_preempt_stats(m); | ||
971 | seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); | ||
972 | seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); | ||
973 | return 0; | ||
974 | } | ||
975 | |||
976 | static int show_tiny_stats_open(struct inode *inode, struct file *file) | ||
977 | { | ||
978 | return single_open(file, show_tiny_stats, NULL); | ||
979 | } | ||
980 | |||
981 | static const struct file_operations show_tiny_stats_fops = { | ||
982 | .owner = THIS_MODULE, | ||
983 | .open = show_tiny_stats_open, | ||
984 | .read = seq_read, | ||
985 | .llseek = seq_lseek, | ||
986 | .release = single_release, | ||
987 | }; | ||
988 | |||
989 | static struct dentry *rcudir; | ||
990 | |||
991 | static int __init rcutiny_trace_init(void) | ||
992 | { | ||
993 | struct dentry *retval; | ||
994 | |||
995 | rcudir = debugfs_create_dir("rcu", NULL); | ||
996 | if (!rcudir) | ||
997 | goto free_out; | ||
998 | retval = debugfs_create_file("rcudata", 0444, rcudir, | ||
999 | NULL, &show_tiny_stats_fops); | ||
1000 | if (!retval) | ||
1001 | goto free_out; | ||
1002 | return 0; | ||
1003 | free_out: | ||
1004 | debugfs_remove_recursive(rcudir); | ||
1005 | return 1; | ||
1006 | } | ||
1007 | |||
1008 | static void __exit rcutiny_trace_cleanup(void) | ||
1009 | { | ||
1010 | debugfs_remove_recursive(rcudir); | ||
1011 | } | ||
1012 | |||
1013 | module_init(rcutiny_trace_init); | ||
1014 | module_exit(rcutiny_trace_cleanup); | ||
1015 | |||
1016 | MODULE_AUTHOR("Paul E. McKenney"); | ||
1017 | MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); | ||
1018 | MODULE_LICENSE("GPL"); | ||
1019 | |||
1020 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 9d8e8fb2515..89613f97ff2 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -47,6 +47,7 @@ | |||
47 | #include <linux/srcu.h> | 47 | #include <linux/srcu.h> |
48 | #include <linux/slab.h> | 48 | #include <linux/slab.h> |
49 | #include <asm/byteorder.h> | 49 | #include <asm/byteorder.h> |
50 | #include <linux/sched.h> | ||
50 | 51 | ||
51 | MODULE_LICENSE("GPL"); | 52 | MODULE_LICENSE("GPL"); |
52 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " | 53 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " |
@@ -64,6 +65,9 @@ static int irqreader = 1; /* RCU readers from irq (timers). */ | |||
64 | static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ | 65 | static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ |
65 | static int fqs_holdoff = 0; /* Hold time within burst (us). */ | 66 | static int fqs_holdoff = 0; /* Hold time within burst (us). */ |
66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ | 67 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ |
68 | static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ | ||
69 | static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ | ||
70 | static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ | ||
67 | static char *torture_type = "rcu"; /* What RCU implementation to torture. */ | 71 | static char *torture_type = "rcu"; /* What RCU implementation to torture. */ |
68 | 72 | ||
69 | module_param(nreaders, int, 0444); | 73 | module_param(nreaders, int, 0444); |
@@ -88,6 +92,12 @@ module_param(fqs_holdoff, int, 0444); | |||
88 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); | 92 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); |
89 | module_param(fqs_stutter, int, 0444); | 93 | module_param(fqs_stutter, int, 0444); |
90 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); | 94 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); |
95 | module_param(test_boost, int, 0444); | ||
96 | MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); | ||
97 | module_param(test_boost_interval, int, 0444); | ||
98 | MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); | ||
99 | module_param(test_boost_duration, int, 0444); | ||
100 | MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); | ||
91 | module_param(torture_type, charp, 0444); | 101 | module_param(torture_type, charp, 0444); |
92 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); | 102 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); |
93 | 103 | ||
@@ -109,6 +119,7 @@ static struct task_struct *stats_task; | |||
109 | static struct task_struct *shuffler_task; | 119 | static struct task_struct *shuffler_task; |
110 | static struct task_struct *stutter_task; | 120 | static struct task_struct *stutter_task; |
111 | static struct task_struct *fqs_task; | 121 | static struct task_struct *fqs_task; |
122 | static struct task_struct *boost_tasks[NR_CPUS]; | ||
112 | 123 | ||
113 | #define RCU_TORTURE_PIPE_LEN 10 | 124 | #define RCU_TORTURE_PIPE_LEN 10 |
114 | 125 | ||
@@ -134,6 +145,12 @@ static atomic_t n_rcu_torture_alloc_fail; | |||
134 | static atomic_t n_rcu_torture_free; | 145 | static atomic_t n_rcu_torture_free; |
135 | static atomic_t n_rcu_torture_mberror; | 146 | static atomic_t n_rcu_torture_mberror; |
136 | static atomic_t n_rcu_torture_error; | 147 | static atomic_t n_rcu_torture_error; |
148 | static long n_rcu_torture_boost_ktrerror; | ||
149 | static long n_rcu_torture_boost_rterror; | ||
150 | static long n_rcu_torture_boost_allocerror; | ||
151 | static long n_rcu_torture_boost_afferror; | ||
152 | static long n_rcu_torture_boost_failure; | ||
153 | static long n_rcu_torture_boosts; | ||
137 | static long n_rcu_torture_timers; | 154 | static long n_rcu_torture_timers; |
138 | static struct list_head rcu_torture_removed; | 155 | static struct list_head rcu_torture_removed; |
139 | static cpumask_var_t shuffle_tmp_mask; | 156 | static cpumask_var_t shuffle_tmp_mask; |
@@ -147,6 +164,16 @@ static int stutter_pause_test; | |||
147 | #endif | 164 | #endif |
148 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; | 165 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; |
149 | 166 | ||
167 | #ifdef CONFIG_RCU_BOOST | ||
168 | #define rcu_can_boost() 1 | ||
169 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
170 | #define rcu_can_boost() 0 | ||
171 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
172 | |||
173 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ | ||
174 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ | ||
175 | /* and boost task create/destroy. */ | ||
176 | |||
150 | /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ | 177 | /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ |
151 | 178 | ||
152 | #define FULLSTOP_DONTSTOP 0 /* Normal operation. */ | 179 | #define FULLSTOP_DONTSTOP 0 /* Normal operation. */ |
@@ -277,6 +304,7 @@ struct rcu_torture_ops { | |||
277 | void (*fqs)(void); | 304 | void (*fqs)(void); |
278 | int (*stats)(char *page); | 305 | int (*stats)(char *page); |
279 | int irq_capable; | 306 | int irq_capable; |
307 | int can_boost; | ||
280 | char *name; | 308 | char *name; |
281 | }; | 309 | }; |
282 | 310 | ||
@@ -366,6 +394,7 @@ static struct rcu_torture_ops rcu_ops = { | |||
366 | .fqs = rcu_force_quiescent_state, | 394 | .fqs = rcu_force_quiescent_state, |
367 | .stats = NULL, | 395 | .stats = NULL, |
368 | .irq_capable = 1, | 396 | .irq_capable = 1, |
397 | .can_boost = rcu_can_boost(), | ||
369 | .name = "rcu" | 398 | .name = "rcu" |
370 | }; | 399 | }; |
371 | 400 | ||
@@ -408,6 +437,7 @@ static struct rcu_torture_ops rcu_sync_ops = { | |||
408 | .fqs = rcu_force_quiescent_state, | 437 | .fqs = rcu_force_quiescent_state, |
409 | .stats = NULL, | 438 | .stats = NULL, |
410 | .irq_capable = 1, | 439 | .irq_capable = 1, |
440 | .can_boost = rcu_can_boost(), | ||
411 | .name = "rcu_sync" | 441 | .name = "rcu_sync" |
412 | }; | 442 | }; |
413 | 443 | ||
@@ -424,6 +454,7 @@ static struct rcu_torture_ops rcu_expedited_ops = { | |||
424 | .fqs = rcu_force_quiescent_state, | 454 | .fqs = rcu_force_quiescent_state, |
425 | .stats = NULL, | 455 | .stats = NULL, |
426 | .irq_capable = 1, | 456 | .irq_capable = 1, |
457 | .can_boost = rcu_can_boost(), | ||
427 | .name = "rcu_expedited" | 458 | .name = "rcu_expedited" |
428 | }; | 459 | }; |
429 | 460 | ||
@@ -684,6 +715,110 @@ static struct rcu_torture_ops sched_expedited_ops = { | |||
684 | }; | 715 | }; |
685 | 716 | ||
686 | /* | 717 | /* |
718 | * RCU torture priority-boost testing. Runs one real-time thread per | ||
719 | * CPU for moderate bursts, repeatedly registering RCU callbacks and | ||
720 | * spinning waiting for them to be invoked. If a given callback takes | ||
721 | * too long to be invoked, we assume that priority inversion has occurred. | ||
722 | */ | ||
723 | |||
724 | struct rcu_boost_inflight { | ||
725 | struct rcu_head rcu; | ||
726 | int inflight; | ||
727 | }; | ||
728 | |||
729 | static void rcu_torture_boost_cb(struct rcu_head *head) | ||
730 | { | ||
731 | struct rcu_boost_inflight *rbip = | ||
732 | container_of(head, struct rcu_boost_inflight, rcu); | ||
733 | |||
734 | smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */ | ||
735 | rbip->inflight = 0; | ||
736 | } | ||
737 | |||
738 | static int rcu_torture_boost(void *arg) | ||
739 | { | ||
740 | unsigned long call_rcu_time; | ||
741 | unsigned long endtime; | ||
742 | unsigned long oldstarttime; | ||
743 | struct rcu_boost_inflight rbi = { .inflight = 0 }; | ||
744 | struct sched_param sp; | ||
745 | |||
746 | VERBOSE_PRINTK_STRING("rcu_torture_boost started"); | ||
747 | |||
748 | /* Set real-time priority. */ | ||
749 | sp.sched_priority = 1; | ||
750 | if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { | ||
751 | VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!"); | ||
752 | n_rcu_torture_boost_rterror++; | ||
753 | } | ||
754 | |||
755 | /* Each pass through the following loop does one boost-test cycle. */ | ||
756 | do { | ||
757 | /* Wait for the next test interval. */ | ||
758 | oldstarttime = boost_starttime; | ||
759 | while (jiffies - oldstarttime > ULONG_MAX / 2) { | ||
760 | schedule_timeout_uninterruptible(1); | ||
761 | rcu_stutter_wait("rcu_torture_boost"); | ||
762 | if (kthread_should_stop() || | ||
763 | fullstop != FULLSTOP_DONTSTOP) | ||
764 | goto checkwait; | ||
765 | } | ||
766 | |||
767 | /* Do one boost-test interval. */ | ||
768 | endtime = oldstarttime + test_boost_duration * HZ; | ||
769 | call_rcu_time = jiffies; | ||
770 | while (jiffies - endtime > ULONG_MAX / 2) { | ||
771 | /* If we don't have a callback in flight, post one. */ | ||
772 | if (!rbi.inflight) { | ||
773 | smp_mb(); /* RCU core before ->inflight = 1. */ | ||
774 | rbi.inflight = 1; | ||
775 | call_rcu(&rbi.rcu, rcu_torture_boost_cb); | ||
776 | if (jiffies - call_rcu_time > | ||
777 | test_boost_duration * HZ - HZ / 2) { | ||
778 | VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed"); | ||
779 | n_rcu_torture_boost_failure++; | ||
780 | } | ||
781 | call_rcu_time = jiffies; | ||
782 | } | ||
783 | cond_resched(); | ||
784 | rcu_stutter_wait("rcu_torture_boost"); | ||
785 | if (kthread_should_stop() || | ||
786 | fullstop != FULLSTOP_DONTSTOP) | ||
787 | goto checkwait; | ||
788 | } | ||
789 | |||
790 | /* | ||
791 | * Set the start time of the next test interval. | ||
792 | * Yes, this is vulnerable to long delays, but such | ||
793 | * delays simply cause a false negative for the next | ||
794 | * interval. Besides, we are running at RT priority, | ||
795 | * so delays should be relatively rare. | ||
796 | */ | ||
797 | while (oldstarttime == boost_starttime) { | ||
798 | if (mutex_trylock(&boost_mutex)) { | ||
799 | boost_starttime = jiffies + | ||
800 | test_boost_interval * HZ; | ||
801 | n_rcu_torture_boosts++; | ||
802 | mutex_unlock(&boost_mutex); | ||
803 | break; | ||
804 | } | ||
805 | schedule_timeout_uninterruptible(1); | ||
806 | } | ||
807 | |||
808 | /* Go do the stutter. */ | ||
809 | checkwait: rcu_stutter_wait("rcu_torture_boost"); | ||
810 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | ||
811 | |||
812 | /* Clean up and exit. */ | ||
813 | VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); | ||
814 | rcutorture_shutdown_absorb("rcu_torture_boost"); | ||
815 | while (!kthread_should_stop() || rbi.inflight) | ||
816 | schedule_timeout_uninterruptible(1); | ||
817 | smp_mb(); /* order accesses to ->inflight before stack-frame death. */ | ||
818 | return 0; | ||
819 | } | ||
820 | |||
821 | /* | ||
687 | * RCU torture force-quiescent-state kthread. Repeatedly induces | 822 | * RCU torture force-quiescent-state kthread. Repeatedly induces |
688 | * bursts of calls to force_quiescent_state(), increasing the probability | 823 | * bursts of calls to force_quiescent_state(), increasing the probability |
689 | * of occurrence of some important types of race conditions. | 824 | * of occurrence of some important types of race conditions. |
@@ -933,7 +1068,8 @@ rcu_torture_printk(char *page) | |||
933 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); | 1068 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); |
934 | cnt += sprintf(&page[cnt], | 1069 | cnt += sprintf(&page[cnt], |
935 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " | 1070 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " |
936 | "rtmbe: %d nt: %ld", | 1071 | "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld " |
1072 | "rtbf: %ld rtb: %ld nt: %ld", | ||
937 | rcu_torture_current, | 1073 | rcu_torture_current, |
938 | rcu_torture_current_version, | 1074 | rcu_torture_current_version, |
939 | list_empty(&rcu_torture_freelist), | 1075 | list_empty(&rcu_torture_freelist), |
@@ -941,8 +1077,19 @@ rcu_torture_printk(char *page) | |||
941 | atomic_read(&n_rcu_torture_alloc_fail), | 1077 | atomic_read(&n_rcu_torture_alloc_fail), |
942 | atomic_read(&n_rcu_torture_free), | 1078 | atomic_read(&n_rcu_torture_free), |
943 | atomic_read(&n_rcu_torture_mberror), | 1079 | atomic_read(&n_rcu_torture_mberror), |
1080 | n_rcu_torture_boost_ktrerror, | ||
1081 | n_rcu_torture_boost_rterror, | ||
1082 | n_rcu_torture_boost_allocerror, | ||
1083 | n_rcu_torture_boost_afferror, | ||
1084 | n_rcu_torture_boost_failure, | ||
1085 | n_rcu_torture_boosts, | ||
944 | n_rcu_torture_timers); | 1086 | n_rcu_torture_timers); |
945 | if (atomic_read(&n_rcu_torture_mberror) != 0) | 1087 | if (atomic_read(&n_rcu_torture_mberror) != 0 || |
1088 | n_rcu_torture_boost_ktrerror != 0 || | ||
1089 | n_rcu_torture_boost_rterror != 0 || | ||
1090 | n_rcu_torture_boost_allocerror != 0 || | ||
1091 | n_rcu_torture_boost_afferror != 0 || | ||
1092 | n_rcu_torture_boost_failure != 0) | ||
946 | cnt += sprintf(&page[cnt], " !!!"); | 1093 | cnt += sprintf(&page[cnt], " !!!"); |
947 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | 1094 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
948 | if (i > 1) { | 1095 | if (i > 1) { |
@@ -1094,22 +1241,91 @@ rcu_torture_stutter(void *arg) | |||
1094 | } | 1241 | } |
1095 | 1242 | ||
1096 | static inline void | 1243 | static inline void |
1097 | rcu_torture_print_module_parms(char *tag) | 1244 | rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) |
1098 | { | 1245 | { |
1099 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1246 | printk(KERN_ALERT "%s" TORTURE_FLAG |
1100 | "--- %s: nreaders=%d nfakewriters=%d " | 1247 | "--- %s: nreaders=%d nfakewriters=%d " |
1101 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " | 1248 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " |
1102 | "shuffle_interval=%d stutter=%d irqreader=%d " | 1249 | "shuffle_interval=%d stutter=%d irqreader=%d " |
1103 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n", | 1250 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " |
1251 | "test_boost=%d/%d test_boost_interval=%d " | ||
1252 | "test_boost_duration=%d\n", | ||
1104 | torture_type, tag, nrealreaders, nfakewriters, | 1253 | torture_type, tag, nrealreaders, nfakewriters, |
1105 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, | 1254 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, |
1106 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter); | 1255 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, |
1256 | test_boost, cur_ops->can_boost, | ||
1257 | test_boost_interval, test_boost_duration); | ||
1107 | } | 1258 | } |
1108 | 1259 | ||
1109 | static struct notifier_block rcutorture_nb = { | 1260 | static struct notifier_block rcutorture_shutdown_nb = { |
1110 | .notifier_call = rcutorture_shutdown_notify, | 1261 | .notifier_call = rcutorture_shutdown_notify, |
1111 | }; | 1262 | }; |
1112 | 1263 | ||
1264 | static void rcutorture_booster_cleanup(int cpu) | ||
1265 | { | ||
1266 | struct task_struct *t; | ||
1267 | |||
1268 | if (boost_tasks[cpu] == NULL) | ||
1269 | return; | ||
1270 | mutex_lock(&boost_mutex); | ||
1271 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task"); | ||
1272 | t = boost_tasks[cpu]; | ||
1273 | boost_tasks[cpu] = NULL; | ||
1274 | mutex_unlock(&boost_mutex); | ||
1275 | |||
1276 | /* This must be outside of the mutex, otherwise deadlock! */ | ||
1277 | kthread_stop(t); | ||
1278 | } | ||
1279 | |||
1280 | static int rcutorture_booster_init(int cpu) | ||
1281 | { | ||
1282 | int retval; | ||
1283 | |||
1284 | if (boost_tasks[cpu] != NULL) | ||
1285 | return 0; /* Already created, nothing more to do. */ | ||
1286 | |||
1287 | /* Don't allow time recalculation while creating a new task. */ | ||
1288 | mutex_lock(&boost_mutex); | ||
1289 | VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); | ||
1290 | boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL, | ||
1291 | "rcu_torture_boost"); | ||
1292 | if (IS_ERR(boost_tasks[cpu])) { | ||
1293 | retval = PTR_ERR(boost_tasks[cpu]); | ||
1294 | VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); | ||
1295 | n_rcu_torture_boost_ktrerror++; | ||
1296 | boost_tasks[cpu] = NULL; | ||
1297 | mutex_unlock(&boost_mutex); | ||
1298 | return retval; | ||
1299 | } | ||
1300 | kthread_bind(boost_tasks[cpu], cpu); | ||
1301 | wake_up_process(boost_tasks[cpu]); | ||
1302 | mutex_unlock(&boost_mutex); | ||
1303 | return 0; | ||
1304 | } | ||
1305 | |||
1306 | static int rcutorture_cpu_notify(struct notifier_block *self, | ||
1307 | unsigned long action, void *hcpu) | ||
1308 | { | ||
1309 | long cpu = (long)hcpu; | ||
1310 | |||
1311 | switch (action) { | ||
1312 | case CPU_ONLINE: | ||
1313 | case CPU_DOWN_FAILED: | ||
1314 | (void)rcutorture_booster_init(cpu); | ||
1315 | break; | ||
1316 | case CPU_DOWN_PREPARE: | ||
1317 | rcutorture_booster_cleanup(cpu); | ||
1318 | break; | ||
1319 | default: | ||
1320 | break; | ||
1321 | } | ||
1322 | return NOTIFY_OK; | ||
1323 | } | ||
1324 | |||
1325 | static struct notifier_block rcutorture_cpu_nb = { | ||
1326 | .notifier_call = rcutorture_cpu_notify, | ||
1327 | }; | ||
1328 | |||
1113 | static void | 1329 | static void |
1114 | rcu_torture_cleanup(void) | 1330 | rcu_torture_cleanup(void) |
1115 | { | 1331 | { |
@@ -1127,7 +1343,7 @@ rcu_torture_cleanup(void) | |||
1127 | } | 1343 | } |
1128 | fullstop = FULLSTOP_RMMOD; | 1344 | fullstop = FULLSTOP_RMMOD; |
1129 | mutex_unlock(&fullstop_mutex); | 1345 | mutex_unlock(&fullstop_mutex); |
1130 | unregister_reboot_notifier(&rcutorture_nb); | 1346 | unregister_reboot_notifier(&rcutorture_shutdown_nb); |
1131 | if (stutter_task) { | 1347 | if (stutter_task) { |
1132 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); | 1348 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); |
1133 | kthread_stop(stutter_task); | 1349 | kthread_stop(stutter_task); |
@@ -1184,6 +1400,12 @@ rcu_torture_cleanup(void) | |||
1184 | kthread_stop(fqs_task); | 1400 | kthread_stop(fqs_task); |
1185 | } | 1401 | } |
1186 | fqs_task = NULL; | 1402 | fqs_task = NULL; |
1403 | if ((test_boost == 1 && cur_ops->can_boost) || | ||
1404 | test_boost == 2) { | ||
1405 | unregister_cpu_notifier(&rcutorture_cpu_nb); | ||
1406 | for_each_possible_cpu(i) | ||
1407 | rcutorture_booster_cleanup(i); | ||
1408 | } | ||
1187 | 1409 | ||
1188 | /* Wait for all RCU callbacks to fire. */ | 1410 | /* Wait for all RCU callbacks to fire. */ |
1189 | 1411 | ||
@@ -1195,9 +1417,9 @@ rcu_torture_cleanup(void) | |||
1195 | if (cur_ops->cleanup) | 1417 | if (cur_ops->cleanup) |
1196 | cur_ops->cleanup(); | 1418 | cur_ops->cleanup(); |
1197 | if (atomic_read(&n_rcu_torture_error)) | 1419 | if (atomic_read(&n_rcu_torture_error)) |
1198 | rcu_torture_print_module_parms("End of test: FAILURE"); | 1420 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); |
1199 | else | 1421 | else |
1200 | rcu_torture_print_module_parms("End of test: SUCCESS"); | 1422 | rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); |
1201 | } | 1423 | } |
1202 | 1424 | ||
1203 | static int __init | 1425 | static int __init |
@@ -1242,7 +1464,7 @@ rcu_torture_init(void) | |||
1242 | nrealreaders = nreaders; | 1464 | nrealreaders = nreaders; |
1243 | else | 1465 | else |
1244 | nrealreaders = 2 * num_online_cpus(); | 1466 | nrealreaders = 2 * num_online_cpus(); |
1245 | rcu_torture_print_module_parms("Start of test"); | 1467 | rcu_torture_print_module_parms(cur_ops, "Start of test"); |
1246 | fullstop = FULLSTOP_DONTSTOP; | 1468 | fullstop = FULLSTOP_DONTSTOP; |
1247 | 1469 | ||
1248 | /* Set up the freelist. */ | 1470 | /* Set up the freelist. */ |
@@ -1263,6 +1485,12 @@ rcu_torture_init(void) | |||
1263 | atomic_set(&n_rcu_torture_free, 0); | 1485 | atomic_set(&n_rcu_torture_free, 0); |
1264 | atomic_set(&n_rcu_torture_mberror, 0); | 1486 | atomic_set(&n_rcu_torture_mberror, 0); |
1265 | atomic_set(&n_rcu_torture_error, 0); | 1487 | atomic_set(&n_rcu_torture_error, 0); |
1488 | n_rcu_torture_boost_ktrerror = 0; | ||
1489 | n_rcu_torture_boost_rterror = 0; | ||
1490 | n_rcu_torture_boost_allocerror = 0; | ||
1491 | n_rcu_torture_boost_afferror = 0; | ||
1492 | n_rcu_torture_boost_failure = 0; | ||
1493 | n_rcu_torture_boosts = 0; | ||
1266 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | 1494 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
1267 | atomic_set(&rcu_torture_wcount[i], 0); | 1495 | atomic_set(&rcu_torture_wcount[i], 0); |
1268 | for_each_possible_cpu(cpu) { | 1496 | for_each_possible_cpu(cpu) { |
@@ -1376,7 +1604,27 @@ rcu_torture_init(void) | |||
1376 | goto unwind; | 1604 | goto unwind; |
1377 | } | 1605 | } |
1378 | } | 1606 | } |
1379 | register_reboot_notifier(&rcutorture_nb); | 1607 | if (test_boost_interval < 1) |
1608 | test_boost_interval = 1; | ||
1609 | if (test_boost_duration < 2) | ||
1610 | test_boost_duration = 2; | ||
1611 | if ((test_boost == 1 && cur_ops->can_boost) || | ||
1612 | test_boost == 2) { | ||
1613 | int retval; | ||
1614 | |||
1615 | boost_starttime = jiffies + test_boost_interval * HZ; | ||
1616 | register_cpu_notifier(&rcutorture_cpu_nb); | ||
1617 | for_each_possible_cpu(i) { | ||
1618 | if (cpu_is_offline(i)) | ||
1619 | continue; /* Heuristic: CPU can go offline. */ | ||
1620 | retval = rcutorture_booster_init(i); | ||
1621 | if (retval < 0) { | ||
1622 | firsterr = retval; | ||
1623 | goto unwind; | ||
1624 | } | ||
1625 | } | ||
1626 | } | ||
1627 | register_reboot_notifier(&rcutorture_shutdown_nb); | ||
1380 | mutex_unlock(&fullstop_mutex); | 1628 | mutex_unlock(&fullstop_mutex); |
1381 | return 0; | 1629 | return 0; |
1382 | 1630 | ||
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ccdc04c4798..d0ddfea6579 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; | |||
67 | .gpnum = -300, \ | 67 | .gpnum = -300, \ |
68 | .completed = -300, \ | 68 | .completed = -300, \ |
69 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ | 69 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ |
70 | .orphan_cbs_list = NULL, \ | ||
71 | .orphan_cbs_tail = &structname.orphan_cbs_list, \ | ||
72 | .orphan_qlen = 0, \ | ||
73 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ | 70 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ |
74 | .n_force_qs = 0, \ | 71 | .n_force_qs = 0, \ |
75 | .n_force_qs_ngp = 0, \ | 72 | .n_force_qs_ngp = 0, \ |
@@ -620,9 +617,17 @@ static void __init check_cpu_stall_init(void) | |||
620 | static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) | 617 | static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) |
621 | { | 618 | { |
622 | if (rdp->gpnum != rnp->gpnum) { | 619 | if (rdp->gpnum != rnp->gpnum) { |
623 | rdp->qs_pending = 1; | 620 | /* |
624 | rdp->passed_quiesc = 0; | 621 | * If the current grace period is waiting for this CPU, |
622 | * set up to detect a quiescent state, otherwise don't | ||
623 | * go looking for one. | ||
624 | */ | ||
625 | rdp->gpnum = rnp->gpnum; | 625 | rdp->gpnum = rnp->gpnum; |
626 | if (rnp->qsmask & rdp->grpmask) { | ||
627 | rdp->qs_pending = 1; | ||
628 | rdp->passed_quiesc = 0; | ||
629 | } else | ||
630 | rdp->qs_pending = 0; | ||
626 | } | 631 | } |
627 | } | 632 | } |
628 | 633 | ||
@@ -681,6 +686,24 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat | |||
681 | 686 | ||
682 | /* Remember that we saw this grace-period completion. */ | 687 | /* Remember that we saw this grace-period completion. */ |
683 | rdp->completed = rnp->completed; | 688 | rdp->completed = rnp->completed; |
689 | |||
690 | /* | ||
691 | * If we were in an extended quiescent state, we may have | ||
692 | * missed some grace periods that others CPUs handled on | ||
693 | * our behalf. Catch up with this state to avoid noting | ||
694 | * spurious new grace periods. If another grace period | ||
695 | * has started, then rnp->gpnum will have advanced, so | ||
696 | * we will detect this later on. | ||
697 | */ | ||
698 | if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) | ||
699 | rdp->gpnum = rdp->completed; | ||
700 | |||
701 | /* | ||
702 | * If RCU does not need a quiescent state from this CPU, | ||
703 | * then make sure that this CPU doesn't go looking for one. | ||
704 | */ | ||
705 | if ((rnp->qsmask & rdp->grpmask) == 0) | ||
706 | rdp->qs_pending = 0; | ||
684 | } | 707 | } |
685 | } | 708 | } |
686 | 709 | ||
@@ -984,53 +1007,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) | |||
984 | #ifdef CONFIG_HOTPLUG_CPU | 1007 | #ifdef CONFIG_HOTPLUG_CPU |
985 | 1008 | ||
986 | /* | 1009 | /* |
987 | * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the | 1010 | * Move a dying CPU's RCU callbacks to online CPU's callback list. |
988 | * specified flavor of RCU. The callbacks will be adopted by the next | 1011 | * Synchronization is not required because this function executes |
989 | * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever | 1012 | * in stop_machine() context. |
990 | * comes first. Because this is invoked from the CPU_DYING notifier, | ||
991 | * irqs are already disabled. | ||
992 | */ | 1013 | */ |
993 | static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) | 1014 | static void rcu_send_cbs_to_online(struct rcu_state *rsp) |
994 | { | 1015 | { |
995 | int i; | 1016 | int i; |
1017 | /* current DYING CPU is cleared in the cpu_online_mask */ | ||
1018 | int receive_cpu = cpumask_any(cpu_online_mask); | ||
996 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | 1019 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); |
1020 | struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); | ||
997 | 1021 | ||
998 | if (rdp->nxtlist == NULL) | 1022 | if (rdp->nxtlist == NULL) |
999 | return; /* irqs disabled, so comparison is stable. */ | 1023 | return; /* irqs disabled, so comparison is stable. */ |
1000 | raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ | 1024 | |
1001 | *rsp->orphan_cbs_tail = rdp->nxtlist; | 1025 | *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; |
1002 | rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; | 1026 | receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; |
1027 | receive_rdp->qlen += rdp->qlen; | ||
1028 | receive_rdp->n_cbs_adopted += rdp->qlen; | ||
1029 | rdp->n_cbs_orphaned += rdp->qlen; | ||
1030 | |||
1003 | rdp->nxtlist = NULL; | 1031 | rdp->nxtlist = NULL; |
1004 | for (i = 0; i < RCU_NEXT_SIZE; i++) | 1032 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
1005 | rdp->nxttail[i] = &rdp->nxtlist; | 1033 | rdp->nxttail[i] = &rdp->nxtlist; |
1006 | rsp->orphan_qlen += rdp->qlen; | ||
1007 | rdp->n_cbs_orphaned += rdp->qlen; | ||
1008 | rdp->qlen = 0; | 1034 | rdp->qlen = 0; |
1009 | raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ | ||
1010 | } | ||
1011 | |||
1012 | /* | ||
1013 | * Adopt previously orphaned RCU callbacks. | ||
1014 | */ | ||
1015 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | ||
1016 | { | ||
1017 | unsigned long flags; | ||
1018 | struct rcu_data *rdp; | ||
1019 | |||
1020 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | ||
1021 | rdp = this_cpu_ptr(rsp->rda); | ||
1022 | if (rsp->orphan_cbs_list == NULL) { | ||
1023 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
1024 | return; | ||
1025 | } | ||
1026 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; | ||
1027 | rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; | ||
1028 | rdp->qlen += rsp->orphan_qlen; | ||
1029 | rdp->n_cbs_adopted += rsp->orphan_qlen; | ||
1030 | rsp->orphan_cbs_list = NULL; | ||
1031 | rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; | ||
1032 | rsp->orphan_qlen = 0; | ||
1033 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
1034 | } | 1035 | } |
1035 | 1036 | ||
1036 | /* | 1037 | /* |
@@ -1081,8 +1082,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
1081 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1082 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1082 | if (need_report & RCU_OFL_TASKS_EXP_GP) | 1083 | if (need_report & RCU_OFL_TASKS_EXP_GP) |
1083 | rcu_report_exp_rnp(rsp, rnp); | 1084 | rcu_report_exp_rnp(rsp, rnp); |
1084 | |||
1085 | rcu_adopt_orphan_cbs(rsp); | ||
1086 | } | 1085 | } |
1087 | 1086 | ||
1088 | /* | 1087 | /* |
@@ -1100,11 +1099,7 @@ static void rcu_offline_cpu(int cpu) | |||
1100 | 1099 | ||
1101 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 1100 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ |
1102 | 1101 | ||
1103 | static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) | 1102 | static void rcu_send_cbs_to_online(struct rcu_state *rsp) |
1104 | { | ||
1105 | } | ||
1106 | |||
1107 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | ||
1108 | { | 1103 | { |
1109 | } | 1104 | } |
1110 | 1105 | ||
@@ -1440,22 +1435,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1440 | */ | 1435 | */ |
1441 | local_irq_save(flags); | 1436 | local_irq_save(flags); |
1442 | rdp = this_cpu_ptr(rsp->rda); | 1437 | rdp = this_cpu_ptr(rsp->rda); |
1443 | rcu_process_gp_end(rsp, rdp); | ||
1444 | check_for_new_grace_period(rsp, rdp); | ||
1445 | 1438 | ||
1446 | /* Add the callback to our list. */ | 1439 | /* Add the callback to our list. */ |
1447 | *rdp->nxttail[RCU_NEXT_TAIL] = head; | 1440 | *rdp->nxttail[RCU_NEXT_TAIL] = head; |
1448 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | 1441 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; |
1449 | 1442 | ||
1450 | /* Start a new grace period if one not already started. */ | ||
1451 | if (!rcu_gp_in_progress(rsp)) { | ||
1452 | unsigned long nestflag; | ||
1453 | struct rcu_node *rnp_root = rcu_get_root(rsp); | ||
1454 | |||
1455 | raw_spin_lock_irqsave(&rnp_root->lock, nestflag); | ||
1456 | rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ | ||
1457 | } | ||
1458 | |||
1459 | /* | 1443 | /* |
1460 | * Force the grace period if too many callbacks or too long waiting. | 1444 | * Force the grace period if too many callbacks or too long waiting. |
1461 | * Enforce hysteresis, and don't invoke force_quiescent_state() | 1445 | * Enforce hysteresis, and don't invoke force_quiescent_state() |
@@ -1464,12 +1448,27 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1464 | * is the only one waiting for a grace period to complete. | 1448 | * is the only one waiting for a grace period to complete. |
1465 | */ | 1449 | */ |
1466 | if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { | 1450 | if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { |
1467 | rdp->blimit = LONG_MAX; | 1451 | |
1468 | if (rsp->n_force_qs == rdp->n_force_qs_snap && | 1452 | /* Are we ignoring a completed grace period? */ |
1469 | *rdp->nxttail[RCU_DONE_TAIL] != head) | 1453 | rcu_process_gp_end(rsp, rdp); |
1470 | force_quiescent_state(rsp, 0); | 1454 | check_for_new_grace_period(rsp, rdp); |
1471 | rdp->n_force_qs_snap = rsp->n_force_qs; | 1455 | |
1472 | rdp->qlen_last_fqs_check = rdp->qlen; | 1456 | /* Start a new grace period if one not already started. */ |
1457 | if (!rcu_gp_in_progress(rsp)) { | ||
1458 | unsigned long nestflag; | ||
1459 | struct rcu_node *rnp_root = rcu_get_root(rsp); | ||
1460 | |||
1461 | raw_spin_lock_irqsave(&rnp_root->lock, nestflag); | ||
1462 | rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ | ||
1463 | } else { | ||
1464 | /* Give the grace period a kick. */ | ||
1465 | rdp->blimit = LONG_MAX; | ||
1466 | if (rsp->n_force_qs == rdp->n_force_qs_snap && | ||
1467 | *rdp->nxttail[RCU_DONE_TAIL] != head) | ||
1468 | force_quiescent_state(rsp, 0); | ||
1469 | rdp->n_force_qs_snap = rsp->n_force_qs; | ||
1470 | rdp->qlen_last_fqs_check = rdp->qlen; | ||
1471 | } | ||
1473 | } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) | 1472 | } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) |
1474 | force_quiescent_state(rsp, 1); | 1473 | force_quiescent_state(rsp, 1); |
1475 | local_irq_restore(flags); | 1474 | local_irq_restore(flags); |
@@ -1699,13 +1698,12 @@ static void _rcu_barrier(struct rcu_state *rsp, | |||
1699 | * decrement rcu_barrier_cpu_count -- otherwise the first CPU | 1698 | * decrement rcu_barrier_cpu_count -- otherwise the first CPU |
1700 | * might complete its grace period before all of the other CPUs | 1699 | * might complete its grace period before all of the other CPUs |
1701 | * did their increment, causing this function to return too | 1700 | * did their increment, causing this function to return too |
1702 | * early. | 1701 | * early. Note that on_each_cpu() disables irqs, which prevents |
1702 | * any CPUs from coming online or going offline until each online | ||
1703 | * CPU has queued its RCU-barrier callback. | ||
1703 | */ | 1704 | */ |
1704 | atomic_set(&rcu_barrier_cpu_count, 1); | 1705 | atomic_set(&rcu_barrier_cpu_count, 1); |
1705 | preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */ | ||
1706 | rcu_adopt_orphan_cbs(rsp); | ||
1707 | on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); | 1706 | on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); |
1708 | preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */ | ||
1709 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 1707 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) |
1710 | complete(&rcu_barrier_completion); | 1708 | complete(&rcu_barrier_completion); |
1711 | wait_for_completion(&rcu_barrier_completion); | 1709 | wait_for_completion(&rcu_barrier_completion); |
@@ -1831,18 +1829,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
1831 | case CPU_DYING: | 1829 | case CPU_DYING: |
1832 | case CPU_DYING_FROZEN: | 1830 | case CPU_DYING_FROZEN: |
1833 | /* | 1831 | /* |
1834 | * preempt_disable() in _rcu_barrier() prevents stop_machine(), | 1832 | * The whole machine is "stopped" except this CPU, so we can |
1835 | * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" | 1833 | * touch any data without introducing corruption. We send the |
1836 | * returns, all online cpus have queued rcu_barrier_func(). | 1834 | * dying CPU's callbacks to an arbitrarily chosen online CPU. |
1837 | * The dying CPU clears its cpu_online_mask bit and | ||
1838 | * moves all of its RCU callbacks to ->orphan_cbs_list | ||
1839 | * in the context of stop_machine(), so subsequent calls | ||
1840 | * to _rcu_barrier() will adopt these callbacks and only | ||
1841 | * then queue rcu_barrier_func() on all remaining CPUs. | ||
1842 | */ | 1835 | */ |
1843 | rcu_send_cbs_to_orphanage(&rcu_bh_state); | 1836 | rcu_send_cbs_to_online(&rcu_bh_state); |
1844 | rcu_send_cbs_to_orphanage(&rcu_sched_state); | 1837 | rcu_send_cbs_to_online(&rcu_sched_state); |
1845 | rcu_preempt_send_cbs_to_orphanage(); | 1838 | rcu_preempt_send_cbs_to_online(); |
1846 | break; | 1839 | break; |
1847 | case CPU_DEAD: | 1840 | case CPU_DEAD: |
1848 | case CPU_DEAD_FROZEN: | 1841 | case CPU_DEAD_FROZEN: |
@@ -1880,8 +1873,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
1880 | { | 1873 | { |
1881 | int i; | 1874 | int i; |
1882 | 1875 | ||
1883 | for (i = NUM_RCU_LVLS - 1; i >= 0; i--) | 1876 | for (i = NUM_RCU_LVLS - 1; i > 0; i--) |
1884 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; | 1877 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; |
1878 | rsp->levelspread[0] = RCU_FANOUT_LEAF; | ||
1885 | } | 1879 | } |
1886 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ | 1880 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ |
1887 | static void __init rcu_init_levelspread(struct rcu_state *rsp) | 1881 | static void __init rcu_init_levelspread(struct rcu_state *rsp) |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 91d4170c5c1..e8f057e44e3 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -31,46 +31,51 @@ | |||
31 | /* | 31 | /* |
32 | * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. | 32 | * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. |
33 | * In theory, it should be possible to add more levels straightforwardly. | 33 | * In theory, it should be possible to add more levels straightforwardly. |
34 | * In practice, this has not been tested, so there is probably some | 34 | * In practice, this did work well going from three levels to four. |
35 | * bug somewhere. | 35 | * Of course, your mileage may vary. |
36 | */ | 36 | */ |
37 | #define MAX_RCU_LVLS 4 | 37 | #define MAX_RCU_LVLS 4 |
38 | #define RCU_FANOUT (CONFIG_RCU_FANOUT) | 38 | #if CONFIG_RCU_FANOUT > 16 |
39 | #define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) | 39 | #define RCU_FANOUT_LEAF 16 |
40 | #define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) | 40 | #else /* #if CONFIG_RCU_FANOUT > 16 */ |
41 | #define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT) | 41 | #define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT) |
42 | 42 | #endif /* #else #if CONFIG_RCU_FANOUT > 16 */ | |
43 | #if NR_CPUS <= RCU_FANOUT | 43 | #define RCU_FANOUT_1 (RCU_FANOUT_LEAF) |
44 | #define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) | ||
45 | #define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) | ||
46 | #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) | ||
47 | |||
48 | #if NR_CPUS <= RCU_FANOUT_1 | ||
44 | # define NUM_RCU_LVLS 1 | 49 | # define NUM_RCU_LVLS 1 |
45 | # define NUM_RCU_LVL_0 1 | 50 | # define NUM_RCU_LVL_0 1 |
46 | # define NUM_RCU_LVL_1 (NR_CPUS) | 51 | # define NUM_RCU_LVL_1 (NR_CPUS) |
47 | # define NUM_RCU_LVL_2 0 | 52 | # define NUM_RCU_LVL_2 0 |
48 | # define NUM_RCU_LVL_3 0 | 53 | # define NUM_RCU_LVL_3 0 |
49 | # define NUM_RCU_LVL_4 0 | 54 | # define NUM_RCU_LVL_4 0 |
50 | #elif NR_CPUS <= RCU_FANOUT_SQ | 55 | #elif NR_CPUS <= RCU_FANOUT_2 |
51 | # define NUM_RCU_LVLS 2 | 56 | # define NUM_RCU_LVLS 2 |
52 | # define NUM_RCU_LVL_0 1 | 57 | # define NUM_RCU_LVL_0 1 |
53 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) | 58 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) |
54 | # define NUM_RCU_LVL_2 (NR_CPUS) | 59 | # define NUM_RCU_LVL_2 (NR_CPUS) |
55 | # define NUM_RCU_LVL_3 0 | 60 | # define NUM_RCU_LVL_3 0 |
56 | # define NUM_RCU_LVL_4 0 | 61 | # define NUM_RCU_LVL_4 0 |
57 | #elif NR_CPUS <= RCU_FANOUT_CUBE | 62 | #elif NR_CPUS <= RCU_FANOUT_3 |
58 | # define NUM_RCU_LVLS 3 | 63 | # define NUM_RCU_LVLS 3 |
59 | # define NUM_RCU_LVL_0 1 | 64 | # define NUM_RCU_LVL_0 1 |
60 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) | 65 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) |
61 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) | 66 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) |
62 | # define NUM_RCU_LVL_3 NR_CPUS | 67 | # define NUM_RCU_LVL_3 (NR_CPUS) |
63 | # define NUM_RCU_LVL_4 0 | 68 | # define NUM_RCU_LVL_4 0 |
64 | #elif NR_CPUS <= RCU_FANOUT_FOURTH | 69 | #elif NR_CPUS <= RCU_FANOUT_4 |
65 | # define NUM_RCU_LVLS 4 | 70 | # define NUM_RCU_LVLS 4 |
66 | # define NUM_RCU_LVL_0 1 | 71 | # define NUM_RCU_LVL_0 1 |
67 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE) | 72 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) |
68 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) | 73 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) |
69 | # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) | 74 | # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) |
70 | # define NUM_RCU_LVL_4 NR_CPUS | 75 | # define NUM_RCU_LVL_4 (NR_CPUS) |
71 | #else | 76 | #else |
72 | # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" | 77 | # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" |
73 | #endif /* #if (NR_CPUS) <= RCU_FANOUT */ | 78 | #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ |
74 | 79 | ||
75 | #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) | 80 | #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) |
76 | #define NUM_RCU_NODES (RCU_SUM - NR_CPUS) | 81 | #define NUM_RCU_NODES (RCU_SUM - NR_CPUS) |
@@ -203,8 +208,8 @@ struct rcu_data { | |||
203 | long qlen_last_fqs_check; | 208 | long qlen_last_fqs_check; |
204 | /* qlen at last check for QS forcing */ | 209 | /* qlen at last check for QS forcing */ |
205 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ | 210 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ |
206 | unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */ | 211 | unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ |
207 | unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */ | 212 | unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ |
208 | unsigned long n_force_qs_snap; | 213 | unsigned long n_force_qs_snap; |
209 | /* did other CPU force QS recently? */ | 214 | /* did other CPU force QS recently? */ |
210 | long blimit; /* Upper limit on a processed batch */ | 215 | long blimit; /* Upper limit on a processed batch */ |
@@ -309,15 +314,7 @@ struct rcu_state { | |||
309 | /* End of fields guarded by root rcu_node's lock. */ | 314 | /* End of fields guarded by root rcu_node's lock. */ |
310 | 315 | ||
311 | raw_spinlock_t onofflock; /* exclude on/offline and */ | 316 | raw_spinlock_t onofflock; /* exclude on/offline and */ |
312 | /* starting new GP. Also */ | 317 | /* starting new GP. */ |
313 | /* protects the following */ | ||
314 | /* orphan_cbs fields. */ | ||
315 | struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */ | ||
316 | /* orphaned by all CPUs in */ | ||
317 | /* a given leaf rcu_node */ | ||
318 | /* going offline. */ | ||
319 | struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ | ||
320 | long orphan_qlen; /* Number of orphaned cbs. */ | ||
321 | raw_spinlock_t fqslock; /* Only one task forcing */ | 318 | raw_spinlock_t fqslock; /* Only one task forcing */ |
322 | /* quiescent states. */ | 319 | /* quiescent states. */ |
323 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | 320 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
@@ -390,7 +387,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); | |||
390 | static int rcu_preempt_pending(int cpu); | 387 | static int rcu_preempt_pending(int cpu); |
391 | static int rcu_preempt_needs_cpu(int cpu); | 388 | static int rcu_preempt_needs_cpu(int cpu); |
392 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu); | 389 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu); |
393 | static void rcu_preempt_send_cbs_to_orphanage(void); | 390 | static void rcu_preempt_send_cbs_to_online(void); |
394 | static void __init __rcu_init_preempt(void); | 391 | static void __init __rcu_init_preempt(void); |
395 | static void rcu_needs_cpu_flush(void); | 392 | static void rcu_needs_cpu_flush(void); |
396 | 393 | ||
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 71a4147473f..a3638710dc6 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -25,6 +25,7 @@ | |||
25 | */ | 25 | */ |
26 | 26 | ||
27 | #include <linux/delay.h> | 27 | #include <linux/delay.h> |
28 | #include <linux/stop_machine.h> | ||
28 | 29 | ||
29 | /* | 30 | /* |
30 | * Check the RCU kernel configuration parameters and print informative | 31 | * Check the RCU kernel configuration parameters and print informative |
@@ -773,11 +774,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | |||
773 | } | 774 | } |
774 | 775 | ||
775 | /* | 776 | /* |
776 | * Move preemptable RCU's callbacks to ->orphan_cbs_list. | 777 | * Move preemptable RCU's callbacks from dying CPU to other online CPU. |
777 | */ | 778 | */ |
778 | static void rcu_preempt_send_cbs_to_orphanage(void) | 779 | static void rcu_preempt_send_cbs_to_online(void) |
779 | { | 780 | { |
780 | rcu_send_cbs_to_orphanage(&rcu_preempt_state); | 781 | rcu_send_cbs_to_online(&rcu_preempt_state); |
781 | } | 782 | } |
782 | 783 | ||
783 | /* | 784 | /* |
@@ -1001,7 +1002,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | |||
1001 | /* | 1002 | /* |
1002 | * Because there is no preemptable RCU, there are no callbacks to move. | 1003 | * Because there is no preemptable RCU, there are no callbacks to move. |
1003 | */ | 1004 | */ |
1004 | static void rcu_preempt_send_cbs_to_orphanage(void) | 1005 | static void rcu_preempt_send_cbs_to_online(void) |
1005 | { | 1006 | { |
1006 | } | 1007 | } |
1007 | 1008 | ||
@@ -1014,6 +1015,132 @@ static void __init __rcu_init_preempt(void) | |||
1014 | 1015 | ||
1015 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ | 1016 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ |
1016 | 1017 | ||
1018 | #ifndef CONFIG_SMP | ||
1019 | |||
1020 | void synchronize_sched_expedited(void) | ||
1021 | { | ||
1022 | cond_resched(); | ||
1023 | } | ||
1024 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
1025 | |||
1026 | #else /* #ifndef CONFIG_SMP */ | ||
1027 | |||
1028 | static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); | ||
1029 | static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); | ||
1030 | |||
1031 | static int synchronize_sched_expedited_cpu_stop(void *data) | ||
1032 | { | ||
1033 | /* | ||
1034 | * There must be a full memory barrier on each affected CPU | ||
1035 | * between the time that try_stop_cpus() is called and the | ||
1036 | * time that it returns. | ||
1037 | * | ||
1038 | * In the current initial implementation of cpu_stop, the | ||
1039 | * above condition is already met when the control reaches | ||
1040 | * this point and the following smp_mb() is not strictly | ||
1041 | * necessary. Do smp_mb() anyway for documentation and | ||
1042 | * robustness against future implementation changes. | ||
1043 | */ | ||
1044 | smp_mb(); /* See above comment block. */ | ||
1045 | return 0; | ||
1046 | } | ||
1047 | |||
1048 | /* | ||
1049 | * Wait for an rcu-sched grace period to elapse, but use "big hammer" | ||
1050 | * approach to force grace period to end quickly. This consumes | ||
1051 | * significant time on all CPUs, and is thus not recommended for | ||
1052 | * any sort of common-case code. | ||
1053 | * | ||
1054 | * Note that it is illegal to call this function while holding any | ||
1055 | * lock that is acquired by a CPU-hotplug notifier. Failing to | ||
1056 | * observe this restriction will result in deadlock. | ||
1057 | * | ||
1058 | * This implementation can be thought of as an application of ticket | ||
1059 | * locking to RCU, with sync_sched_expedited_started and | ||
1060 | * sync_sched_expedited_done taking on the roles of the halves | ||
1061 | * of the ticket-lock word. Each task atomically increments | ||
1062 | * sync_sched_expedited_started upon entry, snapshotting the old value, | ||
1063 | * then attempts to stop all the CPUs. If this succeeds, then each | ||
1064 | * CPU will have executed a context switch, resulting in an RCU-sched | ||
1065 | * grace period. We are then done, so we use atomic_cmpxchg() to | ||
1066 | * update sync_sched_expedited_done to match our snapshot -- but | ||
1067 | * only if someone else has not already advanced past our snapshot. | ||
1068 | * | ||
1069 | * On the other hand, if try_stop_cpus() fails, we check the value | ||
1070 | * of sync_sched_expedited_done. If it has advanced past our | ||
1071 | * initial snapshot, then someone else must have forced a grace period | ||
1072 | * some time after we took our snapshot. In this case, our work is | ||
1073 | * done for us, and we can simply return. Otherwise, we try again, | ||
1074 | * but keep our initial snapshot for purposes of checking for someone | ||
1075 | * doing our work for us. | ||
1076 | * | ||
1077 | * If we fail too many times in a row, we fall back to synchronize_sched(). | ||
1078 | */ | ||
1079 | void synchronize_sched_expedited(void) | ||
1080 | { | ||
1081 | int firstsnap, s, snap, trycount = 0; | ||
1082 | |||
1083 | /* Note that atomic_inc_return() implies full memory barrier. */ | ||
1084 | firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); | ||
1085 | get_online_cpus(); | ||
1086 | |||
1087 | /* | ||
1088 | * Each pass through the following loop attempts to force a | ||
1089 | * context switch on each CPU. | ||
1090 | */ | ||
1091 | while (try_stop_cpus(cpu_online_mask, | ||
1092 | synchronize_sched_expedited_cpu_stop, | ||
1093 | NULL) == -EAGAIN) { | ||
1094 | put_online_cpus(); | ||
1095 | |||
1096 | /* No joy, try again later. Or just synchronize_sched(). */ | ||
1097 | if (trycount++ < 10) | ||
1098 | udelay(trycount * num_online_cpus()); | ||
1099 | else { | ||
1100 | synchronize_sched(); | ||
1101 | return; | ||
1102 | } | ||
1103 | |||
1104 | /* Check to see if someone else did our work for us. */ | ||
1105 | s = atomic_read(&sync_sched_expedited_done); | ||
1106 | if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { | ||
1107 | smp_mb(); /* ensure test happens before caller kfree */ | ||
1108 | return; | ||
1109 | } | ||
1110 | |||
1111 | /* | ||
1112 | * Refetching sync_sched_expedited_started allows later | ||
1113 | * callers to piggyback on our grace period. We subtract | ||
1114 | * 1 to get the same token that the last incrementer got. | ||
1115 | * We retry after they started, so our grace period works | ||
1116 | * for them, and they started after our first try, so their | ||
1117 | * grace period works for us. | ||
1118 | */ | ||
1119 | get_online_cpus(); | ||
1120 | snap = atomic_read(&sync_sched_expedited_started) - 1; | ||
1121 | smp_mb(); /* ensure read is before try_stop_cpus(). */ | ||
1122 | } | ||
1123 | |||
1124 | /* | ||
1125 | * Everyone up to our most recent fetch is covered by our grace | ||
1126 | * period. Update the counter, but only if our work is still | ||
1127 | * relevant -- which it won't be if someone who started later | ||
1128 | * than we did beat us to the punch. | ||
1129 | */ | ||
1130 | do { | ||
1131 | s = atomic_read(&sync_sched_expedited_done); | ||
1132 | if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { | ||
1133 | smp_mb(); /* ensure test happens before caller kfree */ | ||
1134 | break; | ||
1135 | } | ||
1136 | } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); | ||
1137 | |||
1138 | put_online_cpus(); | ||
1139 | } | ||
1140 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
1141 | |||
1142 | #endif /* #else #ifndef CONFIG_SMP */ | ||
1143 | |||
1017 | #if !defined(CONFIG_RCU_FAST_NO_HZ) | 1144 | #if !defined(CONFIG_RCU_FAST_NO_HZ) |
1018 | 1145 | ||
1019 | /* | 1146 | /* |
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index d15430b9d12..c8e97853b97 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -166,13 +166,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
166 | 166 | ||
167 | gpnum = rsp->gpnum; | 167 | gpnum = rsp->gpnum; |
168 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " | 168 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " |
169 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", | 169 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", |
170 | rsp->completed, gpnum, rsp->signaled, | 170 | rsp->completed, gpnum, rsp->signaled, |
171 | (long)(rsp->jiffies_force_qs - jiffies), | 171 | (long)(rsp->jiffies_force_qs - jiffies), |
172 | (int)(jiffies & 0xffff), | 172 | (int)(jiffies & 0xffff), |
173 | rsp->n_force_qs, rsp->n_force_qs_ngp, | 173 | rsp->n_force_qs, rsp->n_force_qs_ngp, |
174 | rsp->n_force_qs - rsp->n_force_qs_ngp, | 174 | rsp->n_force_qs - rsp->n_force_qs_ngp, |
175 | rsp->n_force_qs_lh, rsp->orphan_qlen); | 175 | rsp->n_force_qs_lh); |
176 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { | 176 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { |
177 | if (rnp->level != level) { | 177 | if (rnp->level != level) { |
178 | seq_puts(m, "\n"); | 178 | seq_puts(m, "\n"); |
@@ -300,7 +300,7 @@ static const struct file_operations rcu_pending_fops = { | |||
300 | 300 | ||
301 | static struct dentry *rcudir; | 301 | static struct dentry *rcudir; |
302 | 302 | ||
303 | static int __init rcuclassic_trace_init(void) | 303 | static int __init rcutree_trace_init(void) |
304 | { | 304 | { |
305 | struct dentry *retval; | 305 | struct dentry *retval; |
306 | 306 | ||
@@ -337,14 +337,14 @@ free_out: | |||
337 | return 1; | 337 | return 1; |
338 | } | 338 | } |
339 | 339 | ||
340 | static void __exit rcuclassic_trace_cleanup(void) | 340 | static void __exit rcutree_trace_cleanup(void) |
341 | { | 341 | { |
342 | debugfs_remove_recursive(rcudir); | 342 | debugfs_remove_recursive(rcudir); |
343 | } | 343 | } |
344 | 344 | ||
345 | 345 | ||
346 | module_init(rcuclassic_trace_init); | 346 | module_init(rcutree_trace_init); |
347 | module_exit(rcuclassic_trace_cleanup); | 347 | module_exit(rcutree_trace_cleanup); |
348 | 348 | ||
349 | MODULE_AUTHOR("Paul E. McKenney"); | 349 | MODULE_AUTHOR("Paul E. McKenney"); |
350 | MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); | 350 | MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); |
diff --git a/kernel/resource.c b/kernel/resource.c index 9fad33efd0d..798e2fae2a0 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -40,23 +40,6 @@ EXPORT_SYMBOL(iomem_resource); | |||
40 | 40 | ||
41 | static DEFINE_RWLOCK(resource_lock); | 41 | static DEFINE_RWLOCK(resource_lock); |
42 | 42 | ||
43 | /* | ||
44 | * By default, we allocate free space bottom-up. The architecture can request | ||
45 | * top-down by clearing this flag. The user can override the architecture's | ||
46 | * choice with the "resource_alloc_from_bottom" kernel boot option, but that | ||
47 | * should only be a debugging tool. | ||
48 | */ | ||
49 | int resource_alloc_from_bottom = 1; | ||
50 | |||
51 | static __init int setup_alloc_from_bottom(char *s) | ||
52 | { | ||
53 | printk(KERN_INFO | ||
54 | "resource: allocating from bottom-up; please report a bug\n"); | ||
55 | resource_alloc_from_bottom = 1; | ||
56 | return 0; | ||
57 | } | ||
58 | early_param("resource_alloc_from_bottom", setup_alloc_from_bottom); | ||
59 | |||
60 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) | 43 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) |
61 | { | 44 | { |
62 | struct resource *p = v; | 45 | struct resource *p = v; |
@@ -374,6 +357,10 @@ int __weak page_is_ram(unsigned long pfn) | |||
374 | return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; | 357 | return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; |
375 | } | 358 | } |
376 | 359 | ||
360 | void __weak arch_remove_reservations(struct resource *avail) | ||
361 | { | ||
362 | } | ||
363 | |||
377 | static resource_size_t simple_align_resource(void *data, | 364 | static resource_size_t simple_align_resource(void *data, |
378 | const struct resource *avail, | 365 | const struct resource *avail, |
379 | resource_size_t size, | 366 | resource_size_t size, |
@@ -397,74 +384,7 @@ static bool resource_contains(struct resource *res1, struct resource *res2) | |||
397 | } | 384 | } |
398 | 385 | ||
399 | /* | 386 | /* |
400 | * Find the resource before "child" in the sibling list of "root" children. | ||
401 | */ | ||
402 | static struct resource *find_sibling_prev(struct resource *root, struct resource *child) | ||
403 | { | ||
404 | struct resource *this; | ||
405 | |||
406 | for (this = root->child; this; this = this->sibling) | ||
407 | if (this->sibling == child) | ||
408 | return this; | ||
409 | |||
410 | return NULL; | ||
411 | } | ||
412 | |||
413 | /* | ||
414 | * Find empty slot in the resource tree given range and alignment. | 387 | * Find empty slot in the resource tree given range and alignment. |
415 | * This version allocates from the end of the root resource first. | ||
416 | */ | ||
417 | static int find_resource_from_top(struct resource *root, struct resource *new, | ||
418 | resource_size_t size, resource_size_t min, | ||
419 | resource_size_t max, resource_size_t align, | ||
420 | resource_size_t (*alignf)(void *, | ||
421 | const struct resource *, | ||
422 | resource_size_t, | ||
423 | resource_size_t), | ||
424 | void *alignf_data) | ||
425 | { | ||
426 | struct resource *this; | ||
427 | struct resource tmp, avail, alloc; | ||
428 | |||
429 | tmp.start = root->end; | ||
430 | tmp.end = root->end; | ||
431 | |||
432 | this = find_sibling_prev(root, NULL); | ||
433 | for (;;) { | ||
434 | if (this) { | ||
435 | if (this->end < root->end) | ||
436 | tmp.start = this->end + 1; | ||
437 | } else | ||
438 | tmp.start = root->start; | ||
439 | |||
440 | resource_clip(&tmp, min, max); | ||
441 | |||
442 | /* Check for overflow after ALIGN() */ | ||
443 | avail = *new; | ||
444 | avail.start = ALIGN(tmp.start, align); | ||
445 | avail.end = tmp.end; | ||
446 | if (avail.start >= tmp.start) { | ||
447 | alloc.start = alignf(alignf_data, &avail, size, align); | ||
448 | alloc.end = alloc.start + size - 1; | ||
449 | if (resource_contains(&avail, &alloc)) { | ||
450 | new->start = alloc.start; | ||
451 | new->end = alloc.end; | ||
452 | return 0; | ||
453 | } | ||
454 | } | ||
455 | |||
456 | if (!this || this->start == root->start) | ||
457 | break; | ||
458 | |||
459 | tmp.end = this->start - 1; | ||
460 | this = find_sibling_prev(root, this); | ||
461 | } | ||
462 | return -EBUSY; | ||
463 | } | ||
464 | |||
465 | /* | ||
466 | * Find empty slot in the resource tree given range and alignment. | ||
467 | * This version allocates from the beginning of the root resource first. | ||
468 | */ | 388 | */ |
469 | static int find_resource(struct resource *root, struct resource *new, | 389 | static int find_resource(struct resource *root, struct resource *new, |
470 | resource_size_t size, resource_size_t min, | 390 | resource_size_t size, resource_size_t min, |
@@ -478,23 +398,24 @@ static int find_resource(struct resource *root, struct resource *new, | |||
478 | struct resource *this = root->child; | 398 | struct resource *this = root->child; |
479 | struct resource tmp = *new, avail, alloc; | 399 | struct resource tmp = *new, avail, alloc; |
480 | 400 | ||
401 | tmp.flags = new->flags; | ||
481 | tmp.start = root->start; | 402 | tmp.start = root->start; |
482 | /* | 403 | /* |
483 | * Skip past an allocated resource that starts at 0, since the | 404 | * Skip past an allocated resource that starts at 0, since the assignment |
484 | * assignment of this->start - 1 to tmp->end below would cause an | 405 | * of this->start - 1 to tmp->end below would cause an underflow. |
485 | * underflow. | ||
486 | */ | 406 | */ |
487 | if (this && this->start == 0) { | 407 | if (this && this->start == 0) { |
488 | tmp.start = this->end + 1; | 408 | tmp.start = this->end + 1; |
489 | this = this->sibling; | 409 | this = this->sibling; |
490 | } | 410 | } |
491 | for (;;) { | 411 | for(;;) { |
492 | if (this) | 412 | if (this) |
493 | tmp.end = this->start - 1; | 413 | tmp.end = this->start - 1; |
494 | else | 414 | else |
495 | tmp.end = root->end; | 415 | tmp.end = root->end; |
496 | 416 | ||
497 | resource_clip(&tmp, min, max); | 417 | resource_clip(&tmp, min, max); |
418 | arch_remove_reservations(&tmp); | ||
498 | 419 | ||
499 | /* Check for overflow after ALIGN() */ | 420 | /* Check for overflow after ALIGN() */ |
500 | avail = *new; | 421 | avail = *new; |
@@ -509,10 +430,8 @@ static int find_resource(struct resource *root, struct resource *new, | |||
509 | return 0; | 430 | return 0; |
510 | } | 431 | } |
511 | } | 432 | } |
512 | |||
513 | if (!this) | 433 | if (!this) |
514 | break; | 434 | break; |
515 | |||
516 | tmp.start = this->end + 1; | 435 | tmp.start = this->end + 1; |
517 | this = this->sibling; | 436 | this = this->sibling; |
518 | } | 437 | } |
@@ -545,10 +464,7 @@ int allocate_resource(struct resource *root, struct resource *new, | |||
545 | alignf = simple_align_resource; | 464 | alignf = simple_align_resource; |
546 | 465 | ||
547 | write_lock(&resource_lock); | 466 | write_lock(&resource_lock); |
548 | if (resource_alloc_from_bottom) | 467 | err = find_resource(root, new, size, min, max, align, alignf, alignf_data); |
549 | err = find_resource(root, new, size, min, max, align, alignf, alignf_data); | ||
550 | else | ||
551 | err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data); | ||
552 | if (err >= 0 && __request_resource(root, new)) | 468 | if (err >= 0 && __request_resource(root, new)) |
553 | err = -EBUSY; | 469 | err = -EBUSY; |
554 | write_unlock(&resource_lock); | 470 | write_unlock(&resource_lock); |
diff --git a/kernel/sched.c b/kernel/sched.c index dc91a4d09ac..a0eb0941fa8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -75,9 +75,11 @@ | |||
75 | 75 | ||
76 | #include <asm/tlb.h> | 76 | #include <asm/tlb.h> |
77 | #include <asm/irq_regs.h> | 77 | #include <asm/irq_regs.h> |
78 | #include <asm/mutex.h> | ||
78 | 79 | ||
79 | #include "sched_cpupri.h" | 80 | #include "sched_cpupri.h" |
80 | #include "workqueue_sched.h" | 81 | #include "workqueue_sched.h" |
82 | #include "sched_autogroup.h" | ||
81 | 83 | ||
82 | #define CREATE_TRACE_POINTS | 84 | #define CREATE_TRACE_POINTS |
83 | #include <trace/events/sched.h> | 85 | #include <trace/events/sched.h> |
@@ -253,6 +255,8 @@ struct task_group { | |||
253 | /* runqueue "owned" by this group on each cpu */ | 255 | /* runqueue "owned" by this group on each cpu */ |
254 | struct cfs_rq **cfs_rq; | 256 | struct cfs_rq **cfs_rq; |
255 | unsigned long shares; | 257 | unsigned long shares; |
258 | |||
259 | atomic_t load_weight; | ||
256 | #endif | 260 | #endif |
257 | 261 | ||
258 | #ifdef CONFIG_RT_GROUP_SCHED | 262 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -268,25 +272,18 @@ struct task_group { | |||
268 | struct task_group *parent; | 272 | struct task_group *parent; |
269 | struct list_head siblings; | 273 | struct list_head siblings; |
270 | struct list_head children; | 274 | struct list_head children; |
271 | }; | ||
272 | 275 | ||
273 | #define root_task_group init_task_group | 276 | #ifdef CONFIG_SCHED_AUTOGROUP |
277 | struct autogroup *autogroup; | ||
278 | #endif | ||
279 | }; | ||
274 | 280 | ||
275 | /* task_group_lock serializes add/remove of task groups and also changes to | 281 | /* task_group_lock serializes the addition/removal of task groups */ |
276 | * a task group's cpu shares. | ||
277 | */ | ||
278 | static DEFINE_SPINLOCK(task_group_lock); | 282 | static DEFINE_SPINLOCK(task_group_lock); |
279 | 283 | ||
280 | #ifdef CONFIG_FAIR_GROUP_SCHED | 284 | #ifdef CONFIG_FAIR_GROUP_SCHED |
281 | 285 | ||
282 | #ifdef CONFIG_SMP | 286 | # define ROOT_TASK_GROUP_LOAD NICE_0_LOAD |
283 | static int root_task_group_empty(void) | ||
284 | { | ||
285 | return list_empty(&root_task_group.children); | ||
286 | } | ||
287 | #endif | ||
288 | |||
289 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | ||
290 | 287 | ||
291 | /* | 288 | /* |
292 | * A weight of 0 or 1 can cause arithmetics problems. | 289 | * A weight of 0 or 1 can cause arithmetics problems. |
@@ -299,13 +296,13 @@ static int root_task_group_empty(void) | |||
299 | #define MIN_SHARES 2 | 296 | #define MIN_SHARES 2 |
300 | #define MAX_SHARES (1UL << 18) | 297 | #define MAX_SHARES (1UL << 18) |
301 | 298 | ||
302 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | 299 | static int root_task_group_load = ROOT_TASK_GROUP_LOAD; |
303 | #endif | 300 | #endif |
304 | 301 | ||
305 | /* Default task group. | 302 | /* Default task group. |
306 | * Every task in system belong to this group at bootup. | 303 | * Every task in system belong to this group at bootup. |
307 | */ | 304 | */ |
308 | struct task_group init_task_group; | 305 | struct task_group root_task_group; |
309 | 306 | ||
310 | #endif /* CONFIG_CGROUP_SCHED */ | 307 | #endif /* CONFIG_CGROUP_SCHED */ |
311 | 308 | ||
@@ -342,6 +339,7 @@ struct cfs_rq { | |||
342 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | 339 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This |
343 | * list is used during load balance. | 340 | * list is used during load balance. |
344 | */ | 341 | */ |
342 | int on_list; | ||
345 | struct list_head leaf_cfs_rq_list; | 343 | struct list_head leaf_cfs_rq_list; |
346 | struct task_group *tg; /* group that "owns" this runqueue */ | 344 | struct task_group *tg; /* group that "owns" this runqueue */ |
347 | 345 | ||
@@ -360,14 +358,17 @@ struct cfs_rq { | |||
360 | unsigned long h_load; | 358 | unsigned long h_load; |
361 | 359 | ||
362 | /* | 360 | /* |
363 | * this cpu's part of tg->shares | 361 | * Maintaining per-cpu shares distribution for group scheduling |
362 | * | ||
363 | * load_stamp is the last time we updated the load average | ||
364 | * load_last is the last time we updated the load average and saw load | ||
365 | * load_unacc_exec_time is currently unaccounted execution time | ||
364 | */ | 366 | */ |
365 | unsigned long shares; | 367 | u64 load_avg; |
368 | u64 load_period; | ||
369 | u64 load_stamp, load_last, load_unacc_exec_time; | ||
366 | 370 | ||
367 | /* | 371 | unsigned long load_contribution; |
368 | * load.weight at the time we set shares | ||
369 | */ | ||
370 | unsigned long rq_weight; | ||
371 | #endif | 372 | #endif |
372 | #endif | 373 | #endif |
373 | }; | 374 | }; |
@@ -605,11 +606,14 @@ static inline int cpu_of(struct rq *rq) | |||
605 | */ | 606 | */ |
606 | static inline struct task_group *task_group(struct task_struct *p) | 607 | static inline struct task_group *task_group(struct task_struct *p) |
607 | { | 608 | { |
609 | struct task_group *tg; | ||
608 | struct cgroup_subsys_state *css; | 610 | struct cgroup_subsys_state *css; |
609 | 611 | ||
610 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | 612 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, |
611 | lockdep_is_held(&task_rq(p)->lock)); | 613 | lockdep_is_held(&task_rq(p)->lock)); |
612 | return container_of(css, struct task_group, css); | 614 | tg = container_of(css, struct task_group, css); |
615 | |||
616 | return autogroup_task_group(p, tg); | ||
613 | } | 617 | } |
614 | 618 | ||
615 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | 619 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ |
@@ -636,22 +640,18 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
636 | 640 | ||
637 | #endif /* CONFIG_CGROUP_SCHED */ | 641 | #endif /* CONFIG_CGROUP_SCHED */ |
638 | 642 | ||
639 | static u64 irq_time_cpu(int cpu); | 643 | static void update_rq_clock_task(struct rq *rq, s64 delta); |
640 | static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time); | ||
641 | 644 | ||
642 | inline void update_rq_clock(struct rq *rq) | 645 | static void update_rq_clock(struct rq *rq) |
643 | { | 646 | { |
644 | if (!rq->skip_clock_update) { | 647 | s64 delta; |
645 | int cpu = cpu_of(rq); | ||
646 | u64 irq_time; | ||
647 | 648 | ||
648 | rq->clock = sched_clock_cpu(cpu); | 649 | if (rq->skip_clock_update) |
649 | irq_time = irq_time_cpu(cpu); | 650 | return; |
650 | if (rq->clock - irq_time > rq->clock_task) | ||
651 | rq->clock_task = rq->clock - irq_time; | ||
652 | 651 | ||
653 | sched_irq_time_avg_update(rq, irq_time); | 652 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; |
654 | } | 653 | rq->clock += delta; |
654 | update_rq_clock_task(rq, delta); | ||
655 | } | 655 | } |
656 | 656 | ||
657 | /* | 657 | /* |
@@ -741,7 +741,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
741 | buf[cnt] = 0; | 741 | buf[cnt] = 0; |
742 | cmp = strstrip(buf); | 742 | cmp = strstrip(buf); |
743 | 743 | ||
744 | if (strncmp(buf, "NO_", 3) == 0) { | 744 | if (strncmp(cmp, "NO_", 3) == 0) { |
745 | neg = 1; | 745 | neg = 1; |
746 | cmp += 3; | 746 | cmp += 3; |
747 | } | 747 | } |
@@ -797,20 +797,6 @@ late_initcall(sched_init_debug); | |||
797 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | 797 | const_debug unsigned int sysctl_sched_nr_migrate = 32; |
798 | 798 | ||
799 | /* | 799 | /* |
800 | * ratelimit for updating the group shares. | ||
801 | * default: 0.25ms | ||
802 | */ | ||
803 | unsigned int sysctl_sched_shares_ratelimit = 250000; | ||
804 | unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; | ||
805 | |||
806 | /* | ||
807 | * Inject some fuzzyness into changing the per-cpu group shares | ||
808 | * this avoids remote rq-locks at the expense of fairness. | ||
809 | * default: 4 | ||
810 | */ | ||
811 | unsigned int sysctl_sched_shares_thresh = 4; | ||
812 | |||
813 | /* | ||
814 | * period over which we average the RT time consumption, measured | 800 | * period over which we average the RT time consumption, measured |
815 | * in ms. | 801 | * in ms. |
816 | * | 802 | * |
@@ -1359,6 +1345,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | |||
1359 | lw->inv_weight = 0; | 1345 | lw->inv_weight = 0; |
1360 | } | 1346 | } |
1361 | 1347 | ||
1348 | static inline void update_load_set(struct load_weight *lw, unsigned long w) | ||
1349 | { | ||
1350 | lw->weight = w; | ||
1351 | lw->inv_weight = 0; | ||
1352 | } | ||
1353 | |||
1362 | /* | 1354 | /* |
1363 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | 1355 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
1364 | * of tasks with abnormal "nice" values across CPUs the contribution that | 1356 | * of tasks with abnormal "nice" values across CPUs the contribution that |
@@ -1547,101 +1539,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1547 | 1539 | ||
1548 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1540 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1549 | 1541 | ||
1550 | static __read_mostly unsigned long __percpu *update_shares_data; | ||
1551 | |||
1552 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | ||
1553 | |||
1554 | /* | ||
1555 | * Calculate and set the cpu's group shares. | ||
1556 | */ | ||
1557 | static void update_group_shares_cpu(struct task_group *tg, int cpu, | ||
1558 | unsigned long sd_shares, | ||
1559 | unsigned long sd_rq_weight, | ||
1560 | unsigned long *usd_rq_weight) | ||
1561 | { | ||
1562 | unsigned long shares, rq_weight; | ||
1563 | int boost = 0; | ||
1564 | |||
1565 | rq_weight = usd_rq_weight[cpu]; | ||
1566 | if (!rq_weight) { | ||
1567 | boost = 1; | ||
1568 | rq_weight = NICE_0_LOAD; | ||
1569 | } | ||
1570 | |||
1571 | /* | ||
1572 | * \Sum_j shares_j * rq_weight_i | ||
1573 | * shares_i = ----------------------------- | ||
1574 | * \Sum_j rq_weight_j | ||
1575 | */ | ||
1576 | shares = (sd_shares * rq_weight) / sd_rq_weight; | ||
1577 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); | ||
1578 | |||
1579 | if (abs(shares - tg->se[cpu]->load.weight) > | ||
1580 | sysctl_sched_shares_thresh) { | ||
1581 | struct rq *rq = cpu_rq(cpu); | ||
1582 | unsigned long flags; | ||
1583 | |||
1584 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
1585 | tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; | ||
1586 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | ||
1587 | __set_se_shares(tg->se[cpu], shares); | ||
1588 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
1589 | } | ||
1590 | } | ||
1591 | |||
1592 | /* | ||
1593 | * Re-compute the task group their per cpu shares over the given domain. | ||
1594 | * This needs to be done in a bottom-up fashion because the rq weight of a | ||
1595 | * parent group depends on the shares of its child groups. | ||
1596 | */ | ||
1597 | static int tg_shares_up(struct task_group *tg, void *data) | ||
1598 | { | ||
1599 | unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; | ||
1600 | unsigned long *usd_rq_weight; | ||
1601 | struct sched_domain *sd = data; | ||
1602 | unsigned long flags; | ||
1603 | int i; | ||
1604 | |||
1605 | if (!tg->se[0]) | ||
1606 | return 0; | ||
1607 | |||
1608 | local_irq_save(flags); | ||
1609 | usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); | ||
1610 | |||
1611 | for_each_cpu(i, sched_domain_span(sd)) { | ||
1612 | weight = tg->cfs_rq[i]->load.weight; | ||
1613 | usd_rq_weight[i] = weight; | ||
1614 | |||
1615 | rq_weight += weight; | ||
1616 | /* | ||
1617 | * If there are currently no tasks on the cpu pretend there | ||
1618 | * is one of average load so that when a new task gets to | ||
1619 | * run here it will not get delayed by group starvation. | ||
1620 | */ | ||
1621 | if (!weight) | ||
1622 | weight = NICE_0_LOAD; | ||
1623 | |||
1624 | sum_weight += weight; | ||
1625 | shares += tg->cfs_rq[i]->shares; | ||
1626 | } | ||
1627 | |||
1628 | if (!rq_weight) | ||
1629 | rq_weight = sum_weight; | ||
1630 | |||
1631 | if ((!shares && rq_weight) || shares > tg->shares) | ||
1632 | shares = tg->shares; | ||
1633 | |||
1634 | if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) | ||
1635 | shares = tg->shares; | ||
1636 | |||
1637 | for_each_cpu(i, sched_domain_span(sd)) | ||
1638 | update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); | ||
1639 | |||
1640 | local_irq_restore(flags); | ||
1641 | |||
1642 | return 0; | ||
1643 | } | ||
1644 | |||
1645 | /* | 1542 | /* |
1646 | * Compute the cpu's hierarchical load factor for each task group. | 1543 | * Compute the cpu's hierarchical load factor for each task group. |
1647 | * This needs to be done in a top-down fashion because the load of a child | 1544 | * This needs to be done in a top-down fashion because the load of a child |
@@ -1656,7 +1553,7 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
1656 | load = cpu_rq(cpu)->load.weight; | 1553 | load = cpu_rq(cpu)->load.weight; |
1657 | } else { | 1554 | } else { |
1658 | load = tg->parent->cfs_rq[cpu]->h_load; | 1555 | load = tg->parent->cfs_rq[cpu]->h_load; |
1659 | load *= tg->cfs_rq[cpu]->shares; | 1556 | load *= tg->se[cpu]->load.weight; |
1660 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; | 1557 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; |
1661 | } | 1558 | } |
1662 | 1559 | ||
@@ -1665,34 +1562,11 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
1665 | return 0; | 1562 | return 0; |
1666 | } | 1563 | } |
1667 | 1564 | ||
1668 | static void update_shares(struct sched_domain *sd) | ||
1669 | { | ||
1670 | s64 elapsed; | ||
1671 | u64 now; | ||
1672 | |||
1673 | if (root_task_group_empty()) | ||
1674 | return; | ||
1675 | |||
1676 | now = local_clock(); | ||
1677 | elapsed = now - sd->last_update; | ||
1678 | |||
1679 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | ||
1680 | sd->last_update = now; | ||
1681 | walk_tg_tree(tg_nop, tg_shares_up, sd); | ||
1682 | } | ||
1683 | } | ||
1684 | |||
1685 | static void update_h_load(long cpu) | 1565 | static void update_h_load(long cpu) |
1686 | { | 1566 | { |
1687 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 1567 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
1688 | } | 1568 | } |
1689 | 1569 | ||
1690 | #else | ||
1691 | |||
1692 | static inline void update_shares(struct sched_domain *sd) | ||
1693 | { | ||
1694 | } | ||
1695 | |||
1696 | #endif | 1570 | #endif |
1697 | 1571 | ||
1698 | #ifdef CONFIG_PREEMPT | 1572 | #ifdef CONFIG_PREEMPT |
@@ -1814,15 +1688,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
1814 | 1688 | ||
1815 | #endif | 1689 | #endif |
1816 | 1690 | ||
1817 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1818 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | ||
1819 | { | ||
1820 | #ifdef CONFIG_SMP | ||
1821 | cfs_rq->shares = shares; | ||
1822 | #endif | ||
1823 | } | ||
1824 | #endif | ||
1825 | |||
1826 | static void calc_load_account_idle(struct rq *this_rq); | 1691 | static void calc_load_account_idle(struct rq *this_rq); |
1827 | static void update_sysctl(void); | 1692 | static void update_sysctl(void); |
1828 | static int get_update_sysctl_factor(void); | 1693 | static int get_update_sysctl_factor(void); |
@@ -1924,10 +1789,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
1924 | * They are read and saved off onto struct rq in update_rq_clock(). | 1789 | * They are read and saved off onto struct rq in update_rq_clock(). |
1925 | * This may result in other CPU reading this CPU's irq time and can | 1790 | * This may result in other CPU reading this CPU's irq time and can |
1926 | * race with irq/account_system_vtime on this CPU. We would either get old | 1791 | * race with irq/account_system_vtime on this CPU. We would either get old |
1927 | * or new value (or semi updated value on 32 bit) with a side effect of | 1792 | * or new value with a side effect of accounting a slice of irq time to wrong |
1928 | * accounting a slice of irq time to wrong task when irq is in progress | 1793 | * task when irq is in progress while we read rq->clock. That is a worthy |
1929 | * while we read rq->clock. That is a worthy compromise in place of having | 1794 | * compromise in place of having locks on each irq in account_system_time. |
1930 | * locks on each irq in account_system_time. | ||
1931 | */ | 1795 | */ |
1932 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); | 1796 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); |
1933 | static DEFINE_PER_CPU(u64, cpu_softirq_time); | 1797 | static DEFINE_PER_CPU(u64, cpu_softirq_time); |
@@ -1945,19 +1809,58 @@ void disable_sched_clock_irqtime(void) | |||
1945 | sched_clock_irqtime = 0; | 1809 | sched_clock_irqtime = 0; |
1946 | } | 1810 | } |
1947 | 1811 | ||
1948 | static u64 irq_time_cpu(int cpu) | 1812 | #ifndef CONFIG_64BIT |
1813 | static DEFINE_PER_CPU(seqcount_t, irq_time_seq); | ||
1814 | |||
1815 | static inline void irq_time_write_begin(void) | ||
1949 | { | 1816 | { |
1950 | if (!sched_clock_irqtime) | 1817 | __this_cpu_inc(irq_time_seq.sequence); |
1951 | return 0; | 1818 | smp_wmb(); |
1819 | } | ||
1820 | |||
1821 | static inline void irq_time_write_end(void) | ||
1822 | { | ||
1823 | smp_wmb(); | ||
1824 | __this_cpu_inc(irq_time_seq.sequence); | ||
1825 | } | ||
1826 | |||
1827 | static inline u64 irq_time_read(int cpu) | ||
1828 | { | ||
1829 | u64 irq_time; | ||
1830 | unsigned seq; | ||
1831 | |||
1832 | do { | ||
1833 | seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); | ||
1834 | irq_time = per_cpu(cpu_softirq_time, cpu) + | ||
1835 | per_cpu(cpu_hardirq_time, cpu); | ||
1836 | } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); | ||
1837 | |||
1838 | return irq_time; | ||
1839 | } | ||
1840 | #else /* CONFIG_64BIT */ | ||
1841 | static inline void irq_time_write_begin(void) | ||
1842 | { | ||
1843 | } | ||
1952 | 1844 | ||
1845 | static inline void irq_time_write_end(void) | ||
1846 | { | ||
1847 | } | ||
1848 | |||
1849 | static inline u64 irq_time_read(int cpu) | ||
1850 | { | ||
1953 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | 1851 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); |
1954 | } | 1852 | } |
1853 | #endif /* CONFIG_64BIT */ | ||
1955 | 1854 | ||
1855 | /* | ||
1856 | * Called before incrementing preempt_count on {soft,}irq_enter | ||
1857 | * and before decrementing preempt_count on {soft,}irq_exit. | ||
1858 | */ | ||
1956 | void account_system_vtime(struct task_struct *curr) | 1859 | void account_system_vtime(struct task_struct *curr) |
1957 | { | 1860 | { |
1958 | unsigned long flags; | 1861 | unsigned long flags; |
1862 | s64 delta; | ||
1959 | int cpu; | 1863 | int cpu; |
1960 | u64 now, delta; | ||
1961 | 1864 | ||
1962 | if (!sched_clock_irqtime) | 1865 | if (!sched_clock_irqtime) |
1963 | return; | 1866 | return; |
@@ -1965,9 +1868,10 @@ void account_system_vtime(struct task_struct *curr) | |||
1965 | local_irq_save(flags); | 1868 | local_irq_save(flags); |
1966 | 1869 | ||
1967 | cpu = smp_processor_id(); | 1870 | cpu = smp_processor_id(); |
1968 | now = sched_clock_cpu(cpu); | 1871 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); |
1969 | delta = now - per_cpu(irq_start_time, cpu); | 1872 | __this_cpu_add(irq_start_time, delta); |
1970 | per_cpu(irq_start_time, cpu) = now; | 1873 | |
1874 | irq_time_write_begin(); | ||
1971 | /* | 1875 | /* |
1972 | * We do not account for softirq time from ksoftirqd here. | 1876 | * We do not account for softirq time from ksoftirqd here. |
1973 | * We want to continue accounting softirq time to ksoftirqd thread | 1877 | * We want to continue accounting softirq time to ksoftirqd thread |
@@ -1975,37 +1879,60 @@ void account_system_vtime(struct task_struct *curr) | |||
1975 | * that do not consume any time, but still wants to run. | 1879 | * that do not consume any time, but still wants to run. |
1976 | */ | 1880 | */ |
1977 | if (hardirq_count()) | 1881 | if (hardirq_count()) |
1978 | per_cpu(cpu_hardirq_time, cpu) += delta; | 1882 | __this_cpu_add(cpu_hardirq_time, delta); |
1979 | else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) | 1883 | else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) |
1980 | per_cpu(cpu_softirq_time, cpu) += delta; | 1884 | __this_cpu_add(cpu_softirq_time, delta); |
1981 | 1885 | ||
1886 | irq_time_write_end(); | ||
1982 | local_irq_restore(flags); | 1887 | local_irq_restore(flags); |
1983 | } | 1888 | } |
1984 | EXPORT_SYMBOL_GPL(account_system_vtime); | 1889 | EXPORT_SYMBOL_GPL(account_system_vtime); |
1985 | 1890 | ||
1986 | static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) | 1891 | static void update_rq_clock_task(struct rq *rq, s64 delta) |
1987 | { | 1892 | { |
1988 | if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) { | 1893 | s64 irq_delta; |
1989 | u64 delta_irq = curr_irq_time - rq->prev_irq_time; | 1894 | |
1990 | rq->prev_irq_time = curr_irq_time; | 1895 | irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; |
1991 | sched_rt_avg_update(rq, delta_irq); | 1896 | |
1992 | } | 1897 | /* |
1898 | * Since irq_time is only updated on {soft,}irq_exit, we might run into | ||
1899 | * this case when a previous update_rq_clock() happened inside a | ||
1900 | * {soft,}irq region. | ||
1901 | * | ||
1902 | * When this happens, we stop ->clock_task and only update the | ||
1903 | * prev_irq_time stamp to account for the part that fit, so that a next | ||
1904 | * update will consume the rest. This ensures ->clock_task is | ||
1905 | * monotonic. | ||
1906 | * | ||
1907 | * It does however cause some slight miss-attribution of {soft,}irq | ||
1908 | * time, a more accurate solution would be to update the irq_time using | ||
1909 | * the current rq->clock timestamp, except that would require using | ||
1910 | * atomic ops. | ||
1911 | */ | ||
1912 | if (irq_delta > delta) | ||
1913 | irq_delta = delta; | ||
1914 | |||
1915 | rq->prev_irq_time += irq_delta; | ||
1916 | delta -= irq_delta; | ||
1917 | rq->clock_task += delta; | ||
1918 | |||
1919 | if (irq_delta && sched_feat(NONIRQ_POWER)) | ||
1920 | sched_rt_avg_update(rq, irq_delta); | ||
1993 | } | 1921 | } |
1994 | 1922 | ||
1995 | #else | 1923 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
1996 | 1924 | ||
1997 | static u64 irq_time_cpu(int cpu) | 1925 | static void update_rq_clock_task(struct rq *rq, s64 delta) |
1998 | { | 1926 | { |
1999 | return 0; | 1927 | rq->clock_task += delta; |
2000 | } | 1928 | } |
2001 | 1929 | ||
2002 | static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { } | 1930 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
2003 | |||
2004 | #endif | ||
2005 | 1931 | ||
2006 | #include "sched_idletask.c" | 1932 | #include "sched_idletask.c" |
2007 | #include "sched_fair.c" | 1933 | #include "sched_fair.c" |
2008 | #include "sched_rt.c" | 1934 | #include "sched_rt.c" |
1935 | #include "sched_autogroup.c" | ||
2009 | #include "sched_stoptask.c" | 1936 | #include "sched_stoptask.c" |
2010 | #ifdef CONFIG_SCHED_DEBUG | 1937 | #ifdef CONFIG_SCHED_DEBUG |
2011 | # include "sched_debug.c" | 1938 | # include "sched_debug.c" |
@@ -2129,7 +2056,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
2129 | * A queue event has occurred, and we're going to schedule. In | 2056 | * A queue event has occurred, and we're going to schedule. In |
2130 | * this case, we can save a useless back to back clock update. | 2057 | * this case, we can save a useless back to back clock update. |
2131 | */ | 2058 | */ |
2132 | if (test_tsk_need_resched(rq->curr)) | 2059 | if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) |
2133 | rq->skip_clock_update = 1; | 2060 | rq->skip_clock_update = 1; |
2134 | } | 2061 | } |
2135 | 2062 | ||
@@ -2198,10 +2125,8 @@ static int migration_cpu_stop(void *data); | |||
2198 | * The task's runqueue lock must be held. | 2125 | * The task's runqueue lock must be held. |
2199 | * Returns true if you have to wait for migration thread. | 2126 | * Returns true if you have to wait for migration thread. |
2200 | */ | 2127 | */ |
2201 | static bool migrate_task(struct task_struct *p, int dest_cpu) | 2128 | static bool migrate_task(struct task_struct *p, struct rq *rq) |
2202 | { | 2129 | { |
2203 | struct rq *rq = task_rq(p); | ||
2204 | |||
2205 | /* | 2130 | /* |
2206 | * If the task is not on a runqueue (and not running), then | 2131 | * If the task is not on a runqueue (and not running), then |
2207 | * the next wake-up will properly place the task. | 2132 | * the next wake-up will properly place the task. |
@@ -2381,18 +2306,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
2381 | return dest_cpu; | 2306 | return dest_cpu; |
2382 | 2307 | ||
2383 | /* No more Mr. Nice Guy. */ | 2308 | /* No more Mr. Nice Guy. */ |
2384 | if (unlikely(dest_cpu >= nr_cpu_ids)) { | 2309 | dest_cpu = cpuset_cpus_allowed_fallback(p); |
2385 | dest_cpu = cpuset_cpus_allowed_fallback(p); | 2310 | /* |
2386 | /* | 2311 | * Don't tell them about moving exiting tasks or |
2387 | * Don't tell them about moving exiting tasks or | 2312 | * kernel threads (both mm NULL), since they never |
2388 | * kernel threads (both mm NULL), since they never | 2313 | * leave kernel. |
2389 | * leave kernel. | 2314 | */ |
2390 | */ | 2315 | if (p->mm && printk_ratelimit()) { |
2391 | if (p->mm && printk_ratelimit()) { | 2316 | printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", |
2392 | printk(KERN_INFO "process %d (%s) no " | 2317 | task_pid_nr(p), p->comm, cpu); |
2393 | "longer affine to cpu%d\n", | ||
2394 | task_pid_nr(p), p->comm, cpu); | ||
2395 | } | ||
2396 | } | 2318 | } |
2397 | 2319 | ||
2398 | return dest_cpu; | 2320 | return dest_cpu; |
@@ -2728,7 +2650,9 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2728 | /* Want to start with kernel preemption disabled. */ | 2650 | /* Want to start with kernel preemption disabled. */ |
2729 | task_thread_info(p)->preempt_count = 1; | 2651 | task_thread_info(p)->preempt_count = 1; |
2730 | #endif | 2652 | #endif |
2653 | #ifdef CONFIG_SMP | ||
2731 | plist_node_init(&p->pushable_tasks, MAX_PRIO); | 2654 | plist_node_init(&p->pushable_tasks, MAX_PRIO); |
2655 | #endif | ||
2732 | 2656 | ||
2733 | put_cpu(); | 2657 | put_cpu(); |
2734 | } | 2658 | } |
@@ -3119,6 +3043,15 @@ static long calc_load_fold_active(struct rq *this_rq) | |||
3119 | return delta; | 3043 | return delta; |
3120 | } | 3044 | } |
3121 | 3045 | ||
3046 | static unsigned long | ||
3047 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
3048 | { | ||
3049 | load *= exp; | ||
3050 | load += active * (FIXED_1 - exp); | ||
3051 | load += 1UL << (FSHIFT - 1); | ||
3052 | return load >> FSHIFT; | ||
3053 | } | ||
3054 | |||
3122 | #ifdef CONFIG_NO_HZ | 3055 | #ifdef CONFIG_NO_HZ |
3123 | /* | 3056 | /* |
3124 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. | 3057 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. |
@@ -3148,6 +3081,128 @@ static long calc_load_fold_idle(void) | |||
3148 | 3081 | ||
3149 | return delta; | 3082 | return delta; |
3150 | } | 3083 | } |
3084 | |||
3085 | /** | ||
3086 | * fixed_power_int - compute: x^n, in O(log n) time | ||
3087 | * | ||
3088 | * @x: base of the power | ||
3089 | * @frac_bits: fractional bits of @x | ||
3090 | * @n: power to raise @x to. | ||
3091 | * | ||
3092 | * By exploiting the relation between the definition of the natural power | ||
3093 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and | ||
3094 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, | ||
3095 | * (where: n_i \elem {0, 1}, the binary vector representing n), | ||
3096 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is | ||
3097 | * of course trivially computable in O(log_2 n), the length of our binary | ||
3098 | * vector. | ||
3099 | */ | ||
3100 | static unsigned long | ||
3101 | fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) | ||
3102 | { | ||
3103 | unsigned long result = 1UL << frac_bits; | ||
3104 | |||
3105 | if (n) for (;;) { | ||
3106 | if (n & 1) { | ||
3107 | result *= x; | ||
3108 | result += 1UL << (frac_bits - 1); | ||
3109 | result >>= frac_bits; | ||
3110 | } | ||
3111 | n >>= 1; | ||
3112 | if (!n) | ||
3113 | break; | ||
3114 | x *= x; | ||
3115 | x += 1UL << (frac_bits - 1); | ||
3116 | x >>= frac_bits; | ||
3117 | } | ||
3118 | |||
3119 | return result; | ||
3120 | } | ||
3121 | |||
3122 | /* | ||
3123 | * a1 = a0 * e + a * (1 - e) | ||
3124 | * | ||
3125 | * a2 = a1 * e + a * (1 - e) | ||
3126 | * = (a0 * e + a * (1 - e)) * e + a * (1 - e) | ||
3127 | * = a0 * e^2 + a * (1 - e) * (1 + e) | ||
3128 | * | ||
3129 | * a3 = a2 * e + a * (1 - e) | ||
3130 | * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) | ||
3131 | * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) | ||
3132 | * | ||
3133 | * ... | ||
3134 | * | ||
3135 | * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] | ||
3136 | * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) | ||
3137 | * = a0 * e^n + a * (1 - e^n) | ||
3138 | * | ||
3139 | * [1] application of the geometric series: | ||
3140 | * | ||
3141 | * n 1 - x^(n+1) | ||
3142 | * S_n := \Sum x^i = ------------- | ||
3143 | * i=0 1 - x | ||
3144 | */ | ||
3145 | static unsigned long | ||
3146 | calc_load_n(unsigned long load, unsigned long exp, | ||
3147 | unsigned long active, unsigned int n) | ||
3148 | { | ||
3149 | |||
3150 | return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); | ||
3151 | } | ||
3152 | |||
3153 | /* | ||
3154 | * NO_HZ can leave us missing all per-cpu ticks calling | ||
3155 | * calc_load_account_active(), but since an idle CPU folds its delta into | ||
3156 | * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold | ||
3157 | * in the pending idle delta if our idle period crossed a load cycle boundary. | ||
3158 | * | ||
3159 | * Once we've updated the global active value, we need to apply the exponential | ||
3160 | * weights adjusted to the number of cycles missed. | ||
3161 | */ | ||
3162 | static void calc_global_nohz(unsigned long ticks) | ||
3163 | { | ||
3164 | long delta, active, n; | ||
3165 | |||
3166 | if (time_before(jiffies, calc_load_update)) | ||
3167 | return; | ||
3168 | |||
3169 | /* | ||
3170 | * If we crossed a calc_load_update boundary, make sure to fold | ||
3171 | * any pending idle changes, the respective CPUs might have | ||
3172 | * missed the tick driven calc_load_account_active() update | ||
3173 | * due to NO_HZ. | ||
3174 | */ | ||
3175 | delta = calc_load_fold_idle(); | ||
3176 | if (delta) | ||
3177 | atomic_long_add(delta, &calc_load_tasks); | ||
3178 | |||
3179 | /* | ||
3180 | * If we were idle for multiple load cycles, apply them. | ||
3181 | */ | ||
3182 | if (ticks >= LOAD_FREQ) { | ||
3183 | n = ticks / LOAD_FREQ; | ||
3184 | |||
3185 | active = atomic_long_read(&calc_load_tasks); | ||
3186 | active = active > 0 ? active * FIXED_1 : 0; | ||
3187 | |||
3188 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | ||
3189 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | ||
3190 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
3191 | |||
3192 | calc_load_update += n * LOAD_FREQ; | ||
3193 | } | ||
3194 | |||
3195 | /* | ||
3196 | * Its possible the remainder of the above division also crosses | ||
3197 | * a LOAD_FREQ period, the regular check in calc_global_load() | ||
3198 | * which comes after this will take care of that. | ||
3199 | * | ||
3200 | * Consider us being 11 ticks before a cycle completion, and us | ||
3201 | * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will | ||
3202 | * age us 4 cycles, and the test in calc_global_load() will | ||
3203 | * pick up the final one. | ||
3204 | */ | ||
3205 | } | ||
3151 | #else | 3206 | #else |
3152 | static void calc_load_account_idle(struct rq *this_rq) | 3207 | static void calc_load_account_idle(struct rq *this_rq) |
3153 | { | 3208 | { |
@@ -3157,6 +3212,10 @@ static inline long calc_load_fold_idle(void) | |||
3157 | { | 3212 | { |
3158 | return 0; | 3213 | return 0; |
3159 | } | 3214 | } |
3215 | |||
3216 | static void calc_global_nohz(unsigned long ticks) | ||
3217 | { | ||
3218 | } | ||
3160 | #endif | 3219 | #endif |
3161 | 3220 | ||
3162 | /** | 3221 | /** |
@@ -3174,24 +3233,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | |||
3174 | loads[2] = (avenrun[2] + offset) << shift; | 3233 | loads[2] = (avenrun[2] + offset) << shift; |
3175 | } | 3234 | } |
3176 | 3235 | ||
3177 | static unsigned long | ||
3178 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
3179 | { | ||
3180 | load *= exp; | ||
3181 | load += active * (FIXED_1 - exp); | ||
3182 | return load >> FSHIFT; | ||
3183 | } | ||
3184 | |||
3185 | /* | 3236 | /* |
3186 | * calc_load - update the avenrun load estimates 10 ticks after the | 3237 | * calc_load - update the avenrun load estimates 10 ticks after the |
3187 | * CPUs have updated calc_load_tasks. | 3238 | * CPUs have updated calc_load_tasks. |
3188 | */ | 3239 | */ |
3189 | void calc_global_load(void) | 3240 | void calc_global_load(unsigned long ticks) |
3190 | { | 3241 | { |
3191 | unsigned long upd = calc_load_update + 10; | ||
3192 | long active; | 3242 | long active; |
3193 | 3243 | ||
3194 | if (time_before(jiffies, upd)) | 3244 | calc_global_nohz(ticks); |
3245 | |||
3246 | if (time_before(jiffies, calc_load_update + 10)) | ||
3195 | return; | 3247 | return; |
3196 | 3248 | ||
3197 | active = atomic_long_read(&calc_load_tasks); | 3249 | active = atomic_long_read(&calc_load_tasks); |
@@ -3364,7 +3416,7 @@ void sched_exec(void) | |||
3364 | * select_task_rq() can race against ->cpus_allowed | 3416 | * select_task_rq() can race against ->cpus_allowed |
3365 | */ | 3417 | */ |
3366 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && | 3418 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && |
3367 | likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { | 3419 | likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) { |
3368 | struct migration_arg arg = { p, dest_cpu }; | 3420 | struct migration_arg arg = { p, dest_cpu }; |
3369 | 3421 | ||
3370 | task_rq_unlock(rq, &flags); | 3422 | task_rq_unlock(rq, &flags); |
@@ -3845,7 +3897,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev) | |||
3845 | { | 3897 | { |
3846 | if (prev->se.on_rq) | 3898 | if (prev->se.on_rq) |
3847 | update_rq_clock(rq); | 3899 | update_rq_clock(rq); |
3848 | rq->skip_clock_update = 0; | ||
3849 | prev->sched_class->put_prev_task(rq, prev); | 3900 | prev->sched_class->put_prev_task(rq, prev); |
3850 | } | 3901 | } |
3851 | 3902 | ||
@@ -3903,7 +3954,6 @@ need_resched_nonpreemptible: | |||
3903 | hrtick_clear(rq); | 3954 | hrtick_clear(rq); |
3904 | 3955 | ||
3905 | raw_spin_lock_irq(&rq->lock); | 3956 | raw_spin_lock_irq(&rq->lock); |
3906 | clear_tsk_need_resched(prev); | ||
3907 | 3957 | ||
3908 | switch_count = &prev->nivcsw; | 3958 | switch_count = &prev->nivcsw; |
3909 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3959 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
@@ -3935,6 +3985,8 @@ need_resched_nonpreemptible: | |||
3935 | 3985 | ||
3936 | put_prev_task(rq, prev); | 3986 | put_prev_task(rq, prev); |
3937 | next = pick_next_task(rq); | 3987 | next = pick_next_task(rq); |
3988 | clear_tsk_need_resched(prev); | ||
3989 | rq->skip_clock_update = 0; | ||
3938 | 3990 | ||
3939 | if (likely(prev != next)) { | 3991 | if (likely(prev != next)) { |
3940 | sched_info_switch(prev, next); | 3992 | sched_info_switch(prev, next); |
@@ -4029,7 +4081,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | |||
4029 | if (task_thread_info(rq->curr) != owner || need_resched()) | 4081 | if (task_thread_info(rq->curr) != owner || need_resched()) |
4030 | return 0; | 4082 | return 0; |
4031 | 4083 | ||
4032 | cpu_relax(); | 4084 | arch_mutex_cpu_relax(); |
4033 | } | 4085 | } |
4034 | 4086 | ||
4035 | return 1; | 4087 | return 1; |
@@ -4341,7 +4393,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible); | |||
4341 | * This waits for either a completion of a specific task to be signaled or for a | 4393 | * This waits for either a completion of a specific task to be signaled or for a |
4342 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | 4394 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. |
4343 | */ | 4395 | */ |
4344 | unsigned long __sched | 4396 | long __sched |
4345 | wait_for_completion_interruptible_timeout(struct completion *x, | 4397 | wait_for_completion_interruptible_timeout(struct completion *x, |
4346 | unsigned long timeout) | 4398 | unsigned long timeout) |
4347 | { | 4399 | { |
@@ -4374,7 +4426,7 @@ EXPORT_SYMBOL(wait_for_completion_killable); | |||
4374 | * signaled or for a specified timeout to expire. It can be | 4426 | * signaled or for a specified timeout to expire. It can be |
4375 | * interrupted by a kill signal. The timeout is in jiffies. | 4427 | * interrupted by a kill signal. The timeout is in jiffies. |
4376 | */ | 4428 | */ |
4377 | unsigned long __sched | 4429 | long __sched |
4378 | wait_for_completion_killable_timeout(struct completion *x, | 4430 | wait_for_completion_killable_timeout(struct completion *x, |
4379 | unsigned long timeout) | 4431 | unsigned long timeout) |
4380 | { | 4432 | { |
@@ -4716,7 +4768,7 @@ static bool check_same_owner(struct task_struct *p) | |||
4716 | } | 4768 | } |
4717 | 4769 | ||
4718 | static int __sched_setscheduler(struct task_struct *p, int policy, | 4770 | static int __sched_setscheduler(struct task_struct *p, int policy, |
4719 | struct sched_param *param, bool user) | 4771 | const struct sched_param *param, bool user) |
4720 | { | 4772 | { |
4721 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 4773 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
4722 | unsigned long flags; | 4774 | unsigned long flags; |
@@ -4871,7 +4923,7 @@ recheck: | |||
4871 | * NOTE that the task may be already dead. | 4923 | * NOTE that the task may be already dead. |
4872 | */ | 4924 | */ |
4873 | int sched_setscheduler(struct task_struct *p, int policy, | 4925 | int sched_setscheduler(struct task_struct *p, int policy, |
4874 | struct sched_param *param) | 4926 | const struct sched_param *param) |
4875 | { | 4927 | { |
4876 | return __sched_setscheduler(p, policy, param, true); | 4928 | return __sched_setscheduler(p, policy, param, true); |
4877 | } | 4929 | } |
@@ -4889,7 +4941,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); | |||
4889 | * but our caller might not have that capability. | 4941 | * but our caller might not have that capability. |
4890 | */ | 4942 | */ |
4891 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, | 4943 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, |
4892 | struct sched_param *param) | 4944 | const struct sched_param *param) |
4893 | { | 4945 | { |
4894 | return __sched_setscheduler(p, policy, param, false); | 4946 | return __sched_setscheduler(p, policy, param, false); |
4895 | } | 4947 | } |
@@ -5405,7 +5457,7 @@ void sched_show_task(struct task_struct *p) | |||
5405 | unsigned state; | 5457 | unsigned state; |
5406 | 5458 | ||
5407 | state = p->state ? __ffs(p->state) + 1 : 0; | 5459 | state = p->state ? __ffs(p->state) + 1 : 0; |
5408 | printk(KERN_INFO "%-13.13s %c", p->comm, | 5460 | printk(KERN_INFO "%-15.15s %c", p->comm, |
5409 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); | 5461 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); |
5410 | #if BITS_PER_LONG == 32 | 5462 | #if BITS_PER_LONG == 32 |
5411 | if (state == TASK_RUNNING) | 5463 | if (state == TASK_RUNNING) |
@@ -5569,7 +5621,6 @@ static void update_sysctl(void) | |||
5569 | SET_SYSCTL(sched_min_granularity); | 5621 | SET_SYSCTL(sched_min_granularity); |
5570 | SET_SYSCTL(sched_latency); | 5622 | SET_SYSCTL(sched_latency); |
5571 | SET_SYSCTL(sched_wakeup_granularity); | 5623 | SET_SYSCTL(sched_wakeup_granularity); |
5572 | SET_SYSCTL(sched_shares_ratelimit); | ||
5573 | #undef SET_SYSCTL | 5624 | #undef SET_SYSCTL |
5574 | } | 5625 | } |
5575 | 5626 | ||
@@ -5645,7 +5696,7 @@ again: | |||
5645 | goto out; | 5696 | goto out; |
5646 | 5697 | ||
5647 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); | 5698 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); |
5648 | if (migrate_task(p, dest_cpu)) { | 5699 | if (migrate_task(p, rq)) { |
5649 | struct migration_arg arg = { p, dest_cpu }; | 5700 | struct migration_arg arg = { p, dest_cpu }; |
5650 | /* Need help from migration thread: drop lock and wait. */ | 5701 | /* Need help from migration thread: drop lock and wait. */ |
5651 | task_rq_unlock(rq, &flags); | 5702 | task_rq_unlock(rq, &flags); |
@@ -5727,29 +5778,20 @@ static int migration_cpu_stop(void *data) | |||
5727 | } | 5778 | } |
5728 | 5779 | ||
5729 | #ifdef CONFIG_HOTPLUG_CPU | 5780 | #ifdef CONFIG_HOTPLUG_CPU |
5781 | |||
5730 | /* | 5782 | /* |
5731 | * Figure out where task on dead CPU should go, use force if necessary. | 5783 | * Ensures that the idle task is using init_mm right before its cpu goes |
5784 | * offline. | ||
5732 | */ | 5785 | */ |
5733 | void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | 5786 | void idle_task_exit(void) |
5734 | { | 5787 | { |
5735 | struct rq *rq = cpu_rq(dead_cpu); | 5788 | struct mm_struct *mm = current->active_mm; |
5736 | int needs_cpu, uninitialized_var(dest_cpu); | ||
5737 | unsigned long flags; | ||
5738 | 5789 | ||
5739 | local_irq_save(flags); | 5790 | BUG_ON(cpu_online(smp_processor_id())); |
5740 | 5791 | ||
5741 | raw_spin_lock(&rq->lock); | 5792 | if (mm != &init_mm) |
5742 | needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); | 5793 | switch_mm(mm, &init_mm, current); |
5743 | if (needs_cpu) | 5794 | mmdrop(mm); |
5744 | dest_cpu = select_fallback_rq(dead_cpu, p); | ||
5745 | raw_spin_unlock(&rq->lock); | ||
5746 | /* | ||
5747 | * It can only fail if we race with set_cpus_allowed(), | ||
5748 | * in the racer should migrate the task anyway. | ||
5749 | */ | ||
5750 | if (needs_cpu) | ||
5751 | __migrate_task(p, dead_cpu, dest_cpu); | ||
5752 | local_irq_restore(flags); | ||
5753 | } | 5795 | } |
5754 | 5796 | ||
5755 | /* | 5797 | /* |
@@ -5762,128 +5804,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
5762 | static void migrate_nr_uninterruptible(struct rq *rq_src) | 5804 | static void migrate_nr_uninterruptible(struct rq *rq_src) |
5763 | { | 5805 | { |
5764 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); | 5806 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); |
5765 | unsigned long flags; | ||
5766 | 5807 | ||
5767 | local_irq_save(flags); | ||
5768 | double_rq_lock(rq_src, rq_dest); | ||
5769 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; | 5808 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; |
5770 | rq_src->nr_uninterruptible = 0; | 5809 | rq_src->nr_uninterruptible = 0; |
5771 | double_rq_unlock(rq_src, rq_dest); | ||
5772 | local_irq_restore(flags); | ||
5773 | } | ||
5774 | |||
5775 | /* Run through task list and migrate tasks from the dead cpu. */ | ||
5776 | static void migrate_live_tasks(int src_cpu) | ||
5777 | { | ||
5778 | struct task_struct *p, *t; | ||
5779 | |||
5780 | read_lock(&tasklist_lock); | ||
5781 | |||
5782 | do_each_thread(t, p) { | ||
5783 | if (p == current) | ||
5784 | continue; | ||
5785 | |||
5786 | if (task_cpu(p) == src_cpu) | ||
5787 | move_task_off_dead_cpu(src_cpu, p); | ||
5788 | } while_each_thread(t, p); | ||
5789 | |||
5790 | read_unlock(&tasklist_lock); | ||
5791 | } | 5810 | } |
5792 | 5811 | ||
5793 | /* | 5812 | /* |
5794 | * Schedules idle task to be the next runnable task on current CPU. | 5813 | * remove the tasks which were accounted by rq from calc_load_tasks. |
5795 | * It does so by boosting its priority to highest possible. | ||
5796 | * Used by CPU offline code. | ||
5797 | */ | 5814 | */ |
5798 | void sched_idle_next(void) | 5815 | static void calc_global_load_remove(struct rq *rq) |
5799 | { | 5816 | { |
5800 | int this_cpu = smp_processor_id(); | 5817 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); |
5801 | struct rq *rq = cpu_rq(this_cpu); | 5818 | rq->calc_load_active = 0; |
5802 | struct task_struct *p = rq->idle; | ||
5803 | unsigned long flags; | ||
5804 | |||
5805 | /* cpu has to be offline */ | ||
5806 | BUG_ON(cpu_online(this_cpu)); | ||
5807 | |||
5808 | /* | ||
5809 | * Strictly not necessary since rest of the CPUs are stopped by now | ||
5810 | * and interrupts disabled on the current cpu. | ||
5811 | */ | ||
5812 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
5813 | |||
5814 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); | ||
5815 | |||
5816 | activate_task(rq, p, 0); | ||
5817 | |||
5818 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
5819 | } | 5819 | } |
5820 | 5820 | ||
5821 | /* | 5821 | /* |
5822 | * Ensures that the idle task is using init_mm right before its cpu goes | 5822 | * Migrate all tasks from the rq, sleeping tasks will be migrated by |
5823 | * offline. | 5823 | * try_to_wake_up()->select_task_rq(). |
5824 | * | ||
5825 | * Called with rq->lock held even though we'er in stop_machine() and | ||
5826 | * there's no concurrency possible, we hold the required locks anyway | ||
5827 | * because of lock validation efforts. | ||
5824 | */ | 5828 | */ |
5825 | void idle_task_exit(void) | 5829 | static void migrate_tasks(unsigned int dead_cpu) |
5826 | { | ||
5827 | struct mm_struct *mm = current->active_mm; | ||
5828 | |||
5829 | BUG_ON(cpu_online(smp_processor_id())); | ||
5830 | |||
5831 | if (mm != &init_mm) | ||
5832 | switch_mm(mm, &init_mm, current); | ||
5833 | mmdrop(mm); | ||
5834 | } | ||
5835 | |||
5836 | /* called under rq->lock with disabled interrupts */ | ||
5837 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) | ||
5838 | { | 5830 | { |
5839 | struct rq *rq = cpu_rq(dead_cpu); | 5831 | struct rq *rq = cpu_rq(dead_cpu); |
5840 | 5832 | struct task_struct *next, *stop = rq->stop; | |
5841 | /* Must be exiting, otherwise would be on tasklist. */ | 5833 | int dest_cpu; |
5842 | BUG_ON(!p->exit_state); | ||
5843 | |||
5844 | /* Cannot have done final schedule yet: would have vanished. */ | ||
5845 | BUG_ON(p->state == TASK_DEAD); | ||
5846 | |||
5847 | get_task_struct(p); | ||
5848 | 5834 | ||
5849 | /* | 5835 | /* |
5850 | * Drop lock around migration; if someone else moves it, | 5836 | * Fudge the rq selection such that the below task selection loop |
5851 | * that's OK. No task can be added to this CPU, so iteration is | 5837 | * doesn't get stuck on the currently eligible stop task. |
5852 | * fine. | 5838 | * |
5839 | * We're currently inside stop_machine() and the rq is either stuck | ||
5840 | * in the stop_machine_cpu_stop() loop, or we're executing this code, | ||
5841 | * either way we should never end up calling schedule() until we're | ||
5842 | * done here. | ||
5853 | */ | 5843 | */ |
5854 | raw_spin_unlock_irq(&rq->lock); | 5844 | rq->stop = NULL; |
5855 | move_task_off_dead_cpu(dead_cpu, p); | ||
5856 | raw_spin_lock_irq(&rq->lock); | ||
5857 | |||
5858 | put_task_struct(p); | ||
5859 | } | ||
5860 | |||
5861 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ | ||
5862 | static void migrate_dead_tasks(unsigned int dead_cpu) | ||
5863 | { | ||
5864 | struct rq *rq = cpu_rq(dead_cpu); | ||
5865 | struct task_struct *next; | ||
5866 | 5845 | ||
5867 | for ( ; ; ) { | 5846 | for ( ; ; ) { |
5868 | if (!rq->nr_running) | 5847 | /* |
5848 | * There's this thread running, bail when that's the only | ||
5849 | * remaining thread. | ||
5850 | */ | ||
5851 | if (rq->nr_running == 1) | ||
5869 | break; | 5852 | break; |
5853 | |||
5870 | next = pick_next_task(rq); | 5854 | next = pick_next_task(rq); |
5871 | if (!next) | 5855 | BUG_ON(!next); |
5872 | break; | ||
5873 | next->sched_class->put_prev_task(rq, next); | 5856 | next->sched_class->put_prev_task(rq, next); |
5874 | migrate_dead(dead_cpu, next); | ||
5875 | 5857 | ||
5858 | /* Find suitable destination for @next, with force if needed. */ | ||
5859 | dest_cpu = select_fallback_rq(dead_cpu, next); | ||
5860 | raw_spin_unlock(&rq->lock); | ||
5861 | |||
5862 | __migrate_task(next, dead_cpu, dest_cpu); | ||
5863 | |||
5864 | raw_spin_lock(&rq->lock); | ||
5876 | } | 5865 | } |
5877 | } | ||
5878 | 5866 | ||
5879 | /* | 5867 | rq->stop = stop; |
5880 | * remove the tasks which were accounted by rq from calc_load_tasks. | ||
5881 | */ | ||
5882 | static void calc_global_load_remove(struct rq *rq) | ||
5883 | { | ||
5884 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); | ||
5885 | rq->calc_load_active = 0; | ||
5886 | } | 5868 | } |
5869 | |||
5887 | #endif /* CONFIG_HOTPLUG_CPU */ | 5870 | #endif /* CONFIG_HOTPLUG_CPU */ |
5888 | 5871 | ||
5889 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | 5872 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) |
@@ -6093,15 +6076,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6093 | unsigned long flags; | 6076 | unsigned long flags; |
6094 | struct rq *rq = cpu_rq(cpu); | 6077 | struct rq *rq = cpu_rq(cpu); |
6095 | 6078 | ||
6096 | switch (action) { | 6079 | switch (action & ~CPU_TASKS_FROZEN) { |
6097 | 6080 | ||
6098 | case CPU_UP_PREPARE: | 6081 | case CPU_UP_PREPARE: |
6099 | case CPU_UP_PREPARE_FROZEN: | ||
6100 | rq->calc_load_update = calc_load_update; | 6082 | rq->calc_load_update = calc_load_update; |
6101 | break; | 6083 | break; |
6102 | 6084 | ||
6103 | case CPU_ONLINE: | 6085 | case CPU_ONLINE: |
6104 | case CPU_ONLINE_FROZEN: | ||
6105 | /* Update our root-domain */ | 6086 | /* Update our root-domain */ |
6106 | raw_spin_lock_irqsave(&rq->lock, flags); | 6087 | raw_spin_lock_irqsave(&rq->lock, flags); |
6107 | if (rq->rd) { | 6088 | if (rq->rd) { |
@@ -6113,30 +6094,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6113 | break; | 6094 | break; |
6114 | 6095 | ||
6115 | #ifdef CONFIG_HOTPLUG_CPU | 6096 | #ifdef CONFIG_HOTPLUG_CPU |
6116 | case CPU_DEAD: | ||
6117 | case CPU_DEAD_FROZEN: | ||
6118 | migrate_live_tasks(cpu); | ||
6119 | /* Idle task back to normal (off runqueue, low prio) */ | ||
6120 | raw_spin_lock_irq(&rq->lock); | ||
6121 | deactivate_task(rq, rq->idle, 0); | ||
6122 | __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); | ||
6123 | rq->idle->sched_class = &idle_sched_class; | ||
6124 | migrate_dead_tasks(cpu); | ||
6125 | raw_spin_unlock_irq(&rq->lock); | ||
6126 | migrate_nr_uninterruptible(rq); | ||
6127 | BUG_ON(rq->nr_running != 0); | ||
6128 | calc_global_load_remove(rq); | ||
6129 | break; | ||
6130 | |||
6131 | case CPU_DYING: | 6097 | case CPU_DYING: |
6132 | case CPU_DYING_FROZEN: | ||
6133 | /* Update our root-domain */ | 6098 | /* Update our root-domain */ |
6134 | raw_spin_lock_irqsave(&rq->lock, flags); | 6099 | raw_spin_lock_irqsave(&rq->lock, flags); |
6135 | if (rq->rd) { | 6100 | if (rq->rd) { |
6136 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); | 6101 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
6137 | set_rq_offline(rq); | 6102 | set_rq_offline(rq); |
6138 | } | 6103 | } |
6104 | migrate_tasks(cpu); | ||
6105 | BUG_ON(rq->nr_running != 1); /* the migration thread */ | ||
6139 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6106 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
6107 | |||
6108 | migrate_nr_uninterruptible(rq); | ||
6109 | calc_global_load_remove(rq); | ||
6140 | break; | 6110 | break; |
6141 | #endif | 6111 | #endif |
6142 | } | 6112 | } |
@@ -7867,18 +7837,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
7867 | 7837 | ||
7868 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7838 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7869 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | 7839 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, |
7870 | struct sched_entity *se, int cpu, int add, | 7840 | struct sched_entity *se, int cpu, |
7871 | struct sched_entity *parent) | 7841 | struct sched_entity *parent) |
7872 | { | 7842 | { |
7873 | struct rq *rq = cpu_rq(cpu); | 7843 | struct rq *rq = cpu_rq(cpu); |
7874 | tg->cfs_rq[cpu] = cfs_rq; | 7844 | tg->cfs_rq[cpu] = cfs_rq; |
7875 | init_cfs_rq(cfs_rq, rq); | 7845 | init_cfs_rq(cfs_rq, rq); |
7876 | cfs_rq->tg = tg; | 7846 | cfs_rq->tg = tg; |
7877 | if (add) | ||
7878 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
7879 | 7847 | ||
7880 | tg->se[cpu] = se; | 7848 | tg->se[cpu] = se; |
7881 | /* se could be NULL for init_task_group */ | 7849 | /* se could be NULL for root_task_group */ |
7882 | if (!se) | 7850 | if (!se) |
7883 | return; | 7851 | return; |
7884 | 7852 | ||
@@ -7888,15 +7856,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
7888 | se->cfs_rq = parent->my_q; | 7856 | se->cfs_rq = parent->my_q; |
7889 | 7857 | ||
7890 | se->my_q = cfs_rq; | 7858 | se->my_q = cfs_rq; |
7891 | se->load.weight = tg->shares; | 7859 | update_load_set(&se->load, 0); |
7892 | se->load.inv_weight = 0; | ||
7893 | se->parent = parent; | 7860 | se->parent = parent; |
7894 | } | 7861 | } |
7895 | #endif | 7862 | #endif |
7896 | 7863 | ||
7897 | #ifdef CONFIG_RT_GROUP_SCHED | 7864 | #ifdef CONFIG_RT_GROUP_SCHED |
7898 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | 7865 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, |
7899 | struct sched_rt_entity *rt_se, int cpu, int add, | 7866 | struct sched_rt_entity *rt_se, int cpu, |
7900 | struct sched_rt_entity *parent) | 7867 | struct sched_rt_entity *parent) |
7901 | { | 7868 | { |
7902 | struct rq *rq = cpu_rq(cpu); | 7869 | struct rq *rq = cpu_rq(cpu); |
@@ -7905,8 +7872,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | |||
7905 | init_rt_rq(rt_rq, rq); | 7872 | init_rt_rq(rt_rq, rq); |
7906 | rt_rq->tg = tg; | 7873 | rt_rq->tg = tg; |
7907 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | 7874 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; |
7908 | if (add) | ||
7909 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | ||
7910 | 7875 | ||
7911 | tg->rt_se[cpu] = rt_se; | 7876 | tg->rt_se[cpu] = rt_se; |
7912 | if (!rt_se) | 7877 | if (!rt_se) |
@@ -7941,18 +7906,18 @@ void __init sched_init(void) | |||
7941 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); | 7906 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); |
7942 | 7907 | ||
7943 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7908 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7944 | init_task_group.se = (struct sched_entity **)ptr; | 7909 | root_task_group.se = (struct sched_entity **)ptr; |
7945 | ptr += nr_cpu_ids * sizeof(void **); | 7910 | ptr += nr_cpu_ids * sizeof(void **); |
7946 | 7911 | ||
7947 | init_task_group.cfs_rq = (struct cfs_rq **)ptr; | 7912 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; |
7948 | ptr += nr_cpu_ids * sizeof(void **); | 7913 | ptr += nr_cpu_ids * sizeof(void **); |
7949 | 7914 | ||
7950 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7915 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7951 | #ifdef CONFIG_RT_GROUP_SCHED | 7916 | #ifdef CONFIG_RT_GROUP_SCHED |
7952 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; | 7917 | root_task_group.rt_se = (struct sched_rt_entity **)ptr; |
7953 | ptr += nr_cpu_ids * sizeof(void **); | 7918 | ptr += nr_cpu_ids * sizeof(void **); |
7954 | 7919 | ||
7955 | init_task_group.rt_rq = (struct rt_rq **)ptr; | 7920 | root_task_group.rt_rq = (struct rt_rq **)ptr; |
7956 | ptr += nr_cpu_ids * sizeof(void **); | 7921 | ptr += nr_cpu_ids * sizeof(void **); |
7957 | 7922 | ||
7958 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7923 | #endif /* CONFIG_RT_GROUP_SCHED */ |
@@ -7972,20 +7937,16 @@ void __init sched_init(void) | |||
7972 | global_rt_period(), global_rt_runtime()); | 7937 | global_rt_period(), global_rt_runtime()); |
7973 | 7938 | ||
7974 | #ifdef CONFIG_RT_GROUP_SCHED | 7939 | #ifdef CONFIG_RT_GROUP_SCHED |
7975 | init_rt_bandwidth(&init_task_group.rt_bandwidth, | 7940 | init_rt_bandwidth(&root_task_group.rt_bandwidth, |
7976 | global_rt_period(), global_rt_runtime()); | 7941 | global_rt_period(), global_rt_runtime()); |
7977 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7942 | #endif /* CONFIG_RT_GROUP_SCHED */ |
7978 | 7943 | ||
7979 | #ifdef CONFIG_CGROUP_SCHED | 7944 | #ifdef CONFIG_CGROUP_SCHED |
7980 | list_add(&init_task_group.list, &task_groups); | 7945 | list_add(&root_task_group.list, &task_groups); |
7981 | INIT_LIST_HEAD(&init_task_group.children); | 7946 | INIT_LIST_HEAD(&root_task_group.children); |
7982 | 7947 | autogroup_init(&init_task); | |
7983 | #endif /* CONFIG_CGROUP_SCHED */ | 7948 | #endif /* CONFIG_CGROUP_SCHED */ |
7984 | 7949 | ||
7985 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP | ||
7986 | update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), | ||
7987 | __alignof__(unsigned long)); | ||
7988 | #endif | ||
7989 | for_each_possible_cpu(i) { | 7950 | for_each_possible_cpu(i) { |
7990 | struct rq *rq; | 7951 | struct rq *rq; |
7991 | 7952 | ||
@@ -7997,38 +7958,34 @@ void __init sched_init(void) | |||
7997 | init_cfs_rq(&rq->cfs, rq); | 7958 | init_cfs_rq(&rq->cfs, rq); |
7998 | init_rt_rq(&rq->rt, rq); | 7959 | init_rt_rq(&rq->rt, rq); |
7999 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7960 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8000 | init_task_group.shares = init_task_group_load; | 7961 | root_task_group.shares = root_task_group_load; |
8001 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 7962 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
8002 | #ifdef CONFIG_CGROUP_SCHED | ||
8003 | /* | 7963 | /* |
8004 | * How much cpu bandwidth does init_task_group get? | 7964 | * How much cpu bandwidth does root_task_group get? |
8005 | * | 7965 | * |
8006 | * In case of task-groups formed thr' the cgroup filesystem, it | 7966 | * In case of task-groups formed thr' the cgroup filesystem, it |
8007 | * gets 100% of the cpu resources in the system. This overall | 7967 | * gets 100% of the cpu resources in the system. This overall |
8008 | * system cpu resource is divided among the tasks of | 7968 | * system cpu resource is divided among the tasks of |
8009 | * init_task_group and its child task-groups in a fair manner, | 7969 | * root_task_group and its child task-groups in a fair manner, |
8010 | * based on each entity's (task or task-group's) weight | 7970 | * based on each entity's (task or task-group's) weight |
8011 | * (se->load.weight). | 7971 | * (se->load.weight). |
8012 | * | 7972 | * |
8013 | * In other words, if init_task_group has 10 tasks of weight | 7973 | * In other words, if root_task_group has 10 tasks of weight |
8014 | * 1024) and two child groups A0 and A1 (of weight 1024 each), | 7974 | * 1024) and two child groups A0 and A1 (of weight 1024 each), |
8015 | * then A0's share of the cpu resource is: | 7975 | * then A0's share of the cpu resource is: |
8016 | * | 7976 | * |
8017 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% | 7977 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% |
8018 | * | 7978 | * |
8019 | * We achieve this by letting init_task_group's tasks sit | 7979 | * We achieve this by letting root_task_group's tasks sit |
8020 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). | 7980 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). |
8021 | */ | 7981 | */ |
8022 | init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); | 7982 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); |
8023 | #endif | ||
8024 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7983 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8025 | 7984 | ||
8026 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; | 7985 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; |
8027 | #ifdef CONFIG_RT_GROUP_SCHED | 7986 | #ifdef CONFIG_RT_GROUP_SCHED |
8028 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | 7987 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); |
8029 | #ifdef CONFIG_CGROUP_SCHED | 7988 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); |
8030 | init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); | ||
8031 | #endif | ||
8032 | #endif | 7989 | #endif |
8033 | 7990 | ||
8034 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 7991 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
@@ -8108,8 +8065,6 @@ void __init sched_init(void) | |||
8108 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 8065 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
8109 | #endif /* SMP */ | 8066 | #endif /* SMP */ |
8110 | 8067 | ||
8111 | perf_event_init(); | ||
8112 | |||
8113 | scheduler_running = 1; | 8068 | scheduler_running = 1; |
8114 | } | 8069 | } |
8115 | 8070 | ||
@@ -8303,7 +8258,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8303 | if (!se) | 8258 | if (!se) |
8304 | goto err_free_rq; | 8259 | goto err_free_rq; |
8305 | 8260 | ||
8306 | init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); | 8261 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); |
8307 | } | 8262 | } |
8308 | 8263 | ||
8309 | return 1; | 8264 | return 1; |
@@ -8314,15 +8269,21 @@ err: | |||
8314 | return 0; | 8269 | return 0; |
8315 | } | 8270 | } |
8316 | 8271 | ||
8317 | static inline void register_fair_sched_group(struct task_group *tg, int cpu) | ||
8318 | { | ||
8319 | list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, | ||
8320 | &cpu_rq(cpu)->leaf_cfs_rq_list); | ||
8321 | } | ||
8322 | |||
8323 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8272 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
8324 | { | 8273 | { |
8325 | list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); | 8274 | struct rq *rq = cpu_rq(cpu); |
8275 | unsigned long flags; | ||
8276 | |||
8277 | /* | ||
8278 | * Only empty task groups can be destroyed; so we can speculatively | ||
8279 | * check on_list without danger of it being re-added. | ||
8280 | */ | ||
8281 | if (!tg->cfs_rq[cpu]->on_list) | ||
8282 | return; | ||
8283 | |||
8284 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8285 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); | ||
8286 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8326 | } | 8287 | } |
8327 | #else /* !CONFG_FAIR_GROUP_SCHED */ | 8288 | #else /* !CONFG_FAIR_GROUP_SCHED */ |
8328 | static inline void free_fair_sched_group(struct task_group *tg) | 8289 | static inline void free_fair_sched_group(struct task_group *tg) |
@@ -8335,10 +8296,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8335 | return 1; | 8296 | return 1; |
8336 | } | 8297 | } |
8337 | 8298 | ||
8338 | static inline void register_fair_sched_group(struct task_group *tg, int cpu) | ||
8339 | { | ||
8340 | } | ||
8341 | |||
8342 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8299 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
8343 | { | 8300 | { |
8344 | } | 8301 | } |
@@ -8393,7 +8350,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8393 | if (!rt_se) | 8350 | if (!rt_se) |
8394 | goto err_free_rq; | 8351 | goto err_free_rq; |
8395 | 8352 | ||
8396 | init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); | 8353 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); |
8397 | } | 8354 | } |
8398 | 8355 | ||
8399 | return 1; | 8356 | return 1; |
@@ -8403,17 +8360,6 @@ err_free_rq: | |||
8403 | err: | 8360 | err: |
8404 | return 0; | 8361 | return 0; |
8405 | } | 8362 | } |
8406 | |||
8407 | static inline void register_rt_sched_group(struct task_group *tg, int cpu) | ||
8408 | { | ||
8409 | list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, | ||
8410 | &cpu_rq(cpu)->leaf_rt_rq_list); | ||
8411 | } | ||
8412 | |||
8413 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | ||
8414 | { | ||
8415 | list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); | ||
8416 | } | ||
8417 | #else /* !CONFIG_RT_GROUP_SCHED */ | 8363 | #else /* !CONFIG_RT_GROUP_SCHED */ |
8418 | static inline void free_rt_sched_group(struct task_group *tg) | 8364 | static inline void free_rt_sched_group(struct task_group *tg) |
8419 | { | 8365 | { |
@@ -8424,14 +8370,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8424 | { | 8370 | { |
8425 | return 1; | 8371 | return 1; |
8426 | } | 8372 | } |
8427 | |||
8428 | static inline void register_rt_sched_group(struct task_group *tg, int cpu) | ||
8429 | { | ||
8430 | } | ||
8431 | |||
8432 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | ||
8433 | { | ||
8434 | } | ||
8435 | #endif /* CONFIG_RT_GROUP_SCHED */ | 8373 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8436 | 8374 | ||
8437 | #ifdef CONFIG_CGROUP_SCHED | 8375 | #ifdef CONFIG_CGROUP_SCHED |
@@ -8439,6 +8377,7 @@ static void free_sched_group(struct task_group *tg) | |||
8439 | { | 8377 | { |
8440 | free_fair_sched_group(tg); | 8378 | free_fair_sched_group(tg); |
8441 | free_rt_sched_group(tg); | 8379 | free_rt_sched_group(tg); |
8380 | autogroup_free(tg); | ||
8442 | kfree(tg); | 8381 | kfree(tg); |
8443 | } | 8382 | } |
8444 | 8383 | ||
@@ -8447,7 +8386,6 @@ struct task_group *sched_create_group(struct task_group *parent) | |||
8447 | { | 8386 | { |
8448 | struct task_group *tg; | 8387 | struct task_group *tg; |
8449 | unsigned long flags; | 8388 | unsigned long flags; |
8450 | int i; | ||
8451 | 8389 | ||
8452 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); | 8390 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); |
8453 | if (!tg) | 8391 | if (!tg) |
@@ -8460,10 +8398,6 @@ struct task_group *sched_create_group(struct task_group *parent) | |||
8460 | goto err; | 8398 | goto err; |
8461 | 8399 | ||
8462 | spin_lock_irqsave(&task_group_lock, flags); | 8400 | spin_lock_irqsave(&task_group_lock, flags); |
8463 | for_each_possible_cpu(i) { | ||
8464 | register_fair_sched_group(tg, i); | ||
8465 | register_rt_sched_group(tg, i); | ||
8466 | } | ||
8467 | list_add_rcu(&tg->list, &task_groups); | 8401 | list_add_rcu(&tg->list, &task_groups); |
8468 | 8402 | ||
8469 | WARN_ON(!parent); /* root should already exist */ | 8403 | WARN_ON(!parent); /* root should already exist */ |
@@ -8493,11 +8427,11 @@ void sched_destroy_group(struct task_group *tg) | |||
8493 | unsigned long flags; | 8427 | unsigned long flags; |
8494 | int i; | 8428 | int i; |
8495 | 8429 | ||
8496 | spin_lock_irqsave(&task_group_lock, flags); | 8430 | /* end participation in shares distribution */ |
8497 | for_each_possible_cpu(i) { | 8431 | for_each_possible_cpu(i) |
8498 | unregister_fair_sched_group(tg, i); | 8432 | unregister_fair_sched_group(tg, i); |
8499 | unregister_rt_sched_group(tg, i); | 8433 | |
8500 | } | 8434 | spin_lock_irqsave(&task_group_lock, flags); |
8501 | list_del_rcu(&tg->list); | 8435 | list_del_rcu(&tg->list); |
8502 | list_del_rcu(&tg->siblings); | 8436 | list_del_rcu(&tg->siblings); |
8503 | spin_unlock_irqrestore(&task_group_lock, flags); | 8437 | spin_unlock_irqrestore(&task_group_lock, flags); |
@@ -8544,33 +8478,6 @@ void sched_move_task(struct task_struct *tsk) | |||
8544 | #endif /* CONFIG_CGROUP_SCHED */ | 8478 | #endif /* CONFIG_CGROUP_SCHED */ |
8545 | 8479 | ||
8546 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8480 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8547 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) | ||
8548 | { | ||
8549 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
8550 | int on_rq; | ||
8551 | |||
8552 | on_rq = se->on_rq; | ||
8553 | if (on_rq) | ||
8554 | dequeue_entity(cfs_rq, se, 0); | ||
8555 | |||
8556 | se->load.weight = shares; | ||
8557 | se->load.inv_weight = 0; | ||
8558 | |||
8559 | if (on_rq) | ||
8560 | enqueue_entity(cfs_rq, se, 0); | ||
8561 | } | ||
8562 | |||
8563 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | ||
8564 | { | ||
8565 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
8566 | struct rq *rq = cfs_rq->rq; | ||
8567 | unsigned long flags; | ||
8568 | |||
8569 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8570 | __set_se_shares(se, shares); | ||
8571 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8572 | } | ||
8573 | |||
8574 | static DEFINE_MUTEX(shares_mutex); | 8481 | static DEFINE_MUTEX(shares_mutex); |
8575 | 8482 | ||
8576 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | 8483 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) |
@@ -8593,37 +8500,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
8593 | if (tg->shares == shares) | 8500 | if (tg->shares == shares) |
8594 | goto done; | 8501 | goto done; |
8595 | 8502 | ||
8596 | spin_lock_irqsave(&task_group_lock, flags); | ||
8597 | for_each_possible_cpu(i) | ||
8598 | unregister_fair_sched_group(tg, i); | ||
8599 | list_del_rcu(&tg->siblings); | ||
8600 | spin_unlock_irqrestore(&task_group_lock, flags); | ||
8601 | |||
8602 | /* wait for any ongoing reference to this group to finish */ | ||
8603 | synchronize_sched(); | ||
8604 | |||
8605 | /* | ||
8606 | * Now we are free to modify the group's share on each cpu | ||
8607 | * w/o tripping rebalance_share or load_balance_fair. | ||
8608 | */ | ||
8609 | tg->shares = shares; | 8503 | tg->shares = shares; |
8610 | for_each_possible_cpu(i) { | 8504 | for_each_possible_cpu(i) { |
8611 | /* | 8505 | struct rq *rq = cpu_rq(i); |
8612 | * force a rebalance | 8506 | struct sched_entity *se; |
8613 | */ | 8507 | |
8614 | cfs_rq_set_shares(tg->cfs_rq[i], 0); | 8508 | se = tg->se[i]; |
8615 | set_se_shares(tg->se[i], shares); | 8509 | /* Propagate contribution to hierarchy */ |
8510 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8511 | for_each_sched_entity(se) | ||
8512 | update_cfs_shares(group_cfs_rq(se), 0); | ||
8513 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8616 | } | 8514 | } |
8617 | 8515 | ||
8618 | /* | ||
8619 | * Enable load balance activity on this group, by inserting it back on | ||
8620 | * each cpu's rq->leaf_cfs_rq_list. | ||
8621 | */ | ||
8622 | spin_lock_irqsave(&task_group_lock, flags); | ||
8623 | for_each_possible_cpu(i) | ||
8624 | register_fair_sched_group(tg, i); | ||
8625 | list_add_rcu(&tg->siblings, &tg->parent->children); | ||
8626 | spin_unlock_irqrestore(&task_group_lock, flags); | ||
8627 | done: | 8516 | done: |
8628 | mutex_unlock(&shares_mutex); | 8517 | mutex_unlock(&shares_mutex); |
8629 | return 0; | 8518 | return 0; |
@@ -8922,7 +8811,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
8922 | 8811 | ||
8923 | if (!cgrp->parent) { | 8812 | if (!cgrp->parent) { |
8924 | /* This is early initialization for the top cgroup */ | 8813 | /* This is early initialization for the top cgroup */ |
8925 | return &init_task_group.css; | 8814 | return &root_task_group.css; |
8926 | } | 8815 | } |
8927 | 8816 | ||
8928 | parent = cgroup_tg(cgrp->parent); | 8817 | parent = cgroup_tg(cgrp->parent); |
@@ -9349,72 +9238,3 @@ struct cgroup_subsys cpuacct_subsys = { | |||
9349 | }; | 9238 | }; |
9350 | #endif /* CONFIG_CGROUP_CPUACCT */ | 9239 | #endif /* CONFIG_CGROUP_CPUACCT */ |
9351 | 9240 | ||
9352 | #ifndef CONFIG_SMP | ||
9353 | |||
9354 | void synchronize_sched_expedited(void) | ||
9355 | { | ||
9356 | barrier(); | ||
9357 | } | ||
9358 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
9359 | |||
9360 | #else /* #ifndef CONFIG_SMP */ | ||
9361 | |||
9362 | static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0); | ||
9363 | |||
9364 | static int synchronize_sched_expedited_cpu_stop(void *data) | ||
9365 | { | ||
9366 | /* | ||
9367 | * There must be a full memory barrier on each affected CPU | ||
9368 | * between the time that try_stop_cpus() is called and the | ||
9369 | * time that it returns. | ||
9370 | * | ||
9371 | * In the current initial implementation of cpu_stop, the | ||
9372 | * above condition is already met when the control reaches | ||
9373 | * this point and the following smp_mb() is not strictly | ||
9374 | * necessary. Do smp_mb() anyway for documentation and | ||
9375 | * robustness against future implementation changes. | ||
9376 | */ | ||
9377 | smp_mb(); /* See above comment block. */ | ||
9378 | return 0; | ||
9379 | } | ||
9380 | |||
9381 | /* | ||
9382 | * Wait for an rcu-sched grace period to elapse, but use "big hammer" | ||
9383 | * approach to force grace period to end quickly. This consumes | ||
9384 | * significant time on all CPUs, and is thus not recommended for | ||
9385 | * any sort of common-case code. | ||
9386 | * | ||
9387 | * Note that it is illegal to call this function while holding any | ||
9388 | * lock that is acquired by a CPU-hotplug notifier. Failing to | ||
9389 | * observe this restriction will result in deadlock. | ||
9390 | */ | ||
9391 | void synchronize_sched_expedited(void) | ||
9392 | { | ||
9393 | int snap, trycount = 0; | ||
9394 | |||
9395 | smp_mb(); /* ensure prior mod happens before capturing snap. */ | ||
9396 | snap = atomic_read(&synchronize_sched_expedited_count) + 1; | ||
9397 | get_online_cpus(); | ||
9398 | while (try_stop_cpus(cpu_online_mask, | ||
9399 | synchronize_sched_expedited_cpu_stop, | ||
9400 | NULL) == -EAGAIN) { | ||
9401 | put_online_cpus(); | ||
9402 | if (trycount++ < 10) | ||
9403 | udelay(trycount * num_online_cpus()); | ||
9404 | else { | ||
9405 | synchronize_sched(); | ||
9406 | return; | ||
9407 | } | ||
9408 | if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) { | ||
9409 | smp_mb(); /* ensure test happens before caller kfree */ | ||
9410 | return; | ||
9411 | } | ||
9412 | get_online_cpus(); | ||
9413 | } | ||
9414 | atomic_inc(&synchronize_sched_expedited_count); | ||
9415 | smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */ | ||
9416 | put_online_cpus(); | ||
9417 | } | ||
9418 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
9419 | |||
9420 | #endif /* #else #ifndef CONFIG_SMP */ | ||
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c new file mode 100644 index 00000000000..32a723b8f84 --- /dev/null +++ b/kernel/sched_autogroup.c | |||
@@ -0,0 +1,238 @@ | |||
1 | #ifdef CONFIG_SCHED_AUTOGROUP | ||
2 | |||
3 | #include <linux/proc_fs.h> | ||
4 | #include <linux/seq_file.h> | ||
5 | #include <linux/kallsyms.h> | ||
6 | #include <linux/utsname.h> | ||
7 | |||
8 | unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; | ||
9 | static struct autogroup autogroup_default; | ||
10 | static atomic_t autogroup_seq_nr; | ||
11 | |||
12 | static void __init autogroup_init(struct task_struct *init_task) | ||
13 | { | ||
14 | autogroup_default.tg = &root_task_group; | ||
15 | root_task_group.autogroup = &autogroup_default; | ||
16 | kref_init(&autogroup_default.kref); | ||
17 | init_rwsem(&autogroup_default.lock); | ||
18 | init_task->signal->autogroup = &autogroup_default; | ||
19 | } | ||
20 | |||
21 | static inline void autogroup_free(struct task_group *tg) | ||
22 | { | ||
23 | kfree(tg->autogroup); | ||
24 | } | ||
25 | |||
26 | static inline void autogroup_destroy(struct kref *kref) | ||
27 | { | ||
28 | struct autogroup *ag = container_of(kref, struct autogroup, kref); | ||
29 | |||
30 | sched_destroy_group(ag->tg); | ||
31 | } | ||
32 | |||
33 | static inline void autogroup_kref_put(struct autogroup *ag) | ||
34 | { | ||
35 | kref_put(&ag->kref, autogroup_destroy); | ||
36 | } | ||
37 | |||
38 | static inline struct autogroup *autogroup_kref_get(struct autogroup *ag) | ||
39 | { | ||
40 | kref_get(&ag->kref); | ||
41 | return ag; | ||
42 | } | ||
43 | |||
44 | static inline struct autogroup *autogroup_task_get(struct task_struct *p) | ||
45 | { | ||
46 | struct autogroup *ag; | ||
47 | unsigned long flags; | ||
48 | |||
49 | if (!lock_task_sighand(p, &flags)) | ||
50 | return autogroup_kref_get(&autogroup_default); | ||
51 | |||
52 | ag = autogroup_kref_get(p->signal->autogroup); | ||
53 | unlock_task_sighand(p, &flags); | ||
54 | |||
55 | return ag; | ||
56 | } | ||
57 | |||
58 | static inline struct autogroup *autogroup_create(void) | ||
59 | { | ||
60 | struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); | ||
61 | struct task_group *tg; | ||
62 | |||
63 | if (!ag) | ||
64 | goto out_fail; | ||
65 | |||
66 | tg = sched_create_group(&root_task_group); | ||
67 | |||
68 | if (IS_ERR(tg)) | ||
69 | goto out_free; | ||
70 | |||
71 | kref_init(&ag->kref); | ||
72 | init_rwsem(&ag->lock); | ||
73 | ag->id = atomic_inc_return(&autogroup_seq_nr); | ||
74 | ag->tg = tg; | ||
75 | tg->autogroup = ag; | ||
76 | |||
77 | return ag; | ||
78 | |||
79 | out_free: | ||
80 | kfree(ag); | ||
81 | out_fail: | ||
82 | if (printk_ratelimit()) { | ||
83 | printk(KERN_WARNING "autogroup_create: %s failure.\n", | ||
84 | ag ? "sched_create_group()" : "kmalloc()"); | ||
85 | } | ||
86 | |||
87 | return autogroup_kref_get(&autogroup_default); | ||
88 | } | ||
89 | |||
90 | static inline bool | ||
91 | task_wants_autogroup(struct task_struct *p, struct task_group *tg) | ||
92 | { | ||
93 | if (tg != &root_task_group) | ||
94 | return false; | ||
95 | |||
96 | if (p->sched_class != &fair_sched_class) | ||
97 | return false; | ||
98 | |||
99 | /* | ||
100 | * We can only assume the task group can't go away on us if | ||
101 | * autogroup_move_group() can see us on ->thread_group list. | ||
102 | */ | ||
103 | if (p->flags & PF_EXITING) | ||
104 | return false; | ||
105 | |||
106 | return true; | ||
107 | } | ||
108 | |||
109 | static inline struct task_group * | ||
110 | autogroup_task_group(struct task_struct *p, struct task_group *tg) | ||
111 | { | ||
112 | int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); | ||
113 | |||
114 | if (enabled && task_wants_autogroup(p, tg)) | ||
115 | return p->signal->autogroup->tg; | ||
116 | |||
117 | return tg; | ||
118 | } | ||
119 | |||
120 | static void | ||
121 | autogroup_move_group(struct task_struct *p, struct autogroup *ag) | ||
122 | { | ||
123 | struct autogroup *prev; | ||
124 | struct task_struct *t; | ||
125 | unsigned long flags; | ||
126 | |||
127 | BUG_ON(!lock_task_sighand(p, &flags)); | ||
128 | |||
129 | prev = p->signal->autogroup; | ||
130 | if (prev == ag) { | ||
131 | unlock_task_sighand(p, &flags); | ||
132 | return; | ||
133 | } | ||
134 | |||
135 | p->signal->autogroup = autogroup_kref_get(ag); | ||
136 | |||
137 | t = p; | ||
138 | do { | ||
139 | sched_move_task(t); | ||
140 | } while_each_thread(p, t); | ||
141 | |||
142 | unlock_task_sighand(p, &flags); | ||
143 | autogroup_kref_put(prev); | ||
144 | } | ||
145 | |||
146 | /* Allocates GFP_KERNEL, cannot be called under any spinlock */ | ||
147 | void sched_autogroup_create_attach(struct task_struct *p) | ||
148 | { | ||
149 | struct autogroup *ag = autogroup_create(); | ||
150 | |||
151 | autogroup_move_group(p, ag); | ||
152 | /* drop extra refrence added by autogroup_create() */ | ||
153 | autogroup_kref_put(ag); | ||
154 | } | ||
155 | EXPORT_SYMBOL(sched_autogroup_create_attach); | ||
156 | |||
157 | /* Cannot be called under siglock. Currently has no users */ | ||
158 | void sched_autogroup_detach(struct task_struct *p) | ||
159 | { | ||
160 | autogroup_move_group(p, &autogroup_default); | ||
161 | } | ||
162 | EXPORT_SYMBOL(sched_autogroup_detach); | ||
163 | |||
164 | void sched_autogroup_fork(struct signal_struct *sig) | ||
165 | { | ||
166 | sig->autogroup = autogroup_task_get(current); | ||
167 | } | ||
168 | |||
169 | void sched_autogroup_exit(struct signal_struct *sig) | ||
170 | { | ||
171 | autogroup_kref_put(sig->autogroup); | ||
172 | } | ||
173 | |||
174 | static int __init setup_autogroup(char *str) | ||
175 | { | ||
176 | sysctl_sched_autogroup_enabled = 0; | ||
177 | |||
178 | return 1; | ||
179 | } | ||
180 | |||
181 | __setup("noautogroup", setup_autogroup); | ||
182 | |||
183 | #ifdef CONFIG_PROC_FS | ||
184 | |||
185 | int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) | ||
186 | { | ||
187 | static unsigned long next = INITIAL_JIFFIES; | ||
188 | struct autogroup *ag; | ||
189 | int err; | ||
190 | |||
191 | if (*nice < -20 || *nice > 19) | ||
192 | return -EINVAL; | ||
193 | |||
194 | err = security_task_setnice(current, *nice); | ||
195 | if (err) | ||
196 | return err; | ||
197 | |||
198 | if (*nice < 0 && !can_nice(current, *nice)) | ||
199 | return -EPERM; | ||
200 | |||
201 | /* this is a heavy operation taking global locks.. */ | ||
202 | if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) | ||
203 | return -EAGAIN; | ||
204 | |||
205 | next = HZ / 10 + jiffies; | ||
206 | ag = autogroup_task_get(p); | ||
207 | |||
208 | down_write(&ag->lock); | ||
209 | err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]); | ||
210 | if (!err) | ||
211 | ag->nice = *nice; | ||
212 | up_write(&ag->lock); | ||
213 | |||
214 | autogroup_kref_put(ag); | ||
215 | |||
216 | return err; | ||
217 | } | ||
218 | |||
219 | void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) | ||
220 | { | ||
221 | struct autogroup *ag = autogroup_task_get(p); | ||
222 | |||
223 | down_read(&ag->lock); | ||
224 | seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); | ||
225 | up_read(&ag->lock); | ||
226 | |||
227 | autogroup_kref_put(ag); | ||
228 | } | ||
229 | #endif /* CONFIG_PROC_FS */ | ||
230 | |||
231 | #ifdef CONFIG_SCHED_DEBUG | ||
232 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) | ||
233 | { | ||
234 | return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); | ||
235 | } | ||
236 | #endif /* CONFIG_SCHED_DEBUG */ | ||
237 | |||
238 | #endif /* CONFIG_SCHED_AUTOGROUP */ | ||
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h new file mode 100644 index 00000000000..5358e241cb2 --- /dev/null +++ b/kernel/sched_autogroup.h | |||
@@ -0,0 +1,32 @@ | |||
1 | #ifdef CONFIG_SCHED_AUTOGROUP | ||
2 | |||
3 | struct autogroup { | ||
4 | struct kref kref; | ||
5 | struct task_group *tg; | ||
6 | struct rw_semaphore lock; | ||
7 | unsigned long id; | ||
8 | int nice; | ||
9 | }; | ||
10 | |||
11 | static inline struct task_group * | ||
12 | autogroup_task_group(struct task_struct *p, struct task_group *tg); | ||
13 | |||
14 | #else /* !CONFIG_SCHED_AUTOGROUP */ | ||
15 | |||
16 | static inline void autogroup_init(struct task_struct *init_task) { } | ||
17 | static inline void autogroup_free(struct task_group *tg) { } | ||
18 | |||
19 | static inline struct task_group * | ||
20 | autogroup_task_group(struct task_struct *p, struct task_group *tg) | ||
21 | { | ||
22 | return tg; | ||
23 | } | ||
24 | |||
25 | #ifdef CONFIG_SCHED_DEBUG | ||
26 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) | ||
27 | { | ||
28 | return 0; | ||
29 | } | ||
30 | #endif | ||
31 | |||
32 | #endif /* CONFIG_SCHED_AUTOGROUP */ | ||
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 52f1a149bfb..9d8af0b3fb6 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c | |||
@@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
79 | } | 79 | } |
80 | EXPORT_SYMBOL_GPL(sched_clock); | 80 | EXPORT_SYMBOL_GPL(sched_clock); |
81 | 81 | ||
82 | static __read_mostly int sched_clock_running; | 82 | __read_mostly int sched_clock_running; |
83 | 83 | ||
84 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | 84 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
85 | __read_mostly int sched_clock_stable; | 85 | __read_mostly int sched_clock_stable; |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 2e1b0d17dd9..1dfae3d014b 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -54,8 +54,7 @@ static unsigned long nsec_low(unsigned long long nsec) | |||
54 | #define SPLIT_NS(x) nsec_high(x), nsec_low(x) | 54 | #define SPLIT_NS(x) nsec_high(x), nsec_low(x) |
55 | 55 | ||
56 | #ifdef CONFIG_FAIR_GROUP_SCHED | 56 | #ifdef CONFIG_FAIR_GROUP_SCHED |
57 | static void print_cfs_group_stats(struct seq_file *m, int cpu, | 57 | static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) |
58 | struct task_group *tg) | ||
59 | { | 58 | { |
60 | struct sched_entity *se = tg->se[cpu]; | 59 | struct sched_entity *se = tg->se[cpu]; |
61 | if (!se) | 60 | if (!se) |
@@ -110,16 +109,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
110 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); | 109 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); |
111 | #endif | 110 | #endif |
112 | 111 | ||
113 | #ifdef CONFIG_CGROUP_SCHED | ||
114 | { | ||
115 | char path[64]; | ||
116 | |||
117 | rcu_read_lock(); | ||
118 | cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); | ||
119 | rcu_read_unlock(); | ||
120 | SEQ_printf(m, " %s", path); | ||
121 | } | ||
122 | #endif | ||
123 | SEQ_printf(m, "\n"); | 112 | SEQ_printf(m, "\n"); |
124 | } | 113 | } |
125 | 114 | ||
@@ -147,19 +136,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
147 | read_unlock_irqrestore(&tasklist_lock, flags); | 136 | read_unlock_irqrestore(&tasklist_lock, flags); |
148 | } | 137 | } |
149 | 138 | ||
150 | #if defined(CONFIG_CGROUP_SCHED) && \ | ||
151 | (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)) | ||
152 | static void task_group_path(struct task_group *tg, char *buf, int buflen) | ||
153 | { | ||
154 | /* may be NULL if the underlying cgroup isn't fully-created yet */ | ||
155 | if (!tg->css.cgroup) { | ||
156 | buf[0] = '\0'; | ||
157 | return; | ||
158 | } | ||
159 | cgroup_path(tg->css.cgroup, buf, buflen); | ||
160 | } | ||
161 | #endif | ||
162 | |||
163 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | 139 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) |
164 | { | 140 | { |
165 | s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, | 141 | s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, |
@@ -168,16 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
168 | struct sched_entity *last; | 144 | struct sched_entity *last; |
169 | unsigned long flags; | 145 | unsigned long flags; |
170 | 146 | ||
171 | #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) | ||
172 | char path[128]; | ||
173 | struct task_group *tg = cfs_rq->tg; | ||
174 | |||
175 | task_group_path(tg, path, sizeof(path)); | ||
176 | |||
177 | SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); | ||
178 | #else | ||
179 | SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); | 147 | SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); |
180 | #endif | ||
181 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", | 148 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", |
182 | SPLIT_NS(cfs_rq->exec_clock)); | 149 | SPLIT_NS(cfs_rq->exec_clock)); |
183 | 150 | ||
@@ -202,32 +169,29 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
202 | spread0 = min_vruntime - rq0_min_vruntime; | 169 | spread0 = min_vruntime - rq0_min_vruntime; |
203 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", | 170 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", |
204 | SPLIT_NS(spread0)); | 171 | SPLIT_NS(spread0)); |
205 | SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); | ||
206 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | ||
207 | |||
208 | SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", | 172 | SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", |
209 | cfs_rq->nr_spread_over); | 173 | cfs_rq->nr_spread_over); |
174 | SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); | ||
175 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | ||
210 | #ifdef CONFIG_FAIR_GROUP_SCHED | 176 | #ifdef CONFIG_FAIR_GROUP_SCHED |
211 | #ifdef CONFIG_SMP | 177 | #ifdef CONFIG_SMP |
212 | SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); | 178 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", |
179 | SPLIT_NS(cfs_rq->load_avg)); | ||
180 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", | ||
181 | SPLIT_NS(cfs_rq->load_period)); | ||
182 | SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", | ||
183 | cfs_rq->load_contribution); | ||
184 | SEQ_printf(m, " .%-30s: %d\n", "load_tg", | ||
185 | atomic_read(&cfs_rq->tg->load_weight)); | ||
213 | #endif | 186 | #endif |
187 | |||
214 | print_cfs_group_stats(m, cpu, cfs_rq->tg); | 188 | print_cfs_group_stats(m, cpu, cfs_rq->tg); |
215 | #endif | 189 | #endif |
216 | } | 190 | } |
217 | 191 | ||
218 | void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) | 192 | void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) |
219 | { | 193 | { |
220 | #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) | ||
221 | char path[128]; | ||
222 | struct task_group *tg = rt_rq->tg; | ||
223 | |||
224 | task_group_path(tg, path, sizeof(path)); | ||
225 | |||
226 | SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); | ||
227 | #else | ||
228 | SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); | 194 | SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); |
229 | #endif | ||
230 | |||
231 | 195 | ||
232 | #define P(x) \ | 196 | #define P(x) \ |
233 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) | 197 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) |
@@ -243,6 +207,8 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) | |||
243 | #undef P | 207 | #undef P |
244 | } | 208 | } |
245 | 209 | ||
210 | extern __read_mostly int sched_clock_running; | ||
211 | |||
246 | static void print_cpu(struct seq_file *m, int cpu) | 212 | static void print_cpu(struct seq_file *m, int cpu) |
247 | { | 213 | { |
248 | struct rq *rq = cpu_rq(cpu); | 214 | struct rq *rq = cpu_rq(cpu); |
@@ -314,21 +280,42 @@ static const char *sched_tunable_scaling_names[] = { | |||
314 | 280 | ||
315 | static int sched_debug_show(struct seq_file *m, void *v) | 281 | static int sched_debug_show(struct seq_file *m, void *v) |
316 | { | 282 | { |
317 | u64 now = ktime_to_ns(ktime_get()); | 283 | u64 ktime, sched_clk, cpu_clk; |
284 | unsigned long flags; | ||
318 | int cpu; | 285 | int cpu; |
319 | 286 | ||
320 | SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n", | 287 | local_irq_save(flags); |
288 | ktime = ktime_to_ns(ktime_get()); | ||
289 | sched_clk = sched_clock(); | ||
290 | cpu_clk = local_clock(); | ||
291 | local_irq_restore(flags); | ||
292 | |||
293 | SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", | ||
321 | init_utsname()->release, | 294 | init_utsname()->release, |
322 | (int)strcspn(init_utsname()->version, " "), | 295 | (int)strcspn(init_utsname()->version, " "), |
323 | init_utsname()->version); | 296 | init_utsname()->version); |
324 | 297 | ||
325 | SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); | 298 | #define P(x) \ |
299 | SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x)) | ||
300 | #define PN(x) \ | ||
301 | SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) | ||
302 | PN(ktime); | ||
303 | PN(sched_clk); | ||
304 | PN(cpu_clk); | ||
305 | P(jiffies); | ||
306 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | ||
307 | P(sched_clock_stable); | ||
308 | #endif | ||
309 | #undef PN | ||
310 | #undef P | ||
311 | |||
312 | SEQ_printf(m, "\n"); | ||
313 | SEQ_printf(m, "sysctl_sched\n"); | ||
326 | 314 | ||
327 | #define P(x) \ | 315 | #define P(x) \ |
328 | SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) | 316 | SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) |
329 | #define PN(x) \ | 317 | #define PN(x) \ |
330 | SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) | 318 | SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) |
331 | P(jiffies); | ||
332 | PN(sysctl_sched_latency); | 319 | PN(sysctl_sched_latency); |
333 | PN(sysctl_sched_min_granularity); | 320 | PN(sysctl_sched_min_granularity); |
334 | PN(sysctl_sched_wakeup_granularity); | 321 | PN(sysctl_sched_wakeup_granularity); |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 52ab113d8bb..c62ebae65cf 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; | |||
89 | 89 | ||
90 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | 90 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
91 | 91 | ||
92 | /* | ||
93 | * The exponential sliding window over which load is averaged for shares | ||
94 | * distribution. | ||
95 | * (default: 10msec) | ||
96 | */ | ||
97 | unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; | ||
98 | |||
92 | static const struct sched_class fair_sched_class; | 99 | static const struct sched_class fair_sched_class; |
93 | 100 | ||
94 | /************************************************************** | 101 | /************************************************************** |
@@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | |||
143 | return cfs_rq->tg->cfs_rq[this_cpu]; | 150 | return cfs_rq->tg->cfs_rq[this_cpu]; |
144 | } | 151 | } |
145 | 152 | ||
153 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | ||
154 | { | ||
155 | if (!cfs_rq->on_list) { | ||
156 | /* | ||
157 | * Ensure we either appear before our parent (if already | ||
158 | * enqueued) or force our parent to appear after us when it is | ||
159 | * enqueued. The fact that we always enqueue bottom-up | ||
160 | * reduces this to two cases. | ||
161 | */ | ||
162 | if (cfs_rq->tg->parent && | ||
163 | cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { | ||
164 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, | ||
165 | &rq_of(cfs_rq)->leaf_cfs_rq_list); | ||
166 | } else { | ||
167 | list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, | ||
168 | &rq_of(cfs_rq)->leaf_cfs_rq_list); | ||
169 | } | ||
170 | |||
171 | cfs_rq->on_list = 1; | ||
172 | } | ||
173 | } | ||
174 | |||
175 | static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) | ||
176 | { | ||
177 | if (cfs_rq->on_list) { | ||
178 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | ||
179 | cfs_rq->on_list = 0; | ||
180 | } | ||
181 | } | ||
182 | |||
146 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | 183 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ |
147 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 184 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
148 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | 185 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) |
@@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | |||
246 | return &cpu_rq(this_cpu)->cfs; | 283 | return &cpu_rq(this_cpu)->cfs; |
247 | } | 284 | } |
248 | 285 | ||
286 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | ||
287 | { | ||
288 | } | ||
289 | |||
290 | static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) | ||
291 | { | ||
292 | } | ||
293 | |||
249 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 294 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
250 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | 295 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) |
251 | 296 | ||
@@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write, | |||
417 | WRT_SYSCTL(sched_min_granularity); | 462 | WRT_SYSCTL(sched_min_granularity); |
418 | WRT_SYSCTL(sched_latency); | 463 | WRT_SYSCTL(sched_latency); |
419 | WRT_SYSCTL(sched_wakeup_granularity); | 464 | WRT_SYSCTL(sched_wakeup_granularity); |
420 | WRT_SYSCTL(sched_shares_ratelimit); | ||
421 | #undef WRT_SYSCTL | 465 | #undef WRT_SYSCTL |
422 | 466 | ||
423 | return 0; | 467 | return 0; |
@@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
495 | return calc_delta_fair(sched_slice(cfs_rq, se), se); | 539 | return calc_delta_fair(sched_slice(cfs_rq, se), se); |
496 | } | 540 | } |
497 | 541 | ||
542 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); | ||
543 | static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); | ||
544 | |||
498 | /* | 545 | /* |
499 | * Update the current task's runtime statistics. Skip current tasks that | 546 | * Update the current task's runtime statistics. Skip current tasks that |
500 | * are not in our scheduling class. | 547 | * are not in our scheduling class. |
@@ -514,6 +561,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
514 | 561 | ||
515 | curr->vruntime += delta_exec_weighted; | 562 | curr->vruntime += delta_exec_weighted; |
516 | update_min_vruntime(cfs_rq); | 563 | update_min_vruntime(cfs_rq); |
564 | |||
565 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
566 | cfs_rq->load_unacc_exec_time += delta_exec; | ||
567 | #endif | ||
517 | } | 568 | } |
518 | 569 | ||
519 | static void update_curr(struct cfs_rq *cfs_rq) | 570 | static void update_curr(struct cfs_rq *cfs_rq) |
@@ -633,7 +684,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
633 | list_add(&se->group_node, &cfs_rq->tasks); | 684 | list_add(&se->group_node, &cfs_rq->tasks); |
634 | } | 685 | } |
635 | cfs_rq->nr_running++; | 686 | cfs_rq->nr_running++; |
636 | se->on_rq = 1; | ||
637 | } | 687 | } |
638 | 688 | ||
639 | static void | 689 | static void |
@@ -647,9 +697,140 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
647 | list_del_init(&se->group_node); | 697 | list_del_init(&se->group_node); |
648 | } | 698 | } |
649 | cfs_rq->nr_running--; | 699 | cfs_rq->nr_running--; |
650 | se->on_rq = 0; | ||
651 | } | 700 | } |
652 | 701 | ||
702 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
703 | static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, | ||
704 | int global_update) | ||
705 | { | ||
706 | struct task_group *tg = cfs_rq->tg; | ||
707 | long load_avg; | ||
708 | |||
709 | load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); | ||
710 | load_avg -= cfs_rq->load_contribution; | ||
711 | |||
712 | if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { | ||
713 | atomic_add(load_avg, &tg->load_weight); | ||
714 | cfs_rq->load_contribution += load_avg; | ||
715 | } | ||
716 | } | ||
717 | |||
718 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | ||
719 | { | ||
720 | u64 period = sysctl_sched_shares_window; | ||
721 | u64 now, delta; | ||
722 | unsigned long load = cfs_rq->load.weight; | ||
723 | |||
724 | if (!cfs_rq) | ||
725 | return; | ||
726 | |||
727 | now = rq_of(cfs_rq)->clock; | ||
728 | delta = now - cfs_rq->load_stamp; | ||
729 | |||
730 | /* truncate load history at 4 idle periods */ | ||
731 | if (cfs_rq->load_stamp > cfs_rq->load_last && | ||
732 | now - cfs_rq->load_last > 4 * period) { | ||
733 | cfs_rq->load_period = 0; | ||
734 | cfs_rq->load_avg = 0; | ||
735 | } | ||
736 | |||
737 | cfs_rq->load_stamp = now; | ||
738 | cfs_rq->load_unacc_exec_time = 0; | ||
739 | cfs_rq->load_period += delta; | ||
740 | if (load) { | ||
741 | cfs_rq->load_last = now; | ||
742 | cfs_rq->load_avg += delta * load; | ||
743 | } | ||
744 | |||
745 | /* consider updating load contribution on each fold or truncate */ | ||
746 | if (global_update || cfs_rq->load_period > period | ||
747 | || !cfs_rq->load_period) | ||
748 | update_cfs_rq_load_contribution(cfs_rq, global_update); | ||
749 | |||
750 | while (cfs_rq->load_period > period) { | ||
751 | /* | ||
752 | * Inline assembly required to prevent the compiler | ||
753 | * optimising this loop into a divmod call. | ||
754 | * See __iter_div_u64_rem() for another example of this. | ||
755 | */ | ||
756 | asm("" : "+rm" (cfs_rq->load_period)); | ||
757 | cfs_rq->load_period /= 2; | ||
758 | cfs_rq->load_avg /= 2; | ||
759 | } | ||
760 | |||
761 | if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) | ||
762 | list_del_leaf_cfs_rq(cfs_rq); | ||
763 | } | ||
764 | |||
765 | static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | ||
766 | unsigned long weight) | ||
767 | { | ||
768 | if (se->on_rq) { | ||
769 | /* commit outstanding execution time */ | ||
770 | if (cfs_rq->curr == se) | ||
771 | update_curr(cfs_rq); | ||
772 | account_entity_dequeue(cfs_rq, se); | ||
773 | } | ||
774 | |||
775 | update_load_set(&se->load, weight); | ||
776 | |||
777 | if (se->on_rq) | ||
778 | account_entity_enqueue(cfs_rq, se); | ||
779 | } | ||
780 | |||
781 | static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) | ||
782 | { | ||
783 | struct task_group *tg; | ||
784 | struct sched_entity *se; | ||
785 | long load_weight, load, shares; | ||
786 | |||
787 | if (!cfs_rq) | ||
788 | return; | ||
789 | |||
790 | tg = cfs_rq->tg; | ||
791 | se = tg->se[cpu_of(rq_of(cfs_rq))]; | ||
792 | if (!se) | ||
793 | return; | ||
794 | |||
795 | load = cfs_rq->load.weight + weight_delta; | ||
796 | |||
797 | load_weight = atomic_read(&tg->load_weight); | ||
798 | load_weight -= cfs_rq->load_contribution; | ||
799 | load_weight += load; | ||
800 | |||
801 | shares = (tg->shares * load); | ||
802 | if (load_weight) | ||
803 | shares /= load_weight; | ||
804 | |||
805 | if (shares < MIN_SHARES) | ||
806 | shares = MIN_SHARES; | ||
807 | if (shares > tg->shares) | ||
808 | shares = tg->shares; | ||
809 | |||
810 | reweight_entity(cfs_rq_of(se), se, shares); | ||
811 | } | ||
812 | |||
813 | static void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
814 | { | ||
815 | if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { | ||
816 | update_cfs_load(cfs_rq, 0); | ||
817 | update_cfs_shares(cfs_rq, 0); | ||
818 | } | ||
819 | } | ||
820 | #else /* CONFIG_FAIR_GROUP_SCHED */ | ||
821 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | ||
822 | { | ||
823 | } | ||
824 | |||
825 | static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) | ||
826 | { | ||
827 | } | ||
828 | |||
829 | static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
830 | { | ||
831 | } | ||
832 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
833 | |||
653 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 834 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
654 | { | 835 | { |
655 | #ifdef CONFIG_SCHEDSTATS | 836 | #ifdef CONFIG_SCHEDSTATS |
@@ -771,6 +952,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
771 | * Update run-time statistics of the 'current'. | 952 | * Update run-time statistics of the 'current'. |
772 | */ | 953 | */ |
773 | update_curr(cfs_rq); | 954 | update_curr(cfs_rq); |
955 | update_cfs_load(cfs_rq, 0); | ||
956 | update_cfs_shares(cfs_rq, se->load.weight); | ||
774 | account_entity_enqueue(cfs_rq, se); | 957 | account_entity_enqueue(cfs_rq, se); |
775 | 958 | ||
776 | if (flags & ENQUEUE_WAKEUP) { | 959 | if (flags & ENQUEUE_WAKEUP) { |
@@ -782,6 +965,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
782 | check_spread(cfs_rq, se); | 965 | check_spread(cfs_rq, se); |
783 | if (se != cfs_rq->curr) | 966 | if (se != cfs_rq->curr) |
784 | __enqueue_entity(cfs_rq, se); | 967 | __enqueue_entity(cfs_rq, se); |
968 | se->on_rq = 1; | ||
969 | |||
970 | if (cfs_rq->nr_running == 1) | ||
971 | list_add_leaf_cfs_rq(cfs_rq); | ||
785 | } | 972 | } |
786 | 973 | ||
787 | static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | 974 | static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) |
@@ -825,8 +1012,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
825 | 1012 | ||
826 | if (se != cfs_rq->curr) | 1013 | if (se != cfs_rq->curr) |
827 | __dequeue_entity(cfs_rq, se); | 1014 | __dequeue_entity(cfs_rq, se); |
1015 | se->on_rq = 0; | ||
1016 | update_cfs_load(cfs_rq, 0); | ||
828 | account_entity_dequeue(cfs_rq, se); | 1017 | account_entity_dequeue(cfs_rq, se); |
829 | update_min_vruntime(cfs_rq); | 1018 | update_min_vruntime(cfs_rq); |
1019 | update_cfs_shares(cfs_rq, 0); | ||
830 | 1020 | ||
831 | /* | 1021 | /* |
832 | * Normalize the entity after updating the min_vruntime because the | 1022 | * Normalize the entity after updating the min_vruntime because the |
@@ -955,6 +1145,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
955 | */ | 1145 | */ |
956 | update_curr(cfs_rq); | 1146 | update_curr(cfs_rq); |
957 | 1147 | ||
1148 | /* | ||
1149 | * Update share accounting for long-running entities. | ||
1150 | */ | ||
1151 | update_entity_shares_tick(cfs_rq); | ||
1152 | |||
958 | #ifdef CONFIG_SCHED_HRTICK | 1153 | #ifdef CONFIG_SCHED_HRTICK |
959 | /* | 1154 | /* |
960 | * queued ticks are scheduled to match the slice, so don't bother | 1155 | * queued ticks are scheduled to match the slice, so don't bother |
@@ -1055,6 +1250,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1055 | flags = ENQUEUE_WAKEUP; | 1250 | flags = ENQUEUE_WAKEUP; |
1056 | } | 1251 | } |
1057 | 1252 | ||
1253 | for_each_sched_entity(se) { | ||
1254 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1255 | |||
1256 | update_cfs_load(cfs_rq, 0); | ||
1257 | update_cfs_shares(cfs_rq, 0); | ||
1258 | } | ||
1259 | |||
1058 | hrtick_update(rq); | 1260 | hrtick_update(rq); |
1059 | } | 1261 | } |
1060 | 1262 | ||
@@ -1071,12 +1273,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1071 | for_each_sched_entity(se) { | 1273 | for_each_sched_entity(se) { |
1072 | cfs_rq = cfs_rq_of(se); | 1274 | cfs_rq = cfs_rq_of(se); |
1073 | dequeue_entity(cfs_rq, se, flags); | 1275 | dequeue_entity(cfs_rq, se, flags); |
1276 | |||
1074 | /* Don't dequeue parent if it has other entities besides us */ | 1277 | /* Don't dequeue parent if it has other entities besides us */ |
1075 | if (cfs_rq->load.weight) | 1278 | if (cfs_rq->load.weight) |
1076 | break; | 1279 | break; |
1077 | flags |= DEQUEUE_SLEEP; | 1280 | flags |= DEQUEUE_SLEEP; |
1078 | } | 1281 | } |
1079 | 1282 | ||
1283 | for_each_sched_entity(se) { | ||
1284 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1285 | |||
1286 | update_cfs_load(cfs_rq, 0); | ||
1287 | update_cfs_shares(cfs_rq, 0); | ||
1288 | } | ||
1289 | |||
1080 | hrtick_update(rq); | 1290 | hrtick_update(rq); |
1081 | } | 1291 | } |
1082 | 1292 | ||
@@ -1143,51 +1353,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p) | |||
1143 | * Adding load to a group doesn't make a group heavier, but can cause movement | 1353 | * Adding load to a group doesn't make a group heavier, but can cause movement |
1144 | * of group shares between cpus. Assuming the shares were perfectly aligned one | 1354 | * of group shares between cpus. Assuming the shares were perfectly aligned one |
1145 | * can calculate the shift in shares. | 1355 | * can calculate the shift in shares. |
1146 | * | ||
1147 | * The problem is that perfectly aligning the shares is rather expensive, hence | ||
1148 | * we try to avoid doing that too often - see update_shares(), which ratelimits | ||
1149 | * this change. | ||
1150 | * | ||
1151 | * We compensate this by not only taking the current delta into account, but | ||
1152 | * also considering the delta between when the shares were last adjusted and | ||
1153 | * now. | ||
1154 | * | ||
1155 | * We still saw a performance dip, some tracing learned us that between | ||
1156 | * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased | ||
1157 | * significantly. Therefore try to bias the error in direction of failing | ||
1158 | * the affine wakeup. | ||
1159 | * | ||
1160 | */ | 1356 | */ |
1161 | static long effective_load(struct task_group *tg, int cpu, | 1357 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg) |
1162 | long wl, long wg) | ||
1163 | { | 1358 | { |
1164 | struct sched_entity *se = tg->se[cpu]; | 1359 | struct sched_entity *se = tg->se[cpu]; |
1165 | 1360 | ||
1166 | if (!tg->parent) | 1361 | if (!tg->parent) |
1167 | return wl; | 1362 | return wl; |
1168 | 1363 | ||
1169 | /* | ||
1170 | * By not taking the decrease of shares on the other cpu into | ||
1171 | * account our error leans towards reducing the affine wakeups. | ||
1172 | */ | ||
1173 | if (!wl && sched_feat(ASYM_EFF_LOAD)) | ||
1174 | return wl; | ||
1175 | |||
1176 | for_each_sched_entity(se) { | 1364 | for_each_sched_entity(se) { |
1177 | long S, rw, s, a, b; | 1365 | long S, rw, s, a, b; |
1178 | long more_w; | ||
1179 | |||
1180 | /* | ||
1181 | * Instead of using this increment, also add the difference | ||
1182 | * between when the shares were last updated and now. | ||
1183 | */ | ||
1184 | more_w = se->my_q->load.weight - se->my_q->rq_weight; | ||
1185 | wl += more_w; | ||
1186 | wg += more_w; | ||
1187 | 1366 | ||
1188 | S = se->my_q->tg->shares; | 1367 | S = se->my_q->tg->shares; |
1189 | s = se->my_q->shares; | 1368 | s = se->load.weight; |
1190 | rw = se->my_q->rq_weight; | 1369 | rw = se->my_q->load.weight; |
1191 | 1370 | ||
1192 | a = S*(rw + wl); | 1371 | a = S*(rw + wl); |
1193 | b = S*rw + s*wg; | 1372 | b = S*rw + s*wg; |
@@ -1508,23 +1687,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1508 | sd = tmp; | 1687 | sd = tmp; |
1509 | } | 1688 | } |
1510 | 1689 | ||
1511 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1512 | if (sched_feat(LB_SHARES_UPDATE)) { | ||
1513 | /* | ||
1514 | * Pick the largest domain to update shares over | ||
1515 | */ | ||
1516 | tmp = sd; | ||
1517 | if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight)) | ||
1518 | tmp = affine_sd; | ||
1519 | |||
1520 | if (tmp) { | ||
1521 | raw_spin_unlock(&rq->lock); | ||
1522 | update_shares(tmp); | ||
1523 | raw_spin_lock(&rq->lock); | ||
1524 | } | ||
1525 | } | ||
1526 | #endif | ||
1527 | |||
1528 | if (affine_sd) { | 1690 | if (affine_sd) { |
1529 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) | 1691 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) |
1530 | return select_idle_sibling(p, cpu); | 1692 | return select_idle_sibling(p, cpu); |
@@ -1758,10 +1920,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, | |||
1758 | set_task_cpu(p, this_cpu); | 1920 | set_task_cpu(p, this_cpu); |
1759 | activate_task(this_rq, p, 0); | 1921 | activate_task(this_rq, p, 0); |
1760 | check_preempt_curr(this_rq, p, 0); | 1922 | check_preempt_curr(this_rq, p, 0); |
1761 | |||
1762 | /* re-arm NEWIDLE balancing when moving tasks */ | ||
1763 | src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost; | ||
1764 | this_rq->idle_stamp = 0; | ||
1765 | } | 1923 | } |
1766 | 1924 | ||
1767 | /* | 1925 | /* |
@@ -1913,6 +2071,48 @@ out: | |||
1913 | } | 2071 | } |
1914 | 2072 | ||
1915 | #ifdef CONFIG_FAIR_GROUP_SCHED | 2073 | #ifdef CONFIG_FAIR_GROUP_SCHED |
2074 | /* | ||
2075 | * update tg->load_weight by folding this cpu's load_avg | ||
2076 | */ | ||
2077 | static int update_shares_cpu(struct task_group *tg, int cpu) | ||
2078 | { | ||
2079 | struct cfs_rq *cfs_rq; | ||
2080 | unsigned long flags; | ||
2081 | struct rq *rq; | ||
2082 | |||
2083 | if (!tg->se[cpu]) | ||
2084 | return 0; | ||
2085 | |||
2086 | rq = cpu_rq(cpu); | ||
2087 | cfs_rq = tg->cfs_rq[cpu]; | ||
2088 | |||
2089 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
2090 | |||
2091 | update_rq_clock(rq); | ||
2092 | update_cfs_load(cfs_rq, 1); | ||
2093 | |||
2094 | /* | ||
2095 | * We need to update shares after updating tg->load_weight in | ||
2096 | * order to adjust the weight of groups with long running tasks. | ||
2097 | */ | ||
2098 | update_cfs_shares(cfs_rq, 0); | ||
2099 | |||
2100 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
2101 | |||
2102 | return 0; | ||
2103 | } | ||
2104 | |||
2105 | static void update_shares(int cpu) | ||
2106 | { | ||
2107 | struct cfs_rq *cfs_rq; | ||
2108 | struct rq *rq = cpu_rq(cpu); | ||
2109 | |||
2110 | rcu_read_lock(); | ||
2111 | for_each_leaf_cfs_rq(rq, cfs_rq) | ||
2112 | update_shares_cpu(cfs_rq->tg, cpu); | ||
2113 | rcu_read_unlock(); | ||
2114 | } | ||
2115 | |||
1916 | static unsigned long | 2116 | static unsigned long |
1917 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2117 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1918 | unsigned long max_load_move, | 2118 | unsigned long max_load_move, |
@@ -1960,6 +2160,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1960 | return max_load_move - rem_load_move; | 2160 | return max_load_move - rem_load_move; |
1961 | } | 2161 | } |
1962 | #else | 2162 | #else |
2163 | static inline void update_shares(int cpu) | ||
2164 | { | ||
2165 | } | ||
2166 | |||
1963 | static unsigned long | 2167 | static unsigned long |
1964 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2168 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1965 | unsigned long max_load_move, | 2169 | unsigned long max_load_move, |
@@ -3036,7 +3240,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3036 | schedstat_inc(sd, lb_count[idle]); | 3240 | schedstat_inc(sd, lb_count[idle]); |
3037 | 3241 | ||
3038 | redo: | 3242 | redo: |
3039 | update_shares(sd); | ||
3040 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 3243 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
3041 | cpus, balance); | 3244 | cpus, balance); |
3042 | 3245 | ||
@@ -3178,8 +3381,6 @@ out_one_pinned: | |||
3178 | else | 3381 | else |
3179 | ld_moved = 0; | 3382 | ld_moved = 0; |
3180 | out: | 3383 | out: |
3181 | if (ld_moved) | ||
3182 | update_shares(sd); | ||
3183 | return ld_moved; | 3384 | return ld_moved; |
3184 | } | 3385 | } |
3185 | 3386 | ||
@@ -3203,6 +3404,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
3203 | */ | 3404 | */ |
3204 | raw_spin_unlock(&this_rq->lock); | 3405 | raw_spin_unlock(&this_rq->lock); |
3205 | 3406 | ||
3407 | update_shares(this_cpu); | ||
3206 | for_each_domain(this_cpu, sd) { | 3408 | for_each_domain(this_cpu, sd) { |
3207 | unsigned long interval; | 3409 | unsigned long interval; |
3208 | int balance = 1; | 3410 | int balance = 1; |
@@ -3219,8 +3421,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
3219 | interval = msecs_to_jiffies(sd->balance_interval); | 3421 | interval = msecs_to_jiffies(sd->balance_interval); |
3220 | if (time_after(next_balance, sd->last_balance + interval)) | 3422 | if (time_after(next_balance, sd->last_balance + interval)) |
3221 | next_balance = sd->last_balance + interval; | 3423 | next_balance = sd->last_balance + interval; |
3222 | if (pulled_task) | 3424 | if (pulled_task) { |
3425 | this_rq->idle_stamp = 0; | ||
3223 | break; | 3426 | break; |
3427 | } | ||
3224 | } | 3428 | } |
3225 | 3429 | ||
3226 | raw_spin_lock(&this_rq->lock); | 3430 | raw_spin_lock(&this_rq->lock); |
@@ -3571,6 +3775,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3571 | int update_next_balance = 0; | 3775 | int update_next_balance = 0; |
3572 | int need_serialize; | 3776 | int need_serialize; |
3573 | 3777 | ||
3778 | update_shares(cpu); | ||
3779 | |||
3574 | for_each_domain(cpu, sd) { | 3780 | for_each_domain(cpu, sd) { |
3575 | if (!(sd->flags & SD_LOAD_BALANCE)) | 3781 | if (!(sd->flags & SD_LOAD_BALANCE)) |
3576 | continue; | 3782 | continue; |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 185f920ec1a..68e69acc29b 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0) | |||
52 | SCHED_FEAT(HRTICK, 0) | 52 | SCHED_FEAT(HRTICK, 0) |
53 | SCHED_FEAT(DOUBLE_TICK, 0) | 53 | SCHED_FEAT(DOUBLE_TICK, 0) |
54 | SCHED_FEAT(LB_BIAS, 1) | 54 | SCHED_FEAT(LB_BIAS, 1) |
55 | SCHED_FEAT(LB_SHARES_UPDATE, 1) | ||
56 | SCHED_FEAT(ASYM_EFF_LOAD, 1) | ||
57 | 55 | ||
58 | /* | 56 | /* |
59 | * Spin-wait on mutex acquisition when the mutex owner is running on | 57 | * Spin-wait on mutex acquisition when the mutex owner is running on |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index bea7d79f7e9..c914ec747ca 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -183,6 +183,17 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) | |||
183 | return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); | 183 | return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); |
184 | } | 184 | } |
185 | 185 | ||
186 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) | ||
187 | { | ||
188 | list_add_rcu(&rt_rq->leaf_rt_rq_list, | ||
189 | &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list); | ||
190 | } | ||
191 | |||
192 | static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) | ||
193 | { | ||
194 | list_del_rcu(&rt_rq->leaf_rt_rq_list); | ||
195 | } | ||
196 | |||
186 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | 197 | #define for_each_leaf_rt_rq(rt_rq, rq) \ |
187 | list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) | 198 | list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) |
188 | 199 | ||
@@ -276,6 +287,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) | |||
276 | return ktime_to_ns(def_rt_bandwidth.rt_period); | 287 | return ktime_to_ns(def_rt_bandwidth.rt_period); |
277 | } | 288 | } |
278 | 289 | ||
290 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) | ||
291 | { | ||
292 | } | ||
293 | |||
294 | static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) | ||
295 | { | ||
296 | } | ||
297 | |||
279 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | 298 | #define for_each_leaf_rt_rq(rt_rq, rq) \ |
280 | for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) | 299 | for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) |
281 | 300 | ||
@@ -825,6 +844,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) | |||
825 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) | 844 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) |
826 | return; | 845 | return; |
827 | 846 | ||
847 | if (!rt_rq->rt_nr_running) | ||
848 | list_add_leaf_rt_rq(rt_rq); | ||
849 | |||
828 | if (head) | 850 | if (head) |
829 | list_add(&rt_se->run_list, queue); | 851 | list_add(&rt_se->run_list, queue); |
830 | else | 852 | else |
@@ -844,6 +866,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) | |||
844 | __clear_bit(rt_se_prio(rt_se), array->bitmap); | 866 | __clear_bit(rt_se_prio(rt_se), array->bitmap); |
845 | 867 | ||
846 | dec_rt_tasks(rt_se, rt_rq); | 868 | dec_rt_tasks(rt_se, rt_rq); |
869 | if (!rt_rq->rt_nr_running) | ||
870 | list_del_leaf_rt_rq(rt_rq); | ||
847 | } | 871 | } |
848 | 872 | ||
849 | /* | 873 | /* |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 18f4be0d5fe..c10150cb456 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -853,7 +853,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, | |||
853 | cpumask_any(cpu_online_mask)); | 853 | cpumask_any(cpu_online_mask)); |
854 | case CPU_DEAD: | 854 | case CPU_DEAD: |
855 | case CPU_DEAD_FROZEN: { | 855 | case CPU_DEAD_FROZEN: { |
856 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | 856 | static const struct sched_param param = { |
857 | .sched_priority = MAX_RT_PRIO-1 | ||
858 | }; | ||
857 | 859 | ||
858 | p = per_cpu(ksoftirqd, hotcpu); | 860 | p = per_cpu(ksoftirqd, hotcpu); |
859 | per_cpu(ksoftirqd, hotcpu) = NULL; | 861 | per_cpu(ksoftirqd, hotcpu) = NULL; |
diff --git a/kernel/srcu.c b/kernel/srcu.c index c71e0750053..98d8c1e80ed 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/rcupdate.h> | 31 | #include <linux/rcupdate.h> |
32 | #include <linux/sched.h> | 32 | #include <linux/sched.h> |
33 | #include <linux/smp.h> | 33 | #include <linux/smp.h> |
34 | #include <linux/delay.h> | ||
34 | #include <linux/srcu.h> | 35 | #include <linux/srcu.h> |
35 | 36 | ||
36 | static int init_srcu_struct_fields(struct srcu_struct *sp) | 37 | static int init_srcu_struct_fields(struct srcu_struct *sp) |
@@ -203,9 +204,14 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | |||
203 | * all srcu_read_lock() calls using the old counters have completed. | 204 | * all srcu_read_lock() calls using the old counters have completed. |
204 | * Their corresponding critical sections might well be still | 205 | * Their corresponding critical sections might well be still |
205 | * executing, but the srcu_read_lock() primitives themselves | 206 | * executing, but the srcu_read_lock() primitives themselves |
206 | * will have finished executing. | 207 | * will have finished executing. We initially give readers |
208 | * an arbitrarily chosen 10 microseconds to get out of their | ||
209 | * SRCU read-side critical sections, then loop waiting 1/HZ | ||
210 | * seconds per iteration. | ||
207 | */ | 211 | */ |
208 | 212 | ||
213 | if (srcu_readers_active_idx(sp, idx)) | ||
214 | udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY); | ||
209 | while (srcu_readers_active_idx(sp, idx)) | 215 | while (srcu_readers_active_idx(sp, idx)) |
210 | schedule_timeout_interruptible(1); | 216 | schedule_timeout_interruptible(1); |
211 | 217 | ||
diff --git a/kernel/sys.c b/kernel/sys.c index 7f5a0cd296a..2745dcdb6c6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -1080,8 +1080,10 @@ SYSCALL_DEFINE0(setsid) | |||
1080 | err = session; | 1080 | err = session; |
1081 | out: | 1081 | out: |
1082 | write_unlock_irq(&tasklist_lock); | 1082 | write_unlock_irq(&tasklist_lock); |
1083 | if (err > 0) | 1083 | if (err > 0) { |
1084 | proc_sid_connector(group_leader); | 1084 | proc_sid_connector(group_leader); |
1085 | sched_autogroup_create_attach(group_leader); | ||
1086 | } | ||
1085 | return err; | 1087 | return err; |
1086 | } | 1088 | } |
1087 | 1089 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5abfa151855..ae5cbb1e3ce 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -259,8 +259,6 @@ static int min_wakeup_granularity_ns; /* 0 usecs */ | |||
259 | static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ | 259 | static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ |
260 | static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; | 260 | static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; |
261 | static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; | 261 | static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; |
262 | static int min_sched_shares_ratelimit = 100000; /* 100 usec */ | ||
263 | static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */ | ||
264 | #endif | 262 | #endif |
265 | 263 | ||
266 | #ifdef CONFIG_COMPACTION | 264 | #ifdef CONFIG_COMPACTION |
@@ -305,15 +303,6 @@ static struct ctl_table kern_table[] = { | |||
305 | .extra2 = &max_wakeup_granularity_ns, | 303 | .extra2 = &max_wakeup_granularity_ns, |
306 | }, | 304 | }, |
307 | { | 305 | { |
308 | .procname = "sched_shares_ratelimit", | ||
309 | .data = &sysctl_sched_shares_ratelimit, | ||
310 | .maxlen = sizeof(unsigned int), | ||
311 | .mode = 0644, | ||
312 | .proc_handler = sched_proc_update_handler, | ||
313 | .extra1 = &min_sched_shares_ratelimit, | ||
314 | .extra2 = &max_sched_shares_ratelimit, | ||
315 | }, | ||
316 | { | ||
317 | .procname = "sched_tunable_scaling", | 306 | .procname = "sched_tunable_scaling", |
318 | .data = &sysctl_sched_tunable_scaling, | 307 | .data = &sysctl_sched_tunable_scaling, |
319 | .maxlen = sizeof(enum sched_tunable_scaling), | 308 | .maxlen = sizeof(enum sched_tunable_scaling), |
@@ -323,14 +312,6 @@ static struct ctl_table kern_table[] = { | |||
323 | .extra2 = &max_sched_tunable_scaling, | 312 | .extra2 = &max_sched_tunable_scaling, |
324 | }, | 313 | }, |
325 | { | 314 | { |
326 | .procname = "sched_shares_thresh", | ||
327 | .data = &sysctl_sched_shares_thresh, | ||
328 | .maxlen = sizeof(unsigned int), | ||
329 | .mode = 0644, | ||
330 | .proc_handler = proc_dointvec_minmax, | ||
331 | .extra1 = &zero, | ||
332 | }, | ||
333 | { | ||
334 | .procname = "sched_migration_cost", | 315 | .procname = "sched_migration_cost", |
335 | .data = &sysctl_sched_migration_cost, | 316 | .data = &sysctl_sched_migration_cost, |
336 | .maxlen = sizeof(unsigned int), | 317 | .maxlen = sizeof(unsigned int), |
@@ -352,6 +333,13 @@ static struct ctl_table kern_table[] = { | |||
352 | .proc_handler = proc_dointvec, | 333 | .proc_handler = proc_dointvec, |
353 | }, | 334 | }, |
354 | { | 335 | { |
336 | .procname = "sched_shares_window", | ||
337 | .data = &sysctl_sched_shares_window, | ||
338 | .maxlen = sizeof(unsigned int), | ||
339 | .mode = 0644, | ||
340 | .proc_handler = proc_dointvec, | ||
341 | }, | ||
342 | { | ||
355 | .procname = "timer_migration", | 343 | .procname = "timer_migration", |
356 | .data = &sysctl_timer_migration, | 344 | .data = &sysctl_timer_migration, |
357 | .maxlen = sizeof(unsigned int), | 345 | .maxlen = sizeof(unsigned int), |
@@ -382,6 +370,17 @@ static struct ctl_table kern_table[] = { | |||
382 | .mode = 0644, | 370 | .mode = 0644, |
383 | .proc_handler = proc_dointvec, | 371 | .proc_handler = proc_dointvec, |
384 | }, | 372 | }, |
373 | #ifdef CONFIG_SCHED_AUTOGROUP | ||
374 | { | ||
375 | .procname = "sched_autogroup_enabled", | ||
376 | .data = &sysctl_sched_autogroup_enabled, | ||
377 | .maxlen = sizeof(unsigned int), | ||
378 | .mode = 0644, | ||
379 | .proc_handler = proc_dointvec, | ||
380 | .extra1 = &zero, | ||
381 | .extra2 = &one, | ||
382 | }, | ||
383 | #endif | ||
385 | #ifdef CONFIG_PROVE_LOCKING | 384 | #ifdef CONFIG_PROVE_LOCKING |
386 | { | 385 | { |
387 | .procname = "prove_locking", | 386 | .procname = "prove_locking", |
@@ -745,21 +744,21 @@ static struct ctl_table kern_table[] = { | |||
745 | .extra1 = &zero, | 744 | .extra1 = &zero, |
746 | .extra2 = &one, | 745 | .extra2 = &one, |
747 | }, | 746 | }, |
748 | #endif | ||
749 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR) | ||
750 | { | 747 | { |
751 | .procname = "unknown_nmi_panic", | 748 | .procname = "nmi_watchdog", |
752 | .data = &unknown_nmi_panic, | 749 | .data = &watchdog_enabled, |
753 | .maxlen = sizeof (int), | 750 | .maxlen = sizeof (int), |
754 | .mode = 0644, | 751 | .mode = 0644, |
755 | .proc_handler = proc_dointvec, | 752 | .proc_handler = proc_dowatchdog_enabled, |
756 | }, | 753 | }, |
754 | #endif | ||
755 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | ||
757 | { | 756 | { |
758 | .procname = "nmi_watchdog", | 757 | .procname = "unknown_nmi_panic", |
759 | .data = &nmi_watchdog_enabled, | 758 | .data = &unknown_nmi_panic, |
760 | .maxlen = sizeof (int), | 759 | .maxlen = sizeof (int), |
761 | .mode = 0644, | 760 | .mode = 0644, |
762 | .proc_handler = proc_nmi_enabled, | 761 | .proc_handler = proc_dointvec, |
763 | }, | 762 | }, |
764 | #endif | 763 | #endif |
765 | #if defined(CONFIG_X86) | 764 | #if defined(CONFIG_X86) |
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 1357c578606..4b2545a136f 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
@@ -136,7 +136,6 @@ static const struct bin_table bin_kern_table[] = { | |||
136 | { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, | 136 | { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, |
137 | { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, | 137 | { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, |
138 | { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, | 138 | { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, |
139 | { CTL_INT, KERN_NMI_WATCHDOG, "nmi_watchdog" }, | ||
140 | { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, | 139 | { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, |
141 | {} | 140 | {} |
142 | }; | 141 | }; |
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index c8231fb1570..3308fd7f1b5 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
@@ -349,25 +349,47 @@ static int parse(struct nlattr *na, struct cpumask *mask) | |||
349 | return ret; | 349 | return ret; |
350 | } | 350 | } |
351 | 351 | ||
352 | #ifdef CONFIG_IA64 | ||
353 | #define TASKSTATS_NEEDS_PADDING 1 | ||
354 | #endif | ||
355 | |||
352 | static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) | 356 | static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) |
353 | { | 357 | { |
354 | struct nlattr *na, *ret; | 358 | struct nlattr *na, *ret; |
355 | int aggr; | 359 | int aggr; |
356 | 360 | ||
357 | /* If we don't pad, we end up with alignment on a 4 byte boundary. | ||
358 | * This causes lots of runtime warnings on systems requiring 8 byte | ||
359 | * alignment */ | ||
360 | u32 pids[2] = { pid, 0 }; | ||
361 | int pid_size = ALIGN(sizeof(pid), sizeof(long)); | ||
362 | |||
363 | aggr = (type == TASKSTATS_TYPE_PID) | 361 | aggr = (type == TASKSTATS_TYPE_PID) |
364 | ? TASKSTATS_TYPE_AGGR_PID | 362 | ? TASKSTATS_TYPE_AGGR_PID |
365 | : TASKSTATS_TYPE_AGGR_TGID; | 363 | : TASKSTATS_TYPE_AGGR_TGID; |
366 | 364 | ||
365 | /* | ||
366 | * The taskstats structure is internally aligned on 8 byte | ||
367 | * boundaries but the layout of the aggregrate reply, with | ||
368 | * two NLA headers and the pid (each 4 bytes), actually | ||
369 | * force the entire structure to be unaligned. This causes | ||
370 | * the kernel to issue unaligned access warnings on some | ||
371 | * architectures like ia64. Unfortunately, some software out there | ||
372 | * doesn't properly unroll the NLA packet and assumes that the start | ||
373 | * of the taskstats structure will always be 20 bytes from the start | ||
374 | * of the netlink payload. Aligning the start of the taskstats | ||
375 | * structure breaks this software, which we don't want. So, for now | ||
376 | * the alignment only happens on architectures that require it | ||
377 | * and those users will have to update to fixed versions of those | ||
378 | * packages. Space is reserved in the packet only when needed. | ||
379 | * This ifdef should be removed in several years e.g. 2012 once | ||
380 | * we can be confident that fixed versions are installed on most | ||
381 | * systems. We add the padding before the aggregate since the | ||
382 | * aggregate is already a defined type. | ||
383 | */ | ||
384 | #ifdef TASKSTATS_NEEDS_PADDING | ||
385 | if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0) | ||
386 | goto err; | ||
387 | #endif | ||
367 | na = nla_nest_start(skb, aggr); | 388 | na = nla_nest_start(skb, aggr); |
368 | if (!na) | 389 | if (!na) |
369 | goto err; | 390 | goto err; |
370 | if (nla_put(skb, type, pid_size, pids) < 0) | 391 | |
392 | if (nla_put(skb, type, sizeof(pid), &pid) < 0) | ||
371 | goto err; | 393 | goto err; |
372 | ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); | 394 | ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); |
373 | if (!ret) | 395 | if (!ret) |
@@ -456,6 +478,18 @@ out: | |||
456 | return rc; | 478 | return rc; |
457 | } | 479 | } |
458 | 480 | ||
481 | static size_t taskstats_packet_size(void) | ||
482 | { | ||
483 | size_t size; | ||
484 | |||
485 | size = nla_total_size(sizeof(u32)) + | ||
486 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | ||
487 | #ifdef TASKSTATS_NEEDS_PADDING | ||
488 | size += nla_total_size(0); /* Padding for alignment */ | ||
489 | #endif | ||
490 | return size; | ||
491 | } | ||
492 | |||
459 | static int cmd_attr_pid(struct genl_info *info) | 493 | static int cmd_attr_pid(struct genl_info *info) |
460 | { | 494 | { |
461 | struct taskstats *stats; | 495 | struct taskstats *stats; |
@@ -464,8 +498,7 @@ static int cmd_attr_pid(struct genl_info *info) | |||
464 | u32 pid; | 498 | u32 pid; |
465 | int rc; | 499 | int rc; |
466 | 500 | ||
467 | size = nla_total_size(sizeof(u32)) + | 501 | size = taskstats_packet_size(); |
468 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | ||
469 | 502 | ||
470 | rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); | 503 | rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); |
471 | if (rc < 0) | 504 | if (rc < 0) |
@@ -494,8 +527,7 @@ static int cmd_attr_tgid(struct genl_info *info) | |||
494 | u32 tgid; | 527 | u32 tgid; |
495 | int rc; | 528 | int rc; |
496 | 529 | ||
497 | size = nla_total_size(sizeof(u32)) + | 530 | size = taskstats_packet_size(); |
498 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | ||
499 | 531 | ||
500 | rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); | 532 | rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); |
501 | if (rc < 0) | 533 | if (rc < 0) |
@@ -570,8 +602,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) | |||
570 | /* | 602 | /* |
571 | * Size includes space for nested attributes | 603 | * Size includes space for nested attributes |
572 | */ | 604 | */ |
573 | size = nla_total_size(sizeof(u32)) + | 605 | size = taskstats_packet_size(); |
574 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | ||
575 | 606 | ||
576 | is_thread_group = !!taskstats_tgid_alloc(tsk); | 607 | is_thread_group = !!taskstats_tgid_alloc(tsk); |
577 | if (is_thread_group) { | 608 | if (is_thread_group) { |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c18d7efa1b4..df140cd3ea4 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -152,6 +152,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) | |||
152 | */ | 152 | */ |
153 | for (sft = 32; sft > 0; sft--) { | 153 | for (sft = 32; sft > 0; sft--) { |
154 | tmp = (u64) to << sft; | 154 | tmp = (u64) to << sft; |
155 | tmp += from / 2; | ||
155 | do_div(tmp, from); | 156 | do_div(tmp, from); |
156 | if ((tmp >> sftacc) == 0) | 157 | if ((tmp >> sftacc) == 0) |
157 | break; | 158 | break; |
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c index ac38fbb176c..a9ae369925c 100644 --- a/kernel/time/timecompare.c +++ b/kernel/time/timecompare.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/module.h> | 21 | #include <linux/module.h> |
22 | #include <linux/slab.h> | 22 | #include <linux/slab.h> |
23 | #include <linux/math64.h> | 23 | #include <linux/math64.h> |
24 | #include <linux/kernel.h> | ||
24 | 25 | ||
25 | /* | 26 | /* |
26 | * fixed point arithmetic scale factor for skew | 27 | * fixed point arithmetic scale factor for skew |
@@ -57,11 +58,11 @@ int timecompare_offset(struct timecompare *sync, | |||
57 | int index; | 58 | int index; |
58 | int num_samples = sync->num_samples; | 59 | int num_samples = sync->num_samples; |
59 | 60 | ||
60 | if (num_samples > sizeof(buffer)/sizeof(buffer[0])) { | 61 | if (num_samples > ARRAY_SIZE(buffer)) { |
61 | samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); | 62 | samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); |
62 | if (!samples) { | 63 | if (!samples) { |
63 | samples = buffer; | 64 | samples = buffer; |
64 | num_samples = sizeof(buffer)/sizeof(buffer[0]); | 65 | num_samples = ARRAY_SIZE(buffer); |
65 | } | 66 | } |
66 | } else { | 67 | } else { |
67 | samples = buffer; | 68 | samples = buffer; |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 49010d822f7..5bb86da8200 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -32,6 +32,8 @@ struct timekeeper { | |||
32 | cycle_t cycle_interval; | 32 | cycle_t cycle_interval; |
33 | /* Number of clock shifted nano seconds in one NTP interval. */ | 33 | /* Number of clock shifted nano seconds in one NTP interval. */ |
34 | u64 xtime_interval; | 34 | u64 xtime_interval; |
35 | /* shifted nano seconds left over when rounding cycle_interval */ | ||
36 | s64 xtime_remainder; | ||
35 | /* Raw nano seconds accumulated per NTP interval. */ | 37 | /* Raw nano seconds accumulated per NTP interval. */ |
36 | u32 raw_interval; | 38 | u32 raw_interval; |
37 | 39 | ||
@@ -62,7 +64,7 @@ struct timekeeper timekeeper; | |||
62 | static void timekeeper_setup_internals(struct clocksource *clock) | 64 | static void timekeeper_setup_internals(struct clocksource *clock) |
63 | { | 65 | { |
64 | cycle_t interval; | 66 | cycle_t interval; |
65 | u64 tmp; | 67 | u64 tmp, ntpinterval; |
66 | 68 | ||
67 | timekeeper.clock = clock; | 69 | timekeeper.clock = clock; |
68 | clock->cycle_last = clock->read(clock); | 70 | clock->cycle_last = clock->read(clock); |
@@ -70,6 +72,7 @@ static void timekeeper_setup_internals(struct clocksource *clock) | |||
70 | /* Do the ns -> cycle conversion first, using original mult */ | 72 | /* Do the ns -> cycle conversion first, using original mult */ |
71 | tmp = NTP_INTERVAL_LENGTH; | 73 | tmp = NTP_INTERVAL_LENGTH; |
72 | tmp <<= clock->shift; | 74 | tmp <<= clock->shift; |
75 | ntpinterval = tmp; | ||
73 | tmp += clock->mult/2; | 76 | tmp += clock->mult/2; |
74 | do_div(tmp, clock->mult); | 77 | do_div(tmp, clock->mult); |
75 | if (tmp == 0) | 78 | if (tmp == 0) |
@@ -80,6 +83,7 @@ static void timekeeper_setup_internals(struct clocksource *clock) | |||
80 | 83 | ||
81 | /* Go back from cycles -> shifted ns */ | 84 | /* Go back from cycles -> shifted ns */ |
82 | timekeeper.xtime_interval = (u64) interval * clock->mult; | 85 | timekeeper.xtime_interval = (u64) interval * clock->mult; |
86 | timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval; | ||
83 | timekeeper.raw_interval = | 87 | timekeeper.raw_interval = |
84 | ((u64) interval * clock->mult) >> clock->shift; | 88 | ((u64) interval * clock->mult) >> clock->shift; |
85 | 89 | ||
@@ -719,7 +723,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) | |||
719 | 723 | ||
720 | /* Accumulate error between NTP and clock interval */ | 724 | /* Accumulate error between NTP and clock interval */ |
721 | timekeeper.ntp_error += tick_length << shift; | 725 | timekeeper.ntp_error += tick_length << shift; |
722 | timekeeper.ntp_error -= timekeeper.xtime_interval << | 726 | timekeeper.ntp_error -= |
727 | (timekeeper.xtime_interval + timekeeper.xtime_remainder) << | ||
723 | (timekeeper.ntp_error_shift + shift); | 728 | (timekeeper.ntp_error_shift + shift); |
724 | 729 | ||
725 | return offset; | 730 | return offset; |
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index ab8f5e33fa9..32a19f9397f 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c | |||
@@ -79,26 +79,26 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, | |||
79 | { | 79 | { |
80 | struct hrtimer *timer, tmp; | 80 | struct hrtimer *timer, tmp; |
81 | unsigned long next = 0, i; | 81 | unsigned long next = 0, i; |
82 | struct rb_node *curr; | 82 | struct timerqueue_node *curr; |
83 | unsigned long flags; | 83 | unsigned long flags; |
84 | 84 | ||
85 | next_one: | 85 | next_one: |
86 | i = 0; | 86 | i = 0; |
87 | raw_spin_lock_irqsave(&base->cpu_base->lock, flags); | 87 | raw_spin_lock_irqsave(&base->cpu_base->lock, flags); |
88 | 88 | ||
89 | curr = base->first; | 89 | curr = timerqueue_getnext(&base->active); |
90 | /* | 90 | /* |
91 | * Crude but we have to do this O(N*N) thing, because | 91 | * Crude but we have to do this O(N*N) thing, because |
92 | * we have to unlock the base when printing: | 92 | * we have to unlock the base when printing: |
93 | */ | 93 | */ |
94 | while (curr && i < next) { | 94 | while (curr && i < next) { |
95 | curr = rb_next(curr); | 95 | curr = timerqueue_iterate_next(curr); |
96 | i++; | 96 | i++; |
97 | } | 97 | } |
98 | 98 | ||
99 | if (curr) { | 99 | if (curr) { |
100 | 100 | ||
101 | timer = rb_entry(curr, struct hrtimer, node); | 101 | timer = container_of(curr, struct hrtimer, node); |
102 | tmp = *timer; | 102 | tmp = *timer; |
103 | raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); | 103 | raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); |
104 | 104 | ||
diff --git a/kernel/timer.c b/kernel/timer.c index 68a9ae7679b..43ca9936f2d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -88,18 +88,6 @@ struct tvec_base boot_tvec_bases; | |||
88 | EXPORT_SYMBOL(boot_tvec_bases); | 88 | EXPORT_SYMBOL(boot_tvec_bases); |
89 | static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; | 89 | static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; |
90 | 90 | ||
91 | /* | ||
92 | * Note that all tvec_bases are 2 byte aligned and lower bit of | ||
93 | * base in timer_list is guaranteed to be zero. Use the LSB to | ||
94 | * indicate whether the timer is deferrable. | ||
95 | * | ||
96 | * A deferrable timer will work normally when the system is busy, but | ||
97 | * will not cause a CPU to come out of idle just to service it; instead, | ||
98 | * the timer will be serviced when the CPU eventually wakes up with a | ||
99 | * subsequent non-deferrable timer. | ||
100 | */ | ||
101 | #define TBASE_DEFERRABLE_FLAG (0x1) | ||
102 | |||
103 | /* Functions below help us manage 'deferrable' flag */ | 91 | /* Functions below help us manage 'deferrable' flag */ |
104 | static inline unsigned int tbase_get_deferrable(struct tvec_base *base) | 92 | static inline unsigned int tbase_get_deferrable(struct tvec_base *base) |
105 | { | 93 | { |
@@ -113,8 +101,7 @@ static inline struct tvec_base *tbase_get_base(struct tvec_base *base) | |||
113 | 101 | ||
114 | static inline void timer_set_deferrable(struct timer_list *timer) | 102 | static inline void timer_set_deferrable(struct timer_list *timer) |
115 | { | 103 | { |
116 | timer->base = ((struct tvec_base *)((unsigned long)(timer->base) | | 104 | timer->base = TBASE_MAKE_DEFERRED(timer->base); |
117 | TBASE_DEFERRABLE_FLAG)); | ||
118 | } | 105 | } |
119 | 106 | ||
120 | static inline void | 107 | static inline void |
@@ -343,15 +330,6 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) | |||
343 | } | 330 | } |
344 | EXPORT_SYMBOL_GPL(set_timer_slack); | 331 | EXPORT_SYMBOL_GPL(set_timer_slack); |
345 | 332 | ||
346 | |||
347 | static inline void set_running_timer(struct tvec_base *base, | ||
348 | struct timer_list *timer) | ||
349 | { | ||
350 | #ifdef CONFIG_SMP | ||
351 | base->running_timer = timer; | ||
352 | #endif | ||
353 | } | ||
354 | |||
355 | static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) | 333 | static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) |
356 | { | 334 | { |
357 | unsigned long expires = timer->expires; | 335 | unsigned long expires = timer->expires; |
@@ -936,15 +914,12 @@ int del_timer(struct timer_list *timer) | |||
936 | } | 914 | } |
937 | EXPORT_SYMBOL(del_timer); | 915 | EXPORT_SYMBOL(del_timer); |
938 | 916 | ||
939 | #ifdef CONFIG_SMP | ||
940 | /** | 917 | /** |
941 | * try_to_del_timer_sync - Try to deactivate a timer | 918 | * try_to_del_timer_sync - Try to deactivate a timer |
942 | * @timer: timer do del | 919 | * @timer: timer do del |
943 | * | 920 | * |
944 | * This function tries to deactivate a timer. Upon successful (ret >= 0) | 921 | * This function tries to deactivate a timer. Upon successful (ret >= 0) |
945 | * exit the timer is not queued and the handler is not running on any CPU. | 922 | * exit the timer is not queued and the handler is not running on any CPU. |
946 | * | ||
947 | * It must not be called from interrupt contexts. | ||
948 | */ | 923 | */ |
949 | int try_to_del_timer_sync(struct timer_list *timer) | 924 | int try_to_del_timer_sync(struct timer_list *timer) |
950 | { | 925 | { |
@@ -973,6 +948,7 @@ out: | |||
973 | } | 948 | } |
974 | EXPORT_SYMBOL(try_to_del_timer_sync); | 949 | EXPORT_SYMBOL(try_to_del_timer_sync); |
975 | 950 | ||
951 | #ifdef CONFIG_SMP | ||
976 | /** | 952 | /** |
977 | * del_timer_sync - deactivate a timer and wait for the handler to finish. | 953 | * del_timer_sync - deactivate a timer and wait for the handler to finish. |
978 | * @timer: the timer to be deactivated | 954 | * @timer: the timer to be deactivated |
@@ -983,7 +959,7 @@ EXPORT_SYMBOL(try_to_del_timer_sync); | |||
983 | * | 959 | * |
984 | * Synchronization rules: Callers must prevent restarting of the timer, | 960 | * Synchronization rules: Callers must prevent restarting of the timer, |
985 | * otherwise this function is meaningless. It must not be called from | 961 | * otherwise this function is meaningless. It must not be called from |
986 | * interrupt contexts. The caller must not hold locks which would prevent | 962 | * hardirq contexts. The caller must not hold locks which would prevent |
987 | * completion of the timer's handler. The timer's handler must not call | 963 | * completion of the timer's handler. The timer's handler must not call |
988 | * add_timer_on(). Upon exit the timer is not queued and the handler is | 964 | * add_timer_on(). Upon exit the timer is not queued and the handler is |
989 | * not running on any CPU. | 965 | * not running on any CPU. |
@@ -993,14 +969,16 @@ EXPORT_SYMBOL(try_to_del_timer_sync); | |||
993 | int del_timer_sync(struct timer_list *timer) | 969 | int del_timer_sync(struct timer_list *timer) |
994 | { | 970 | { |
995 | #ifdef CONFIG_LOCKDEP | 971 | #ifdef CONFIG_LOCKDEP |
996 | unsigned long flags; | 972 | local_bh_disable(); |
997 | |||
998 | local_irq_save(flags); | ||
999 | lock_map_acquire(&timer->lockdep_map); | 973 | lock_map_acquire(&timer->lockdep_map); |
1000 | lock_map_release(&timer->lockdep_map); | 974 | lock_map_release(&timer->lockdep_map); |
1001 | local_irq_restore(flags); | 975 | local_bh_enable(); |
1002 | #endif | 976 | #endif |
1003 | 977 | /* | |
978 | * don't use it in hardirq context, because it | ||
979 | * could lead to deadlock. | ||
980 | */ | ||
981 | WARN_ON(in_irq()); | ||
1004 | for (;;) { | 982 | for (;;) { |
1005 | int ret = try_to_del_timer_sync(timer); | 983 | int ret = try_to_del_timer_sync(timer); |
1006 | if (ret >= 0) | 984 | if (ret >= 0) |
@@ -1111,7 +1089,7 @@ static inline void __run_timers(struct tvec_base *base) | |||
1111 | 1089 | ||
1112 | timer_stats_account_timer(timer); | 1090 | timer_stats_account_timer(timer); |
1113 | 1091 | ||
1114 | set_running_timer(base, timer); | 1092 | base->running_timer = timer; |
1115 | detach_timer(timer, 1); | 1093 | detach_timer(timer, 1); |
1116 | 1094 | ||
1117 | spin_unlock_irq(&base->lock); | 1095 | spin_unlock_irq(&base->lock); |
@@ -1119,7 +1097,7 @@ static inline void __run_timers(struct tvec_base *base) | |||
1119 | spin_lock_irq(&base->lock); | 1097 | spin_lock_irq(&base->lock); |
1120 | } | 1098 | } |
1121 | } | 1099 | } |
1122 | set_running_timer(base, NULL); | 1100 | base->running_timer = NULL; |
1123 | spin_unlock_irq(&base->lock); | 1101 | spin_unlock_irq(&base->lock); |
1124 | } | 1102 | } |
1125 | 1103 | ||
@@ -1249,9 +1227,15 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now, | |||
1249 | */ | 1227 | */ |
1250 | unsigned long get_next_timer_interrupt(unsigned long now) | 1228 | unsigned long get_next_timer_interrupt(unsigned long now) |
1251 | { | 1229 | { |
1252 | struct tvec_base *base = __get_cpu_var(tvec_bases); | 1230 | struct tvec_base *base = __this_cpu_read(tvec_bases); |
1253 | unsigned long expires; | 1231 | unsigned long expires; |
1254 | 1232 | ||
1233 | /* | ||
1234 | * Pretend that there is no timer pending if the cpu is offline. | ||
1235 | * Possible pending timers will be migrated later to an active cpu. | ||
1236 | */ | ||
1237 | if (cpu_is_offline(smp_processor_id())) | ||
1238 | return now + NEXT_TIMER_MAX_DELTA; | ||
1255 | spin_lock(&base->lock); | 1239 | spin_lock(&base->lock); |
1256 | if (time_before_eq(base->next_timer, base->timer_jiffies)) | 1240 | if (time_before_eq(base->next_timer, base->timer_jiffies)) |
1257 | base->next_timer = __next_timer_interrupt(base); | 1241 | base->next_timer = __next_timer_interrupt(base); |
@@ -1292,7 +1276,7 @@ void update_process_times(int user_tick) | |||
1292 | */ | 1276 | */ |
1293 | static void run_timer_softirq(struct softirq_action *h) | 1277 | static void run_timer_softirq(struct softirq_action *h) |
1294 | { | 1278 | { |
1295 | struct tvec_base *base = __get_cpu_var(tvec_bases); | 1279 | struct tvec_base *base = __this_cpu_read(tvec_bases); |
1296 | 1280 | ||
1297 | hrtimer_run_pending(); | 1281 | hrtimer_run_pending(); |
1298 | 1282 | ||
@@ -1319,7 +1303,7 @@ void do_timer(unsigned long ticks) | |||
1319 | { | 1303 | { |
1320 | jiffies_64 += ticks; | 1304 | jiffies_64 += ticks; |
1321 | update_wall_time(); | 1305 | update_wall_time(); |
1322 | calc_global_load(); | 1306 | calc_global_load(ticks); |
1323 | } | 1307 | } |
1324 | 1308 | ||
1325 | #ifdef __ARCH_WANT_SYS_ALARM | 1309 | #ifdef __ARCH_WANT_SYS_ALARM |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index ea37e2ff416..14674dce77a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -69,6 +69,21 @@ config EVENT_TRACING | |||
69 | select CONTEXT_SWITCH_TRACER | 69 | select CONTEXT_SWITCH_TRACER |
70 | bool | 70 | bool |
71 | 71 | ||
72 | config EVENT_POWER_TRACING_DEPRECATED | ||
73 | depends on EVENT_TRACING | ||
74 | bool "Deprecated power event trace API, to be removed" | ||
75 | default y | ||
76 | help | ||
77 | Provides old power event types: | ||
78 | C-state/idle accounting events: | ||
79 | power:power_start | ||
80 | power:power_end | ||
81 | and old cpufreq accounting event: | ||
82 | power:power_frequency | ||
83 | This is for userspace compatibility | ||
84 | and will vanish after 5 kernel iterations, | ||
85 | namely 2.6.41. | ||
86 | |||
72 | config CONTEXT_SWITCH_TRACER | 87 | config CONTEXT_SWITCH_TRACER |
73 | bool | 88 | bool |
74 | 89 | ||
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index a22582a0616..f55fcf61b22 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c | |||
@@ -13,5 +13,8 @@ | |||
13 | #define CREATE_TRACE_POINTS | 13 | #define CREATE_TRACE_POINTS |
14 | #include <trace/events/power.h> | 14 | #include <trace/events/power.h> |
15 | 15 | ||
16 | EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); | 16 | #ifdef EVENT_POWER_TRACING_DEPRECATED |
17 | EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); | ||
18 | #endif | ||
19 | EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); | ||
17 | 20 | ||
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9ed509a015d..bd1c35a4fbc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -3853,6 +3853,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer, | |||
3853 | 3853 | ||
3854 | /* Need to copy one event at a time */ | 3854 | /* Need to copy one event at a time */ |
3855 | do { | 3855 | do { |
3856 | /* We need the size of one event, because | ||
3857 | * rb_advance_reader only advances by one event, | ||
3858 | * whereas rb_event_ts_length may include the size of | ||
3859 | * one or two events. | ||
3860 | * We have already ensured there's enough space if this | ||
3861 | * is a time extend. */ | ||
3862 | size = rb_event_length(event); | ||
3856 | memcpy(bpage->data + pos, rpage->data + rpos, size); | 3863 | memcpy(bpage->data + pos, rpage->data + rpos, size); |
3857 | 3864 | ||
3858 | len -= size; | 3865 | len -= size; |
@@ -3867,7 +3874,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, | |||
3867 | event = rb_reader_event(cpu_buffer); | 3874 | event = rb_reader_event(cpu_buffer); |
3868 | /* Always keep the time extend and data together */ | 3875 | /* Always keep the time extend and data together */ |
3869 | size = rb_event_ts_length(event); | 3876 | size = rb_event_ts_length(event); |
3870 | } while (len > size); | 3877 | } while (len >= size); |
3871 | 3878 | ||
3872 | /* update bpage */ | 3879 | /* update bpage */ |
3873 | local_set(&bpage->commit, pos); | 3880 | local_set(&bpage->commit, pos); |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 04208415798..f8cf959bad4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -1283,6 +1283,8 @@ void trace_dump_stack(void) | |||
1283 | __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); | 1283 | __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); |
1284 | } | 1284 | } |
1285 | 1285 | ||
1286 | static DEFINE_PER_CPU(int, user_stack_count); | ||
1287 | |||
1286 | void | 1288 | void |
1287 | ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) | 1289 | ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) |
1288 | { | 1290 | { |
@@ -1301,6 +1303,18 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) | |||
1301 | if (unlikely(in_nmi())) | 1303 | if (unlikely(in_nmi())) |
1302 | return; | 1304 | return; |
1303 | 1305 | ||
1306 | /* | ||
1307 | * prevent recursion, since the user stack tracing may | ||
1308 | * trigger other kernel events. | ||
1309 | */ | ||
1310 | preempt_disable(); | ||
1311 | if (__this_cpu_read(user_stack_count)) | ||
1312 | goto out; | ||
1313 | |||
1314 | __this_cpu_inc(user_stack_count); | ||
1315 | |||
1316 | |||
1317 | |||
1304 | event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, | 1318 | event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, |
1305 | sizeof(*entry), flags, pc); | 1319 | sizeof(*entry), flags, pc); |
1306 | if (!event) | 1320 | if (!event) |
@@ -1318,6 +1332,11 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) | |||
1318 | save_stack_trace_user(&trace); | 1332 | save_stack_trace_user(&trace); |
1319 | if (!filter_check_discard(call, entry, buffer, event)) | 1333 | if (!filter_check_discard(call, entry, buffer, event)) |
1320 | ring_buffer_unlock_commit(buffer, event); | 1334 | ring_buffer_unlock_commit(buffer, event); |
1335 | |||
1336 | __this_cpu_dec(user_stack_count); | ||
1337 | |||
1338 | out: | ||
1339 | preempt_enable(); | ||
1321 | } | 1340 | } |
1322 | 1341 | ||
1323 | #ifdef UNUSED | 1342 | #ifdef UNUSED |
@@ -2319,11 +2338,19 @@ tracing_write_stub(struct file *filp, const char __user *ubuf, | |||
2319 | return count; | 2338 | return count; |
2320 | } | 2339 | } |
2321 | 2340 | ||
2341 | static loff_t tracing_seek(struct file *file, loff_t offset, int origin) | ||
2342 | { | ||
2343 | if (file->f_mode & FMODE_READ) | ||
2344 | return seq_lseek(file, offset, origin); | ||
2345 | else | ||
2346 | return 0; | ||
2347 | } | ||
2348 | |||
2322 | static const struct file_operations tracing_fops = { | 2349 | static const struct file_operations tracing_fops = { |
2323 | .open = tracing_open, | 2350 | .open = tracing_open, |
2324 | .read = seq_read, | 2351 | .read = seq_read, |
2325 | .write = tracing_write_stub, | 2352 | .write = tracing_write_stub, |
2326 | .llseek = seq_lseek, | 2353 | .llseek = tracing_seek, |
2327 | .release = tracing_release, | 2354 | .release = tracing_release, |
2328 | }; | 2355 | }; |
2329 | 2356 | ||
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 39c059ca670..19a359d5e6d 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c | |||
@@ -21,17 +21,46 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) | |||
21 | /* Count the events in use (per event id, not per instance) */ | 21 | /* Count the events in use (per event id, not per instance) */ |
22 | static int total_ref_count; | 22 | static int total_ref_count; |
23 | 23 | ||
24 | static int perf_trace_event_perm(struct ftrace_event_call *tp_event, | ||
25 | struct perf_event *p_event) | ||
26 | { | ||
27 | /* No tracing, just counting, so no obvious leak */ | ||
28 | if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) | ||
29 | return 0; | ||
30 | |||
31 | /* Some events are ok to be traced by non-root users... */ | ||
32 | if (p_event->attach_state == PERF_ATTACH_TASK) { | ||
33 | if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY) | ||
34 | return 0; | ||
35 | } | ||
36 | |||
37 | /* | ||
38 | * ...otherwise raw tracepoint data can be a severe data leak, | ||
39 | * only allow root to have these. | ||
40 | */ | ||
41 | if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) | ||
42 | return -EPERM; | ||
43 | |||
44 | return 0; | ||
45 | } | ||
46 | |||
24 | static int perf_trace_event_init(struct ftrace_event_call *tp_event, | 47 | static int perf_trace_event_init(struct ftrace_event_call *tp_event, |
25 | struct perf_event *p_event) | 48 | struct perf_event *p_event) |
26 | { | 49 | { |
27 | struct hlist_head __percpu *list; | 50 | struct hlist_head __percpu *list; |
28 | int ret = -ENOMEM; | 51 | int ret; |
29 | int cpu; | 52 | int cpu; |
30 | 53 | ||
54 | ret = perf_trace_event_perm(tp_event, p_event); | ||
55 | if (ret) | ||
56 | return ret; | ||
57 | |||
31 | p_event->tp_event = tp_event; | 58 | p_event->tp_event = tp_event; |
32 | if (tp_event->perf_refcount++ > 0) | 59 | if (tp_event->perf_refcount++ > 0) |
33 | return 0; | 60 | return 0; |
34 | 61 | ||
62 | ret = -ENOMEM; | ||
63 | |||
35 | list = alloc_percpu(struct hlist_head); | 64 | list = alloc_percpu(struct hlist_head); |
36 | if (!list) | 65 | if (!list) |
37 | goto fail; | 66 | goto fail; |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 0725eeab193..35fde09b81d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -27,6 +27,12 @@ | |||
27 | 27 | ||
28 | DEFINE_MUTEX(event_mutex); | 28 | DEFINE_MUTEX(event_mutex); |
29 | 29 | ||
30 | DEFINE_MUTEX(event_storage_mutex); | ||
31 | EXPORT_SYMBOL_GPL(event_storage_mutex); | ||
32 | |||
33 | char event_storage[EVENT_STORAGE_SIZE]; | ||
34 | EXPORT_SYMBOL_GPL(event_storage); | ||
35 | |||
30 | LIST_HEAD(ftrace_events); | 36 | LIST_HEAD(ftrace_events); |
31 | LIST_HEAD(ftrace_common_fields); | 37 | LIST_HEAD(ftrace_common_fields); |
32 | 38 | ||
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 4ba44deaac2..4b74d71705c 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
@@ -83,13 +83,19 @@ static void __always_unused ____ftrace_check_##name(void) \ | |||
83 | 83 | ||
84 | #undef __array | 84 | #undef __array |
85 | #define __array(type, item, len) \ | 85 | #define __array(type, item, len) \ |
86 | BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ | 86 | do { \ |
87 | ret = trace_define_field(event_call, #type "[" #len "]", #item, \ | 87 | BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ |
88 | mutex_lock(&event_storage_mutex); \ | ||
89 | snprintf(event_storage, sizeof(event_storage), \ | ||
90 | "%s[%d]", #type, len); \ | ||
91 | ret = trace_define_field(event_call, event_storage, #item, \ | ||
88 | offsetof(typeof(field), item), \ | 92 | offsetof(typeof(field), item), \ |
89 | sizeof(field.item), \ | 93 | sizeof(field.item), \ |
90 | is_signed_type(type), FILTER_OTHER); \ | 94 | is_signed_type(type), FILTER_OTHER); \ |
91 | if (ret) \ | 95 | mutex_unlock(&event_storage_mutex); \ |
92 | return ret; | 96 | if (ret) \ |
97 | return ret; \ | ||
98 | } while (0); | ||
93 | 99 | ||
94 | #undef __array_desc | 100 | #undef __array_desc |
95 | #define __array_desc(type, container, item, len) \ | 101 | #define __array_desc(type, container, item, len) \ |
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 155a415b320..659732eba07 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
@@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) | |||
558 | static int trace_wakeup_test_thread(void *data) | 558 | static int trace_wakeup_test_thread(void *data) |
559 | { | 559 | { |
560 | /* Make this a RT thread, doesn't need to be too high */ | 560 | /* Make this a RT thread, doesn't need to be too high */ |
561 | struct sched_param param = { .sched_priority = 5 }; | 561 | static const struct sched_param param = { .sched_priority = 5 }; |
562 | struct completion *x = data; | 562 | struct completion *x = data; |
563 | 563 | ||
564 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 564 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
diff --git a/kernel/user.c b/kernel/user.c index 2c7d8d5914b..5c598ca781d 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -158,6 +158,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) | |||
158 | spin_lock_irq(&uidhash_lock); | 158 | spin_lock_irq(&uidhash_lock); |
159 | up = uid_hash_find(uid, hashent); | 159 | up = uid_hash_find(uid, hashent); |
160 | if (up) { | 160 | if (up) { |
161 | put_user_ns(ns); | ||
161 | key_put(new->uid_keyring); | 162 | key_put(new->uid_keyring); |
162 | key_put(new->session_keyring); | 163 | key_put(new->session_keyring); |
163 | kmem_cache_free(uid_cachep, new); | 164 | kmem_cache_free(uid_cachep, new); |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 6e3c41a4024..6e7b575ac33 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -57,6 +57,8 @@ static int __init hardlockup_panic_setup(char *str) | |||
57 | { | 57 | { |
58 | if (!strncmp(str, "panic", 5)) | 58 | if (!strncmp(str, "panic", 5)) |
59 | hardlockup_panic = 1; | 59 | hardlockup_panic = 1; |
60 | else if (!strncmp(str, "0", 1)) | ||
61 | no_watchdog = 1; | ||
60 | return 1; | 62 | return 1; |
61 | } | 63 | } |
62 | __setup("nmi_watchdog=", hardlockup_panic_setup); | 64 | __setup("nmi_watchdog=", hardlockup_panic_setup); |
@@ -307,7 +309,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
307 | */ | 309 | */ |
308 | static int watchdog(void *unused) | 310 | static int watchdog(void *unused) |
309 | { | 311 | { |
310 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | 312 | static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; |
311 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | 313 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); |
312 | 314 | ||
313 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 315 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
@@ -364,7 +366,8 @@ static int watchdog_nmi_enable(int cpu) | |||
364 | goto out_save; | 366 | goto out_save; |
365 | } | 367 | } |
366 | 368 | ||
367 | printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); | 369 | printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n", |
370 | cpu, PTR_ERR(event)); | ||
368 | return PTR_ERR(event); | 371 | return PTR_ERR(event); |
369 | 372 | ||
370 | /* success path */ | 373 | /* success path */ |
@@ -547,13 +550,13 @@ static struct notifier_block __cpuinitdata cpu_nfb = { | |||
547 | .notifier_call = cpu_callback | 550 | .notifier_call = cpu_callback |
548 | }; | 551 | }; |
549 | 552 | ||
550 | static int __init spawn_watchdog_task(void) | 553 | void __init lockup_detector_init(void) |
551 | { | 554 | { |
552 | void *cpu = (void *)(long)smp_processor_id(); | 555 | void *cpu = (void *)(long)smp_processor_id(); |
553 | int err; | 556 | int err; |
554 | 557 | ||
555 | if (no_watchdog) | 558 | if (no_watchdog) |
556 | return 0; | 559 | return; |
557 | 560 | ||
558 | err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | 561 | err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); |
559 | WARN_ON(notifier_to_errno(err)); | 562 | WARN_ON(notifier_to_errno(err)); |
@@ -561,6 +564,5 @@ static int __init spawn_watchdog_task(void) | |||
561 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | 564 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); |
562 | register_cpu_notifier(&cpu_nfb); | 565 | register_cpu_notifier(&cpu_nfb); |
563 | 566 | ||
564 | return 0; | 567 | return; |
565 | } | 568 | } |
566 | early_initcall(spawn_watchdog_task); | ||