aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c54
-rw-r--r--kernel/cpu.c29
-rw-r--r--kernel/debug/kdb/kdb_main.c21
-rw-r--r--kernel/exit.c11
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/futex.c238
-rw-r--r--kernel/futex_compat.c3
-rw-r--r--kernel/hrtimer.c85
-rw-r--r--kernel/hw_breakpoint.c5
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/irq_work.c22
-rw-r--r--kernel/kprobes.c573
-rw-r--r--kernel/kthread.c13
-rw-r--r--kernel/latencytop.c17
-rw-r--r--kernel/lockdep_proc.c16
-rw-r--r--kernel/module.c183
-rw-r--r--kernel/mutex.c2
-rw-r--r--kernel/perf_event.c741
-rw-r--r--kernel/pm_qos_params.c4
-rw-r--r--kernel/posix-cpu-timers.c12
-rw-r--r--kernel/posix-timers.c10
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/hibernate.c22
-rw-r--r--kernel/power/suspend.c8
-rw-r--r--kernel/power/swap.c55
-rw-r--r--kernel/power/user.c4
-rw-r--r--kernel/printk.c39
-rw-r--r--kernel/range.c2
-rw-r--r--kernel/rcutiny.c105
-rw-r--r--kernel/rcutiny_plugin.h433
-rw-r--r--kernel/rcutorture.c270
-rw-r--r--kernel/rcutree.c160
-rw-r--r--kernel/rcutree.h61
-rw-r--r--kernel/rcutree_plugin.h135
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/resource.c104
-rw-r--r--kernel/sched.c1005
-rw-r--r--kernel/sched_autogroup.c238
-rw-r--r--kernel/sched_autogroup.h32
-rw-r--r--kernel/sched_clock.c2
-rw-r--r--kernel/sched_debug.c91
-rw-r--r--kernel/sched_fair.c370
-rw-r--r--kernel/sched_features.h2
-rw-r--r--kernel/sched_rt.c24
-rw-r--r--kernel/sched_stoptask.c4
-rw-r--r--kernel/softirq.c46
-rw-r--r--kernel/srcu.c8
-rw-r--r--kernel/sys.c4
-rw-r--r--kernel/sysctl.c62
-rw-r--r--kernel/sysctl_binary.c1
-rw-r--r--kernel/taskstats.c62
-rw-r--r--kernel/time/clocksource.c1
-rw-r--r--kernel/time/tick-common.c2
-rw-r--r--kernel/time/tick-oneshot.c4
-rw-r--r--kernel/time/timecompare.c5
-rw-r--r--kernel/time/timekeeping.c9
-rw-r--r--kernel/time/timer_list.c8
-rw-r--r--kernel/timer.c58
-rw-r--r--kernel/trace/Kconfig17
-rw-r--r--kernel/trace/blktrace.c4
-rw-r--r--kernel/trace/power-traces.c5
-rw-r--r--kernel/trace/ring_buffer.c9
-rw-r--r--kernel/trace/trace.c30
-rw-r--r--kernel/trace/trace_event_perf.c31
-rw-r--r--kernel/trace/trace_events.c6
-rw-r--r--kernel/trace/trace_export.c14
-rw-r--r--kernel/trace/trace_selftest.c2
-rw-r--r--kernel/user.c1
-rw-r--r--kernel/watchdog.c50
-rw-r--r--kernel/workqueue.c67
71 files changed, 3918 insertions, 1820 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 66a416b42c18..51cddc11cd85 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -763,6 +763,8 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
763 * -> cgroup_mkdir. 763 * -> cgroup_mkdir.
764 */ 764 */
765 765
766static struct dentry *cgroup_lookup(struct inode *dir,
767 struct dentry *dentry, struct nameidata *nd);
766static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); 768static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
767static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 769static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
768static int cgroup_populate_dir(struct cgroup *cgrp); 770static int cgroup_populate_dir(struct cgroup *cgrp);
@@ -874,25 +876,29 @@ static void cgroup_clear_directory(struct dentry *dentry)
874 struct list_head *node; 876 struct list_head *node;
875 877
876 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); 878 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
877 spin_lock(&dcache_lock); 879 spin_lock(&dentry->d_lock);
878 node = dentry->d_subdirs.next; 880 node = dentry->d_subdirs.next;
879 while (node != &dentry->d_subdirs) { 881 while (node != &dentry->d_subdirs) {
880 struct dentry *d = list_entry(node, struct dentry, d_u.d_child); 882 struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
883
884 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
881 list_del_init(node); 885 list_del_init(node);
882 if (d->d_inode) { 886 if (d->d_inode) {
883 /* This should never be called on a cgroup 887 /* This should never be called on a cgroup
884 * directory with child cgroups */ 888 * directory with child cgroups */
885 BUG_ON(d->d_inode->i_mode & S_IFDIR); 889 BUG_ON(d->d_inode->i_mode & S_IFDIR);
886 d = dget_locked(d); 890 dget_dlock(d);
887 spin_unlock(&dcache_lock); 891 spin_unlock(&d->d_lock);
892 spin_unlock(&dentry->d_lock);
888 d_delete(d); 893 d_delete(d);
889 simple_unlink(dentry->d_inode, d); 894 simple_unlink(dentry->d_inode, d);
890 dput(d); 895 dput(d);
891 spin_lock(&dcache_lock); 896 spin_lock(&dentry->d_lock);
892 } 897 } else
898 spin_unlock(&d->d_lock);
893 node = dentry->d_subdirs.next; 899 node = dentry->d_subdirs.next;
894 } 900 }
895 spin_unlock(&dcache_lock); 901 spin_unlock(&dentry->d_lock);
896} 902}
897 903
898/* 904/*
@@ -900,11 +906,16 @@ static void cgroup_clear_directory(struct dentry *dentry)
900 */ 906 */
901static void cgroup_d_remove_dir(struct dentry *dentry) 907static void cgroup_d_remove_dir(struct dentry *dentry)
902{ 908{
909 struct dentry *parent;
910
903 cgroup_clear_directory(dentry); 911 cgroup_clear_directory(dentry);
904 912
905 spin_lock(&dcache_lock); 913 parent = dentry->d_parent;
914 spin_lock(&parent->d_lock);
915 spin_lock(&dentry->d_lock);
906 list_del_init(&dentry->d_u.d_child); 916 list_del_init(&dentry->d_u.d_child);
907 spin_unlock(&dcache_lock); 917 spin_unlock(&dentry->d_lock);
918 spin_unlock(&parent->d_lock);
908 remove_dir(dentry); 919 remove_dir(dentry);
909} 920}
910 921
@@ -2180,7 +2191,7 @@ static const struct file_operations cgroup_file_operations = {
2180}; 2191};
2181 2192
2182static const struct inode_operations cgroup_dir_inode_operations = { 2193static const struct inode_operations cgroup_dir_inode_operations = {
2183 .lookup = simple_lookup, 2194 .lookup = cgroup_lookup,
2184 .mkdir = cgroup_mkdir, 2195 .mkdir = cgroup_mkdir,
2185 .rmdir = cgroup_rmdir, 2196 .rmdir = cgroup_rmdir,
2186 .rename = cgroup_rename, 2197 .rename = cgroup_rename,
@@ -2196,13 +2207,29 @@ static inline struct cftype *__file_cft(struct file *file)
2196 return __d_cft(file->f_dentry); 2207 return __d_cft(file->f_dentry);
2197} 2208}
2198 2209
2199static int cgroup_create_file(struct dentry *dentry, mode_t mode, 2210static int cgroup_delete_dentry(const struct dentry *dentry)
2200 struct super_block *sb) 2211{
2212 return 1;
2213}
2214
2215static struct dentry *cgroup_lookup(struct inode *dir,
2216 struct dentry *dentry, struct nameidata *nd)
2201{ 2217{
2202 static const struct dentry_operations cgroup_dops = { 2218 static const struct dentry_operations cgroup_dentry_operations = {
2219 .d_delete = cgroup_delete_dentry,
2203 .d_iput = cgroup_diput, 2220 .d_iput = cgroup_diput,
2204 }; 2221 };
2205 2222
2223 if (dentry->d_name.len > NAME_MAX)
2224 return ERR_PTR(-ENAMETOOLONG);
2225 d_set_d_op(dentry, &cgroup_dentry_operations);
2226 d_add(dentry, NULL);
2227 return NULL;
2228}
2229
2230static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2231 struct super_block *sb)
2232{
2206 struct inode *inode; 2233 struct inode *inode;
2207 2234
2208 if (!dentry) 2235 if (!dentry)
@@ -2228,7 +2255,6 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2228 inode->i_size = 0; 2255 inode->i_size = 0;
2229 inode->i_fop = &cgroup_file_operations; 2256 inode->i_fop = &cgroup_file_operations;
2230 } 2257 }
2231 dentry->d_op = &cgroup_dops;
2232 d_instantiate(dentry, inode); 2258 d_instantiate(dentry, inode);
2233 dget(dentry); /* Extra count - pin the dentry in core */ 2259 dget(dentry); /* Extra count - pin the dentry in core */
2234 return 0; 2260 return 0;
@@ -3638,9 +3664,7 @@ again:
3638 list_del(&cgrp->sibling); 3664 list_del(&cgrp->sibling);
3639 cgroup_unlock_hierarchy(cgrp->root); 3665 cgroup_unlock_hierarchy(cgrp->root);
3640 3666
3641 spin_lock(&cgrp->dentry->d_lock);
3642 d = dget(cgrp->dentry); 3667 d = dget(cgrp->dentry);
3643 spin_unlock(&d->d_lock);
3644 3668
3645 cgroup_d_remove_dir(d); 3669 cgroup_d_remove_dir(d);
3646 dput(d); 3670 dput(d);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f6e726f18491..156cc5556140 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu)
189} 189}
190 190
191struct take_cpu_down_param { 191struct take_cpu_down_param {
192 struct task_struct *caller;
193 unsigned long mod; 192 unsigned long mod;
194 void *hcpu; 193 void *hcpu;
195}; 194};
@@ -198,7 +197,6 @@ struct take_cpu_down_param {
198static int __ref take_cpu_down(void *_param) 197static int __ref take_cpu_down(void *_param)
199{ 198{
200 struct take_cpu_down_param *param = _param; 199 struct take_cpu_down_param *param = _param;
201 unsigned int cpu = (unsigned long)param->hcpu;
202 int err; 200 int err;
203 201
204 /* Ensure this CPU doesn't handle any more interrupts. */ 202 /* Ensure this CPU doesn't handle any more interrupts. */
@@ -208,11 +206,6 @@ static int __ref take_cpu_down(void *_param)
208 206
209 cpu_notify(CPU_DYING | param->mod, param->hcpu); 207 cpu_notify(CPU_DYING | param->mod, param->hcpu);
210 208
211 if (task_cpu(param->caller) == cpu)
212 move_task_off_dead_cpu(cpu, param->caller);
213 /* Force idle task to run as soon as we yield: it should
214 immediately notice cpu is offline and die quickly. */
215 sched_idle_next();
216 return 0; 209 return 0;
217} 210}
218 211
@@ -223,7 +216,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
223 void *hcpu = (void *)(long)cpu; 216 void *hcpu = (void *)(long)cpu;
224 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 217 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
225 struct take_cpu_down_param tcd_param = { 218 struct take_cpu_down_param tcd_param = {
226 .caller = current,
227 .mod = mod, 219 .mod = mod,
228 .hcpu = hcpu, 220 .hcpu = hcpu,
229 }; 221 };
@@ -253,9 +245,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
253 } 245 }
254 BUG_ON(cpu_online(cpu)); 246 BUG_ON(cpu_online(cpu));
255 247
256 /* Wait for it to sleep (leaving idle task). */ 248 /*
249 * The migration_call() CPU_DYING callback will have removed all
250 * runnable tasks from the cpu, there's only the idle task left now
251 * that the migration thread is done doing the stop_machine thing.
252 *
253 * Wait for the stop thread to go away.
254 */
257 while (!idle_cpu(cpu)) 255 while (!idle_cpu(cpu))
258 yield(); 256 cpu_relax();
259 257
260 /* This actually kills the CPU. */ 258 /* This actually kills the CPU. */
261 __cpu_die(cpu); 259 __cpu_die(cpu);
@@ -386,6 +384,14 @@ out:
386#ifdef CONFIG_PM_SLEEP_SMP 384#ifdef CONFIG_PM_SLEEP_SMP
387static cpumask_var_t frozen_cpus; 385static cpumask_var_t frozen_cpus;
388 386
387void __weak arch_disable_nonboot_cpus_begin(void)
388{
389}
390
391void __weak arch_disable_nonboot_cpus_end(void)
392{
393}
394
389int disable_nonboot_cpus(void) 395int disable_nonboot_cpus(void)
390{ 396{
391 int cpu, first_cpu, error = 0; 397 int cpu, first_cpu, error = 0;
@@ -397,6 +403,7 @@ int disable_nonboot_cpus(void)
397 * with the userspace trying to use the CPU hotplug at the same time 403 * with the userspace trying to use the CPU hotplug at the same time
398 */ 404 */
399 cpumask_clear(frozen_cpus); 405 cpumask_clear(frozen_cpus);
406 arch_disable_nonboot_cpus_begin();
400 407
401 printk("Disabling non-boot CPUs ...\n"); 408 printk("Disabling non-boot CPUs ...\n");
402 for_each_online_cpu(cpu) { 409 for_each_online_cpu(cpu) {
@@ -412,6 +419,8 @@ int disable_nonboot_cpus(void)
412 } 419 }
413 } 420 }
414 421
422 arch_disable_nonboot_cpus_end();
423
415 if (!error) { 424 if (!error) {
416 BUG_ON(num_online_cpus() > 1); 425 BUG_ON(num_online_cpus() > 1);
417 /* Make sure the CPUs won't be enabled by someone else */ 426 /* Make sure the CPUs won't be enabled by someone else */
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 37755d621924..a6e729766821 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -82,7 +82,7 @@ static kdbtab_t kdb_base_commands[50];
82#define for_each_kdbcmd(cmd, num) \ 82#define for_each_kdbcmd(cmd, num) \
83 for ((cmd) = kdb_base_commands, (num) = 0; \ 83 for ((cmd) = kdb_base_commands, (num) = 0; \
84 num < kdb_max_commands; \ 84 num < kdb_max_commands; \
85 num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++, num++) 85 num++, num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++)
86 86
87typedef struct _kdbmsg { 87typedef struct _kdbmsg {
88 int km_diag; /* kdb diagnostic */ 88 int km_diag; /* kdb diagnostic */
@@ -646,7 +646,7 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0)
646 } 646 }
647 if (!s->usable) 647 if (!s->usable)
648 return KDB_NOTIMP; 648 return KDB_NOTIMP;
649 s->command = kmalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB); 649 s->command = kzalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
650 if (!s->command) { 650 if (!s->command) {
651 kdb_printf("Could not allocate new kdb_defcmd table for %s\n", 651 kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
652 cmdstr); 652 cmdstr);
@@ -2361,7 +2361,7 @@ static int kdb_pid(int argc, const char **argv)
2361 */ 2361 */
2362static int kdb_ll(int argc, const char **argv) 2362static int kdb_ll(int argc, const char **argv)
2363{ 2363{
2364 int diag; 2364 int diag = 0;
2365 unsigned long addr; 2365 unsigned long addr;
2366 long offset = 0; 2366 long offset = 0;
2367 unsigned long va; 2367 unsigned long va;
@@ -2400,20 +2400,21 @@ static int kdb_ll(int argc, const char **argv)
2400 char buf[80]; 2400 char buf[80];
2401 2401
2402 if (KDB_FLAG(CMD_INTERRUPT)) 2402 if (KDB_FLAG(CMD_INTERRUPT))
2403 return 0; 2403 goto out;
2404 2404
2405 sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va); 2405 sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
2406 diag = kdb_parse(buf); 2406 diag = kdb_parse(buf);
2407 if (diag) 2407 if (diag)
2408 return diag; 2408 goto out;
2409 2409
2410 addr = va + linkoffset; 2410 addr = va + linkoffset;
2411 if (kdb_getword(&va, addr, sizeof(va))) 2411 if (kdb_getword(&va, addr, sizeof(va)))
2412 return 0; 2412 goto out;
2413 } 2413 }
2414 kfree(command);
2415 2414
2416 return 0; 2415out:
2416 kfree(command);
2417 return diag;
2417} 2418}
2418 2419
2419static int kdb_kgdb(int argc, const char **argv) 2420static int kdb_kgdb(int argc, const char **argv)
@@ -2739,13 +2740,13 @@ int kdb_register_repeat(char *cmd,
2739 } 2740 }
2740 if (kdb_commands) { 2741 if (kdb_commands) {
2741 memcpy(new, kdb_commands, 2742 memcpy(new, kdb_commands,
2742 kdb_max_commands * sizeof(*new)); 2743 (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
2743 kfree(kdb_commands); 2744 kfree(kdb_commands);
2744 } 2745 }
2745 memset(new + kdb_max_commands, 0, 2746 memset(new + kdb_max_commands, 0,
2746 kdb_command_extend * sizeof(*new)); 2747 kdb_command_extend * sizeof(*new));
2747 kdb_commands = new; 2748 kdb_commands = new;
2748 kp = kdb_commands + kdb_max_commands; 2749 kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
2749 kdb_max_commands += kdb_command_extend; 2750 kdb_max_commands += kdb_command_extend;
2750 } 2751 }
2751 2752
diff --git a/kernel/exit.c b/kernel/exit.c
index 21aa7b3001fb..89c74861a3da 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -69,7 +69,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
69 69
70 list_del_rcu(&p->tasks); 70 list_del_rcu(&p->tasks);
71 list_del_init(&p->sibling); 71 list_del_init(&p->sibling);
72 __get_cpu_var(process_counts)--; 72 __this_cpu_dec(process_counts);
73 } 73 }
74 list_del_rcu(&p->thread_group); 74 list_del_rcu(&p->thread_group);
75} 75}
@@ -914,6 +914,15 @@ NORET_TYPE void do_exit(long code)
914 if (unlikely(!tsk->pid)) 914 if (unlikely(!tsk->pid))
915 panic("Attempted to kill the idle task!"); 915 panic("Attempted to kill the idle task!");
916 916
917 /*
918 * If do_exit is called because this processes oopsed, it's possible
919 * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
920 * continuing. Amongst other possible reasons, this is to prevent
921 * mm_release()->clear_child_tid() from writing to a user-controlled
922 * kernel address.
923 */
924 set_fs(USER_DS);
925
917 tracehook_report_exit(&code); 926 tracehook_report_exit(&code);
918 927
919 validate_creds_for_do_exit(tsk); 928 validate_creds_for_do_exit(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b159c5991b7..d9b44f20b6b0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -169,6 +169,7 @@ EXPORT_SYMBOL(free_task);
169static inline void free_signal_struct(struct signal_struct *sig) 169static inline void free_signal_struct(struct signal_struct *sig)
170{ 170{
171 taskstats_tgid_free(sig); 171 taskstats_tgid_free(sig);
172 sched_autogroup_exit(sig);
172 kmem_cache_free(signal_cachep, sig); 173 kmem_cache_free(signal_cachep, sig);
173} 174}
174 175
@@ -273,6 +274,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
273 274
274 setup_thread_stack(tsk, orig); 275 setup_thread_stack(tsk, orig);
275 clear_user_return_notifier(tsk); 276 clear_user_return_notifier(tsk);
277 clear_tsk_need_resched(tsk);
276 stackend = end_of_stack(tsk); 278 stackend = end_of_stack(tsk);
277 *stackend = STACK_END_MAGIC; /* for overflow detection */ 279 *stackend = STACK_END_MAGIC; /* for overflow detection */
278 280
@@ -904,6 +906,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
904 posix_cpu_timers_init_group(sig); 906 posix_cpu_timers_init_group(sig);
905 907
906 tty_audit_fork(sig); 908 tty_audit_fork(sig);
909 sched_autogroup_fork(sig);
907 910
908 sig->oom_adj = current->signal->oom_adj; 911 sig->oom_adj = current->signal->oom_adj;
909 sig->oom_score_adj = current->signal->oom_score_adj; 912 sig->oom_score_adj = current->signal->oom_score_adj;
@@ -1282,7 +1285,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1282 attach_pid(p, PIDTYPE_SID, task_session(current)); 1285 attach_pid(p, PIDTYPE_SID, task_session(current));
1283 list_add_tail(&p->sibling, &p->real_parent->children); 1286 list_add_tail(&p->sibling, &p->real_parent->children);
1284 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1287 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1285 __get_cpu_var(process_counts)++; 1288 __this_cpu_inc(process_counts);
1286 } 1289 }
1287 attach_pid(p, PIDTYPE_PID, pid); 1290 attach_pid(p, PIDTYPE_PID, pid);
1288 nr_threads++; 1291 nr_threads++;
diff --git a/kernel/futex.c b/kernel/futex.c
index 6c683b37f2ce..3019b92e6917 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -69,6 +69,14 @@ int __read_mostly futex_cmpxchg_enabled;
69#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 69#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
70 70
71/* 71/*
72 * Futex flags used to encode options to functions and preserve them across
73 * restarts.
74 */
75#define FLAGS_SHARED 0x01
76#define FLAGS_CLOCKRT 0x02
77#define FLAGS_HAS_TIMEOUT 0x04
78
79/*
72 * Priority Inheritance state: 80 * Priority Inheritance state:
73 */ 81 */
74struct futex_pi_state { 82struct futex_pi_state {
@@ -123,6 +131,12 @@ struct futex_q {
123 u32 bitset; 131 u32 bitset;
124}; 132};
125 133
134static const struct futex_q futex_q_init = {
135 /* list gets initialized in queue_me()*/
136 .key = FUTEX_KEY_INIT,
137 .bitset = FUTEX_BITSET_MATCH_ANY
138};
139
126/* 140/*
127 * Hash buckets are shared by all the futex_keys that hash to the same 141 * Hash buckets are shared by all the futex_keys that hash to the same
128 * location. Each key may have multiple futex_q structures, one for each task 142 * location. Each key may have multiple futex_q structures, one for each task
@@ -283,8 +297,7 @@ again:
283 return 0; 297 return 0;
284} 298}
285 299
286static inline 300static inline void put_futex_key(union futex_key *key)
287void put_futex_key(int fshared, union futex_key *key)
288{ 301{
289 drop_futex_key_refs(key); 302 drop_futex_key_refs(key);
290} 303}
@@ -870,7 +883,8 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
870/* 883/*
871 * Wake up waiters matching bitset queued on this futex (uaddr). 884 * Wake up waiters matching bitset queued on this futex (uaddr).
872 */ 885 */
873static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) 886static int
887futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
874{ 888{
875 struct futex_hash_bucket *hb; 889 struct futex_hash_bucket *hb;
876 struct futex_q *this, *next; 890 struct futex_q *this, *next;
@@ -881,7 +895,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
881 if (!bitset) 895 if (!bitset)
882 return -EINVAL; 896 return -EINVAL;
883 897
884 ret = get_futex_key(uaddr, fshared, &key); 898 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
885 if (unlikely(ret != 0)) 899 if (unlikely(ret != 0))
886 goto out; 900 goto out;
887 901
@@ -907,7 +921,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
907 } 921 }
908 922
909 spin_unlock(&hb->lock); 923 spin_unlock(&hb->lock);
910 put_futex_key(fshared, &key); 924 put_futex_key(&key);
911out: 925out:
912 return ret; 926 return ret;
913} 927}
@@ -917,7 +931,7 @@ out:
917 * to this virtual address: 931 * to this virtual address:
918 */ 932 */
919static int 933static int
920futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, 934futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
921 int nr_wake, int nr_wake2, int op) 935 int nr_wake, int nr_wake2, int op)
922{ 936{
923 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 937 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
@@ -927,10 +941,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
927 int ret, op_ret; 941 int ret, op_ret;
928 942
929retry: 943retry:
930 ret = get_futex_key(uaddr1, fshared, &key1); 944 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
931 if (unlikely(ret != 0)) 945 if (unlikely(ret != 0))
932 goto out; 946 goto out;
933 ret = get_futex_key(uaddr2, fshared, &key2); 947 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
934 if (unlikely(ret != 0)) 948 if (unlikely(ret != 0))
935 goto out_put_key1; 949 goto out_put_key1;
936 950
@@ -962,11 +976,11 @@ retry_private:
962 if (ret) 976 if (ret)
963 goto out_put_keys; 977 goto out_put_keys;
964 978
965 if (!fshared) 979 if (!(flags & FLAGS_SHARED))
966 goto retry_private; 980 goto retry_private;
967 981
968 put_futex_key(fshared, &key2); 982 put_futex_key(&key2);
969 put_futex_key(fshared, &key1); 983 put_futex_key(&key1);
970 goto retry; 984 goto retry;
971 } 985 }
972 986
@@ -996,9 +1010,9 @@ retry_private:
996 1010
997 double_unlock_hb(hb1, hb2); 1011 double_unlock_hb(hb1, hb2);
998out_put_keys: 1012out_put_keys:
999 put_futex_key(fshared, &key2); 1013 put_futex_key(&key2);
1000out_put_key1: 1014out_put_key1:
1001 put_futex_key(fshared, &key1); 1015 put_futex_key(&key1);
1002out: 1016out:
1003 return ret; 1017 return ret;
1004} 1018}
@@ -1133,13 +1147,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1133/** 1147/**
1134 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 1148 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1135 * @uaddr1: source futex user address 1149 * @uaddr1: source futex user address
1136 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 1150 * @flags: futex flags (FLAGS_SHARED, etc.)
1137 * @uaddr2: target futex user address 1151 * @uaddr2: target futex user address
1138 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) 1152 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
1139 * @nr_requeue: number of waiters to requeue (0-INT_MAX) 1153 * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1140 * @cmpval: @uaddr1 expected value (or %NULL) 1154 * @cmpval: @uaddr1 expected value (or %NULL)
1141 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a 1155 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1142 * pi futex (pi to pi requeue is not supported) 1156 * pi futex (pi to pi requeue is not supported)
1143 * 1157 *
1144 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire 1158 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1145 * uaddr2 atomically on behalf of the top waiter. 1159 * uaddr2 atomically on behalf of the top waiter.
@@ -1148,9 +1162,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1148 * >=0 - on success, the number of tasks requeued or woken 1162 * >=0 - on success, the number of tasks requeued or woken
1149 * <0 - on error 1163 * <0 - on error
1150 */ 1164 */
1151static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, 1165static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1152 int nr_wake, int nr_requeue, u32 *cmpval, 1166 u32 __user *uaddr2, int nr_wake, int nr_requeue,
1153 int requeue_pi) 1167 u32 *cmpval, int requeue_pi)
1154{ 1168{
1155 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1169 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1156 int drop_count = 0, task_count = 0, ret; 1170 int drop_count = 0, task_count = 0, ret;
@@ -1191,10 +1205,10 @@ retry:
1191 pi_state = NULL; 1205 pi_state = NULL;
1192 } 1206 }
1193 1207
1194 ret = get_futex_key(uaddr1, fshared, &key1); 1208 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
1195 if (unlikely(ret != 0)) 1209 if (unlikely(ret != 0))
1196 goto out; 1210 goto out;
1197 ret = get_futex_key(uaddr2, fshared, &key2); 1211 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
1198 if (unlikely(ret != 0)) 1212 if (unlikely(ret != 0))
1199 goto out_put_key1; 1213 goto out_put_key1;
1200 1214
@@ -1216,11 +1230,11 @@ retry_private:
1216 if (ret) 1230 if (ret)
1217 goto out_put_keys; 1231 goto out_put_keys;
1218 1232
1219 if (!fshared) 1233 if (!(flags & FLAGS_SHARED))
1220 goto retry_private; 1234 goto retry_private;
1221 1235
1222 put_futex_key(fshared, &key2); 1236 put_futex_key(&key2);
1223 put_futex_key(fshared, &key1); 1237 put_futex_key(&key1);
1224 goto retry; 1238 goto retry;
1225 } 1239 }
1226 if (curval != *cmpval) { 1240 if (curval != *cmpval) {
@@ -1260,8 +1274,8 @@ retry_private:
1260 break; 1274 break;
1261 case -EFAULT: 1275 case -EFAULT:
1262 double_unlock_hb(hb1, hb2); 1276 double_unlock_hb(hb1, hb2);
1263 put_futex_key(fshared, &key2); 1277 put_futex_key(&key2);
1264 put_futex_key(fshared, &key1); 1278 put_futex_key(&key1);
1265 ret = fault_in_user_writeable(uaddr2); 1279 ret = fault_in_user_writeable(uaddr2);
1266 if (!ret) 1280 if (!ret)
1267 goto retry; 1281 goto retry;
@@ -1269,8 +1283,8 @@ retry_private:
1269 case -EAGAIN: 1283 case -EAGAIN:
1270 /* The owner was exiting, try again. */ 1284 /* The owner was exiting, try again. */
1271 double_unlock_hb(hb1, hb2); 1285 double_unlock_hb(hb1, hb2);
1272 put_futex_key(fshared, &key2); 1286 put_futex_key(&key2);
1273 put_futex_key(fshared, &key1); 1287 put_futex_key(&key1);
1274 cond_resched(); 1288 cond_resched();
1275 goto retry; 1289 goto retry;
1276 default: 1290 default:
@@ -1352,9 +1366,9 @@ out_unlock:
1352 drop_futex_key_refs(&key1); 1366 drop_futex_key_refs(&key1);
1353 1367
1354out_put_keys: 1368out_put_keys:
1355 put_futex_key(fshared, &key2); 1369 put_futex_key(&key2);
1356out_put_key1: 1370out_put_key1:
1357 put_futex_key(fshared, &key1); 1371 put_futex_key(&key1);
1358out: 1372out:
1359 if (pi_state != NULL) 1373 if (pi_state != NULL)
1360 free_pi_state(pi_state); 1374 free_pi_state(pi_state);
@@ -1494,7 +1508,7 @@ static void unqueue_me_pi(struct futex_q *q)
1494 * private futexes. 1508 * private futexes.
1495 */ 1509 */
1496static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1510static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1497 struct task_struct *newowner, int fshared) 1511 struct task_struct *newowner)
1498{ 1512{
1499 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1513 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1500 struct futex_pi_state *pi_state = q->pi_state; 1514 struct futex_pi_state *pi_state = q->pi_state;
@@ -1587,20 +1601,11 @@ handle_fault:
1587 goto retry; 1601 goto retry;
1588} 1602}
1589 1603
1590/*
1591 * In case we must use restart_block to restart a futex_wait,
1592 * we encode in the 'flags' shared capability
1593 */
1594#define FLAGS_SHARED 0x01
1595#define FLAGS_CLOCKRT 0x02
1596#define FLAGS_HAS_TIMEOUT 0x04
1597
1598static long futex_wait_restart(struct restart_block *restart); 1604static long futex_wait_restart(struct restart_block *restart);
1599 1605
1600/** 1606/**
1601 * fixup_owner() - Post lock pi_state and corner case management 1607 * fixup_owner() - Post lock pi_state and corner case management
1602 * @uaddr: user address of the futex 1608 * @uaddr: user address of the futex
1603 * @fshared: whether the futex is shared (1) or not (0)
1604 * @q: futex_q (contains pi_state and access to the rt_mutex) 1609 * @q: futex_q (contains pi_state and access to the rt_mutex)
1605 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) 1610 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
1606 * 1611 *
@@ -1613,8 +1618,7 @@ static long futex_wait_restart(struct restart_block *restart);
1613 * 0 - success, lock not taken 1618 * 0 - success, lock not taken
1614 * <0 - on error (-EFAULT) 1619 * <0 - on error (-EFAULT)
1615 */ 1620 */
1616static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, 1621static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
1617 int locked)
1618{ 1622{
1619 struct task_struct *owner; 1623 struct task_struct *owner;
1620 int ret = 0; 1624 int ret = 0;
@@ -1625,7 +1629,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
1625 * did a lock-steal - fix up the PI-state in that case: 1629 * did a lock-steal - fix up the PI-state in that case:
1626 */ 1630 */
1627 if (q->pi_state->owner != current) 1631 if (q->pi_state->owner != current)
1628 ret = fixup_pi_state_owner(uaddr, q, current, fshared); 1632 ret = fixup_pi_state_owner(uaddr, q, current);
1629 goto out; 1633 goto out;
1630 } 1634 }
1631 1635
@@ -1652,7 +1656,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
1652 * lock. Fix the state up. 1656 * lock. Fix the state up.
1653 */ 1657 */
1654 owner = rt_mutex_owner(&q->pi_state->pi_mutex); 1658 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1655 ret = fixup_pi_state_owner(uaddr, q, owner, fshared); 1659 ret = fixup_pi_state_owner(uaddr, q, owner);
1656 goto out; 1660 goto out;
1657 } 1661 }
1658 1662
@@ -1715,7 +1719,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1715 * futex_wait_setup() - Prepare to wait on a futex 1719 * futex_wait_setup() - Prepare to wait on a futex
1716 * @uaddr: the futex userspace address 1720 * @uaddr: the futex userspace address
1717 * @val: the expected value 1721 * @val: the expected value
1718 * @fshared: whether the futex is shared (1) or not (0) 1722 * @flags: futex flags (FLAGS_SHARED, etc.)
1719 * @q: the associated futex_q 1723 * @q: the associated futex_q
1720 * @hb: storage for hash_bucket pointer to be returned to caller 1724 * @hb: storage for hash_bucket pointer to be returned to caller
1721 * 1725 *
@@ -1728,7 +1732,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1728 * 0 - uaddr contains val and hb has been locked 1732 * 0 - uaddr contains val and hb has been locked
1729 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked 1733 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
1730 */ 1734 */
1731static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, 1735static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1732 struct futex_q *q, struct futex_hash_bucket **hb) 1736 struct futex_q *q, struct futex_hash_bucket **hb)
1733{ 1737{
1734 u32 uval; 1738 u32 uval;
@@ -1752,8 +1756,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
1752 * rare, but normal. 1756 * rare, but normal.
1753 */ 1757 */
1754retry: 1758retry:
1755 q->key = FUTEX_KEY_INIT; 1759 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
1756 ret = get_futex_key(uaddr, fshared, &q->key);
1757 if (unlikely(ret != 0)) 1760 if (unlikely(ret != 0))
1758 return ret; 1761 return ret;
1759 1762
@@ -1769,10 +1772,10 @@ retry_private:
1769 if (ret) 1772 if (ret)
1770 goto out; 1773 goto out;
1771 1774
1772 if (!fshared) 1775 if (!(flags & FLAGS_SHARED))
1773 goto retry_private; 1776 goto retry_private;
1774 1777
1775 put_futex_key(fshared, &q->key); 1778 put_futex_key(&q->key);
1776 goto retry; 1779 goto retry;
1777 } 1780 }
1778 1781
@@ -1783,32 +1786,29 @@ retry_private:
1783 1786
1784out: 1787out:
1785 if (ret) 1788 if (ret)
1786 put_futex_key(fshared, &q->key); 1789 put_futex_key(&q->key);
1787 return ret; 1790 return ret;
1788} 1791}
1789 1792
1790static int futex_wait(u32 __user *uaddr, int fshared, 1793static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
1791 u32 val, ktime_t *abs_time, u32 bitset, int clockrt) 1794 ktime_t *abs_time, u32 bitset)
1792{ 1795{
1793 struct hrtimer_sleeper timeout, *to = NULL; 1796 struct hrtimer_sleeper timeout, *to = NULL;
1794 struct restart_block *restart; 1797 struct restart_block *restart;
1795 struct futex_hash_bucket *hb; 1798 struct futex_hash_bucket *hb;
1796 struct futex_q q; 1799 struct futex_q q = futex_q_init;
1797 int ret; 1800 int ret;
1798 1801
1799 if (!bitset) 1802 if (!bitset)
1800 return -EINVAL; 1803 return -EINVAL;
1801
1802 q.pi_state = NULL;
1803 q.bitset = bitset; 1804 q.bitset = bitset;
1804 q.rt_waiter = NULL;
1805 q.requeue_pi_key = NULL;
1806 1805
1807 if (abs_time) { 1806 if (abs_time) {
1808 to = &timeout; 1807 to = &timeout;
1809 1808
1810 hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : 1809 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
1811 CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 1810 CLOCK_REALTIME : CLOCK_MONOTONIC,
1811 HRTIMER_MODE_ABS);
1812 hrtimer_init_sleeper(to, current); 1812 hrtimer_init_sleeper(to, current);
1813 hrtimer_set_expires_range_ns(&to->timer, *abs_time, 1813 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
1814 current->timer_slack_ns); 1814 current->timer_slack_ns);
@@ -1819,7 +1819,7 @@ retry:
1819 * Prepare to wait on uaddr. On success, holds hb lock and increments 1819 * Prepare to wait on uaddr. On success, holds hb lock and increments
1820 * q.key refs. 1820 * q.key refs.
1821 */ 1821 */
1822 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 1822 ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
1823 if (ret) 1823 if (ret)
1824 goto out; 1824 goto out;
1825 1825
@@ -1852,12 +1852,7 @@ retry:
1852 restart->futex.val = val; 1852 restart->futex.val = val;
1853 restart->futex.time = abs_time->tv64; 1853 restart->futex.time = abs_time->tv64;
1854 restart->futex.bitset = bitset; 1854 restart->futex.bitset = bitset;
1855 restart->futex.flags = FLAGS_HAS_TIMEOUT; 1855 restart->futex.flags = flags;
1856
1857 if (fshared)
1858 restart->futex.flags |= FLAGS_SHARED;
1859 if (clockrt)
1860 restart->futex.flags |= FLAGS_CLOCKRT;
1861 1856
1862 ret = -ERESTART_RESTARTBLOCK; 1857 ret = -ERESTART_RESTARTBLOCK;
1863 1858
@@ -1873,7 +1868,6 @@ out:
1873static long futex_wait_restart(struct restart_block *restart) 1868static long futex_wait_restart(struct restart_block *restart)
1874{ 1869{
1875 u32 __user *uaddr = restart->futex.uaddr; 1870 u32 __user *uaddr = restart->futex.uaddr;
1876 int fshared = 0;
1877 ktime_t t, *tp = NULL; 1871 ktime_t t, *tp = NULL;
1878 1872
1879 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { 1873 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
@@ -1881,11 +1875,9 @@ static long futex_wait_restart(struct restart_block *restart)
1881 tp = &t; 1875 tp = &t;
1882 } 1876 }
1883 restart->fn = do_no_restart_syscall; 1877 restart->fn = do_no_restart_syscall;
1884 if (restart->futex.flags & FLAGS_SHARED) 1878
1885 fshared = 1; 1879 return (long)futex_wait(uaddr, restart->futex.flags,
1886 return (long)futex_wait(uaddr, fshared, restart->futex.val, tp, 1880 restart->futex.val, tp, restart->futex.bitset);
1887 restart->futex.bitset,
1888 restart->futex.flags & FLAGS_CLOCKRT);
1889} 1881}
1890 1882
1891 1883
@@ -1895,12 +1887,12 @@ static long futex_wait_restart(struct restart_block *restart)
1895 * if there are waiters then it will block, it does PI, etc. (Due to 1887 * if there are waiters then it will block, it does PI, etc. (Due to
1896 * races the kernel might see a 0 value of the futex too.) 1888 * races the kernel might see a 0 value of the futex too.)
1897 */ 1889 */
1898static int futex_lock_pi(u32 __user *uaddr, int fshared, 1890static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
1899 int detect, ktime_t *time, int trylock) 1891 ktime_t *time, int trylock)
1900{ 1892{
1901 struct hrtimer_sleeper timeout, *to = NULL; 1893 struct hrtimer_sleeper timeout, *to = NULL;
1902 struct futex_hash_bucket *hb; 1894 struct futex_hash_bucket *hb;
1903 struct futex_q q; 1895 struct futex_q q = futex_q_init;
1904 int res, ret; 1896 int res, ret;
1905 1897
1906 if (refill_pi_state_cache()) 1898 if (refill_pi_state_cache())
@@ -1914,12 +1906,8 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1914 hrtimer_set_expires(&to->timer, *time); 1906 hrtimer_set_expires(&to->timer, *time);
1915 } 1907 }
1916 1908
1917 q.pi_state = NULL;
1918 q.rt_waiter = NULL;
1919 q.requeue_pi_key = NULL;
1920retry: 1909retry:
1921 q.key = FUTEX_KEY_INIT; 1910 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key);
1922 ret = get_futex_key(uaddr, fshared, &q.key);
1923 if (unlikely(ret != 0)) 1911 if (unlikely(ret != 0))
1924 goto out; 1912 goto out;
1925 1913
@@ -1941,7 +1929,7 @@ retry_private:
1941 * exit to complete. 1929 * exit to complete.
1942 */ 1930 */
1943 queue_unlock(&q, hb); 1931 queue_unlock(&q, hb);
1944 put_futex_key(fshared, &q.key); 1932 put_futex_key(&q.key);
1945 cond_resched(); 1933 cond_resched();
1946 goto retry; 1934 goto retry;
1947 default: 1935 default:
@@ -1971,7 +1959,7 @@ retry_private:
1971 * Fixup the pi_state owner and possibly acquire the lock if we 1959 * Fixup the pi_state owner and possibly acquire the lock if we
1972 * haven't already. 1960 * haven't already.
1973 */ 1961 */
1974 res = fixup_owner(uaddr, fshared, &q, !ret); 1962 res = fixup_owner(uaddr, &q, !ret);
1975 /* 1963 /*
1976 * If fixup_owner() returned an error, proprogate that. If it acquired 1964 * If fixup_owner() returned an error, proprogate that. If it acquired
1977 * the lock, clear our -ETIMEDOUT or -EINTR. 1965 * the lock, clear our -ETIMEDOUT or -EINTR.
@@ -1995,7 +1983,7 @@ out_unlock_put_key:
1995 queue_unlock(&q, hb); 1983 queue_unlock(&q, hb);
1996 1984
1997out_put_key: 1985out_put_key:
1998 put_futex_key(fshared, &q.key); 1986 put_futex_key(&q.key);
1999out: 1987out:
2000 if (to) 1988 if (to)
2001 destroy_hrtimer_on_stack(&to->timer); 1989 destroy_hrtimer_on_stack(&to->timer);
@@ -2008,10 +1996,10 @@ uaddr_faulted:
2008 if (ret) 1996 if (ret)
2009 goto out_put_key; 1997 goto out_put_key;
2010 1998
2011 if (!fshared) 1999 if (!(flags & FLAGS_SHARED))
2012 goto retry_private; 2000 goto retry_private;
2013 2001
2014 put_futex_key(fshared, &q.key); 2002 put_futex_key(&q.key);
2015 goto retry; 2003 goto retry;
2016} 2004}
2017 2005
@@ -2020,7 +2008,7 @@ uaddr_faulted:
2020 * This is the in-kernel slowpath: we look up the PI state (if any), 2008 * This is the in-kernel slowpath: we look up the PI state (if any),
2021 * and do the rt-mutex unlock. 2009 * and do the rt-mutex unlock.
2022 */ 2010 */
2023static int futex_unlock_pi(u32 __user *uaddr, int fshared) 2011static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2024{ 2012{
2025 struct futex_hash_bucket *hb; 2013 struct futex_hash_bucket *hb;
2026 struct futex_q *this, *next; 2014 struct futex_q *this, *next;
@@ -2038,7 +2026,7 @@ retry:
2038 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 2026 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
2039 return -EPERM; 2027 return -EPERM;
2040 2028
2041 ret = get_futex_key(uaddr, fshared, &key); 2029 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
2042 if (unlikely(ret != 0)) 2030 if (unlikely(ret != 0))
2043 goto out; 2031 goto out;
2044 2032
@@ -2093,14 +2081,14 @@ retry:
2093 2081
2094out_unlock: 2082out_unlock:
2095 spin_unlock(&hb->lock); 2083 spin_unlock(&hb->lock);
2096 put_futex_key(fshared, &key); 2084 put_futex_key(&key);
2097 2085
2098out: 2086out:
2099 return ret; 2087 return ret;
2100 2088
2101pi_faulted: 2089pi_faulted:
2102 spin_unlock(&hb->lock); 2090 spin_unlock(&hb->lock);
2103 put_futex_key(fshared, &key); 2091 put_futex_key(&key);
2104 2092
2105 ret = fault_in_user_writeable(uaddr); 2093 ret = fault_in_user_writeable(uaddr);
2106 if (!ret) 2094 if (!ret)
@@ -2160,7 +2148,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2160/** 2148/**
2161 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 2149 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2162 * @uaddr: the futex we initially wait on (non-pi) 2150 * @uaddr: the futex we initially wait on (non-pi)
2163 * @fshared: whether the futexes are shared (1) or not (0). They must be 2151 * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
2164 * the same type, no requeueing from private to shared, etc. 2152 * the same type, no requeueing from private to shared, etc.
2165 * @val: the expected value of uaddr 2153 * @val: the expected value of uaddr
2166 * @abs_time: absolute timeout 2154 * @abs_time: absolute timeout
@@ -2198,16 +2186,16 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2198 * 0 - On success 2186 * 0 - On success
2199 * <0 - On error 2187 * <0 - On error
2200 */ 2188 */
2201static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, 2189static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2202 u32 val, ktime_t *abs_time, u32 bitset, 2190 u32 val, ktime_t *abs_time, u32 bitset,
2203 int clockrt, u32 __user *uaddr2) 2191 u32 __user *uaddr2)
2204{ 2192{
2205 struct hrtimer_sleeper timeout, *to = NULL; 2193 struct hrtimer_sleeper timeout, *to = NULL;
2206 struct rt_mutex_waiter rt_waiter; 2194 struct rt_mutex_waiter rt_waiter;
2207 struct rt_mutex *pi_mutex = NULL; 2195 struct rt_mutex *pi_mutex = NULL;
2208 struct futex_hash_bucket *hb; 2196 struct futex_hash_bucket *hb;
2209 union futex_key key2; 2197 union futex_key key2 = FUTEX_KEY_INIT;
2210 struct futex_q q; 2198 struct futex_q q = futex_q_init;
2211 int res, ret; 2199 int res, ret;
2212 2200
2213 if (!bitset) 2201 if (!bitset)
@@ -2215,8 +2203,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2215 2203
2216 if (abs_time) { 2204 if (abs_time) {
2217 to = &timeout; 2205 to = &timeout;
2218 hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : 2206 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2219 CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 2207 CLOCK_REALTIME : CLOCK_MONOTONIC,
2208 HRTIMER_MODE_ABS);
2220 hrtimer_init_sleeper(to, current); 2209 hrtimer_init_sleeper(to, current);
2221 hrtimer_set_expires_range_ns(&to->timer, *abs_time, 2210 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2222 current->timer_slack_ns); 2211 current->timer_slack_ns);
@@ -2229,12 +2218,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2229 debug_rt_mutex_init_waiter(&rt_waiter); 2218 debug_rt_mutex_init_waiter(&rt_waiter);
2230 rt_waiter.task = NULL; 2219 rt_waiter.task = NULL;
2231 2220
2232 key2 = FUTEX_KEY_INIT; 2221 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
2233 ret = get_futex_key(uaddr2, fshared, &key2);
2234 if (unlikely(ret != 0)) 2222 if (unlikely(ret != 0))
2235 goto out; 2223 goto out;
2236 2224
2237 q.pi_state = NULL;
2238 q.bitset = bitset; 2225 q.bitset = bitset;
2239 q.rt_waiter = &rt_waiter; 2226 q.rt_waiter = &rt_waiter;
2240 q.requeue_pi_key = &key2; 2227 q.requeue_pi_key = &key2;
@@ -2243,7 +2230,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2243 * Prepare to wait on uaddr. On success, increments q.key (key1) ref 2230 * Prepare to wait on uaddr. On success, increments q.key (key1) ref
2244 * count. 2231 * count.
2245 */ 2232 */
2246 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 2233 ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2247 if (ret) 2234 if (ret)
2248 goto out_key2; 2235 goto out_key2;
2249 2236
@@ -2273,8 +2260,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2273 */ 2260 */
2274 if (q.pi_state && (q.pi_state->owner != current)) { 2261 if (q.pi_state && (q.pi_state->owner != current)) {
2275 spin_lock(q.lock_ptr); 2262 spin_lock(q.lock_ptr);
2276 ret = fixup_pi_state_owner(uaddr2, &q, current, 2263 ret = fixup_pi_state_owner(uaddr2, &q, current);
2277 fshared);
2278 spin_unlock(q.lock_ptr); 2264 spin_unlock(q.lock_ptr);
2279 } 2265 }
2280 } else { 2266 } else {
@@ -2293,7 +2279,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2293 * Fixup the pi_state owner and possibly acquire the lock if we 2279 * Fixup the pi_state owner and possibly acquire the lock if we
2294 * haven't already. 2280 * haven't already.
2295 */ 2281 */
2296 res = fixup_owner(uaddr2, fshared, &q, !ret); 2282 res = fixup_owner(uaddr2, &q, !ret);
2297 /* 2283 /*
2298 * If fixup_owner() returned an error, proprogate that. If it 2284 * If fixup_owner() returned an error, proprogate that. If it
2299 * acquired the lock, clear -ETIMEDOUT or -EINTR. 2285 * acquired the lock, clear -ETIMEDOUT or -EINTR.
@@ -2324,9 +2310,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2324 } 2310 }
2325 2311
2326out_put_keys: 2312out_put_keys:
2327 put_futex_key(fshared, &q.key); 2313 put_futex_key(&q.key);
2328out_key2: 2314out_key2:
2329 put_futex_key(fshared, &key2); 2315 put_futex_key(&key2);
2330 2316
2331out: 2317out:
2332 if (to) { 2318 if (to) {
@@ -2489,7 +2475,8 @@ void exit_robust_list(struct task_struct *curr)
2489{ 2475{
2490 struct robust_list_head __user *head = curr->robust_list; 2476 struct robust_list_head __user *head = curr->robust_list;
2491 struct robust_list __user *entry, *next_entry, *pending; 2477 struct robust_list __user *entry, *next_entry, *pending;
2492 unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; 2478 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
2479 unsigned int uninitialized_var(next_pi);
2493 unsigned long futex_offset; 2480 unsigned long futex_offset;
2494 int rc; 2481 int rc;
2495 2482
@@ -2550,58 +2537,57 @@ void exit_robust_list(struct task_struct *curr)
2550long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 2537long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2551 u32 __user *uaddr2, u32 val2, u32 val3) 2538 u32 __user *uaddr2, u32 val2, u32 val3)
2552{ 2539{
2553 int clockrt, ret = -ENOSYS; 2540 int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
2554 int cmd = op & FUTEX_CMD_MASK; 2541 unsigned int flags = 0;
2555 int fshared = 0;
2556 2542
2557 if (!(op & FUTEX_PRIVATE_FLAG)) 2543 if (!(op & FUTEX_PRIVATE_FLAG))
2558 fshared = 1; 2544 flags |= FLAGS_SHARED;
2559 2545
2560 clockrt = op & FUTEX_CLOCK_REALTIME; 2546 if (op & FUTEX_CLOCK_REALTIME) {
2561 if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) 2547 flags |= FLAGS_CLOCKRT;
2562 return -ENOSYS; 2548 if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
2549 return -ENOSYS;
2550 }
2563 2551
2564 switch (cmd) { 2552 switch (cmd) {
2565 case FUTEX_WAIT: 2553 case FUTEX_WAIT:
2566 val3 = FUTEX_BITSET_MATCH_ANY; 2554 val3 = FUTEX_BITSET_MATCH_ANY;
2567 case FUTEX_WAIT_BITSET: 2555 case FUTEX_WAIT_BITSET:
2568 ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt); 2556 ret = futex_wait(uaddr, flags, val, timeout, val3);
2569 break; 2557 break;
2570 case FUTEX_WAKE: 2558 case FUTEX_WAKE:
2571 val3 = FUTEX_BITSET_MATCH_ANY; 2559 val3 = FUTEX_BITSET_MATCH_ANY;
2572 case FUTEX_WAKE_BITSET: 2560 case FUTEX_WAKE_BITSET:
2573 ret = futex_wake(uaddr, fshared, val, val3); 2561 ret = futex_wake(uaddr, flags, val, val3);
2574 break; 2562 break;
2575 case FUTEX_REQUEUE: 2563 case FUTEX_REQUEUE:
2576 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); 2564 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
2577 break; 2565 break;
2578 case FUTEX_CMP_REQUEUE: 2566 case FUTEX_CMP_REQUEUE:
2579 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, 2567 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
2580 0);
2581 break; 2568 break;
2582 case FUTEX_WAKE_OP: 2569 case FUTEX_WAKE_OP:
2583 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); 2570 ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
2584 break; 2571 break;
2585 case FUTEX_LOCK_PI: 2572 case FUTEX_LOCK_PI:
2586 if (futex_cmpxchg_enabled) 2573 if (futex_cmpxchg_enabled)
2587 ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); 2574 ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
2588 break; 2575 break;
2589 case FUTEX_UNLOCK_PI: 2576 case FUTEX_UNLOCK_PI:
2590 if (futex_cmpxchg_enabled) 2577 if (futex_cmpxchg_enabled)
2591 ret = futex_unlock_pi(uaddr, fshared); 2578 ret = futex_unlock_pi(uaddr, flags);
2592 break; 2579 break;
2593 case FUTEX_TRYLOCK_PI: 2580 case FUTEX_TRYLOCK_PI:
2594 if (futex_cmpxchg_enabled) 2581 if (futex_cmpxchg_enabled)
2595 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); 2582 ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
2596 break; 2583 break;
2597 case FUTEX_WAIT_REQUEUE_PI: 2584 case FUTEX_WAIT_REQUEUE_PI:
2598 val3 = FUTEX_BITSET_MATCH_ANY; 2585 val3 = FUTEX_BITSET_MATCH_ANY;
2599 ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, 2586 ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
2600 clockrt, uaddr2); 2587 uaddr2);
2601 break; 2588 break;
2602 case FUTEX_CMP_REQUEUE_PI: 2589 case FUTEX_CMP_REQUEUE_PI:
2603 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, 2590 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
2604 1);
2605 break; 2591 break;
2606 default: 2592 default:
2607 ret = -ENOSYS; 2593 ret = -ENOSYS;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 06da4dfc339b..a7934ac75e5b 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -49,7 +49,8 @@ void compat_exit_robust_list(struct task_struct *curr)
49{ 49{
50 struct compat_robust_list_head __user *head = curr->compat_robust_list; 50 struct compat_robust_list_head __user *head = curr->compat_robust_list;
51 struct robust_list __user *entry, *next_entry, *pending; 51 struct robust_list __user *entry, *next_entry, *pending;
52 unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; 52 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
53 unsigned int uninitialized_var(next_pi);
53 compat_uptr_t uentry, next_uentry, upending; 54 compat_uptr_t uentry, next_uentry, upending;
54 compat_long_t futex_offset; 55 compat_long_t futex_offset;
55 int rc; 56 int rc;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 72206cf5c6cf..45da2b6920ab 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -497,7 +497,7 @@ static inline int hrtimer_is_hres_enabled(void)
497 */ 497 */
498static inline int hrtimer_hres_active(void) 498static inline int hrtimer_hres_active(void)
499{ 499{
500 return __get_cpu_var(hrtimer_bases).hres_active; 500 return __this_cpu_read(hrtimer_bases.hres_active);
501} 501}
502 502
503/* 503/*
@@ -516,10 +516,13 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
516 516
517 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 517 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
518 struct hrtimer *timer; 518 struct hrtimer *timer;
519 struct timerqueue_node *next;
519 520
520 if (!base->first) 521 next = timerqueue_getnext(&base->active);
522 if (!next)
521 continue; 523 continue;
522 timer = rb_entry(base->first, struct hrtimer, node); 524 timer = container_of(next, struct hrtimer, node);
525
523 expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 526 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
524 /* 527 /*
525 * clock_was_set() has changed base->offset so the 528 * clock_was_set() has changed base->offset so the
@@ -840,48 +843,17 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
840static int enqueue_hrtimer(struct hrtimer *timer, 843static int enqueue_hrtimer(struct hrtimer *timer,
841 struct hrtimer_clock_base *base) 844 struct hrtimer_clock_base *base)
842{ 845{
843 struct rb_node **link = &base->active.rb_node;
844 struct rb_node *parent = NULL;
845 struct hrtimer *entry;
846 int leftmost = 1;
847
848 debug_activate(timer); 846 debug_activate(timer);
849 847
850 /* 848 timerqueue_add(&base->active, &timer->node);
851 * Find the right place in the rbtree:
852 */
853 while (*link) {
854 parent = *link;
855 entry = rb_entry(parent, struct hrtimer, node);
856 /*
857 * We dont care about collisions. Nodes with
858 * the same expiry time stay together.
859 */
860 if (hrtimer_get_expires_tv64(timer) <
861 hrtimer_get_expires_tv64(entry)) {
862 link = &(*link)->rb_left;
863 } else {
864 link = &(*link)->rb_right;
865 leftmost = 0;
866 }
867 }
868
869 /*
870 * Insert the timer to the rbtree and check whether it
871 * replaces the first pending timer
872 */
873 if (leftmost)
874 base->first = &timer->node;
875 849
876 rb_link_node(&timer->node, parent, link);
877 rb_insert_color(&timer->node, &base->active);
878 /* 850 /*
879 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the 851 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
880 * state of a possibly running callback. 852 * state of a possibly running callback.
881 */ 853 */
882 timer->state |= HRTIMER_STATE_ENQUEUED; 854 timer->state |= HRTIMER_STATE_ENQUEUED;
883 855
884 return leftmost; 856 return (&timer->node == base->active.next);
885} 857}
886 858
887/* 859/*
@@ -901,12 +873,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
901 if (!(timer->state & HRTIMER_STATE_ENQUEUED)) 873 if (!(timer->state & HRTIMER_STATE_ENQUEUED))
902 goto out; 874 goto out;
903 875
904 /* 876 if (&timer->node == timerqueue_getnext(&base->active)) {
905 * Remove the timer from the rbtree and replace the first
906 * entry pointer if necessary.
907 */
908 if (base->first == &timer->node) {
909 base->first = rb_next(&timer->node);
910#ifdef CONFIG_HIGH_RES_TIMERS 877#ifdef CONFIG_HIGH_RES_TIMERS
911 /* Reprogram the clock event device. if enabled */ 878 /* Reprogram the clock event device. if enabled */
912 if (reprogram && hrtimer_hres_active()) { 879 if (reprogram && hrtimer_hres_active()) {
@@ -919,7 +886,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
919 } 886 }
920#endif 887#endif
921 } 888 }
922 rb_erase(&timer->node, &base->active); 889 timerqueue_del(&base->active, &timer->node);
923out: 890out:
924 timer->state = newstate; 891 timer->state = newstate;
925} 892}
@@ -1128,11 +1095,13 @@ ktime_t hrtimer_get_next_event(void)
1128 if (!hrtimer_hres_active()) { 1095 if (!hrtimer_hres_active()) {
1129 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 1096 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
1130 struct hrtimer *timer; 1097 struct hrtimer *timer;
1098 struct timerqueue_node *next;
1131 1099
1132 if (!base->first) 1100 next = timerqueue_getnext(&base->active);
1101 if (!next)
1133 continue; 1102 continue;
1134 1103
1135 timer = rb_entry(base->first, struct hrtimer, node); 1104 timer = container_of(next, struct hrtimer, node);
1136 delta.tv64 = hrtimer_get_expires_tv64(timer); 1105 delta.tv64 = hrtimer_get_expires_tv64(timer);
1137 delta = ktime_sub(delta, base->get_time()); 1106 delta = ktime_sub(delta, base->get_time());
1138 if (delta.tv64 < mindelta.tv64) 1107 if (delta.tv64 < mindelta.tv64)
@@ -1162,6 +1131,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1162 1131
1163 timer->base = &cpu_base->clock_base[clock_id]; 1132 timer->base = &cpu_base->clock_base[clock_id];
1164 hrtimer_init_timer_hres(timer); 1133 hrtimer_init_timer_hres(timer);
1134 timerqueue_init(&timer->node);
1165 1135
1166#ifdef CONFIG_TIMER_STATS 1136#ifdef CONFIG_TIMER_STATS
1167 timer->start_site = NULL; 1137 timer->start_site = NULL;
@@ -1278,14 +1248,14 @@ retry:
1278 1248
1279 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1249 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1280 ktime_t basenow; 1250 ktime_t basenow;
1281 struct rb_node *node; 1251 struct timerqueue_node *node;
1282 1252
1283 basenow = ktime_add(now, base->offset); 1253 basenow = ktime_add(now, base->offset);
1284 1254
1285 while ((node = base->first)) { 1255 while ((node = timerqueue_getnext(&base->active))) {
1286 struct hrtimer *timer; 1256 struct hrtimer *timer;
1287 1257
1288 timer = rb_entry(node, struct hrtimer, node); 1258 timer = container_of(node, struct hrtimer, node);
1289 1259
1290 /* 1260 /*
1291 * The immediate goal for using the softexpires is 1261 * The immediate goal for using the softexpires is
@@ -1441,7 +1411,7 @@ void hrtimer_run_pending(void)
1441 */ 1411 */
1442void hrtimer_run_queues(void) 1412void hrtimer_run_queues(void)
1443{ 1413{
1444 struct rb_node *node; 1414 struct timerqueue_node *node;
1445 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1415 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1446 struct hrtimer_clock_base *base; 1416 struct hrtimer_clock_base *base;
1447 int index, gettime = 1; 1417 int index, gettime = 1;
@@ -1451,8 +1421,7 @@ void hrtimer_run_queues(void)
1451 1421
1452 for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { 1422 for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
1453 base = &cpu_base->clock_base[index]; 1423 base = &cpu_base->clock_base[index];
1454 1424 if (!timerqueue_getnext(&base->active))
1455 if (!base->first)
1456 continue; 1425 continue;
1457 1426
1458 if (gettime) { 1427 if (gettime) {
@@ -1462,10 +1431,10 @@ void hrtimer_run_queues(void)
1462 1431
1463 raw_spin_lock(&cpu_base->lock); 1432 raw_spin_lock(&cpu_base->lock);
1464 1433
1465 while ((node = base->first)) { 1434 while ((node = timerqueue_getnext(&base->active))) {
1466 struct hrtimer *timer; 1435 struct hrtimer *timer;
1467 1436
1468 timer = rb_entry(node, struct hrtimer, node); 1437 timer = container_of(node, struct hrtimer, node);
1469 if (base->softirq_time.tv64 <= 1438 if (base->softirq_time.tv64 <=
1470 hrtimer_get_expires_tv64(timer)) 1439 hrtimer_get_expires_tv64(timer))
1471 break; 1440 break;
@@ -1630,8 +1599,10 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1630 1599
1631 raw_spin_lock_init(&cpu_base->lock); 1600 raw_spin_lock_init(&cpu_base->lock);
1632 1601
1633 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1602 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1634 cpu_base->clock_base[i].cpu_base = cpu_base; 1603 cpu_base->clock_base[i].cpu_base = cpu_base;
1604 timerqueue_init_head(&cpu_base->clock_base[i].active);
1605 }
1635 1606
1636 hrtimer_init_hres(cpu_base); 1607 hrtimer_init_hres(cpu_base);
1637} 1608}
@@ -1642,10 +1613,10 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1642 struct hrtimer_clock_base *new_base) 1613 struct hrtimer_clock_base *new_base)
1643{ 1614{
1644 struct hrtimer *timer; 1615 struct hrtimer *timer;
1645 struct rb_node *node; 1616 struct timerqueue_node *node;
1646 1617
1647 while ((node = rb_first(&old_base->active))) { 1618 while ((node = timerqueue_getnext(&old_base->active))) {
1648 timer = rb_entry(node, struct hrtimer, node); 1619 timer = container_of(node, struct hrtimer, node);
1649 BUG_ON(hrtimer_callback_running(timer)); 1620 BUG_ON(hrtimer_callback_running(timer));
1650 debug_deactivate(timer); 1621 debug_deactivate(timer);
1651 1622
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 2c9120f0afca..086adf25a55e 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -620,7 +620,7 @@ static struct pmu perf_breakpoint = {
620 .read = hw_breakpoint_pmu_read, 620 .read = hw_breakpoint_pmu_read,
621}; 621};
622 622
623static int __init init_hw_breakpoint(void) 623int __init init_hw_breakpoint(void)
624{ 624{
625 unsigned int **task_bp_pinned; 625 unsigned int **task_bp_pinned;
626 int cpu, err_cpu; 626 int cpu, err_cpu;
@@ -641,7 +641,7 @@ static int __init init_hw_breakpoint(void)
641 641
642 constraints_initialized = 1; 642 constraints_initialized = 1;
643 643
644 perf_pmu_register(&perf_breakpoint); 644 perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);
645 645
646 return register_die_notifier(&hw_breakpoint_exceptions_nb); 646 return register_die_notifier(&hw_breakpoint_exceptions_nb);
647 647
@@ -655,6 +655,5 @@ static int __init init_hw_breakpoint(void)
655 655
656 return -ENOMEM; 656 return -ENOMEM;
657} 657}
658core_initcall(init_hw_breakpoint);
659 658
660 659
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5f92acc5f952..0caa59f747dd 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -577,7 +577,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
577 */ 577 */
578static int irq_thread(void *data) 578static int irq_thread(void *data)
579{ 579{
580 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; 580 static const struct sched_param param = {
581 .sched_priority = MAX_USER_RT_PRIO/2,
582 };
581 struct irqaction *action = data; 583 struct irqaction *action = data;
582 struct irq_desc *desc = irq_to_desc(action->irq); 584 struct irq_desc *desc = irq_to_desc(action->irq);
583 int wake, oneshot = desc->status & IRQ_ONESHOT; 585 int wake, oneshot = desc->status & IRQ_ONESHOT;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 01b1d3a88983..6c8a2a9f8a7b 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -214,7 +214,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
214 214
215static int irq_spurious_proc_open(struct inode *inode, struct file *file) 215static int irq_spurious_proc_open(struct inode *inode, struct file *file)
216{ 216{
217 return single_open(file, irq_spurious_proc_show, NULL); 217 return single_open(file, irq_spurious_proc_show, PDE(inode)->data);
218} 218}
219 219
220static const struct file_operations irq_spurious_proc_fops = { 220static const struct file_operations irq_spurious_proc_fops = {
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index f16763ff8481..c58fa7da8aef 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -77,21 +77,21 @@ void __weak arch_irq_work_raise(void)
77 */ 77 */
78static void __irq_work_queue(struct irq_work *entry) 78static void __irq_work_queue(struct irq_work *entry)
79{ 79{
80 struct irq_work **head, *next; 80 struct irq_work *next;
81 81
82 head = &get_cpu_var(irq_work_list); 82 preempt_disable();
83 83
84 do { 84 do {
85 next = *head; 85 next = __this_cpu_read(irq_work_list);
86 /* Can assign non-atomic because we keep the flags set. */ 86 /* Can assign non-atomic because we keep the flags set. */
87 entry->next = next_flags(next, IRQ_WORK_FLAGS); 87 entry->next = next_flags(next, IRQ_WORK_FLAGS);
88 } while (cmpxchg(head, next, entry) != next); 88 } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
89 89
90 /* The list was empty, raise self-interrupt to start processing. */ 90 /* The list was empty, raise self-interrupt to start processing. */
91 if (!irq_work_next(entry)) 91 if (!irq_work_next(entry))
92 arch_irq_work_raise(); 92 arch_irq_work_raise();
93 93
94 put_cpu_var(irq_work_list); 94 preempt_enable();
95} 95}
96 96
97/* 97/*
@@ -120,16 +120,16 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
120 */ 120 */
121void irq_work_run(void) 121void irq_work_run(void)
122{ 122{
123 struct irq_work *list, **head; 123 struct irq_work *list;
124 124
125 head = &__get_cpu_var(irq_work_list); 125 if (this_cpu_read(irq_work_list) == NULL)
126 if (*head == NULL)
127 return; 126 return;
128 127
129 BUG_ON(!in_irq()); 128 BUG_ON(!in_irq());
130 BUG_ON(!irqs_disabled()); 129 BUG_ON(!irqs_disabled());
131 130
132 list = xchg(head, NULL); 131 list = this_cpu_xchg(irq_work_list, NULL);
132
133 while (list != NULL) { 133 while (list != NULL) {
134 struct irq_work *entry = list; 134 struct irq_work *entry = list;
135 135
@@ -145,7 +145,9 @@ void irq_work_run(void)
145 * Clear the BUSY bit and return to the free state if 145 * Clear the BUSY bit and return to the free state if
146 * no-one else claimed it meanwhile. 146 * no-one else claimed it meanwhile.
147 */ 147 */
148 cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL); 148 (void)cmpxchg(&entry->next,
149 next_flags(NULL, IRQ_WORK_BUSY),
150 NULL);
149 } 151 }
150} 152}
151EXPORT_SYMBOL_GPL(irq_work_run); 153EXPORT_SYMBOL_GPL(irq_work_run);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9737a76e106f..77981813a1e7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -317,12 +317,12 @@ void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
317/* We have preemption disabled.. so it is safe to use __ versions */ 317/* We have preemption disabled.. so it is safe to use __ versions */
318static inline void set_kprobe_instance(struct kprobe *kp) 318static inline void set_kprobe_instance(struct kprobe *kp)
319{ 319{
320 __get_cpu_var(kprobe_instance) = kp; 320 __this_cpu_write(kprobe_instance, kp);
321} 321}
322 322
323static inline void reset_kprobe_instance(void) 323static inline void reset_kprobe_instance(void)
324{ 324{
325 __get_cpu_var(kprobe_instance) = NULL; 325 __this_cpu_write(kprobe_instance, NULL);
326} 326}
327 327
328/* 328/*
@@ -354,13 +354,20 @@ static inline int kprobe_aggrprobe(struct kprobe *p)
354 return p->pre_handler == aggr_pre_handler; 354 return p->pre_handler == aggr_pre_handler;
355} 355}
356 356
357/* Return true(!0) if the kprobe is unused */
358static inline int kprobe_unused(struct kprobe *p)
359{
360 return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
361 list_empty(&p->list);
362}
363
357/* 364/*
358 * Keep all fields in the kprobe consistent 365 * Keep all fields in the kprobe consistent
359 */ 366 */
360static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) 367static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
361{ 368{
362 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); 369 memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
363 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); 370 memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn));
364} 371}
365 372
366#ifdef CONFIG_OPTPROBES 373#ifdef CONFIG_OPTPROBES
@@ -384,6 +391,17 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
384 } 391 }
385} 392}
386 393
394/* Free optimized instructions and optimized_kprobe */
395static __kprobes void free_aggr_kprobe(struct kprobe *p)
396{
397 struct optimized_kprobe *op;
398
399 op = container_of(p, struct optimized_kprobe, kp);
400 arch_remove_optimized_kprobe(op);
401 arch_remove_kprobe(p);
402 kfree(op);
403}
404
387/* Return true(!0) if the kprobe is ready for optimization. */ 405/* Return true(!0) if the kprobe is ready for optimization. */
388static inline int kprobe_optready(struct kprobe *p) 406static inline int kprobe_optready(struct kprobe *p)
389{ 407{
@@ -397,6 +415,33 @@ static inline int kprobe_optready(struct kprobe *p)
397 return 0; 415 return 0;
398} 416}
399 417
418/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */
419static inline int kprobe_disarmed(struct kprobe *p)
420{
421 struct optimized_kprobe *op;
422
423 /* If kprobe is not aggr/opt probe, just return kprobe is disabled */
424 if (!kprobe_aggrprobe(p))
425 return kprobe_disabled(p);
426
427 op = container_of(p, struct optimized_kprobe, kp);
428
429 return kprobe_disabled(p) && list_empty(&op->list);
430}
431
432/* Return true(!0) if the probe is queued on (un)optimizing lists */
433static int __kprobes kprobe_queued(struct kprobe *p)
434{
435 struct optimized_kprobe *op;
436
437 if (kprobe_aggrprobe(p)) {
438 op = container_of(p, struct optimized_kprobe, kp);
439 if (!list_empty(&op->list))
440 return 1;
441 }
442 return 0;
443}
444
400/* 445/*
401 * Return an optimized kprobe whose optimizing code replaces 446 * Return an optimized kprobe whose optimizing code replaces
402 * instructions including addr (exclude breakpoint). 447 * instructions including addr (exclude breakpoint).
@@ -422,30 +467,23 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
422 467
423/* Optimization staging list, protected by kprobe_mutex */ 468/* Optimization staging list, protected by kprobe_mutex */
424static LIST_HEAD(optimizing_list); 469static LIST_HEAD(optimizing_list);
470static LIST_HEAD(unoptimizing_list);
425 471
426static void kprobe_optimizer(struct work_struct *work); 472static void kprobe_optimizer(struct work_struct *work);
427static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); 473static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
474static DECLARE_COMPLETION(optimizer_comp);
428#define OPTIMIZE_DELAY 5 475#define OPTIMIZE_DELAY 5
429 476
430/* Kprobe jump optimizer */ 477/*
431static __kprobes void kprobe_optimizer(struct work_struct *work) 478 * Optimize (replace a breakpoint with a jump) kprobes listed on
479 * optimizing_list.
480 */
481static __kprobes void do_optimize_kprobes(void)
432{ 482{
433 struct optimized_kprobe *op, *tmp; 483 /* Optimization never be done when disarmed */
434 484 if (kprobes_all_disarmed || !kprobes_allow_optimization ||
435 /* Lock modules while optimizing kprobes */ 485 list_empty(&optimizing_list))
436 mutex_lock(&module_mutex); 486 return;
437 mutex_lock(&kprobe_mutex);
438 if (kprobes_all_disarmed || !kprobes_allow_optimization)
439 goto end;
440
441 /*
442 * Wait for quiesence period to ensure all running interrupts
443 * are done. Because optprobe may modify multiple instructions
444 * there is a chance that Nth instruction is interrupted. In that
445 * case, running interrupt can return to 2nd-Nth byte of jump
446 * instruction. This wait is for avoiding it.
447 */
448 synchronize_sched();
449 487
450 /* 488 /*
451 * The optimization/unoptimization refers online_cpus via 489 * The optimization/unoptimization refers online_cpus via
@@ -459,17 +497,111 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
459 */ 497 */
460 get_online_cpus(); 498 get_online_cpus();
461 mutex_lock(&text_mutex); 499 mutex_lock(&text_mutex);
462 list_for_each_entry_safe(op, tmp, &optimizing_list, list) { 500 arch_optimize_kprobes(&optimizing_list);
463 WARN_ON(kprobe_disabled(&op->kp)); 501 mutex_unlock(&text_mutex);
464 if (arch_optimize_kprobe(op) < 0) 502 put_online_cpus();
465 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; 503}
466 list_del_init(&op->list); 504
505/*
506 * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
507 * if need) kprobes listed on unoptimizing_list.
508 */
509static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
510{
511 struct optimized_kprobe *op, *tmp;
512
513 /* Unoptimization must be done anytime */
514 if (list_empty(&unoptimizing_list))
515 return;
516
517 /* Ditto to do_optimize_kprobes */
518 get_online_cpus();
519 mutex_lock(&text_mutex);
520 arch_unoptimize_kprobes(&unoptimizing_list, free_list);
521 /* Loop free_list for disarming */
522 list_for_each_entry_safe(op, tmp, free_list, list) {
523 /* Disarm probes if marked disabled */
524 if (kprobe_disabled(&op->kp))
525 arch_disarm_kprobe(&op->kp);
526 if (kprobe_unused(&op->kp)) {
527 /*
528 * Remove unused probes from hash list. After waiting
529 * for synchronization, these probes are reclaimed.
530 * (reclaiming is done by do_free_cleaned_kprobes.)
531 */
532 hlist_del_rcu(&op->kp.hlist);
533 } else
534 list_del_init(&op->list);
467 } 535 }
468 mutex_unlock(&text_mutex); 536 mutex_unlock(&text_mutex);
469 put_online_cpus(); 537 put_online_cpus();
470end: 538}
539
540/* Reclaim all kprobes on the free_list */
541static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
542{
543 struct optimized_kprobe *op, *tmp;
544
545 list_for_each_entry_safe(op, tmp, free_list, list) {
546 BUG_ON(!kprobe_unused(&op->kp));
547 list_del_init(&op->list);
548 free_aggr_kprobe(&op->kp);
549 }
550}
551
552/* Start optimizer after OPTIMIZE_DELAY passed */
553static __kprobes void kick_kprobe_optimizer(void)
554{
555 if (!delayed_work_pending(&optimizing_work))
556 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
557}
558
559/* Kprobe jump optimizer */
560static __kprobes void kprobe_optimizer(struct work_struct *work)
561{
562 LIST_HEAD(free_list);
563
564 /* Lock modules while optimizing kprobes */
565 mutex_lock(&module_mutex);
566 mutex_lock(&kprobe_mutex);
567
568 /*
569 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
570 * kprobes before waiting for quiesence period.
571 */
572 do_unoptimize_kprobes(&free_list);
573
574 /*
575 * Step 2: Wait for quiesence period to ensure all running interrupts
576 * are done. Because optprobe may modify multiple instructions
577 * there is a chance that Nth instruction is interrupted. In that
578 * case, running interrupt can return to 2nd-Nth byte of jump
579 * instruction. This wait is for avoiding it.
580 */
581 synchronize_sched();
582
583 /* Step 3: Optimize kprobes after quiesence period */
584 do_optimize_kprobes();
585
586 /* Step 4: Free cleaned kprobes after quiesence period */
587 do_free_cleaned_kprobes(&free_list);
588
471 mutex_unlock(&kprobe_mutex); 589 mutex_unlock(&kprobe_mutex);
472 mutex_unlock(&module_mutex); 590 mutex_unlock(&module_mutex);
591
592 /* Step 5: Kick optimizer again if needed */
593 if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
594 kick_kprobe_optimizer();
595 else
596 /* Wake up all waiters */
597 complete_all(&optimizer_comp);
598}
599
600/* Wait for completing optimization and unoptimization */
601static __kprobes void wait_for_kprobe_optimizer(void)
602{
603 if (delayed_work_pending(&optimizing_work))
604 wait_for_completion(&optimizer_comp);
473} 605}
474 606
475/* Optimize kprobe if p is ready to be optimized */ 607/* Optimize kprobe if p is ready to be optimized */
@@ -495,42 +627,99 @@ static __kprobes void optimize_kprobe(struct kprobe *p)
495 /* Check if it is already optimized. */ 627 /* Check if it is already optimized. */
496 if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) 628 if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
497 return; 629 return;
498
499 op->kp.flags |= KPROBE_FLAG_OPTIMIZED; 630 op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
500 list_add(&op->list, &optimizing_list); 631
501 if (!delayed_work_pending(&optimizing_work)) 632 if (!list_empty(&op->list))
502 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); 633 /* This is under unoptimizing. Just dequeue the probe */
634 list_del_init(&op->list);
635 else {
636 list_add(&op->list, &optimizing_list);
637 kick_kprobe_optimizer();
638 }
639}
640
641/* Short cut to direct unoptimizing */
642static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
643{
644 get_online_cpus();
645 arch_unoptimize_kprobe(op);
646 put_online_cpus();
647 if (kprobe_disabled(&op->kp))
648 arch_disarm_kprobe(&op->kp);
503} 649}
504 650
505/* Unoptimize a kprobe if p is optimized */ 651/* Unoptimize a kprobe if p is optimized */
506static __kprobes void unoptimize_kprobe(struct kprobe *p) 652static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force)
507{ 653{
508 struct optimized_kprobe *op; 654 struct optimized_kprobe *op;
509 655
510 if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) { 656 if (!kprobe_aggrprobe(p) || kprobe_disarmed(p))
511 op = container_of(p, struct optimized_kprobe, kp); 657 return; /* This is not an optprobe nor optimized */
512 if (!list_empty(&op->list)) 658
513 /* Dequeue from the optimization queue */ 659 op = container_of(p, struct optimized_kprobe, kp);
660 if (!kprobe_optimized(p)) {
661 /* Unoptimized or unoptimizing case */
662 if (force && !list_empty(&op->list)) {
663 /*
664 * Only if this is unoptimizing kprobe and forced,
665 * forcibly unoptimize it. (No need to unoptimize
666 * unoptimized kprobe again :)
667 */
514 list_del_init(&op->list); 668 list_del_init(&op->list);
515 else 669 force_unoptimize_kprobe(op);
516 /* Replace jump with break */ 670 }
517 arch_unoptimize_kprobe(op); 671 return;
518 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; 672 }
673
674 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
675 if (!list_empty(&op->list)) {
676 /* Dequeue from the optimization queue */
677 list_del_init(&op->list);
678 return;
679 }
680 /* Optimized kprobe case */
681 if (force)
682 /* Forcibly update the code: this is a special case */
683 force_unoptimize_kprobe(op);
684 else {
685 list_add(&op->list, &unoptimizing_list);
686 kick_kprobe_optimizer();
519 } 687 }
520} 688}
521 689
690/* Cancel unoptimizing for reusing */
691static void reuse_unused_kprobe(struct kprobe *ap)
692{
693 struct optimized_kprobe *op;
694
695 BUG_ON(!kprobe_unused(ap));
696 /*
697 * Unused kprobe MUST be on the way of delayed unoptimizing (means
698 * there is still a relative jump) and disabled.
699 */
700 op = container_of(ap, struct optimized_kprobe, kp);
701 if (unlikely(list_empty(&op->list)))
702 printk(KERN_WARNING "Warning: found a stray unused "
703 "aggrprobe@%p\n", ap->addr);
704 /* Enable the probe again */
705 ap->flags &= ~KPROBE_FLAG_DISABLED;
706 /* Optimize it again (remove from op->list) */
707 BUG_ON(!kprobe_optready(ap));
708 optimize_kprobe(ap);
709}
710
522/* Remove optimized instructions */ 711/* Remove optimized instructions */
523static void __kprobes kill_optimized_kprobe(struct kprobe *p) 712static void __kprobes kill_optimized_kprobe(struct kprobe *p)
524{ 713{
525 struct optimized_kprobe *op; 714 struct optimized_kprobe *op;
526 715
527 op = container_of(p, struct optimized_kprobe, kp); 716 op = container_of(p, struct optimized_kprobe, kp);
528 if (!list_empty(&op->list)) { 717 if (!list_empty(&op->list))
529 /* Dequeue from the optimization queue */ 718 /* Dequeue from the (un)optimization queue */
530 list_del_init(&op->list); 719 list_del_init(&op->list);
531 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; 720
532 } 721 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
533 /* Don't unoptimize, because the target code will be freed. */ 722 /* Don't touch the code, because it is already freed. */
534 arch_remove_optimized_kprobe(op); 723 arch_remove_optimized_kprobe(op);
535} 724}
536 725
@@ -543,16 +732,6 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
543 arch_prepare_optimized_kprobe(op); 732 arch_prepare_optimized_kprobe(op);
544} 733}
545 734
546/* Free optimized instructions and optimized_kprobe */
547static __kprobes void free_aggr_kprobe(struct kprobe *p)
548{
549 struct optimized_kprobe *op;
550
551 op = container_of(p, struct optimized_kprobe, kp);
552 arch_remove_optimized_kprobe(op);
553 kfree(op);
554}
555
556/* Allocate new optimized_kprobe and try to prepare optimized instructions */ 735/* Allocate new optimized_kprobe and try to prepare optimized instructions */
557static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) 736static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
558{ 737{
@@ -587,7 +766,8 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
587 op = container_of(ap, struct optimized_kprobe, kp); 766 op = container_of(ap, struct optimized_kprobe, kp);
588 if (!arch_prepared_optinsn(&op->optinsn)) { 767 if (!arch_prepared_optinsn(&op->optinsn)) {
589 /* If failed to setup optimizing, fallback to kprobe */ 768 /* If failed to setup optimizing, fallback to kprobe */
590 free_aggr_kprobe(ap); 769 arch_remove_optimized_kprobe(op);
770 kfree(op);
591 return; 771 return;
592 } 772 }
593 773
@@ -631,21 +811,16 @@ static void __kprobes unoptimize_all_kprobes(void)
631 return; 811 return;
632 812
633 kprobes_allow_optimization = false; 813 kprobes_allow_optimization = false;
634 printk(KERN_INFO "Kprobes globally unoptimized\n");
635 get_online_cpus(); /* For avoiding text_mutex deadlock */
636 mutex_lock(&text_mutex);
637 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 814 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
638 head = &kprobe_table[i]; 815 head = &kprobe_table[i];
639 hlist_for_each_entry_rcu(p, node, head, hlist) { 816 hlist_for_each_entry_rcu(p, node, head, hlist) {
640 if (!kprobe_disabled(p)) 817 if (!kprobe_disabled(p))
641 unoptimize_kprobe(p); 818 unoptimize_kprobe(p, false);
642 } 819 }
643 } 820 }
644 821 /* Wait for unoptimizing completion */
645 mutex_unlock(&text_mutex); 822 wait_for_kprobe_optimizer();
646 put_online_cpus(); 823 printk(KERN_INFO "Kprobes globally unoptimized\n");
647 /* Allow all currently running kprobes to complete */
648 synchronize_sched();
649} 824}
650 825
651int sysctl_kprobes_optimization; 826int sysctl_kprobes_optimization;
@@ -669,44 +844,60 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
669} 844}
670#endif /* CONFIG_SYSCTL */ 845#endif /* CONFIG_SYSCTL */
671 846
847/* Put a breakpoint for a probe. Must be called with text_mutex locked */
672static void __kprobes __arm_kprobe(struct kprobe *p) 848static void __kprobes __arm_kprobe(struct kprobe *p)
673{ 849{
674 struct kprobe *old_p; 850 struct kprobe *_p;
675 851
676 /* Check collision with other optimized kprobes */ 852 /* Check collision with other optimized kprobes */
677 old_p = get_optimized_kprobe((unsigned long)p->addr); 853 _p = get_optimized_kprobe((unsigned long)p->addr);
678 if (unlikely(old_p)) 854 if (unlikely(_p))
679 unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */ 855 /* Fallback to unoptimized kprobe */
856 unoptimize_kprobe(_p, true);
680 857
681 arch_arm_kprobe(p); 858 arch_arm_kprobe(p);
682 optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ 859 optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
683} 860}
684 861
685static void __kprobes __disarm_kprobe(struct kprobe *p) 862/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
863static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt)
686{ 864{
687 struct kprobe *old_p; 865 struct kprobe *_p;
688 866
689 unoptimize_kprobe(p); /* Try to unoptimize */ 867 unoptimize_kprobe(p, false); /* Try to unoptimize */
690 arch_disarm_kprobe(p);
691 868
692 /* If another kprobe was blocked, optimize it. */ 869 if (!kprobe_queued(p)) {
693 old_p = get_optimized_kprobe((unsigned long)p->addr); 870 arch_disarm_kprobe(p);
694 if (unlikely(old_p)) 871 /* If another kprobe was blocked, optimize it. */
695 optimize_kprobe(old_p); 872 _p = get_optimized_kprobe((unsigned long)p->addr);
873 if (unlikely(_p) && reopt)
874 optimize_kprobe(_p);
875 }
876 /* TODO: reoptimize others after unoptimized this probe */
696} 877}
697 878
698#else /* !CONFIG_OPTPROBES */ 879#else /* !CONFIG_OPTPROBES */
699 880
700#define optimize_kprobe(p) do {} while (0) 881#define optimize_kprobe(p) do {} while (0)
701#define unoptimize_kprobe(p) do {} while (0) 882#define unoptimize_kprobe(p, f) do {} while (0)
702#define kill_optimized_kprobe(p) do {} while (0) 883#define kill_optimized_kprobe(p) do {} while (0)
703#define prepare_optimized_kprobe(p) do {} while (0) 884#define prepare_optimized_kprobe(p) do {} while (0)
704#define try_to_optimize_kprobe(p) do {} while (0) 885#define try_to_optimize_kprobe(p) do {} while (0)
705#define __arm_kprobe(p) arch_arm_kprobe(p) 886#define __arm_kprobe(p) arch_arm_kprobe(p)
706#define __disarm_kprobe(p) arch_disarm_kprobe(p) 887#define __disarm_kprobe(p, o) arch_disarm_kprobe(p)
888#define kprobe_disarmed(p) kprobe_disabled(p)
889#define wait_for_kprobe_optimizer() do {} while (0)
890
891/* There should be no unused kprobes can be reused without optimization */
892static void reuse_unused_kprobe(struct kprobe *ap)
893{
894 printk(KERN_ERR "Error: There should be no unused kprobe here.\n");
895 BUG_ON(kprobe_unused(ap));
896}
707 897
708static __kprobes void free_aggr_kprobe(struct kprobe *p) 898static __kprobes void free_aggr_kprobe(struct kprobe *p)
709{ 899{
900 arch_remove_kprobe(p);
710 kfree(p); 901 kfree(p);
711} 902}
712 903
@@ -732,11 +923,10 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
732/* Disarm a kprobe with text_mutex */ 923/* Disarm a kprobe with text_mutex */
733static void __kprobes disarm_kprobe(struct kprobe *kp) 924static void __kprobes disarm_kprobe(struct kprobe *kp)
734{ 925{
735 get_online_cpus(); /* For avoiding text_mutex deadlock */ 926 /* Ditto */
736 mutex_lock(&text_mutex); 927 mutex_lock(&text_mutex);
737 __disarm_kprobe(kp); 928 __disarm_kprobe(kp, true);
738 mutex_unlock(&text_mutex); 929 mutex_unlock(&text_mutex);
739 put_online_cpus();
740} 930}
741 931
742/* 932/*
@@ -775,7 +965,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
775static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, 965static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
776 int trapnr) 966 int trapnr)
777{ 967{
778 struct kprobe *cur = __get_cpu_var(kprobe_instance); 968 struct kprobe *cur = __this_cpu_read(kprobe_instance);
779 969
780 /* 970 /*
781 * if we faulted "during" the execution of a user specified 971 * if we faulted "during" the execution of a user specified
@@ -790,7 +980,7 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
790 980
791static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) 981static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
792{ 982{
793 struct kprobe *cur = __get_cpu_var(kprobe_instance); 983 struct kprobe *cur = __this_cpu_read(kprobe_instance);
794 int ret = 0; 984 int ret = 0;
795 985
796 if (cur && cur->break_handler) { 986 if (cur && cur->break_handler) {
@@ -942,7 +1132,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
942 BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); 1132 BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
943 1133
944 if (p->break_handler || p->post_handler) 1134 if (p->break_handler || p->post_handler)
945 unoptimize_kprobe(ap); /* Fall back to normal kprobe */ 1135 unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */
946 1136
947 if (p->break_handler) { 1137 if (p->break_handler) {
948 if (ap->break_handler) 1138 if (ap->break_handler)
@@ -993,19 +1183,21 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
993 * This is the second or subsequent kprobe at the address - handle 1183 * This is the second or subsequent kprobe at the address - handle
994 * the intricacies 1184 * the intricacies
995 */ 1185 */
996static int __kprobes register_aggr_kprobe(struct kprobe *old_p, 1186static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
997 struct kprobe *p) 1187 struct kprobe *p)
998{ 1188{
999 int ret = 0; 1189 int ret = 0;
1000 struct kprobe *ap = old_p; 1190 struct kprobe *ap = orig_p;
1001 1191
1002 if (!kprobe_aggrprobe(old_p)) { 1192 if (!kprobe_aggrprobe(orig_p)) {
1003 /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */ 1193 /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
1004 ap = alloc_aggr_kprobe(old_p); 1194 ap = alloc_aggr_kprobe(orig_p);
1005 if (!ap) 1195 if (!ap)
1006 return -ENOMEM; 1196 return -ENOMEM;
1007 init_aggr_kprobe(ap, old_p); 1197 init_aggr_kprobe(ap, orig_p);
1008 } 1198 } else if (kprobe_unused(ap))
1199 /* This probe is going to die. Rescue it */
1200 reuse_unused_kprobe(ap);
1009 1201
1010 if (kprobe_gone(ap)) { 1202 if (kprobe_gone(ap)) {
1011 /* 1203 /*
@@ -1039,23 +1231,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
1039 return add_new_kprobe(ap, p); 1231 return add_new_kprobe(ap, p);
1040} 1232}
1041 1233
1042/* Try to disable aggr_kprobe, and return 1 if succeeded.*/
1043static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p)
1044{
1045 struct kprobe *kp;
1046
1047 list_for_each_entry_rcu(kp, &p->list, list) {
1048 if (!kprobe_disabled(kp))
1049 /*
1050 * There is an active probe on the list.
1051 * We can't disable aggr_kprobe.
1052 */
1053 return 0;
1054 }
1055 p->flags |= KPROBE_FLAG_DISABLED;
1056 return 1;
1057}
1058
1059static int __kprobes in_kprobes_functions(unsigned long addr) 1234static int __kprobes in_kprobes_functions(unsigned long addr)
1060{ 1235{
1061 struct kprobe_blackpoint *kb; 1236 struct kprobe_blackpoint *kb;
@@ -1098,34 +1273,33 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
1098/* Check passed kprobe is valid and return kprobe in kprobe_table. */ 1273/* Check passed kprobe is valid and return kprobe in kprobe_table. */
1099static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) 1274static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
1100{ 1275{
1101 struct kprobe *old_p, *list_p; 1276 struct kprobe *ap, *list_p;
1102 1277
1103 old_p = get_kprobe(p->addr); 1278 ap = get_kprobe(p->addr);
1104 if (unlikely(!old_p)) 1279 if (unlikely(!ap))
1105 return NULL; 1280 return NULL;
1106 1281
1107 if (p != old_p) { 1282 if (p != ap) {
1108 list_for_each_entry_rcu(list_p, &old_p->list, list) 1283 list_for_each_entry_rcu(list_p, &ap->list, list)
1109 if (list_p == p) 1284 if (list_p == p)
1110 /* kprobe p is a valid probe */ 1285 /* kprobe p is a valid probe */
1111 goto valid; 1286 goto valid;
1112 return NULL; 1287 return NULL;
1113 } 1288 }
1114valid: 1289valid:
1115 return old_p; 1290 return ap;
1116} 1291}
1117 1292
1118/* Return error if the kprobe is being re-registered */ 1293/* Return error if the kprobe is being re-registered */
1119static inline int check_kprobe_rereg(struct kprobe *p) 1294static inline int check_kprobe_rereg(struct kprobe *p)
1120{ 1295{
1121 int ret = 0; 1296 int ret = 0;
1122 struct kprobe *old_p;
1123 1297
1124 mutex_lock(&kprobe_mutex); 1298 mutex_lock(&kprobe_mutex);
1125 old_p = __get_valid_kprobe(p); 1299 if (__get_valid_kprobe(p))
1126 if (old_p)
1127 ret = -EINVAL; 1300 ret = -EINVAL;
1128 mutex_unlock(&kprobe_mutex); 1301 mutex_unlock(&kprobe_mutex);
1302
1129 return ret; 1303 return ret;
1130} 1304}
1131 1305
@@ -1229,67 +1403,121 @@ fail_with_jump_label:
1229} 1403}
1230EXPORT_SYMBOL_GPL(register_kprobe); 1404EXPORT_SYMBOL_GPL(register_kprobe);
1231 1405
1406/* Check if all probes on the aggrprobe are disabled */
1407static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
1408{
1409 struct kprobe *kp;
1410
1411 list_for_each_entry_rcu(kp, &ap->list, list)
1412 if (!kprobe_disabled(kp))
1413 /*
1414 * There is an active probe on the list.
1415 * We can't disable this ap.
1416 */
1417 return 0;
1418
1419 return 1;
1420}
1421
1422/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
1423static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
1424{
1425 struct kprobe *orig_p;
1426
1427 /* Get an original kprobe for return */
1428 orig_p = __get_valid_kprobe(p);
1429 if (unlikely(orig_p == NULL))
1430 return NULL;
1431
1432 if (!kprobe_disabled(p)) {
1433 /* Disable probe if it is a child probe */
1434 if (p != orig_p)
1435 p->flags |= KPROBE_FLAG_DISABLED;
1436
1437 /* Try to disarm and disable this/parent probe */
1438 if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
1439 disarm_kprobe(orig_p);
1440 orig_p->flags |= KPROBE_FLAG_DISABLED;
1441 }
1442 }
1443
1444 return orig_p;
1445}
1446
1232/* 1447/*
1233 * Unregister a kprobe without a scheduler synchronization. 1448 * Unregister a kprobe without a scheduler synchronization.
1234 */ 1449 */
1235static int __kprobes __unregister_kprobe_top(struct kprobe *p) 1450static int __kprobes __unregister_kprobe_top(struct kprobe *p)
1236{ 1451{
1237 struct kprobe *old_p, *list_p; 1452 struct kprobe *ap, *list_p;
1238 1453
1239 old_p = __get_valid_kprobe(p); 1454 /* Disable kprobe. This will disarm it if needed. */
1240 if (old_p == NULL) 1455 ap = __disable_kprobe(p);
1456 if (ap == NULL)
1241 return -EINVAL; 1457 return -EINVAL;
1242 1458
1243 if (old_p == p || 1459 if (ap == p)
1244 (kprobe_aggrprobe(old_p) &&
1245 list_is_singular(&old_p->list))) {
1246 /* 1460 /*
1247 * Only probe on the hash list. Disarm only if kprobes are 1461 * This probe is an independent(and non-optimized) kprobe
1248 * enabled and not gone - otherwise, the breakpoint would 1462 * (not an aggrprobe). Remove from the hash list.
1249 * already have been removed. We save on flushing icache.
1250 */ 1463 */
1251 if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) 1464 goto disarmed;
1252 disarm_kprobe(old_p); 1465
1253 hlist_del_rcu(&old_p->hlist); 1466 /* Following process expects this probe is an aggrprobe */
1254 } else { 1467 WARN_ON(!kprobe_aggrprobe(ap));
1468
1469 if (list_is_singular(&ap->list) && kprobe_disarmed(ap))
1470 /*
1471 * !disarmed could be happen if the probe is under delayed
1472 * unoptimizing.
1473 */
1474 goto disarmed;
1475 else {
1476 /* If disabling probe has special handlers, update aggrprobe */
1255 if (p->break_handler && !kprobe_gone(p)) 1477 if (p->break_handler && !kprobe_gone(p))
1256 old_p->break_handler = NULL; 1478 ap->break_handler = NULL;
1257 if (p->post_handler && !kprobe_gone(p)) { 1479 if (p->post_handler && !kprobe_gone(p)) {
1258 list_for_each_entry_rcu(list_p, &old_p->list, list) { 1480 list_for_each_entry_rcu(list_p, &ap->list, list) {
1259 if ((list_p != p) && (list_p->post_handler)) 1481 if ((list_p != p) && (list_p->post_handler))
1260 goto noclean; 1482 goto noclean;
1261 } 1483 }
1262 old_p->post_handler = NULL; 1484 ap->post_handler = NULL;
1263 } 1485 }
1264noclean: 1486noclean:
1487 /*
1488 * Remove from the aggrprobe: this path will do nothing in
1489 * __unregister_kprobe_bottom().
1490 */
1265 list_del_rcu(&p->list); 1491 list_del_rcu(&p->list);
1266 if (!kprobe_disabled(old_p)) { 1492 if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
1267 try_to_disable_aggr_kprobe(old_p); 1493 /*
1268 if (!kprobes_all_disarmed) { 1494 * Try to optimize this probe again, because post
1269 if (kprobe_disabled(old_p)) 1495 * handler may have been changed.
1270 disarm_kprobe(old_p); 1496 */
1271 else 1497 optimize_kprobe(ap);
1272 /* Try to optimize this probe again */
1273 optimize_kprobe(old_p);
1274 }
1275 }
1276 } 1498 }
1277 return 0; 1499 return 0;
1500
1501disarmed:
1502 BUG_ON(!kprobe_disarmed(ap));
1503 hlist_del_rcu(&ap->hlist);
1504 return 0;
1278} 1505}
1279 1506
1280static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) 1507static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
1281{ 1508{
1282 struct kprobe *old_p; 1509 struct kprobe *ap;
1283 1510
1284 if (list_empty(&p->list)) 1511 if (list_empty(&p->list))
1512 /* This is an independent kprobe */
1285 arch_remove_kprobe(p); 1513 arch_remove_kprobe(p);
1286 else if (list_is_singular(&p->list)) { 1514 else if (list_is_singular(&p->list)) {
1287 /* "p" is the last child of an aggr_kprobe */ 1515 /* This is the last child of an aggrprobe */
1288 old_p = list_entry(p->list.next, struct kprobe, list); 1516 ap = list_entry(p->list.next, struct kprobe, list);
1289 list_del(&p->list); 1517 list_del(&p->list);
1290 arch_remove_kprobe(old_p); 1518 free_aggr_kprobe(ap);
1291 free_aggr_kprobe(old_p);
1292 } 1519 }
1520 /* Otherwise, do nothing. */
1293} 1521}
1294 1522
1295int __kprobes register_kprobes(struct kprobe **kps, int num) 1523int __kprobes register_kprobes(struct kprobe **kps, int num)
@@ -1607,29 +1835,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1607int __kprobes disable_kprobe(struct kprobe *kp) 1835int __kprobes disable_kprobe(struct kprobe *kp)
1608{ 1836{
1609 int ret = 0; 1837 int ret = 0;
1610 struct kprobe *p;
1611 1838
1612 mutex_lock(&kprobe_mutex); 1839 mutex_lock(&kprobe_mutex);
1613 1840
1614 /* Check whether specified probe is valid. */ 1841 /* Disable this kprobe */
1615 p = __get_valid_kprobe(kp); 1842 if (__disable_kprobe(kp) == NULL)
1616 if (unlikely(p == NULL)) {
1617 ret = -EINVAL; 1843 ret = -EINVAL;
1618 goto out;
1619 }
1620 1844
1621 /* If the probe is already disabled (or gone), just return */
1622 if (kprobe_disabled(kp))
1623 goto out;
1624
1625 kp->flags |= KPROBE_FLAG_DISABLED;
1626 if (p != kp)
1627 /* When kp != p, p is always enabled. */
1628 try_to_disable_aggr_kprobe(p);
1629
1630 if (!kprobes_all_disarmed && kprobe_disabled(p))
1631 disarm_kprobe(p);
1632out:
1633 mutex_unlock(&kprobe_mutex); 1845 mutex_unlock(&kprobe_mutex);
1634 return ret; 1846 return ret;
1635} 1847}
@@ -1927,36 +2139,27 @@ static void __kprobes disarm_all_kprobes(void)
1927 mutex_lock(&kprobe_mutex); 2139 mutex_lock(&kprobe_mutex);
1928 2140
1929 /* If kprobes are already disarmed, just return */ 2141 /* If kprobes are already disarmed, just return */
1930 if (kprobes_all_disarmed) 2142 if (kprobes_all_disarmed) {
1931 goto already_disabled; 2143 mutex_unlock(&kprobe_mutex);
2144 return;
2145 }
1932 2146
1933 kprobes_all_disarmed = true; 2147 kprobes_all_disarmed = true;
1934 printk(KERN_INFO "Kprobes globally disabled\n"); 2148 printk(KERN_INFO "Kprobes globally disabled\n");
1935 2149
1936 /*
1937 * Here we call get_online_cpus() for avoiding text_mutex deadlock,
1938 * because disarming may also unoptimize kprobes.
1939 */
1940 get_online_cpus();
1941 mutex_lock(&text_mutex); 2150 mutex_lock(&text_mutex);
1942 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2151 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1943 head = &kprobe_table[i]; 2152 head = &kprobe_table[i];
1944 hlist_for_each_entry_rcu(p, node, head, hlist) { 2153 hlist_for_each_entry_rcu(p, node, head, hlist) {
1945 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) 2154 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
1946 __disarm_kprobe(p); 2155 __disarm_kprobe(p, false);
1947 } 2156 }
1948 } 2157 }
1949
1950 mutex_unlock(&text_mutex); 2158 mutex_unlock(&text_mutex);
1951 put_online_cpus();
1952 mutex_unlock(&kprobe_mutex); 2159 mutex_unlock(&kprobe_mutex);
1953 /* Allow all currently running kprobes to complete */
1954 synchronize_sched();
1955 return;
1956 2160
1957already_disabled: 2161 /* Wait for disarming all kprobes by optimizer */
1958 mutex_unlock(&kprobe_mutex); 2162 wait_for_kprobe_optimizer();
1959 return;
1960} 2163}
1961 2164
1962/* 2165/*
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 2dc3786349d1..c55afba990a3 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
148 wait_for_completion(&create.done); 148 wait_for_completion(&create.done);
149 149
150 if (!IS_ERR(create.result)) { 150 if (!IS_ERR(create.result)) {
151 struct sched_param param = { .sched_priority = 0 }; 151 static const struct sched_param param = { .sched_priority = 0 };
152 va_list args; 152 va_list args;
153 153
154 va_start(args, namefmt); 154 va_start(args, namefmt);
@@ -265,6 +265,17 @@ int kthreadd(void *unused)
265 return 0; 265 return 0;
266} 266}
267 267
268void __init_kthread_worker(struct kthread_worker *worker,
269 const char *name,
270 struct lock_class_key *key)
271{
272 spin_lock_init(&worker->lock);
273 lockdep_set_class_and_name(&worker->lock, key, name);
274 INIT_LIST_HEAD(&worker->work_list);
275 worker->task = NULL;
276}
277EXPORT_SYMBOL_GPL(__init_kthread_worker);
278
268/** 279/**
269 * kthread_worker_fn - kthread function to process kthread_worker 280 * kthread_worker_fn - kthread function to process kthread_worker
270 * @worker_ptr: pointer to initialized kthread_worker 281 * @worker_ptr: pointer to initialized kthread_worker
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 877fb306d415..17110a4a4fc2 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -194,14 +194,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
194 194
195 account_global_scheduler_latency(tsk, &lat); 195 account_global_scheduler_latency(tsk, &lat);
196 196
197 /* 197 for (i = 0; i < tsk->latency_record_count; i++) {
198 * short term hack; if we're > 32 we stop; future we recycle:
199 */
200 tsk->latency_record_count++;
201 if (tsk->latency_record_count >= LT_SAVECOUNT)
202 goto out_unlock;
203
204 for (i = 0; i < LT_SAVECOUNT; i++) {
205 struct latency_record *mylat; 198 struct latency_record *mylat;
206 int same = 1; 199 int same = 1;
207 200
@@ -227,8 +220,14 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
227 } 220 }
228 } 221 }
229 222
223 /*
224 * short term hack; if we're > 32 we stop; future we recycle:
225 */
226 if (tsk->latency_record_count >= LT_SAVECOUNT)
227 goto out_unlock;
228
230 /* Allocated a new one: */ 229 /* Allocated a new one: */
231 i = tsk->latency_record_count; 230 i = tsk->latency_record_count++;
232 memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); 231 memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
233 232
234out_unlock: 233out_unlock:
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 59b76c8ce9d7..1969d2fc4b36 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -494,7 +494,6 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
494 namelen += 2; 494 namelen += 2;
495 495
496 for (i = 0; i < LOCKSTAT_POINTS; i++) { 496 for (i = 0; i < LOCKSTAT_POINTS; i++) {
497 char sym[KSYM_SYMBOL_LEN];
498 char ip[32]; 497 char ip[32];
499 498
500 if (class->contention_point[i] == 0) 499 if (class->contention_point[i] == 0)
@@ -503,15 +502,13 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
503 if (!i) 502 if (!i)
504 seq_line(m, '-', 40-namelen, namelen); 503 seq_line(m, '-', 40-namelen, namelen);
505 504
506 sprint_symbol(sym, class->contention_point[i]);
507 snprintf(ip, sizeof(ip), "[<%p>]", 505 snprintf(ip, sizeof(ip), "[<%p>]",
508 (void *)class->contention_point[i]); 506 (void *)class->contention_point[i]);
509 seq_printf(m, "%40s %14lu %29s %s\n", name, 507 seq_printf(m, "%40s %14lu %29s %pS\n",
510 stats->contention_point[i], 508 name, stats->contention_point[i],
511 ip, sym); 509 ip, (void *)class->contention_point[i]);
512 } 510 }
513 for (i = 0; i < LOCKSTAT_POINTS; i++) { 511 for (i = 0; i < LOCKSTAT_POINTS; i++) {
514 char sym[KSYM_SYMBOL_LEN];
515 char ip[32]; 512 char ip[32];
516 513
517 if (class->contending_point[i] == 0) 514 if (class->contending_point[i] == 0)
@@ -520,12 +517,11 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
520 if (!i) 517 if (!i)
521 seq_line(m, '-', 40-namelen, namelen); 518 seq_line(m, '-', 40-namelen, namelen);
522 519
523 sprint_symbol(sym, class->contending_point[i]);
524 snprintf(ip, sizeof(ip), "[<%p>]", 520 snprintf(ip, sizeof(ip), "[<%p>]",
525 (void *)class->contending_point[i]); 521 (void *)class->contending_point[i]);
526 seq_printf(m, "%40s %14lu %29s %s\n", name, 522 seq_printf(m, "%40s %14lu %29s %pS\n",
527 stats->contending_point[i], 523 name, stats->contending_point[i],
528 ip, sym); 524 ip, (void *)class->contending_point[i]);
529 } 525 }
530 if (i) { 526 if (i) {
531 seq_puts(m, "\n"); 527 seq_puts(m, "\n");
diff --git a/kernel/module.c b/kernel/module.c
index 437a74a7524a..34e00b708fad 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -56,6 +56,7 @@
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/kmemleak.h> 57#include <linux/kmemleak.h>
58#include <linux/jump_label.h> 58#include <linux/jump_label.h>
59#include <linux/pfn.h>
59 60
60#define CREATE_TRACE_POINTS 61#define CREATE_TRACE_POINTS
61#include <trace/events/module.h> 62#include <trace/events/module.h>
@@ -70,6 +71,26 @@
70#define ARCH_SHF_SMALL 0 71#define ARCH_SHF_SMALL 0
71#endif 72#endif
72 73
74/*
75 * Modules' sections will be aligned on page boundaries
76 * to ensure complete separation of code and data, but
77 * only when CONFIG_DEBUG_SET_MODULE_RONX=y
78 */
79#ifdef CONFIG_DEBUG_SET_MODULE_RONX
80# define debug_align(X) ALIGN(X, PAGE_SIZE)
81#else
82# define debug_align(X) (X)
83#endif
84
85/*
86 * Given BASE and SIZE this macro calculates the number of pages the
87 * memory regions occupies
88 */
89#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \
90 (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \
91 PFN_DOWN((unsigned long)BASE) + 1) \
92 : (0UL))
93
73/* If this is set, the section belongs in the init part of the module */ 94/* If this is set, the section belongs in the init part of the module */
74#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) 95#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
75 96
@@ -1542,6 +1563,115 @@ static int __unlink_module(void *_mod)
1542 return 0; 1563 return 0;
1543} 1564}
1544 1565
1566#ifdef CONFIG_DEBUG_SET_MODULE_RONX
1567/*
1568 * LKM RO/NX protection: protect module's text/ro-data
1569 * from modification and any data from execution.
1570 */
1571void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages))
1572{
1573 unsigned long begin_pfn = PFN_DOWN((unsigned long)start);
1574 unsigned long end_pfn = PFN_DOWN((unsigned long)end);
1575
1576 if (end_pfn > begin_pfn)
1577 set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
1578}
1579
1580static void set_section_ro_nx(void *base,
1581 unsigned long text_size,
1582 unsigned long ro_size,
1583 unsigned long total_size)
1584{
1585 /* begin and end PFNs of the current subsection */
1586 unsigned long begin_pfn;
1587 unsigned long end_pfn;
1588
1589 /*
1590 * Set RO for module text and RO-data:
1591 * - Always protect first page.
1592 * - Do not protect last partial page.
1593 */
1594 if (ro_size > 0)
1595 set_page_attributes(base, base + ro_size, set_memory_ro);
1596
1597 /*
1598 * Set NX permissions for module data:
1599 * - Do not protect first partial page.
1600 * - Always protect last page.
1601 */
1602 if (total_size > text_size) {
1603 begin_pfn = PFN_UP((unsigned long)base + text_size);
1604 end_pfn = PFN_UP((unsigned long)base + total_size);
1605 if (end_pfn > begin_pfn)
1606 set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
1607 }
1608}
1609
1610/* Setting memory back to RW+NX before releasing it */
1611void unset_section_ro_nx(struct module *mod, void *module_region)
1612{
1613 unsigned long total_pages;
1614
1615 if (mod->module_core == module_region) {
1616 /* Set core as NX+RW */
1617 total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size);
1618 set_memory_nx((unsigned long)mod->module_core, total_pages);
1619 set_memory_rw((unsigned long)mod->module_core, total_pages);
1620
1621 } else if (mod->module_init == module_region) {
1622 /* Set init as NX+RW */
1623 total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size);
1624 set_memory_nx((unsigned long)mod->module_init, total_pages);
1625 set_memory_rw((unsigned long)mod->module_init, total_pages);
1626 }
1627}
1628
1629/* Iterate through all modules and set each module's text as RW */
1630void set_all_modules_text_rw()
1631{
1632 struct module *mod;
1633
1634 mutex_lock(&module_mutex);
1635 list_for_each_entry_rcu(mod, &modules, list) {
1636 if ((mod->module_core) && (mod->core_text_size)) {
1637 set_page_attributes(mod->module_core,
1638 mod->module_core + mod->core_text_size,
1639 set_memory_rw);
1640 }
1641 if ((mod->module_init) && (mod->init_text_size)) {
1642 set_page_attributes(mod->module_init,
1643 mod->module_init + mod->init_text_size,
1644 set_memory_rw);
1645 }
1646 }
1647 mutex_unlock(&module_mutex);
1648}
1649
1650/* Iterate through all modules and set each module's text as RO */
1651void set_all_modules_text_ro()
1652{
1653 struct module *mod;
1654
1655 mutex_lock(&module_mutex);
1656 list_for_each_entry_rcu(mod, &modules, list) {
1657 if ((mod->module_core) && (mod->core_text_size)) {
1658 set_page_attributes(mod->module_core,
1659 mod->module_core + mod->core_text_size,
1660 set_memory_ro);
1661 }
1662 if ((mod->module_init) && (mod->init_text_size)) {
1663 set_page_attributes(mod->module_init,
1664 mod->module_init + mod->init_text_size,
1665 set_memory_ro);
1666 }
1667 }
1668 mutex_unlock(&module_mutex);
1669}
1670#else
1671static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
1672static inline void unset_section_ro_nx(struct module *mod, void *module_region) { }
1673#endif
1674
1545/* Free a module, remove from lists, etc. */ 1675/* Free a module, remove from lists, etc. */
1546static void free_module(struct module *mod) 1676static void free_module(struct module *mod)
1547{ 1677{
@@ -1566,6 +1696,7 @@ static void free_module(struct module *mod)
1566 destroy_params(mod->kp, mod->num_kp); 1696 destroy_params(mod->kp, mod->num_kp);
1567 1697
1568 /* This may be NULL, but that's OK */ 1698 /* This may be NULL, but that's OK */
1699 unset_section_ro_nx(mod, mod->module_init);
1569 module_free(mod, mod->module_init); 1700 module_free(mod, mod->module_init);
1570 kfree(mod->args); 1701 kfree(mod->args);
1571 percpu_modfree(mod); 1702 percpu_modfree(mod);
@@ -1574,6 +1705,7 @@ static void free_module(struct module *mod)
1574 lockdep_free_key_range(mod->module_core, mod->core_size); 1705 lockdep_free_key_range(mod->module_core, mod->core_size);
1575 1706
1576 /* Finally, free the core (containing the module structure) */ 1707 /* Finally, free the core (containing the module structure) */
1708 unset_section_ro_nx(mod, mod->module_core);
1577 module_free(mod, mod->module_core); 1709 module_free(mod, mod->module_core);
1578 1710
1579#ifdef CONFIG_MPU 1711#ifdef CONFIG_MPU
@@ -1777,8 +1909,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
1777 s->sh_entsize = get_offset(mod, &mod->core_size, s, i); 1909 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
1778 DEBUGP("\t%s\n", name); 1910 DEBUGP("\t%s\n", name);
1779 } 1911 }
1780 if (m == 0) 1912 switch (m) {
1913 case 0: /* executable */
1914 mod->core_size = debug_align(mod->core_size);
1781 mod->core_text_size = mod->core_size; 1915 mod->core_text_size = mod->core_size;
1916 break;
1917 case 1: /* RO: text and ro-data */
1918 mod->core_size = debug_align(mod->core_size);
1919 mod->core_ro_size = mod->core_size;
1920 break;
1921 case 3: /* whole core */
1922 mod->core_size = debug_align(mod->core_size);
1923 break;
1924 }
1782 } 1925 }
1783 1926
1784 DEBUGP("Init section allocation order:\n"); 1927 DEBUGP("Init section allocation order:\n");
@@ -1796,8 +1939,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
1796 | INIT_OFFSET_MASK); 1939 | INIT_OFFSET_MASK);
1797 DEBUGP("\t%s\n", sname); 1940 DEBUGP("\t%s\n", sname);
1798 } 1941 }
1799 if (m == 0) 1942 switch (m) {
1943 case 0: /* executable */
1944 mod->init_size = debug_align(mod->init_size);
1800 mod->init_text_size = mod->init_size; 1945 mod->init_text_size = mod->init_size;
1946 break;
1947 case 1: /* RO: text and ro-data */
1948 mod->init_size = debug_align(mod->init_size);
1949 mod->init_ro_size = mod->init_size;
1950 break;
1951 case 3: /* whole init */
1952 mod->init_size = debug_align(mod->init_size);
1953 break;
1954 }
1801 } 1955 }
1802} 1956}
1803 1957
@@ -2326,6 +2480,18 @@ static void find_module_sections(struct module *mod, struct load_info *info)
2326 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * 2480 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2327 mod->num_trace_events, GFP_KERNEL); 2481 mod->num_trace_events, GFP_KERNEL);
2328#endif 2482#endif
2483#ifdef CONFIG_TRACING
2484 mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
2485 sizeof(*mod->trace_bprintk_fmt_start),
2486 &mod->num_trace_bprintk_fmt);
2487 /*
2488 * This section contains pointers to allocated objects in the trace
2489 * code and not scanning it leads to false positives.
2490 */
2491 kmemleak_scan_area(mod->trace_bprintk_fmt_start,
2492 sizeof(*mod->trace_bprintk_fmt_start) *
2493 mod->num_trace_bprintk_fmt, GFP_KERNEL);
2494#endif
2329#ifdef CONFIG_FTRACE_MCOUNT_RECORD 2495#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2330 /* sechdrs[0].sh_size is always zero */ 2496 /* sechdrs[0].sh_size is always zero */
2331 mod->ftrace_callsites = section_objs(info, "__mcount_loc", 2497 mod->ftrace_callsites = section_objs(info, "__mcount_loc",
@@ -2710,6 +2876,18 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2710 blocking_notifier_call_chain(&module_notify_list, 2876 blocking_notifier_call_chain(&module_notify_list,
2711 MODULE_STATE_COMING, mod); 2877 MODULE_STATE_COMING, mod);
2712 2878
2879 /* Set RO and NX regions for core */
2880 set_section_ro_nx(mod->module_core,
2881 mod->core_text_size,
2882 mod->core_ro_size,
2883 mod->core_size);
2884
2885 /* Set RO and NX regions for init */
2886 set_section_ro_nx(mod->module_init,
2887 mod->init_text_size,
2888 mod->init_ro_size,
2889 mod->init_size);
2890
2713 do_mod_ctors(mod); 2891 do_mod_ctors(mod);
2714 /* Start the module */ 2892 /* Start the module */
2715 if (mod->init != NULL) 2893 if (mod->init != NULL)
@@ -2753,6 +2931,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2753 mod->symtab = mod->core_symtab; 2931 mod->symtab = mod->core_symtab;
2754 mod->strtab = mod->core_strtab; 2932 mod->strtab = mod->core_strtab;
2755#endif 2933#endif
2934 unset_section_ro_nx(mod, mod->module_init);
2756 module_free(mod, mod->module_init); 2935 module_free(mod, mod->module_init);
2757 mod->module_init = NULL; 2936 mod->module_init = NULL;
2758 mod->init_size = 0; 2937 mod->init_size = 0;
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 200407c1502f..a5889fb28ecf 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -199,7 +199,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
199 * memory barriers as we'll eventually observe the right 199 * memory barriers as we'll eventually observe the right
200 * values at the cost of a few extra spins. 200 * values at the cost of a few extra spins.
201 */ 201 */
202 cpu_relax(); 202 arch_mutex_cpu_relax();
203 } 203 }
204#endif 204#endif
205 spin_lock_mutex(&lock->wait_lock, flags); 205 spin_lock_mutex(&lock->wait_lock, flags);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 517d827f4982..11847bf1e8cc 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -13,6 +13,7 @@
13#include <linux/mm.h> 13#include <linux/mm.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/smp.h> 15#include <linux/smp.h>
16#include <linux/idr.h>
16#include <linux/file.h> 17#include <linux/file.h>
17#include <linux/poll.h> 18#include <linux/poll.h>
18#include <linux/slab.h> 19#include <linux/slab.h>
@@ -21,7 +22,9 @@
21#include <linux/dcache.h> 22#include <linux/dcache.h>
22#include <linux/percpu.h> 23#include <linux/percpu.h>
23#include <linux/ptrace.h> 24#include <linux/ptrace.h>
25#include <linux/reboot.h>
24#include <linux/vmstat.h> 26#include <linux/vmstat.h>
27#include <linux/device.h>
25#include <linux/vmalloc.h> 28#include <linux/vmalloc.h>
26#include <linux/hardirq.h> 29#include <linux/hardirq.h>
27#include <linux/rculist.h> 30#include <linux/rculist.h>
@@ -31,6 +34,7 @@
31#include <linux/kernel_stat.h> 34#include <linux/kernel_stat.h>
32#include <linux/perf_event.h> 35#include <linux/perf_event.h>
33#include <linux/ftrace_event.h> 36#include <linux/ftrace_event.h>
37#include <linux/hw_breakpoint.h>
34 38
35#include <asm/irq_regs.h> 39#include <asm/irq_regs.h>
36 40
@@ -132,6 +136,28 @@ static void unclone_ctx(struct perf_event_context *ctx)
132 } 136 }
133} 137}
134 138
139static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
140{
141 /*
142 * only top level events have the pid namespace they were created in
143 */
144 if (event->parent)
145 event = event->parent;
146
147 return task_tgid_nr_ns(p, event->ns);
148}
149
150static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
151{
152 /*
153 * only top level events have the pid namespace they were created in
154 */
155 if (event->parent)
156 event = event->parent;
157
158 return task_pid_nr_ns(p, event->ns);
159}
160
135/* 161/*
136 * If we inherit events we want to return the parent event id 162 * If we inherit events we want to return the parent event id
137 * to userspace. 163 * to userspace.
@@ -311,9 +337,84 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
311 ctx->nr_stat++; 337 ctx->nr_stat++;
312} 338}
313 339
340/*
341 * Called at perf_event creation and when events are attached/detached from a
342 * group.
343 */
344static void perf_event__read_size(struct perf_event *event)
345{
346 int entry = sizeof(u64); /* value */
347 int size = 0;
348 int nr = 1;
349
350 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
351 size += sizeof(u64);
352
353 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
354 size += sizeof(u64);
355
356 if (event->attr.read_format & PERF_FORMAT_ID)
357 entry += sizeof(u64);
358
359 if (event->attr.read_format & PERF_FORMAT_GROUP) {
360 nr += event->group_leader->nr_siblings;
361 size += sizeof(u64);
362 }
363
364 size += entry * nr;
365 event->read_size = size;
366}
367
368static void perf_event__header_size(struct perf_event *event)
369{
370 struct perf_sample_data *data;
371 u64 sample_type = event->attr.sample_type;
372 u16 size = 0;
373
374 perf_event__read_size(event);
375
376 if (sample_type & PERF_SAMPLE_IP)
377 size += sizeof(data->ip);
378
379 if (sample_type & PERF_SAMPLE_ADDR)
380 size += sizeof(data->addr);
381
382 if (sample_type & PERF_SAMPLE_PERIOD)
383 size += sizeof(data->period);
384
385 if (sample_type & PERF_SAMPLE_READ)
386 size += event->read_size;
387
388 event->header_size = size;
389}
390
391static void perf_event__id_header_size(struct perf_event *event)
392{
393 struct perf_sample_data *data;
394 u64 sample_type = event->attr.sample_type;
395 u16 size = 0;
396
397 if (sample_type & PERF_SAMPLE_TID)
398 size += sizeof(data->tid_entry);
399
400 if (sample_type & PERF_SAMPLE_TIME)
401 size += sizeof(data->time);
402
403 if (sample_type & PERF_SAMPLE_ID)
404 size += sizeof(data->id);
405
406 if (sample_type & PERF_SAMPLE_STREAM_ID)
407 size += sizeof(data->stream_id);
408
409 if (sample_type & PERF_SAMPLE_CPU)
410 size += sizeof(data->cpu_entry);
411
412 event->id_header_size = size;
413}
414
314static void perf_group_attach(struct perf_event *event) 415static void perf_group_attach(struct perf_event *event)
315{ 416{
316 struct perf_event *group_leader = event->group_leader; 417 struct perf_event *group_leader = event->group_leader, *pos;
317 418
318 /* 419 /*
319 * We can have double attach due to group movement in perf_event_open. 420 * We can have double attach due to group movement in perf_event_open.
@@ -332,6 +433,11 @@ static void perf_group_attach(struct perf_event *event)
332 433
333 list_add_tail(&event->group_entry, &group_leader->sibling_list); 434 list_add_tail(&event->group_entry, &group_leader->sibling_list);
334 group_leader->nr_siblings++; 435 group_leader->nr_siblings++;
436
437 perf_event__header_size(group_leader);
438
439 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
440 perf_event__header_size(pos);
335} 441}
336 442
337/* 443/*
@@ -390,7 +496,7 @@ static void perf_group_detach(struct perf_event *event)
390 if (event->group_leader != event) { 496 if (event->group_leader != event) {
391 list_del_init(&event->group_entry); 497 list_del_init(&event->group_entry);
392 event->group_leader->nr_siblings--; 498 event->group_leader->nr_siblings--;
393 return; 499 goto out;
394 } 500 }
395 501
396 if (!list_empty(&event->group_entry)) 502 if (!list_empty(&event->group_entry))
@@ -409,6 +515,12 @@ static void perf_group_detach(struct perf_event *event)
409 /* Inherit group flags from the previous leader */ 515 /* Inherit group flags from the previous leader */
410 sibling->group_flags = event->group_flags; 516 sibling->group_flags = event->group_flags;
411 } 517 }
518
519out:
520 perf_event__header_size(event->group_leader);
521
522 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
523 perf_event__header_size(tmp);
412} 524}
413 525
414static inline int 526static inline int
@@ -674,6 +786,8 @@ event_sched_in(struct perf_event *event,
674 786
675 event->tstamp_running += ctx->time - event->tstamp_stopped; 787 event->tstamp_running += ctx->time - event->tstamp_stopped;
676 788
789 event->shadow_ctx_time = ctx->time - ctx->timestamp;
790
677 if (!is_software_event(event)) 791 if (!is_software_event(event))
678 cpuctx->active_oncpu++; 792 cpuctx->active_oncpu++;
679 ctx->nr_active++; 793 ctx->nr_active++;
@@ -1070,7 +1184,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1070 /* 1184 /*
1071 * not supported on inherited events 1185 * not supported on inherited events
1072 */ 1186 */
1073 if (event->attr.inherit) 1187 if (event->attr.inherit || !is_sampling_event(event))
1074 return -EINVAL; 1188 return -EINVAL;
1075 1189
1076 atomic_add(refresh, &event->event_limit); 1190 atomic_add(refresh, &event->event_limit);
@@ -1284,8 +1398,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
1284{ 1398{
1285 int ctxn; 1399 int ctxn;
1286 1400
1287 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1288
1289 for_each_task_context_nr(ctxn) 1401 for_each_task_context_nr(ctxn)
1290 perf_event_context_sched_out(task, ctxn, next); 1402 perf_event_context_sched_out(task, ctxn, next);
1291} 1403}
@@ -1619,8 +1731,12 @@ static void rotate_ctx(struct perf_event_context *ctx)
1619{ 1731{
1620 raw_spin_lock(&ctx->lock); 1732 raw_spin_lock(&ctx->lock);
1621 1733
1622 /* Rotate the first entry last of non-pinned groups */ 1734 /*
1623 list_rotate_left(&ctx->flexible_groups); 1735 * Rotate the first entry last of non-pinned groups. Rotation might be
1736 * disabled by the inheritance code.
1737 */
1738 if (!ctx->rotate_disable)
1739 list_rotate_left(&ctx->flexible_groups);
1624 1740
1625 raw_spin_unlock(&ctx->lock); 1741 raw_spin_unlock(&ctx->lock);
1626} 1742}
@@ -2232,11 +2348,6 @@ int perf_event_release_kernel(struct perf_event *event)
2232 raw_spin_unlock_irq(&ctx->lock); 2348 raw_spin_unlock_irq(&ctx->lock);
2233 mutex_unlock(&ctx->mutex); 2349 mutex_unlock(&ctx->mutex);
2234 2350
2235 mutex_lock(&event->owner->perf_event_mutex);
2236 list_del_init(&event->owner_entry);
2237 mutex_unlock(&event->owner->perf_event_mutex);
2238 put_task_struct(event->owner);
2239
2240 free_event(event); 2351 free_event(event);
2241 2352
2242 return 0; 2353 return 0;
@@ -2249,35 +2360,44 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
2249static int perf_release(struct inode *inode, struct file *file) 2360static int perf_release(struct inode *inode, struct file *file)
2250{ 2361{
2251 struct perf_event *event = file->private_data; 2362 struct perf_event *event = file->private_data;
2363 struct task_struct *owner;
2252 2364
2253 file->private_data = NULL; 2365 file->private_data = NULL;
2254 2366
2255 return perf_event_release_kernel(event); 2367 rcu_read_lock();
2256} 2368 owner = ACCESS_ONCE(event->owner);
2257 2369 /*
2258static int perf_event_read_size(struct perf_event *event) 2370 * Matches the smp_wmb() in perf_event_exit_task(). If we observe
2259{ 2371 * !owner it means the list deletion is complete and we can indeed
2260 int entry = sizeof(u64); /* value */ 2372 * free this event, otherwise we need to serialize on
2261 int size = 0; 2373 * owner->perf_event_mutex.
2262 int nr = 1; 2374 */
2263 2375 smp_read_barrier_depends();
2264 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 2376 if (owner) {
2265 size += sizeof(u64); 2377 /*
2266 2378 * Since delayed_put_task_struct() also drops the last
2267 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 2379 * task reference we can safely take a new reference
2268 size += sizeof(u64); 2380 * while holding the rcu_read_lock().
2269 2381 */
2270 if (event->attr.read_format & PERF_FORMAT_ID) 2382 get_task_struct(owner);
2271 entry += sizeof(u64);
2272
2273 if (event->attr.read_format & PERF_FORMAT_GROUP) {
2274 nr += event->group_leader->nr_siblings;
2275 size += sizeof(u64);
2276 } 2383 }
2384 rcu_read_unlock();
2277 2385
2278 size += entry * nr; 2386 if (owner) {
2387 mutex_lock(&owner->perf_event_mutex);
2388 /*
2389 * We have to re-check the event->owner field, if it is cleared
2390 * we raced with perf_event_exit_task(), acquiring the mutex
2391 * ensured they're done, and we can proceed with freeing the
2392 * event.
2393 */
2394 if (event->owner)
2395 list_del_init(&event->owner_entry);
2396 mutex_unlock(&owner->perf_event_mutex);
2397 put_task_struct(owner);
2398 }
2279 2399
2280 return size; 2400 return perf_event_release_kernel(event);
2281} 2401}
2282 2402
2283u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) 2403u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
@@ -2394,7 +2514,7 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
2394 if (event->state == PERF_EVENT_STATE_ERROR) 2514 if (event->state == PERF_EVENT_STATE_ERROR)
2395 return 0; 2515 return 0;
2396 2516
2397 if (count < perf_event_read_size(event)) 2517 if (count < event->read_size)
2398 return -ENOSPC; 2518 return -ENOSPC;
2399 2519
2400 WARN_ON_ONCE(event->ctx->parent_ctx); 2520 WARN_ON_ONCE(event->ctx->parent_ctx);
@@ -2480,7 +2600,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
2480 int ret = 0; 2600 int ret = 0;
2481 u64 value; 2601 u64 value;
2482 2602
2483 if (!event->attr.sample_period) 2603 if (!is_sampling_event(event))
2484 return -EINVAL; 2604 return -EINVAL;
2485 2605
2486 if (copy_from_user(&value, arg, sizeof(value))) 2606 if (copy_from_user(&value, arg, sizeof(value)))
@@ -3271,6 +3391,73 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle,
3271 } while (len); 3391 } while (len);
3272} 3392}
3273 3393
3394static void __perf_event_header__init_id(struct perf_event_header *header,
3395 struct perf_sample_data *data,
3396 struct perf_event *event)
3397{
3398 u64 sample_type = event->attr.sample_type;
3399
3400 data->type = sample_type;
3401 header->size += event->id_header_size;
3402
3403 if (sample_type & PERF_SAMPLE_TID) {
3404 /* namespace issues */
3405 data->tid_entry.pid = perf_event_pid(event, current);
3406 data->tid_entry.tid = perf_event_tid(event, current);
3407 }
3408
3409 if (sample_type & PERF_SAMPLE_TIME)
3410 data->time = perf_clock();
3411
3412 if (sample_type & PERF_SAMPLE_ID)
3413 data->id = primary_event_id(event);
3414
3415 if (sample_type & PERF_SAMPLE_STREAM_ID)
3416 data->stream_id = event->id;
3417
3418 if (sample_type & PERF_SAMPLE_CPU) {
3419 data->cpu_entry.cpu = raw_smp_processor_id();
3420 data->cpu_entry.reserved = 0;
3421 }
3422}
3423
3424static void perf_event_header__init_id(struct perf_event_header *header,
3425 struct perf_sample_data *data,
3426 struct perf_event *event)
3427{
3428 if (event->attr.sample_id_all)
3429 __perf_event_header__init_id(header, data, event);
3430}
3431
3432static void __perf_event__output_id_sample(struct perf_output_handle *handle,
3433 struct perf_sample_data *data)
3434{
3435 u64 sample_type = data->type;
3436
3437 if (sample_type & PERF_SAMPLE_TID)
3438 perf_output_put(handle, data->tid_entry);
3439
3440 if (sample_type & PERF_SAMPLE_TIME)
3441 perf_output_put(handle, data->time);
3442
3443 if (sample_type & PERF_SAMPLE_ID)
3444 perf_output_put(handle, data->id);
3445
3446 if (sample_type & PERF_SAMPLE_STREAM_ID)
3447 perf_output_put(handle, data->stream_id);
3448
3449 if (sample_type & PERF_SAMPLE_CPU)
3450 perf_output_put(handle, data->cpu_entry);
3451}
3452
3453static void perf_event__output_id_sample(struct perf_event *event,
3454 struct perf_output_handle *handle,
3455 struct perf_sample_data *sample)
3456{
3457 if (event->attr.sample_id_all)
3458 __perf_event__output_id_sample(handle, sample);
3459}
3460
3274int perf_output_begin(struct perf_output_handle *handle, 3461int perf_output_begin(struct perf_output_handle *handle,
3275 struct perf_event *event, unsigned int size, 3462 struct perf_event *event, unsigned int size,
3276 int nmi, int sample) 3463 int nmi, int sample)
@@ -3278,6 +3465,7 @@ int perf_output_begin(struct perf_output_handle *handle,
3278 struct perf_buffer *buffer; 3465 struct perf_buffer *buffer;
3279 unsigned long tail, offset, head; 3466 unsigned long tail, offset, head;
3280 int have_lost; 3467 int have_lost;
3468 struct perf_sample_data sample_data;
3281 struct { 3469 struct {
3282 struct perf_event_header header; 3470 struct perf_event_header header;
3283 u64 id; 3471 u64 id;
@@ -3304,8 +3492,12 @@ int perf_output_begin(struct perf_output_handle *handle,
3304 goto out; 3492 goto out;
3305 3493
3306 have_lost = local_read(&buffer->lost); 3494 have_lost = local_read(&buffer->lost);
3307 if (have_lost) 3495 if (have_lost) {
3308 size += sizeof(lost_event); 3496 lost_event.header.size = sizeof(lost_event);
3497 perf_event_header__init_id(&lost_event.header, &sample_data,
3498 event);
3499 size += lost_event.header.size;
3500 }
3309 3501
3310 perf_output_get_handle(handle); 3502 perf_output_get_handle(handle);
3311 3503
@@ -3336,11 +3528,11 @@ int perf_output_begin(struct perf_output_handle *handle,
3336 if (have_lost) { 3528 if (have_lost) {
3337 lost_event.header.type = PERF_RECORD_LOST; 3529 lost_event.header.type = PERF_RECORD_LOST;
3338 lost_event.header.misc = 0; 3530 lost_event.header.misc = 0;
3339 lost_event.header.size = sizeof(lost_event);
3340 lost_event.id = event->id; 3531 lost_event.id = event->id;
3341 lost_event.lost = local_xchg(&buffer->lost, 0); 3532 lost_event.lost = local_xchg(&buffer->lost, 0);
3342 3533
3343 perf_output_put(handle, lost_event); 3534 perf_output_put(handle, lost_event);
3535 perf_event__output_id_sample(event, handle, &sample_data);
3344 } 3536 }
3345 3537
3346 return 0; 3538 return 0;
@@ -3373,30 +3565,9 @@ void perf_output_end(struct perf_output_handle *handle)
3373 rcu_read_unlock(); 3565 rcu_read_unlock();
3374} 3566}
3375 3567
3376static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
3377{
3378 /*
3379 * only top level events have the pid namespace they were created in
3380 */
3381 if (event->parent)
3382 event = event->parent;
3383
3384 return task_tgid_nr_ns(p, event->ns);
3385}
3386
3387static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
3388{
3389 /*
3390 * only top level events have the pid namespace they were created in
3391 */
3392 if (event->parent)
3393 event = event->parent;
3394
3395 return task_pid_nr_ns(p, event->ns);
3396}
3397
3398static void perf_output_read_one(struct perf_output_handle *handle, 3568static void perf_output_read_one(struct perf_output_handle *handle,
3399 struct perf_event *event) 3569 struct perf_event *event,
3570 u64 enabled, u64 running)
3400{ 3571{
3401 u64 read_format = event->attr.read_format; 3572 u64 read_format = event->attr.read_format;
3402 u64 values[4]; 3573 u64 values[4];
@@ -3404,11 +3575,11 @@ static void perf_output_read_one(struct perf_output_handle *handle,
3404 3575
3405 values[n++] = perf_event_count(event); 3576 values[n++] = perf_event_count(event);
3406 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 3577 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3407 values[n++] = event->total_time_enabled + 3578 values[n++] = enabled +
3408 atomic64_read(&event->child_total_time_enabled); 3579 atomic64_read(&event->child_total_time_enabled);
3409 } 3580 }
3410 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 3581 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3411 values[n++] = event->total_time_running + 3582 values[n++] = running +
3412 atomic64_read(&event->child_total_time_running); 3583 atomic64_read(&event->child_total_time_running);
3413 } 3584 }
3414 if (read_format & PERF_FORMAT_ID) 3585 if (read_format & PERF_FORMAT_ID)
@@ -3421,7 +3592,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
3421 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. 3592 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
3422 */ 3593 */
3423static void perf_output_read_group(struct perf_output_handle *handle, 3594static void perf_output_read_group(struct perf_output_handle *handle,
3424 struct perf_event *event) 3595 struct perf_event *event,
3596 u64 enabled, u64 running)
3425{ 3597{
3426 struct perf_event *leader = event->group_leader, *sub; 3598 struct perf_event *leader = event->group_leader, *sub;
3427 u64 read_format = event->attr.read_format; 3599 u64 read_format = event->attr.read_format;
@@ -3431,10 +3603,10 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3431 values[n++] = 1 + leader->nr_siblings; 3603 values[n++] = 1 + leader->nr_siblings;
3432 3604
3433 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 3605 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3434 values[n++] = leader->total_time_enabled; 3606 values[n++] = enabled;
3435 3607
3436 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 3608 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3437 values[n++] = leader->total_time_running; 3609 values[n++] = running;
3438 3610
3439 if (leader != event) 3611 if (leader != event)
3440 leader->pmu->read(leader); 3612 leader->pmu->read(leader);
@@ -3459,13 +3631,35 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3459 } 3631 }
3460} 3632}
3461 3633
3634#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
3635 PERF_FORMAT_TOTAL_TIME_RUNNING)
3636
3462static void perf_output_read(struct perf_output_handle *handle, 3637static void perf_output_read(struct perf_output_handle *handle,
3463 struct perf_event *event) 3638 struct perf_event *event)
3464{ 3639{
3640 u64 enabled = 0, running = 0, now, ctx_time;
3641 u64 read_format = event->attr.read_format;
3642
3643 /*
3644 * compute total_time_enabled, total_time_running
3645 * based on snapshot values taken when the event
3646 * was last scheduled in.
3647 *
3648 * we cannot simply called update_context_time()
3649 * because of locking issue as we are called in
3650 * NMI context
3651 */
3652 if (read_format & PERF_FORMAT_TOTAL_TIMES) {
3653 now = perf_clock();
3654 ctx_time = event->shadow_ctx_time + now;
3655 enabled = ctx_time - event->tstamp_enabled;
3656 running = ctx_time - event->tstamp_running;
3657 }
3658
3465 if (event->attr.read_format & PERF_FORMAT_GROUP) 3659 if (event->attr.read_format & PERF_FORMAT_GROUP)
3466 perf_output_read_group(handle, event); 3660 perf_output_read_group(handle, event, enabled, running);
3467 else 3661 else
3468 perf_output_read_one(handle, event); 3662 perf_output_read_one(handle, event, enabled, running);
3469} 3663}
3470 3664
3471void perf_output_sample(struct perf_output_handle *handle, 3665void perf_output_sample(struct perf_output_handle *handle,
@@ -3545,61 +3739,16 @@ void perf_prepare_sample(struct perf_event_header *header,
3545{ 3739{
3546 u64 sample_type = event->attr.sample_type; 3740 u64 sample_type = event->attr.sample_type;
3547 3741
3548 data->type = sample_type;
3549
3550 header->type = PERF_RECORD_SAMPLE; 3742 header->type = PERF_RECORD_SAMPLE;
3551 header->size = sizeof(*header); 3743 header->size = sizeof(*header) + event->header_size;
3552 3744
3553 header->misc = 0; 3745 header->misc = 0;
3554 header->misc |= perf_misc_flags(regs); 3746 header->misc |= perf_misc_flags(regs);
3555 3747
3556 if (sample_type & PERF_SAMPLE_IP) { 3748 __perf_event_header__init_id(header, data, event);
3557 data->ip = perf_instruction_pointer(regs);
3558
3559 header->size += sizeof(data->ip);
3560 }
3561
3562 if (sample_type & PERF_SAMPLE_TID) {
3563 /* namespace issues */
3564 data->tid_entry.pid = perf_event_pid(event, current);
3565 data->tid_entry.tid = perf_event_tid(event, current);
3566
3567 header->size += sizeof(data->tid_entry);
3568 }
3569
3570 if (sample_type & PERF_SAMPLE_TIME) {
3571 data->time = perf_clock();
3572
3573 header->size += sizeof(data->time);
3574 }
3575
3576 if (sample_type & PERF_SAMPLE_ADDR)
3577 header->size += sizeof(data->addr);
3578
3579 if (sample_type & PERF_SAMPLE_ID) {
3580 data->id = primary_event_id(event);
3581
3582 header->size += sizeof(data->id);
3583 }
3584
3585 if (sample_type & PERF_SAMPLE_STREAM_ID) {
3586 data->stream_id = event->id;
3587 3749
3588 header->size += sizeof(data->stream_id); 3750 if (sample_type & PERF_SAMPLE_IP)
3589 } 3751 data->ip = perf_instruction_pointer(regs);
3590
3591 if (sample_type & PERF_SAMPLE_CPU) {
3592 data->cpu_entry.cpu = raw_smp_processor_id();
3593 data->cpu_entry.reserved = 0;
3594
3595 header->size += sizeof(data->cpu_entry);
3596 }
3597
3598 if (sample_type & PERF_SAMPLE_PERIOD)
3599 header->size += sizeof(data->period);
3600
3601 if (sample_type & PERF_SAMPLE_READ)
3602 header->size += perf_event_read_size(event);
3603 3752
3604 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 3753 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3605 int size = 1; 3754 int size = 1;
@@ -3664,23 +3813,26 @@ perf_event_read_event(struct perf_event *event,
3664 struct task_struct *task) 3813 struct task_struct *task)
3665{ 3814{
3666 struct perf_output_handle handle; 3815 struct perf_output_handle handle;
3816 struct perf_sample_data sample;
3667 struct perf_read_event read_event = { 3817 struct perf_read_event read_event = {
3668 .header = { 3818 .header = {
3669 .type = PERF_RECORD_READ, 3819 .type = PERF_RECORD_READ,
3670 .misc = 0, 3820 .misc = 0,
3671 .size = sizeof(read_event) + perf_event_read_size(event), 3821 .size = sizeof(read_event) + event->read_size,
3672 }, 3822 },
3673 .pid = perf_event_pid(event, task), 3823 .pid = perf_event_pid(event, task),
3674 .tid = perf_event_tid(event, task), 3824 .tid = perf_event_tid(event, task),
3675 }; 3825 };
3676 int ret; 3826 int ret;
3677 3827
3828 perf_event_header__init_id(&read_event.header, &sample, event);
3678 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); 3829 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3679 if (ret) 3830 if (ret)
3680 return; 3831 return;
3681 3832
3682 perf_output_put(&handle, read_event); 3833 perf_output_put(&handle, read_event);
3683 perf_output_read(&handle, event); 3834 perf_output_read(&handle, event);
3835 perf_event__output_id_sample(event, &handle, &sample);
3684 3836
3685 perf_output_end(&handle); 3837 perf_output_end(&handle);
3686} 3838}
@@ -3710,14 +3862,16 @@ static void perf_event_task_output(struct perf_event *event,
3710 struct perf_task_event *task_event) 3862 struct perf_task_event *task_event)
3711{ 3863{
3712 struct perf_output_handle handle; 3864 struct perf_output_handle handle;
3865 struct perf_sample_data sample;
3713 struct task_struct *task = task_event->task; 3866 struct task_struct *task = task_event->task;
3714 int size, ret; 3867 int ret, size = task_event->event_id.header.size;
3715 3868
3716 size = task_event->event_id.header.size; 3869 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
3717 ret = perf_output_begin(&handle, event, size, 0, 0);
3718 3870
3871 ret = perf_output_begin(&handle, event,
3872 task_event->event_id.header.size, 0, 0);
3719 if (ret) 3873 if (ret)
3720 return; 3874 goto out;
3721 3875
3722 task_event->event_id.pid = perf_event_pid(event, task); 3876 task_event->event_id.pid = perf_event_pid(event, task);
3723 task_event->event_id.ppid = perf_event_pid(event, current); 3877 task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3727,7 +3881,11 @@ static void perf_event_task_output(struct perf_event *event,
3727 3881
3728 perf_output_put(&handle, task_event->event_id); 3882 perf_output_put(&handle, task_event->event_id);
3729 3883
3884 perf_event__output_id_sample(event, &handle, &sample);
3885
3730 perf_output_end(&handle); 3886 perf_output_end(&handle);
3887out:
3888 task_event->event_id.header.size = size;
3731} 3889}
3732 3890
3733static int perf_event_task_match(struct perf_event *event) 3891static int perf_event_task_match(struct perf_event *event)
@@ -3766,6 +3924,8 @@ static void perf_event_task_event(struct perf_task_event *task_event)
3766 rcu_read_lock(); 3924 rcu_read_lock();
3767 list_for_each_entry_rcu(pmu, &pmus, entry) { 3925 list_for_each_entry_rcu(pmu, &pmus, entry) {
3768 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 3926 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3927 if (cpuctx->active_pmu != pmu)
3928 goto next;
3769 perf_event_task_ctx(&cpuctx->ctx, task_event); 3929 perf_event_task_ctx(&cpuctx->ctx, task_event);
3770 3930
3771 ctx = task_event->task_ctx; 3931 ctx = task_event->task_ctx;
@@ -3840,11 +4000,16 @@ static void perf_event_comm_output(struct perf_event *event,
3840 struct perf_comm_event *comm_event) 4000 struct perf_comm_event *comm_event)
3841{ 4001{
3842 struct perf_output_handle handle; 4002 struct perf_output_handle handle;
4003 struct perf_sample_data sample;
3843 int size = comm_event->event_id.header.size; 4004 int size = comm_event->event_id.header.size;
3844 int ret = perf_output_begin(&handle, event, size, 0, 0); 4005 int ret;
4006
4007 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4008 ret = perf_output_begin(&handle, event,
4009 comm_event->event_id.header.size, 0, 0);
3845 4010
3846 if (ret) 4011 if (ret)
3847 return; 4012 goto out;
3848 4013
3849 comm_event->event_id.pid = perf_event_pid(event, comm_event->task); 4014 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3850 comm_event->event_id.tid = perf_event_tid(event, comm_event->task); 4015 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
@@ -3852,7 +4017,12 @@ static void perf_event_comm_output(struct perf_event *event,
3852 perf_output_put(&handle, comm_event->event_id); 4017 perf_output_put(&handle, comm_event->event_id);
3853 perf_output_copy(&handle, comm_event->comm, 4018 perf_output_copy(&handle, comm_event->comm,
3854 comm_event->comm_size); 4019 comm_event->comm_size);
4020
4021 perf_event__output_id_sample(event, &handle, &sample);
4022
3855 perf_output_end(&handle); 4023 perf_output_end(&handle);
4024out:
4025 comm_event->event_id.header.size = size;
3856} 4026}
3857 4027
3858static int perf_event_comm_match(struct perf_event *event) 4028static int perf_event_comm_match(struct perf_event *event)
@@ -3897,10 +4067,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3897 comm_event->comm_size = size; 4067 comm_event->comm_size = size;
3898 4068
3899 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 4069 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3900
3901 rcu_read_lock(); 4070 rcu_read_lock();
3902 list_for_each_entry_rcu(pmu, &pmus, entry) { 4071 list_for_each_entry_rcu(pmu, &pmus, entry) {
3903 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 4072 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4073 if (cpuctx->active_pmu != pmu)
4074 goto next;
3904 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 4075 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3905 4076
3906 ctxn = pmu->task_ctx_nr; 4077 ctxn = pmu->task_ctx_nr;
@@ -3976,11 +4147,15 @@ static void perf_event_mmap_output(struct perf_event *event,
3976 struct perf_mmap_event *mmap_event) 4147 struct perf_mmap_event *mmap_event)
3977{ 4148{
3978 struct perf_output_handle handle; 4149 struct perf_output_handle handle;
4150 struct perf_sample_data sample;
3979 int size = mmap_event->event_id.header.size; 4151 int size = mmap_event->event_id.header.size;
3980 int ret = perf_output_begin(&handle, event, size, 0, 0); 4152 int ret;
3981 4153
4154 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4155 ret = perf_output_begin(&handle, event,
4156 mmap_event->event_id.header.size, 0, 0);
3982 if (ret) 4157 if (ret)
3983 return; 4158 goto out;
3984 4159
3985 mmap_event->event_id.pid = perf_event_pid(event, current); 4160 mmap_event->event_id.pid = perf_event_pid(event, current);
3986 mmap_event->event_id.tid = perf_event_tid(event, current); 4161 mmap_event->event_id.tid = perf_event_tid(event, current);
@@ -3988,7 +4163,12 @@ static void perf_event_mmap_output(struct perf_event *event,
3988 perf_output_put(&handle, mmap_event->event_id); 4163 perf_output_put(&handle, mmap_event->event_id);
3989 perf_output_copy(&handle, mmap_event->file_name, 4164 perf_output_copy(&handle, mmap_event->file_name,
3990 mmap_event->file_size); 4165 mmap_event->file_size);
4166
4167 perf_event__output_id_sample(event, &handle, &sample);
4168
3991 perf_output_end(&handle); 4169 perf_output_end(&handle);
4170out:
4171 mmap_event->event_id.header.size = size;
3992} 4172}
3993 4173
3994static int perf_event_mmap_match(struct perf_event *event, 4174static int perf_event_mmap_match(struct perf_event *event,
@@ -4086,6 +4266,8 @@ got_name:
4086 rcu_read_lock(); 4266 rcu_read_lock();
4087 list_for_each_entry_rcu(pmu, &pmus, entry) { 4267 list_for_each_entry_rcu(pmu, &pmus, entry) {
4088 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 4268 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4269 if (cpuctx->active_pmu != pmu)
4270 goto next;
4089 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, 4271 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
4090 vma->vm_flags & VM_EXEC); 4272 vma->vm_flags & VM_EXEC);
4091 4273
@@ -4141,6 +4323,7 @@ void perf_event_mmap(struct vm_area_struct *vma)
4141static void perf_log_throttle(struct perf_event *event, int enable) 4323static void perf_log_throttle(struct perf_event *event, int enable)
4142{ 4324{
4143 struct perf_output_handle handle; 4325 struct perf_output_handle handle;
4326 struct perf_sample_data sample;
4144 int ret; 4327 int ret;
4145 4328
4146 struct { 4329 struct {
@@ -4162,11 +4345,15 @@ static void perf_log_throttle(struct perf_event *event, int enable)
4162 if (enable) 4345 if (enable)
4163 throttle_event.header.type = PERF_RECORD_UNTHROTTLE; 4346 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
4164 4347
4165 ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); 4348 perf_event_header__init_id(&throttle_event.header, &sample, event);
4349
4350 ret = perf_output_begin(&handle, event,
4351 throttle_event.header.size, 1, 0);
4166 if (ret) 4352 if (ret)
4167 return; 4353 return;
4168 4354
4169 perf_output_put(&handle, throttle_event); 4355 perf_output_put(&handle, throttle_event);
4356 perf_event__output_id_sample(event, &handle, &sample);
4170 perf_output_end(&handle); 4357 perf_output_end(&handle);
4171} 4358}
4172 4359
@@ -4182,6 +4369,13 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
4182 struct hw_perf_event *hwc = &event->hw; 4369 struct hw_perf_event *hwc = &event->hw;
4183 int ret = 0; 4370 int ret = 0;
4184 4371
4372 /*
4373 * Non-sampling counters might still use the PMI to fold short
4374 * hardware counters, ignore those.
4375 */
4376 if (unlikely(!is_sampling_event(event)))
4377 return 0;
4378
4185 if (!throttle) { 4379 if (!throttle) {
4186 hwc->interrupts++; 4380 hwc->interrupts++;
4187 } else { 4381 } else {
@@ -4327,7 +4521,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
4327 if (!regs) 4521 if (!regs)
4328 return; 4522 return;
4329 4523
4330 if (!hwc->sample_period) 4524 if (!is_sampling_event(event))
4331 return; 4525 return;
4332 4526
4333 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4527 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
@@ -4490,7 +4684,7 @@ static int perf_swevent_add(struct perf_event *event, int flags)
4490 struct hw_perf_event *hwc = &event->hw; 4684 struct hw_perf_event *hwc = &event->hw;
4491 struct hlist_head *head; 4685 struct hlist_head *head;
4492 4686
4493 if (hwc->sample_period) { 4687 if (is_sampling_event(event)) {
4494 hwc->last_period = hwc->sample_period; 4688 hwc->last_period = hwc->sample_period;
4495 perf_swevent_set_period(event); 4689 perf_swevent_set_period(event);
4496 } 4690 }
@@ -4655,7 +4849,7 @@ static int perf_swevent_init(struct perf_event *event)
4655 break; 4849 break;
4656 } 4850 }
4657 4851
4658 if (event_id > PERF_COUNT_SW_MAX) 4852 if (event_id >= PERF_COUNT_SW_MAX)
4659 return -ENOENT; 4853 return -ENOENT;
4660 4854
4661 if (!event->parent) { 4855 if (!event->parent) {
@@ -4747,15 +4941,6 @@ static int perf_tp_event_init(struct perf_event *event)
4747 if (event->attr.type != PERF_TYPE_TRACEPOINT) 4941 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4748 return -ENOENT; 4942 return -ENOENT;
4749 4943
4750 /*
4751 * Raw tracepoint data is a severe data leak, only allow root to
4752 * have these.
4753 */
4754 if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4755 perf_paranoid_tracepoint_raw() &&
4756 !capable(CAP_SYS_ADMIN))
4757 return -EPERM;
4758
4759 err = perf_trace_init(event); 4944 err = perf_trace_init(event);
4760 if (err) 4945 if (err)
4761 return err; 4946 return err;
@@ -4778,7 +4963,7 @@ static struct pmu perf_tracepoint = {
4778 4963
4779static inline void perf_tp_register(void) 4964static inline void perf_tp_register(void)
4780{ 4965{
4781 perf_pmu_register(&perf_tracepoint); 4966 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
4782} 4967}
4783 4968
4784static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4969static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4868,31 +5053,33 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4868static void perf_swevent_start_hrtimer(struct perf_event *event) 5053static void perf_swevent_start_hrtimer(struct perf_event *event)
4869{ 5054{
4870 struct hw_perf_event *hwc = &event->hw; 5055 struct hw_perf_event *hwc = &event->hw;
5056 s64 period;
5057
5058 if (!is_sampling_event(event))
5059 return;
4871 5060
4872 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 5061 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4873 hwc->hrtimer.function = perf_swevent_hrtimer; 5062 hwc->hrtimer.function = perf_swevent_hrtimer;
4874 if (hwc->sample_period) {
4875 s64 period = local64_read(&hwc->period_left);
4876 5063
4877 if (period) { 5064 period = local64_read(&hwc->period_left);
4878 if (period < 0) 5065 if (period) {
4879 period = 10000; 5066 if (period < 0)
5067 period = 10000;
4880 5068
4881 local64_set(&hwc->period_left, 0); 5069 local64_set(&hwc->period_left, 0);
4882 } else { 5070 } else {
4883 period = max_t(u64, 10000, hwc->sample_period); 5071 period = max_t(u64, 10000, hwc->sample_period);
4884 } 5072 }
4885 __hrtimer_start_range_ns(&hwc->hrtimer, 5073 __hrtimer_start_range_ns(&hwc->hrtimer,
4886 ns_to_ktime(period), 0, 5074 ns_to_ktime(period), 0,
4887 HRTIMER_MODE_REL_PINNED, 0); 5075 HRTIMER_MODE_REL_PINNED, 0);
4888 }
4889} 5076}
4890 5077
4891static void perf_swevent_cancel_hrtimer(struct perf_event *event) 5078static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4892{ 5079{
4893 struct hw_perf_event *hwc = &event->hw; 5080 struct hw_perf_event *hwc = &event->hw;
4894 5081
4895 if (hwc->sample_period) { 5082 if (is_sampling_event(event)) {
4896 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); 5083 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4897 local64_set(&hwc->period_left, ktime_to_ns(remaining)); 5084 local64_set(&hwc->period_left, ktime_to_ns(remaining));
4898 5085
@@ -5087,25 +5274,94 @@ static void *find_pmu_context(int ctxn)
5087 return NULL; 5274 return NULL;
5088} 5275}
5089 5276
5090static void free_pmu_context(void * __percpu cpu_context) 5277static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
5091{ 5278{
5092 struct pmu *pmu; 5279 int cpu;
5280
5281 for_each_possible_cpu(cpu) {
5282 struct perf_cpu_context *cpuctx;
5283
5284 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5285
5286 if (cpuctx->active_pmu == old_pmu)
5287 cpuctx->active_pmu = pmu;
5288 }
5289}
5290
5291static void free_pmu_context(struct pmu *pmu)
5292{
5293 struct pmu *i;
5093 5294
5094 mutex_lock(&pmus_lock); 5295 mutex_lock(&pmus_lock);
5095 /* 5296 /*
5096 * Like a real lame refcount. 5297 * Like a real lame refcount.
5097 */ 5298 */
5098 list_for_each_entry(pmu, &pmus, entry) { 5299 list_for_each_entry(i, &pmus, entry) {
5099 if (pmu->pmu_cpu_context == cpu_context) 5300 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
5301 update_pmu_context(i, pmu);
5100 goto out; 5302 goto out;
5303 }
5101 } 5304 }
5102 5305
5103 free_percpu(cpu_context); 5306 free_percpu(pmu->pmu_cpu_context);
5104out: 5307out:
5105 mutex_unlock(&pmus_lock); 5308 mutex_unlock(&pmus_lock);
5106} 5309}
5310static struct idr pmu_idr;
5311
5312static ssize_t
5313type_show(struct device *dev, struct device_attribute *attr, char *page)
5314{
5315 struct pmu *pmu = dev_get_drvdata(dev);
5316
5317 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
5318}
5319
5320static struct device_attribute pmu_dev_attrs[] = {
5321 __ATTR_RO(type),
5322 __ATTR_NULL,
5323};
5107 5324
5108int perf_pmu_register(struct pmu *pmu) 5325static int pmu_bus_running;
5326static struct bus_type pmu_bus = {
5327 .name = "event_source",
5328 .dev_attrs = pmu_dev_attrs,
5329};
5330
5331static void pmu_dev_release(struct device *dev)
5332{
5333 kfree(dev);
5334}
5335
5336static int pmu_dev_alloc(struct pmu *pmu)
5337{
5338 int ret = -ENOMEM;
5339
5340 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
5341 if (!pmu->dev)
5342 goto out;
5343
5344 device_initialize(pmu->dev);
5345 ret = dev_set_name(pmu->dev, "%s", pmu->name);
5346 if (ret)
5347 goto free_dev;
5348
5349 dev_set_drvdata(pmu->dev, pmu);
5350 pmu->dev->bus = &pmu_bus;
5351 pmu->dev->release = pmu_dev_release;
5352 ret = device_add(pmu->dev);
5353 if (ret)
5354 goto free_dev;
5355
5356out:
5357 return ret;
5358
5359free_dev:
5360 put_device(pmu->dev);
5361 goto out;
5362}
5363
5364int perf_pmu_register(struct pmu *pmu, char *name, int type)
5109{ 5365{
5110 int cpu, ret; 5366 int cpu, ret;
5111 5367
@@ -5115,13 +5371,38 @@ int perf_pmu_register(struct pmu *pmu)
5115 if (!pmu->pmu_disable_count) 5371 if (!pmu->pmu_disable_count)
5116 goto unlock; 5372 goto unlock;
5117 5373
5374 pmu->type = -1;
5375 if (!name)
5376 goto skip_type;
5377 pmu->name = name;
5378
5379 if (type < 0) {
5380 int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
5381 if (!err)
5382 goto free_pdc;
5383
5384 err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
5385 if (err) {
5386 ret = err;
5387 goto free_pdc;
5388 }
5389 }
5390 pmu->type = type;
5391
5392 if (pmu_bus_running) {
5393 ret = pmu_dev_alloc(pmu);
5394 if (ret)
5395 goto free_idr;
5396 }
5397
5398skip_type:
5118 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); 5399 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
5119 if (pmu->pmu_cpu_context) 5400 if (pmu->pmu_cpu_context)
5120 goto got_cpu_context; 5401 goto got_cpu_context;
5121 5402
5122 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); 5403 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
5123 if (!pmu->pmu_cpu_context) 5404 if (!pmu->pmu_cpu_context)
5124 goto free_pdc; 5405 goto free_dev;
5125 5406
5126 for_each_possible_cpu(cpu) { 5407 for_each_possible_cpu(cpu) {
5127 struct perf_cpu_context *cpuctx; 5408 struct perf_cpu_context *cpuctx;
@@ -5132,6 +5413,7 @@ int perf_pmu_register(struct pmu *pmu)
5132 cpuctx->ctx.pmu = pmu; 5413 cpuctx->ctx.pmu = pmu;
5133 cpuctx->jiffies_interval = 1; 5414 cpuctx->jiffies_interval = 1;
5134 INIT_LIST_HEAD(&cpuctx->rotation_list); 5415 INIT_LIST_HEAD(&cpuctx->rotation_list);
5416 cpuctx->active_pmu = pmu;
5135 } 5417 }
5136 5418
5137got_cpu_context: 5419got_cpu_context:
@@ -5164,6 +5446,14 @@ unlock:
5164 5446
5165 return ret; 5447 return ret;
5166 5448
5449free_dev:
5450 device_del(pmu->dev);
5451 put_device(pmu->dev);
5452
5453free_idr:
5454 if (pmu->type >= PERF_TYPE_MAX)
5455 idr_remove(&pmu_idr, pmu->type);
5456
5167free_pdc: 5457free_pdc:
5168 free_percpu(pmu->pmu_disable_count); 5458 free_percpu(pmu->pmu_disable_count);
5169 goto unlock; 5459 goto unlock;
@@ -5183,7 +5473,11 @@ void perf_pmu_unregister(struct pmu *pmu)
5183 synchronize_rcu(); 5473 synchronize_rcu();
5184 5474
5185 free_percpu(pmu->pmu_disable_count); 5475 free_percpu(pmu->pmu_disable_count);
5186 free_pmu_context(pmu->pmu_cpu_context); 5476 if (pmu->type >= PERF_TYPE_MAX)
5477 idr_remove(&pmu_idr, pmu->type);
5478 device_del(pmu->dev);
5479 put_device(pmu->dev);
5480 free_pmu_context(pmu);
5187} 5481}
5188 5482
5189struct pmu *perf_init_event(struct perf_event *event) 5483struct pmu *perf_init_event(struct perf_event *event)
@@ -5192,6 +5486,13 @@ struct pmu *perf_init_event(struct perf_event *event)
5192 int idx; 5486 int idx;
5193 5487
5194 idx = srcu_read_lock(&pmus_srcu); 5488 idx = srcu_read_lock(&pmus_srcu);
5489
5490 rcu_read_lock();
5491 pmu = idr_find(&pmu_idr, event->attr.type);
5492 rcu_read_unlock();
5493 if (pmu)
5494 goto unlock;
5495
5195 list_for_each_entry_rcu(pmu, &pmus, entry) { 5496 list_for_each_entry_rcu(pmu, &pmus, entry) {
5196 int ret = pmu->event_init(event); 5497 int ret = pmu->event_init(event);
5197 if (!ret) 5498 if (!ret)
@@ -5651,12 +5952,18 @@ SYSCALL_DEFINE5(perf_event_open,
5651 mutex_unlock(&ctx->mutex); 5952 mutex_unlock(&ctx->mutex);
5652 5953
5653 event->owner = current; 5954 event->owner = current;
5654 get_task_struct(current); 5955
5655 mutex_lock(&current->perf_event_mutex); 5956 mutex_lock(&current->perf_event_mutex);
5656 list_add_tail(&event->owner_entry, &current->perf_event_list); 5957 list_add_tail(&event->owner_entry, &current->perf_event_list);
5657 mutex_unlock(&current->perf_event_mutex); 5958 mutex_unlock(&current->perf_event_mutex);
5658 5959
5659 /* 5960 /*
5961 * Precalculate sample_data sizes
5962 */
5963 perf_event__header_size(event);
5964 perf_event__id_header_size(event);
5965
5966 /*
5660 * Drop the reference on the group_event after placing the 5967 * Drop the reference on the group_event after placing the
5661 * new event on the sibling_list. This ensures destruction 5968 * new event on the sibling_list. This ensures destruction
5662 * of the group leader will find the pointer to itself in 5969 * of the group leader will find the pointer to itself in
@@ -5719,12 +6026,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
5719 ++ctx->generation; 6026 ++ctx->generation;
5720 mutex_unlock(&ctx->mutex); 6027 mutex_unlock(&ctx->mutex);
5721 6028
5722 event->owner = current;
5723 get_task_struct(current);
5724 mutex_lock(&current->perf_event_mutex);
5725 list_add_tail(&event->owner_entry, &current->perf_event_list);
5726 mutex_unlock(&current->perf_event_mutex);
5727
5728 return event; 6029 return event;
5729 6030
5730err_free: 6031err_free:
@@ -5875,8 +6176,24 @@ again:
5875 */ 6176 */
5876void perf_event_exit_task(struct task_struct *child) 6177void perf_event_exit_task(struct task_struct *child)
5877{ 6178{
6179 struct perf_event *event, *tmp;
5878 int ctxn; 6180 int ctxn;
5879 6181
6182 mutex_lock(&child->perf_event_mutex);
6183 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
6184 owner_entry) {
6185 list_del_init(&event->owner_entry);
6186
6187 /*
6188 * Ensure the list deletion is visible before we clear
6189 * the owner, closes a race against perf_release() where
6190 * we need to serialize on the owner->perf_event_mutex.
6191 */
6192 smp_wmb();
6193 event->owner = NULL;
6194 }
6195 mutex_unlock(&child->perf_event_mutex);
6196
5880 for_each_task_context_nr(ctxn) 6197 for_each_task_context_nr(ctxn)
5881 perf_event_exit_task_context(child, ctxn); 6198 perf_event_exit_task_context(child, ctxn);
5882} 6199}
@@ -5999,6 +6316,12 @@ inherit_event(struct perf_event *parent_event,
5999 child_event->overflow_handler = parent_event->overflow_handler; 6316 child_event->overflow_handler = parent_event->overflow_handler;
6000 6317
6001 /* 6318 /*
6319 * Precalculate sample_data sizes
6320 */
6321 perf_event__header_size(child_event);
6322 perf_event__id_header_size(child_event);
6323
6324 /*
6002 * Link it up in the child's context: 6325 * Link it up in the child's context:
6003 */ 6326 */
6004 raw_spin_lock_irqsave(&child_ctx->lock, flags); 6327 raw_spin_lock_irqsave(&child_ctx->lock, flags);
@@ -6096,6 +6419,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6096 struct perf_event *event; 6419 struct perf_event *event;
6097 struct task_struct *parent = current; 6420 struct task_struct *parent = current;
6098 int inherited_all = 1; 6421 int inherited_all = 1;
6422 unsigned long flags;
6099 int ret = 0; 6423 int ret = 0;
6100 6424
6101 child->perf_event_ctxp[ctxn] = NULL; 6425 child->perf_event_ctxp[ctxn] = NULL;
@@ -6136,6 +6460,15 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6136 break; 6460 break;
6137 } 6461 }
6138 6462
6463 /*
6464 * We can't hold ctx->lock when iterating the ->flexible_group list due
6465 * to allocations, but we need to prevent rotation because
6466 * rotate_ctx() will change the list from interrupt context.
6467 */
6468 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
6469 parent_ctx->rotate_disable = 1;
6470 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
6471
6139 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { 6472 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
6140 ret = inherit_task_group(event, parent, parent_ctx, 6473 ret = inherit_task_group(event, parent, parent_ctx,
6141 child, ctxn, &inherited_all); 6474 child, ctxn, &inherited_all);
@@ -6143,6 +6476,10 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6143 break; 6476 break;
6144 } 6477 }
6145 6478
6479 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
6480 parent_ctx->rotate_disable = 0;
6481 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
6482
6146 child_ctx = child->perf_event_ctxp[ctxn]; 6483 child_ctx = child->perf_event_ctxp[ctxn];
6147 6484
6148 if (child_ctx && inherited_all) { 6485 if (child_ctx && inherited_all) {
@@ -6215,7 +6552,7 @@ static void __cpuinit perf_event_init_cpu(int cpu)
6215 mutex_unlock(&swhash->hlist_mutex); 6552 mutex_unlock(&swhash->hlist_mutex);
6216} 6553}
6217 6554
6218#ifdef CONFIG_HOTPLUG_CPU 6555#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
6219static void perf_pmu_rotate_stop(struct pmu *pmu) 6556static void perf_pmu_rotate_stop(struct pmu *pmu)
6220{ 6557{
6221 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 6558 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
@@ -6269,6 +6606,26 @@ static void perf_event_exit_cpu(int cpu)
6269static inline void perf_event_exit_cpu(int cpu) { } 6606static inline void perf_event_exit_cpu(int cpu) { }
6270#endif 6607#endif
6271 6608
6609static int
6610perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
6611{
6612 int cpu;
6613
6614 for_each_online_cpu(cpu)
6615 perf_event_exit_cpu(cpu);
6616
6617 return NOTIFY_OK;
6618}
6619
6620/*
6621 * Run the perf reboot notifier at the very last possible moment so that
6622 * the generic watchdog code runs as long as possible.
6623 */
6624static struct notifier_block perf_reboot_notifier = {
6625 .notifier_call = perf_reboot,
6626 .priority = INT_MIN,
6627};
6628
6272static int __cpuinit 6629static int __cpuinit
6273perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) 6630perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
6274{ 6631{
@@ -6295,11 +6652,47 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
6295 6652
6296void __init perf_event_init(void) 6653void __init perf_event_init(void)
6297{ 6654{
6655 int ret;
6656
6657 idr_init(&pmu_idr);
6658
6298 perf_event_init_all_cpus(); 6659 perf_event_init_all_cpus();
6299 init_srcu_struct(&pmus_srcu); 6660 init_srcu_struct(&pmus_srcu);
6300 perf_pmu_register(&perf_swevent); 6661 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
6301 perf_pmu_register(&perf_cpu_clock); 6662 perf_pmu_register(&perf_cpu_clock, NULL, -1);
6302 perf_pmu_register(&perf_task_clock); 6663 perf_pmu_register(&perf_task_clock, NULL, -1);
6303 perf_tp_register(); 6664 perf_tp_register();
6304 perf_cpu_notifier(perf_cpu_notify); 6665 perf_cpu_notifier(perf_cpu_notify);
6666 register_reboot_notifier(&perf_reboot_notifier);
6667
6668 ret = init_hw_breakpoint();
6669 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
6670}
6671
6672static int __init perf_event_sysfs_init(void)
6673{
6674 struct pmu *pmu;
6675 int ret;
6676
6677 mutex_lock(&pmus_lock);
6678
6679 ret = bus_register(&pmu_bus);
6680 if (ret)
6681 goto unlock;
6682
6683 list_for_each_entry(pmu, &pmus, entry) {
6684 if (!pmu->name || pmu->type < 0)
6685 continue;
6686
6687 ret = pmu_dev_alloc(pmu);
6688 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
6689 }
6690 pmu_bus_running = 1;
6691 ret = 0;
6692
6693unlock:
6694 mutex_unlock(&pmus_lock);
6695
6696 return ret;
6305} 6697}
6698device_initcall(perf_event_sysfs_init);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index c7a8f453919e..aeaa7f846821 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -121,10 +121,10 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
121 121
122 switch (o->type) { 122 switch (o->type) {
123 case PM_QOS_MIN: 123 case PM_QOS_MIN:
124 return plist_last(&o->requests)->prio; 124 return plist_first(&o->requests)->prio;
125 125
126 case PM_QOS_MAX: 126 case PM_QOS_MAX:
127 return plist_first(&o->requests)->prio; 127 return plist_last(&o->requests)->prio;
128 128
129 default: 129 default:
130 /* runtime check for not using enum */ 130 /* runtime check for not using enum */
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 6842eeba5879..05bb7173850e 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -37,13 +37,13 @@ static int check_clock(const clockid_t which_clock)
37 if (pid == 0) 37 if (pid == 0)
38 return 0; 38 return 0;
39 39
40 read_lock(&tasklist_lock); 40 rcu_read_lock();
41 p = find_task_by_vpid(pid); 41 p = find_task_by_vpid(pid);
42 if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? 42 if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
43 same_thread_group(p, current) : thread_group_leader(p))) { 43 same_thread_group(p, current) : has_group_leader_pid(p))) {
44 error = -EINVAL; 44 error = -EINVAL;
45 } 45 }
46 read_unlock(&tasklist_lock); 46 rcu_read_unlock();
47 47
48 return error; 48 return error;
49} 49}
@@ -390,7 +390,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
390 390
391 INIT_LIST_HEAD(&new_timer->it.cpu.entry); 391 INIT_LIST_HEAD(&new_timer->it.cpu.entry);
392 392
393 read_lock(&tasklist_lock); 393 rcu_read_lock();
394 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { 394 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
395 if (pid == 0) { 395 if (pid == 0) {
396 p = current; 396 p = current;
@@ -404,7 +404,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
404 p = current->group_leader; 404 p = current->group_leader;
405 } else { 405 } else {
406 p = find_task_by_vpid(pid); 406 p = find_task_by_vpid(pid);
407 if (p && !thread_group_leader(p)) 407 if (p && !has_group_leader_pid(p))
408 p = NULL; 408 p = NULL;
409 } 409 }
410 } 410 }
@@ -414,7 +414,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
414 } else { 414 } else {
415 ret = -EINVAL; 415 ret = -EINVAL;
416 } 416 }
417 read_unlock(&tasklist_lock); 417 rcu_read_unlock();
418 418
419 return ret; 419 return ret;
420} 420}
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9ca4973f736d..93bd2eb2bc53 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -145,7 +145,13 @@ static int common_timer_del(struct k_itimer *timer);
145 145
146static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); 146static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
147 147
148static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); 148static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
149
150#define lock_timer(tid, flags) \
151({ struct k_itimer *__timr; \
152 __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \
153 __timr; \
154})
149 155
150static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) 156static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
151{ 157{
@@ -619,7 +625,7 @@ out:
619 * the find to the timer lock. To avoid a dead lock, the timer id MUST 625 * the find to the timer lock. To avoid a dead lock, the timer id MUST
620 * be release with out holding the timer lock. 626 * be release with out holding the timer lock.
621 */ 627 */
622static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags) 628static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
623{ 629{
624 struct k_itimer *timr; 630 struct k_itimer *timr;
625 /* 631 /*
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 29bff6117abc..a5aff3ebad38 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -246,9 +246,13 @@ config PM_OPS
246 depends on PM_SLEEP || PM_RUNTIME 246 depends on PM_SLEEP || PM_RUNTIME
247 default y 247 default y
248 248
249config ARCH_HAS_OPP
250 bool
251
249config PM_OPP 252config PM_OPP
250 bool "Operating Performance Point (OPP) Layer library" 253 bool "Operating Performance Point (OPP) Layer library"
251 depends on PM 254 depends on PM
255 depends on ARCH_HAS_OPP
252 ---help--- 256 ---help---
253 SOCs have a standard set of tuples consisting of frequency and 257 SOCs have a standard set of tuples consisting of frequency and
254 voltage pairs that the device will support per voltage domain. This 258 voltage pairs that the device will support per voltage domain. This
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 657272e91d0a..048d0b514831 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -327,7 +327,6 @@ static int create_image(int platform_mode)
327int hibernation_snapshot(int platform_mode) 327int hibernation_snapshot(int platform_mode)
328{ 328{
329 int error; 329 int error;
330 gfp_t saved_mask;
331 330
332 error = platform_begin(platform_mode); 331 error = platform_begin(platform_mode);
333 if (error) 332 if (error)
@@ -339,7 +338,7 @@ int hibernation_snapshot(int platform_mode)
339 goto Close; 338 goto Close;
340 339
341 suspend_console(); 340 suspend_console();
342 saved_mask = clear_gfp_allowed_mask(GFP_IOFS); 341 pm_restrict_gfp_mask();
343 error = dpm_suspend_start(PMSG_FREEZE); 342 error = dpm_suspend_start(PMSG_FREEZE);
344 if (error) 343 if (error)
345 goto Recover_platform; 344 goto Recover_platform;
@@ -348,7 +347,10 @@ int hibernation_snapshot(int platform_mode)
348 goto Recover_platform; 347 goto Recover_platform;
349 348
350 error = create_image(platform_mode); 349 error = create_image(platform_mode);
351 /* Control returns here after successful restore */ 350 /*
351 * Control returns here (1) after the image has been created or the
352 * image creation has failed and (2) after a successful restore.
353 */
352 354
353 Resume_devices: 355 Resume_devices:
354 /* We may need to release the preallocated image pages here. */ 356 /* We may need to release the preallocated image pages here. */
@@ -357,7 +359,10 @@ int hibernation_snapshot(int platform_mode)
357 359
358 dpm_resume_end(in_suspend ? 360 dpm_resume_end(in_suspend ?
359 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 361 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
360 set_gfp_allowed_mask(saved_mask); 362
363 if (error || !in_suspend)
364 pm_restore_gfp_mask();
365
361 resume_console(); 366 resume_console();
362 Close: 367 Close:
363 platform_end(platform_mode); 368 platform_end(platform_mode);
@@ -452,17 +457,16 @@ static int resume_target_kernel(bool platform_mode)
452int hibernation_restore(int platform_mode) 457int hibernation_restore(int platform_mode)
453{ 458{
454 int error; 459 int error;
455 gfp_t saved_mask;
456 460
457 pm_prepare_console(); 461 pm_prepare_console();
458 suspend_console(); 462 suspend_console();
459 saved_mask = clear_gfp_allowed_mask(GFP_IOFS); 463 pm_restrict_gfp_mask();
460 error = dpm_suspend_start(PMSG_QUIESCE); 464 error = dpm_suspend_start(PMSG_QUIESCE);
461 if (!error) { 465 if (!error) {
462 error = resume_target_kernel(platform_mode); 466 error = resume_target_kernel(platform_mode);
463 dpm_resume_end(PMSG_RECOVER); 467 dpm_resume_end(PMSG_RECOVER);
464 } 468 }
465 set_gfp_allowed_mask(saved_mask); 469 pm_restore_gfp_mask();
466 resume_console(); 470 resume_console();
467 pm_restore_console(); 471 pm_restore_console();
468 return error; 472 return error;
@@ -476,7 +480,6 @@ int hibernation_restore(int platform_mode)
476int hibernation_platform_enter(void) 480int hibernation_platform_enter(void)
477{ 481{
478 int error; 482 int error;
479 gfp_t saved_mask;
480 483
481 if (!hibernation_ops) 484 if (!hibernation_ops)
482 return -ENOSYS; 485 return -ENOSYS;
@@ -492,7 +495,6 @@ int hibernation_platform_enter(void)
492 495
493 entering_platform_hibernation = true; 496 entering_platform_hibernation = true;
494 suspend_console(); 497 suspend_console();
495 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
496 error = dpm_suspend_start(PMSG_HIBERNATE); 498 error = dpm_suspend_start(PMSG_HIBERNATE);
497 if (error) { 499 if (error) {
498 if (hibernation_ops->recover) 500 if (hibernation_ops->recover)
@@ -536,7 +538,6 @@ int hibernation_platform_enter(void)
536 Resume_devices: 538 Resume_devices:
537 entering_platform_hibernation = false; 539 entering_platform_hibernation = false;
538 dpm_resume_end(PMSG_RESTORE); 540 dpm_resume_end(PMSG_RESTORE);
539 set_gfp_allowed_mask(saved_mask);
540 resume_console(); 541 resume_console();
541 542
542 Close: 543 Close:
@@ -646,6 +647,7 @@ int hibernate(void)
646 swsusp_free(); 647 swsusp_free();
647 if (!error) 648 if (!error)
648 power_down(); 649 power_down();
650 pm_restore_gfp_mask();
649 } else { 651 } else {
650 pr_debug("PM: Image restored successfully.\n"); 652 pr_debug("PM: Image restored successfully.\n");
651 } 653 }
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 7335952ee473..031d5e3a6197 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -22,6 +22,7 @@
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/suspend.h> 24#include <linux/suspend.h>
25#include <trace/events/power.h>
25 26
26#include "power.h" 27#include "power.h"
27 28
@@ -197,18 +198,18 @@ static int suspend_enter(suspend_state_t state)
197int suspend_devices_and_enter(suspend_state_t state) 198int suspend_devices_and_enter(suspend_state_t state)
198{ 199{
199 int error; 200 int error;
200 gfp_t saved_mask;
201 201
202 if (!suspend_ops) 202 if (!suspend_ops)
203 return -ENOSYS; 203 return -ENOSYS;
204 204
205 trace_machine_suspend(state);
205 if (suspend_ops->begin) { 206 if (suspend_ops->begin) {
206 error = suspend_ops->begin(state); 207 error = suspend_ops->begin(state);
207 if (error) 208 if (error)
208 goto Close; 209 goto Close;
209 } 210 }
210 suspend_console(); 211 suspend_console();
211 saved_mask = clear_gfp_allowed_mask(GFP_IOFS); 212 pm_restrict_gfp_mask();
212 suspend_test_start(); 213 suspend_test_start();
213 error = dpm_suspend_start(PMSG_SUSPEND); 214 error = dpm_suspend_start(PMSG_SUSPEND);
214 if (error) { 215 if (error) {
@@ -225,11 +226,12 @@ int suspend_devices_and_enter(suspend_state_t state)
225 suspend_test_start(); 226 suspend_test_start();
226 dpm_resume_end(PMSG_RESUME); 227 dpm_resume_end(PMSG_RESUME);
227 suspend_test_finish("resume devices"); 228 suspend_test_finish("resume devices");
228 set_gfp_allowed_mask(saved_mask); 229 pm_restore_gfp_mask();
229 resume_console(); 230 resume_console();
230 Close: 231 Close:
231 if (suspend_ops->end) 232 if (suspend_ops->end)
232 suspend_ops->end(); 233 suspend_ops->end();
234 trace_machine_suspend(PWR_EVENT_EXIT);
233 return error; 235 return error;
234 236
235 Recover_platform: 237 Recover_platform:
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a0e4a86ccf94..8c7e4832b9be 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -6,6 +6,7 @@
6 * 6 *
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> 7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> 8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
9 * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com>
9 * 10 *
10 * This file is released under the GPLv2. 11 * This file is released under the GPLv2.
11 * 12 *
@@ -29,7 +30,7 @@
29 30
30#include "power.h" 31#include "power.h"
31 32
32#define HIBERNATE_SIG "LINHIB0001" 33#define HIBERNATE_SIG "S1SUSPEND"
33 34
34/* 35/*
35 * The swap map is a data structure used for keeping track of each page 36 * The swap map is a data structure used for keeping track of each page
@@ -753,30 +754,43 @@ static int load_image_lzo(struct swap_map_handle *handle,
753{ 754{
754 unsigned int m; 755 unsigned int m;
755 int error = 0; 756 int error = 0;
757 struct bio *bio;
756 struct timeval start; 758 struct timeval start;
757 struct timeval stop; 759 struct timeval stop;
758 unsigned nr_pages; 760 unsigned nr_pages;
759 size_t off, unc_len, cmp_len; 761 size_t i, off, unc_len, cmp_len;
760 unsigned char *unc, *cmp, *page; 762 unsigned char *unc, *cmp, *page[LZO_CMP_PAGES];
761 763
762 page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); 764 for (i = 0; i < LZO_CMP_PAGES; i++) {
763 if (!page) { 765 page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
764 printk(KERN_ERR "PM: Failed to allocate LZO page\n"); 766 if (!page[i]) {
765 return -ENOMEM; 767 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
768
769 while (i)
770 free_page((unsigned long)page[--i]);
771
772 return -ENOMEM;
773 }
766 } 774 }
767 775
768 unc = vmalloc(LZO_UNC_SIZE); 776 unc = vmalloc(LZO_UNC_SIZE);
769 if (!unc) { 777 if (!unc) {
770 printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); 778 printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
771 free_page((unsigned long)page); 779
780 for (i = 0; i < LZO_CMP_PAGES; i++)
781 free_page((unsigned long)page[i]);
782
772 return -ENOMEM; 783 return -ENOMEM;
773 } 784 }
774 785
775 cmp = vmalloc(LZO_CMP_SIZE); 786 cmp = vmalloc(LZO_CMP_SIZE);
776 if (!cmp) { 787 if (!cmp) {
777 printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); 788 printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
789
778 vfree(unc); 790 vfree(unc);
779 free_page((unsigned long)page); 791 for (i = 0; i < LZO_CMP_PAGES; i++)
792 free_page((unsigned long)page[i]);
793
780 return -ENOMEM; 794 return -ENOMEM;
781 } 795 }
782 796
@@ -787,6 +801,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
787 if (!m) 801 if (!m)
788 m = 1; 802 m = 1;
789 nr_pages = 0; 803 nr_pages = 0;
804 bio = NULL;
790 do_gettimeofday(&start); 805 do_gettimeofday(&start);
791 806
792 error = snapshot_write_next(snapshot); 807 error = snapshot_write_next(snapshot);
@@ -794,11 +809,11 @@ static int load_image_lzo(struct swap_map_handle *handle,
794 goto out_finish; 809 goto out_finish;
795 810
796 for (;;) { 811 for (;;) {
797 error = swap_read_page(handle, page, NULL); /* sync */ 812 error = swap_read_page(handle, page[0], NULL); /* sync */
798 if (error) 813 if (error)
799 break; 814 break;
800 815
801 cmp_len = *(size_t *)page; 816 cmp_len = *(size_t *)page[0];
802 if (unlikely(!cmp_len || 817 if (unlikely(!cmp_len ||
803 cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { 818 cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) {
804 printk(KERN_ERR "PM: Invalid LZO compressed length\n"); 819 printk(KERN_ERR "PM: Invalid LZO compressed length\n");
@@ -806,13 +821,20 @@ static int load_image_lzo(struct swap_map_handle *handle,
806 break; 821 break;
807 } 822 }
808 823
809 memcpy(cmp, page, PAGE_SIZE); 824 for (off = PAGE_SIZE, i = 1;
810 for (off = PAGE_SIZE; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { 825 off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
811 error = swap_read_page(handle, page, NULL); /* sync */ 826 error = swap_read_page(handle, page[i], &bio);
812 if (error) 827 if (error)
813 goto out_finish; 828 goto out_finish;
829 }
814 830
815 memcpy(cmp + off, page, PAGE_SIZE); 831 error = hib_wait_on_bio_chain(&bio); /* need all data now */
832 if (error)
833 goto out_finish;
834
835 for (off = 0, i = 0;
836 off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
837 memcpy(cmp + off, page[i], PAGE_SIZE);
816 } 838 }
817 839
818 unc_len = LZO_UNC_SIZE; 840 unc_len = LZO_UNC_SIZE;
@@ -857,7 +879,8 @@ out_finish:
857 879
858 vfree(cmp); 880 vfree(cmp);
859 vfree(unc); 881 vfree(unc);
860 free_page((unsigned long)page); 882 for (i = 0; i < LZO_CMP_PAGES; i++)
883 free_page((unsigned long)page[i]);
861 884
862 return error; 885 return error;
863} 886}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index e819e17877ca..c36c3b9e8a84 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -137,7 +137,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
137 free_all_swap_pages(data->swap); 137 free_all_swap_pages(data->swap);
138 if (data->frozen) 138 if (data->frozen)
139 thaw_processes(); 139 thaw_processes();
140 pm_notifier_call_chain(data->mode == O_WRONLY ? 140 pm_notifier_call_chain(data->mode == O_RDONLY ?
141 PM_POST_HIBERNATION : PM_POST_RESTORE); 141 PM_POST_HIBERNATION : PM_POST_RESTORE);
142 atomic_inc(&snapshot_device_available); 142 atomic_inc(&snapshot_device_available);
143 143
@@ -263,6 +263,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
263 case SNAPSHOT_UNFREEZE: 263 case SNAPSHOT_UNFREEZE:
264 if (!data->frozen || data->ready) 264 if (!data->frozen || data->ready)
265 break; 265 break;
266 pm_restore_gfp_mask();
266 thaw_processes(); 267 thaw_processes();
267 usermodehelper_enable(); 268 usermodehelper_enable();
268 data->frozen = 0; 269 data->frozen = 0;
@@ -275,6 +276,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
275 error = -EPERM; 276 error = -EPERM;
276 break; 277 break;
277 } 278 }
279 pm_restore_gfp_mask();
278 error = hibernation_snapshot(data->platform_support); 280 error = hibernation_snapshot(data->platform_support);
279 if (!error) 281 if (!error)
280 error = put_user(in_suspend, (int __user *)arg); 282 error = put_user(in_suspend, (int __user *)arg);
diff --git a/kernel/printk.c b/kernel/printk.c
index b2ebaee8c377..4642a5c439eb 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -43,12 +43,6 @@
43#include <asm/uaccess.h> 43#include <asm/uaccess.h>
44 44
45/* 45/*
46 * for_each_console() allows you to iterate on each console
47 */
48#define for_each_console(con) \
49 for (con = console_drivers; con != NULL; con = con->next)
50
51/*
52 * Architectures can override it: 46 * Architectures can override it:
53 */ 47 */
54void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) 48void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
@@ -261,6 +255,12 @@ static inline void boot_delay_msec(void)
261} 255}
262#endif 256#endif
263 257
258#ifdef CONFIG_SECURITY_DMESG_RESTRICT
259int dmesg_restrict = 1;
260#else
261int dmesg_restrict;
262#endif
263
264int do_syslog(int type, char __user *buf, int len, bool from_file) 264int do_syslog(int type, char __user *buf, int len, bool from_file)
265{ 265{
266 unsigned i, j, limit, count; 266 unsigned i, j, limit, count;
@@ -268,7 +268,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
268 char c; 268 char c;
269 int error = 0; 269 int error = 0;
270 270
271 error = security_syslog(type, from_file); 271 /*
272 * If this is from /proc/kmsg we only do the capabilities checks
273 * at open time.
274 */
275 if (type == SYSLOG_ACTION_OPEN || !from_file) {
276 if (dmesg_restrict && !capable(CAP_SYS_ADMIN))
277 return -EPERM;
278 if ((type != SYSLOG_ACTION_READ_ALL &&
279 type != SYSLOG_ACTION_SIZE_BUFFER) &&
280 !capable(CAP_SYS_ADMIN))
281 return -EPERM;
282 }
283
284 error = security_syslog(type);
272 if (error) 285 if (error)
273 return error; 286 return error;
274 287
@@ -1055,21 +1068,23 @@ static DEFINE_PER_CPU(int, printk_pending);
1055 1068
1056void printk_tick(void) 1069void printk_tick(void)
1057{ 1070{
1058 if (__get_cpu_var(printk_pending)) { 1071 if (__this_cpu_read(printk_pending)) {
1059 __get_cpu_var(printk_pending) = 0; 1072 __this_cpu_write(printk_pending, 0);
1060 wake_up_interruptible(&log_wait); 1073 wake_up_interruptible(&log_wait);
1061 } 1074 }
1062} 1075}
1063 1076
1064int printk_needs_cpu(int cpu) 1077int printk_needs_cpu(int cpu)
1065{ 1078{
1066 return per_cpu(printk_pending, cpu); 1079 if (cpu_is_offline(cpu))
1080 printk_tick();
1081 return __this_cpu_read(printk_pending);
1067} 1082}
1068 1083
1069void wake_up_klogd(void) 1084void wake_up_klogd(void)
1070{ 1085{
1071 if (waitqueue_active(&log_wait)) 1086 if (waitqueue_active(&log_wait))
1072 __raw_get_cpu_var(printk_pending) = 1; 1087 this_cpu_write(printk_pending, 1);
1073} 1088}
1074 1089
1075/** 1090/**
@@ -1338,6 +1353,7 @@ void register_console(struct console *newcon)
1338 spin_unlock_irqrestore(&logbuf_lock, flags); 1353 spin_unlock_irqrestore(&logbuf_lock, flags);
1339 } 1354 }
1340 release_console_sem(); 1355 release_console_sem();
1356 console_sysfs_notify();
1341 1357
1342 /* 1358 /*
1343 * By unregistering the bootconsoles after we enable the real console 1359 * By unregistering the bootconsoles after we enable the real console
@@ -1396,6 +1412,7 @@ int unregister_console(struct console *console)
1396 console_drivers->flags |= CON_CONSDEV; 1412 console_drivers->flags |= CON_CONSDEV;
1397 1413
1398 release_console_sem(); 1414 release_console_sem();
1415 console_sysfs_notify();
1399 return res; 1416 return res;
1400} 1417}
1401EXPORT_SYMBOL(unregister_console); 1418EXPORT_SYMBOL(unregister_console);
diff --git a/kernel/range.c b/kernel/range.c
index 471b66acabb5..37fa9b99ad58 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -119,7 +119,7 @@ static int cmp_range(const void *x1, const void *x2)
119 119
120int clean_sort_range(struct range *range, int az) 120int clean_sort_range(struct range *range, int az)
121{ 121{
122 int i, j, k = az - 1, nr_range = 0; 122 int i, j, k = az - 1, nr_range = az;
123 123
124 for (i = 0; i < k; i++) { 124 for (i = 0; i < k; i++) {
125 if (range[i].end) 125 if (range[i].end)
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index d806735342ac..034493724749 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -36,31 +36,16 @@
36#include <linux/time.h> 36#include <linux/time.h>
37#include <linux/cpu.h> 37#include <linux/cpu.h>
38 38
39/* Global control variables for rcupdate callback mechanism. */ 39/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
40struct rcu_ctrlblk { 40static struct task_struct *rcu_kthread_task;
41 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ 41static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
42 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ 42static unsigned long have_rcu_kthread_work;
43 struct rcu_head **curtail; /* ->next pointer of last CB. */ 43static void invoke_rcu_kthread(void);
44};
45
46/* Definition for rcupdate control block. */
47static struct rcu_ctrlblk rcu_sched_ctrlblk = {
48 .donetail = &rcu_sched_ctrlblk.rcucblist,
49 .curtail = &rcu_sched_ctrlblk.rcucblist,
50};
51
52static struct rcu_ctrlblk rcu_bh_ctrlblk = {
53 .donetail = &rcu_bh_ctrlblk.rcucblist,
54 .curtail = &rcu_bh_ctrlblk.rcucblist,
55};
56
57#ifdef CONFIG_DEBUG_LOCK_ALLOC
58int rcu_scheduler_active __read_mostly;
59EXPORT_SYMBOL_GPL(rcu_scheduler_active);
60#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
61 44
62/* Forward declarations for rcutiny_plugin.h. */ 45/* Forward declarations for rcutiny_plugin.h. */
63static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); 46struct rcu_ctrlblk;
47static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
48static int rcu_kthread(void *arg);
64static void __call_rcu(struct rcu_head *head, 49static void __call_rcu(struct rcu_head *head,
65 void (*func)(struct rcu_head *rcu), 50 void (*func)(struct rcu_head *rcu),
66 struct rcu_ctrlblk *rcp); 51 struct rcu_ctrlblk *rcp);
@@ -123,7 +108,7 @@ void rcu_sched_qs(int cpu)
123{ 108{
124 if (rcu_qsctr_help(&rcu_sched_ctrlblk) + 109 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
125 rcu_qsctr_help(&rcu_bh_ctrlblk)) 110 rcu_qsctr_help(&rcu_bh_ctrlblk))
126 raise_softirq(RCU_SOFTIRQ); 111 invoke_rcu_kthread();
127} 112}
128 113
129/* 114/*
@@ -132,7 +117,7 @@ void rcu_sched_qs(int cpu)
132void rcu_bh_qs(int cpu) 117void rcu_bh_qs(int cpu)
133{ 118{
134 if (rcu_qsctr_help(&rcu_bh_ctrlblk)) 119 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
135 raise_softirq(RCU_SOFTIRQ); 120 invoke_rcu_kthread();
136} 121}
137 122
138/* 123/*
@@ -152,13 +137,14 @@ void rcu_check_callbacks(int cpu, int user)
152} 137}
153 138
154/* 139/*
155 * Helper function for rcu_process_callbacks() that operates on the 140 * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
156 * specified rcu_ctrlkblk structure. 141 * whose grace period has elapsed.
157 */ 142 */
158static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) 143static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
159{ 144{
160 struct rcu_head *next, *list; 145 struct rcu_head *next, *list;
161 unsigned long flags; 146 unsigned long flags;
147 RCU_TRACE(int cb_count = 0);
162 148
163 /* If no RCU callbacks ready to invoke, just return. */ 149 /* If no RCU callbacks ready to invoke, just return. */
164 if (&rcp->rcucblist == rcp->donetail) 150 if (&rcp->rcucblist == rcp->donetail)
@@ -180,19 +166,58 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
180 next = list->next; 166 next = list->next;
181 prefetch(next); 167 prefetch(next);
182 debug_rcu_head_unqueue(list); 168 debug_rcu_head_unqueue(list);
169 local_bh_disable();
183 list->func(list); 170 list->func(list);
171 local_bh_enable();
184 list = next; 172 list = next;
173 RCU_TRACE(cb_count++);
185 } 174 }
175 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
186} 176}
187 177
188/* 178/*
189 * Invoke any callbacks whose grace period has completed. 179 * This kthread invokes RCU callbacks whose grace periods have
180 * elapsed. It is awakened as needed, and takes the place of the
181 * RCU_SOFTIRQ that was used previously for this purpose.
182 * This is a kthread, but it is never stopped, at least not until
183 * the system goes down.
190 */ 184 */
191static void rcu_process_callbacks(struct softirq_action *unused) 185static int rcu_kthread(void *arg)
192{ 186{
193 __rcu_process_callbacks(&rcu_sched_ctrlblk); 187 unsigned long work;
194 __rcu_process_callbacks(&rcu_bh_ctrlblk); 188 unsigned long morework;
195 rcu_preempt_process_callbacks(); 189 unsigned long flags;
190
191 for (;;) {
192 wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0);
193 morework = rcu_boost();
194 local_irq_save(flags);
195 work = have_rcu_kthread_work;
196 have_rcu_kthread_work = morework;
197 local_irq_restore(flags);
198 if (work) {
199 rcu_process_callbacks(&rcu_sched_ctrlblk);
200 rcu_process_callbacks(&rcu_bh_ctrlblk);
201 rcu_preempt_process_callbacks();
202 }
203 schedule_timeout_interruptible(1); /* Leave CPU for others. */
204 }
205
206 return 0; /* Not reached, but needed to shut gcc up. */
207}
208
209/*
210 * Wake up rcu_kthread() to process callbacks now eligible for invocation
211 * or to boost readers.
212 */
213static void invoke_rcu_kthread(void)
214{
215 unsigned long flags;
216
217 local_irq_save(flags);
218 have_rcu_kthread_work = 1;
219 wake_up(&rcu_kthread_wq);
220 local_irq_restore(flags);
196} 221}
197 222
198/* 223/*
@@ -230,6 +255,7 @@ static void __call_rcu(struct rcu_head *head,
230 local_irq_save(flags); 255 local_irq_save(flags);
231 *rcp->curtail = head; 256 *rcp->curtail = head;
232 rcp->curtail = &head->next; 257 rcp->curtail = &head->next;
258 RCU_TRACE(rcp->qlen++);
233 local_irq_restore(flags); 259 local_irq_restore(flags);
234} 260}
235 261
@@ -282,7 +308,16 @@ void rcu_barrier_sched(void)
282} 308}
283EXPORT_SYMBOL_GPL(rcu_barrier_sched); 309EXPORT_SYMBOL_GPL(rcu_barrier_sched);
284 310
285void __init rcu_init(void) 311/*
312 * Spawn the kthread that invokes RCU callbacks.
313 */
314static int __init rcu_spawn_kthreads(void)
286{ 315{
287 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 316 struct sched_param sp;
317
318 rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
319 sp.sched_priority = RCU_BOOST_PRIO;
320 sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
321 return 0;
288} 322}
323early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 6ceca4f745ff..015abaea962a 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -22,6 +22,40 @@
22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */ 23 */
24 24
25#include <linux/kthread.h>
26#include <linux/debugfs.h>
27#include <linux/seq_file.h>
28
29#ifdef CONFIG_RCU_TRACE
30#define RCU_TRACE(stmt) stmt
31#else /* #ifdef CONFIG_RCU_TRACE */
32#define RCU_TRACE(stmt)
33#endif /* #else #ifdef CONFIG_RCU_TRACE */
34
35/* Global control variables for rcupdate callback mechanism. */
36struct rcu_ctrlblk {
37 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
38 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
39 struct rcu_head **curtail; /* ->next pointer of last CB. */
40 RCU_TRACE(long qlen); /* Number of pending CBs. */
41};
42
43/* Definition for rcupdate control block. */
44static struct rcu_ctrlblk rcu_sched_ctrlblk = {
45 .donetail = &rcu_sched_ctrlblk.rcucblist,
46 .curtail = &rcu_sched_ctrlblk.rcucblist,
47};
48
49static struct rcu_ctrlblk rcu_bh_ctrlblk = {
50 .donetail = &rcu_bh_ctrlblk.rcucblist,
51 .curtail = &rcu_bh_ctrlblk.rcucblist,
52};
53
54#ifdef CONFIG_DEBUG_LOCK_ALLOC
55int rcu_scheduler_active __read_mostly;
56EXPORT_SYMBOL_GPL(rcu_scheduler_active);
57#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
58
25#ifdef CONFIG_TINY_PREEMPT_RCU 59#ifdef CONFIG_TINY_PREEMPT_RCU
26 60
27#include <linux/delay.h> 61#include <linux/delay.h>
@@ -46,17 +80,45 @@ struct rcu_preempt_ctrlblk {
46 struct list_head *gp_tasks; 80 struct list_head *gp_tasks;
47 /* Pointer to the first task blocking the */ 81 /* Pointer to the first task blocking the */
48 /* current grace period, or NULL if there */ 82 /* current grace period, or NULL if there */
49 /* is not such task. */ 83 /* is no such task. */
50 struct list_head *exp_tasks; 84 struct list_head *exp_tasks;
51 /* Pointer to first task blocking the */ 85 /* Pointer to first task blocking the */
52 /* current expedited grace period, or NULL */ 86 /* current expedited grace period, or NULL */
53 /* if there is no such task. If there */ 87 /* if there is no such task. If there */
54 /* is no current expedited grace period, */ 88 /* is no current expedited grace period, */
55 /* then there cannot be any such task. */ 89 /* then there cannot be any such task. */
90#ifdef CONFIG_RCU_BOOST
91 struct list_head *boost_tasks;
92 /* Pointer to first task that needs to be */
93 /* priority-boosted, or NULL if no priority */
94 /* boosting is needed. If there is no */
95 /* current or expedited grace period, there */
96 /* can be no such task. */
97#endif /* #ifdef CONFIG_RCU_BOOST */
56 u8 gpnum; /* Current grace period. */ 98 u8 gpnum; /* Current grace period. */
57 u8 gpcpu; /* Last grace period blocked by the CPU. */ 99 u8 gpcpu; /* Last grace period blocked by the CPU. */
58 u8 completed; /* Last grace period completed. */ 100 u8 completed; /* Last grace period completed. */
59 /* If all three are equal, RCU is idle. */ 101 /* If all three are equal, RCU is idle. */
102#ifdef CONFIG_RCU_BOOST
103 s8 boosted_this_gp; /* Has boosting already happened? */
104 unsigned long boost_time; /* When to start boosting (jiffies) */
105#endif /* #ifdef CONFIG_RCU_BOOST */
106#ifdef CONFIG_RCU_TRACE
107 unsigned long n_grace_periods;
108#ifdef CONFIG_RCU_BOOST
109 unsigned long n_tasks_boosted;
110 unsigned long n_exp_boosts;
111 unsigned long n_normal_boosts;
112 unsigned long n_normal_balk_blkd_tasks;
113 unsigned long n_normal_balk_gp_tasks;
114 unsigned long n_normal_balk_boost_tasks;
115 unsigned long n_normal_balk_boosted;
116 unsigned long n_normal_balk_notyet;
117 unsigned long n_normal_balk_nos;
118 unsigned long n_exp_balk_blkd_tasks;
119 unsigned long n_exp_balk_nos;
120#endif /* #ifdef CONFIG_RCU_BOOST */
121#endif /* #ifdef CONFIG_RCU_TRACE */
60}; 122};
61 123
62static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { 124static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
@@ -122,6 +184,210 @@ static int rcu_preempt_gp_in_progress(void)
122} 184}
123 185
124/* 186/*
187 * Advance a ->blkd_tasks-list pointer to the next entry, instead
188 * returning NULL if at the end of the list.
189 */
190static struct list_head *rcu_next_node_entry(struct task_struct *t)
191{
192 struct list_head *np;
193
194 np = t->rcu_node_entry.next;
195 if (np == &rcu_preempt_ctrlblk.blkd_tasks)
196 np = NULL;
197 return np;
198}
199
200#ifdef CONFIG_RCU_TRACE
201
202#ifdef CONFIG_RCU_BOOST
203static void rcu_initiate_boost_trace(void);
204static void rcu_initiate_exp_boost_trace(void);
205#endif /* #ifdef CONFIG_RCU_BOOST */
206
207/*
208 * Dump additional statistice for TINY_PREEMPT_RCU.
209 */
210static void show_tiny_preempt_stats(struct seq_file *m)
211{
212 seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
213 rcu_preempt_ctrlblk.rcb.qlen,
214 rcu_preempt_ctrlblk.n_grace_periods,
215 rcu_preempt_ctrlblk.gpnum,
216 rcu_preempt_ctrlblk.gpcpu,
217 rcu_preempt_ctrlblk.completed,
218 "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
219 "N."[!rcu_preempt_ctrlblk.gp_tasks],
220 "E."[!rcu_preempt_ctrlblk.exp_tasks]);
221#ifdef CONFIG_RCU_BOOST
222 seq_printf(m, " ttb=%c btg=",
223 "B."[!rcu_preempt_ctrlblk.boost_tasks]);
224 switch (rcu_preempt_ctrlblk.boosted_this_gp) {
225 case -1:
226 seq_puts(m, "exp");
227 break;
228 case 0:
229 seq_puts(m, "no");
230 break;
231 case 1:
232 seq_puts(m, "begun");
233 break;
234 case 2:
235 seq_puts(m, "done");
236 break;
237 default:
238 seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
239 }
240 seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
241 rcu_preempt_ctrlblk.n_tasks_boosted,
242 rcu_preempt_ctrlblk.n_exp_boosts,
243 rcu_preempt_ctrlblk.n_normal_boosts,
244 (int)(jiffies & 0xffff),
245 (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
246 seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n",
247 "normal balk",
248 rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks,
249 rcu_preempt_ctrlblk.n_normal_balk_gp_tasks,
250 rcu_preempt_ctrlblk.n_normal_balk_boost_tasks,
251 rcu_preempt_ctrlblk.n_normal_balk_boosted,
252 rcu_preempt_ctrlblk.n_normal_balk_notyet,
253 rcu_preempt_ctrlblk.n_normal_balk_nos);
254 seq_printf(m, " exp balk: bt=%lu nos=%lu\n",
255 rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
256 rcu_preempt_ctrlblk.n_exp_balk_nos);
257#endif /* #ifdef CONFIG_RCU_BOOST */
258}
259
260#endif /* #ifdef CONFIG_RCU_TRACE */
261
262#ifdef CONFIG_RCU_BOOST
263
264#include "rtmutex_common.h"
265
266/*
267 * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
268 * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
269 */
270static int rcu_boost(void)
271{
272 unsigned long flags;
273 struct rt_mutex mtx;
274 struct list_head *np;
275 struct task_struct *t;
276
277 if (rcu_preempt_ctrlblk.boost_tasks == NULL)
278 return 0; /* Nothing to boost. */
279 raw_local_irq_save(flags);
280 rcu_preempt_ctrlblk.boosted_this_gp++;
281 t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct,
282 rcu_node_entry);
283 np = rcu_next_node_entry(t);
284 rt_mutex_init_proxy_locked(&mtx, t);
285 t->rcu_boost_mutex = &mtx;
286 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
287 raw_local_irq_restore(flags);
288 rt_mutex_lock(&mtx);
289 RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
290 rcu_preempt_ctrlblk.boosted_this_gp++;
291 rt_mutex_unlock(&mtx);
292 return rcu_preempt_ctrlblk.boost_tasks != NULL;
293}
294
295/*
296 * Check to see if it is now time to start boosting RCU readers blocking
297 * the current grace period, and, if so, tell the rcu_kthread_task to
298 * start boosting them. If there is an expedited boost in progress,
299 * we wait for it to complete.
300 *
301 * If there are no blocked readers blocking the current grace period,
302 * return 0 to let the caller know, otherwise return 1. Note that this
303 * return value is independent of whether or not boosting was done.
304 */
305static int rcu_initiate_boost(void)
306{
307 if (!rcu_preempt_blocked_readers_cgp()) {
308 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++);
309 return 0;
310 }
311 if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
312 rcu_preempt_ctrlblk.boost_tasks == NULL &&
313 rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
314 ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
315 rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
316 invoke_rcu_kthread();
317 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
318 } else
319 RCU_TRACE(rcu_initiate_boost_trace());
320 return 1;
321}
322
323/*
324 * Initiate boosting for an expedited grace period.
325 */
326static void rcu_initiate_expedited_boost(void)
327{
328 unsigned long flags;
329
330 raw_local_irq_save(flags);
331 if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
332 rcu_preempt_ctrlblk.boost_tasks =
333 rcu_preempt_ctrlblk.blkd_tasks.next;
334 rcu_preempt_ctrlblk.boosted_this_gp = -1;
335 invoke_rcu_kthread();
336 RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
337 } else
338 RCU_TRACE(rcu_initiate_exp_boost_trace());
339 raw_local_irq_restore(flags);
340}
341
342#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
343
344/*
345 * Do priority-boost accounting for the start of a new grace period.
346 */
347static void rcu_preempt_boost_start_gp(void)
348{
349 rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
350 if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
351 rcu_preempt_ctrlblk.boosted_this_gp = 0;
352}
353
354#else /* #ifdef CONFIG_RCU_BOOST */
355
356/*
357 * If there is no RCU priority boosting, we don't boost.
358 */
359static int rcu_boost(void)
360{
361 return 0;
362}
363
364/*
365 * If there is no RCU priority boosting, we don't initiate boosting,
366 * but we do indicate whether there are blocked readers blocking the
367 * current grace period.
368 */
369static int rcu_initiate_boost(void)
370{
371 return rcu_preempt_blocked_readers_cgp();
372}
373
374/*
375 * If there is no RCU priority boosting, we don't initiate expedited boosting.
376 */
377static void rcu_initiate_expedited_boost(void)
378{
379}
380
381/*
382 * If there is no RCU priority boosting, nothing to do at grace-period start.
383 */
384static void rcu_preempt_boost_start_gp(void)
385{
386}
387
388#endif /* else #ifdef CONFIG_RCU_BOOST */
389
390/*
125 * Record a preemptible-RCU quiescent state for the specified CPU. Note 391 * Record a preemptible-RCU quiescent state for the specified CPU. Note
126 * that this just means that the task currently running on the CPU is 392 * that this just means that the task currently running on the CPU is
127 * in a quiescent state. There might be any number of tasks blocked 393 * in a quiescent state. There might be any number of tasks blocked
@@ -148,11 +414,14 @@ static void rcu_preempt_cpu_qs(void)
148 rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; 414 rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
149 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 415 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
150 416
417 /* If there is no GP then there is nothing more to do. */
418 if (!rcu_preempt_gp_in_progress())
419 return;
151 /* 420 /*
152 * If there is no GP, or if blocked readers are still blocking GP, 421 * Check up on boosting. If there are no readers blocking the
153 * then there is nothing more to do. 422 * current grace period, leave.
154 */ 423 */
155 if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp()) 424 if (rcu_initiate_boost())
156 return; 425 return;
157 426
158 /* Advance callbacks. */ 427 /* Advance callbacks. */
@@ -164,9 +433,9 @@ static void rcu_preempt_cpu_qs(void)
164 if (!rcu_preempt_blocked_readers_any()) 433 if (!rcu_preempt_blocked_readers_any())
165 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; 434 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
166 435
167 /* If there are done callbacks, make RCU_SOFTIRQ process them. */ 436 /* If there are done callbacks, cause them to be invoked. */
168 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) 437 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
169 raise_softirq(RCU_SOFTIRQ); 438 invoke_rcu_kthread();
170} 439}
171 440
172/* 441/*
@@ -178,12 +447,16 @@ static void rcu_preempt_start_gp(void)
178 447
179 /* Official start of GP. */ 448 /* Official start of GP. */
180 rcu_preempt_ctrlblk.gpnum++; 449 rcu_preempt_ctrlblk.gpnum++;
450 RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
181 451
182 /* Any blocked RCU readers block new GP. */ 452 /* Any blocked RCU readers block new GP. */
183 if (rcu_preempt_blocked_readers_any()) 453 if (rcu_preempt_blocked_readers_any())
184 rcu_preempt_ctrlblk.gp_tasks = 454 rcu_preempt_ctrlblk.gp_tasks =
185 rcu_preempt_ctrlblk.blkd_tasks.next; 455 rcu_preempt_ctrlblk.blkd_tasks.next;
186 456
457 /* Set up for RCU priority boosting. */
458 rcu_preempt_boost_start_gp();
459
187 /* If there is no running reader, CPU is done with GP. */ 460 /* If there is no running reader, CPU is done with GP. */
188 if (!rcu_preempt_running_reader()) 461 if (!rcu_preempt_running_reader())
189 rcu_preempt_cpu_qs(); 462 rcu_preempt_cpu_qs();
@@ -304,14 +577,16 @@ static void rcu_read_unlock_special(struct task_struct *t)
304 */ 577 */
305 empty = !rcu_preempt_blocked_readers_cgp(); 578 empty = !rcu_preempt_blocked_readers_cgp();
306 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; 579 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
307 np = t->rcu_node_entry.next; 580 np = rcu_next_node_entry(t);
308 if (np == &rcu_preempt_ctrlblk.blkd_tasks)
309 np = NULL;
310 list_del(&t->rcu_node_entry); 581 list_del(&t->rcu_node_entry);
311 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) 582 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
312 rcu_preempt_ctrlblk.gp_tasks = np; 583 rcu_preempt_ctrlblk.gp_tasks = np;
313 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) 584 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
314 rcu_preempt_ctrlblk.exp_tasks = np; 585 rcu_preempt_ctrlblk.exp_tasks = np;
586#ifdef CONFIG_RCU_BOOST
587 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
588 rcu_preempt_ctrlblk.boost_tasks = np;
589#endif /* #ifdef CONFIG_RCU_BOOST */
315 INIT_LIST_HEAD(&t->rcu_node_entry); 590 INIT_LIST_HEAD(&t->rcu_node_entry);
316 591
317 /* 592 /*
@@ -331,6 +606,14 @@ static void rcu_read_unlock_special(struct task_struct *t)
331 if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) 606 if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
332 rcu_report_exp_done(); 607 rcu_report_exp_done();
333 } 608 }
609#ifdef CONFIG_RCU_BOOST
610 /* Unboost self if was boosted. */
611 if (special & RCU_READ_UNLOCK_BOOSTED) {
612 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
613 rt_mutex_unlock(t->rcu_boost_mutex);
614 t->rcu_boost_mutex = NULL;
615 }
616#endif /* #ifdef CONFIG_RCU_BOOST */
334 local_irq_restore(flags); 617 local_irq_restore(flags);
335} 618}
336 619
@@ -374,7 +657,7 @@ static void rcu_preempt_check_callbacks(void)
374 rcu_preempt_cpu_qs(); 657 rcu_preempt_cpu_qs();
375 if (&rcu_preempt_ctrlblk.rcb.rcucblist != 658 if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
376 rcu_preempt_ctrlblk.rcb.donetail) 659 rcu_preempt_ctrlblk.rcb.donetail)
377 raise_softirq(RCU_SOFTIRQ); 660 invoke_rcu_kthread();
378 if (rcu_preempt_gp_in_progress() && 661 if (rcu_preempt_gp_in_progress() &&
379 rcu_cpu_blocking_cur_gp() && 662 rcu_cpu_blocking_cur_gp() &&
380 rcu_preempt_running_reader()) 663 rcu_preempt_running_reader())
@@ -383,7 +666,7 @@ static void rcu_preempt_check_callbacks(void)
383 666
384/* 667/*
385 * TINY_PREEMPT_RCU has an extra callback-list tail pointer to 668 * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
386 * update, so this is invoked from __rcu_process_callbacks() to 669 * update, so this is invoked from rcu_process_callbacks() to
387 * handle that case. Of course, it is invoked for all flavors of 670 * handle that case. Of course, it is invoked for all flavors of
388 * RCU, but RCU callbacks can appear only on one of the lists, and 671 * RCU, but RCU callbacks can appear only on one of the lists, and
389 * neither ->nexttail nor ->donetail can possibly be NULL, so there 672 * neither ->nexttail nor ->donetail can possibly be NULL, so there
@@ -400,7 +683,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
400 */ 683 */
401static void rcu_preempt_process_callbacks(void) 684static void rcu_preempt_process_callbacks(void)
402{ 685{
403 __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); 686 rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
404} 687}
405 688
406/* 689/*
@@ -417,6 +700,7 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
417 local_irq_save(flags); 700 local_irq_save(flags);
418 *rcu_preempt_ctrlblk.nexttail = head; 701 *rcu_preempt_ctrlblk.nexttail = head;
419 rcu_preempt_ctrlblk.nexttail = &head->next; 702 rcu_preempt_ctrlblk.nexttail = &head->next;
703 RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
420 rcu_preempt_start_gp(); /* checks to see if GP needed. */ 704 rcu_preempt_start_gp(); /* checks to see if GP needed. */
421 local_irq_restore(flags); 705 local_irq_restore(flags);
422} 706}
@@ -532,6 +816,7 @@ void synchronize_rcu_expedited(void)
532 816
533 /* Wait for tail of ->blkd_tasks list to drain. */ 817 /* Wait for tail of ->blkd_tasks list to drain. */
534 if (rcu_preempted_readers_exp()) 818 if (rcu_preempted_readers_exp())
819 rcu_initiate_expedited_boost();
535 wait_event(sync_rcu_preempt_exp_wq, 820 wait_event(sync_rcu_preempt_exp_wq,
536 !rcu_preempted_readers_exp()); 821 !rcu_preempted_readers_exp());
537 822
@@ -572,6 +857,27 @@ void exit_rcu(void)
572 857
573#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ 858#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
574 859
860#ifdef CONFIG_RCU_TRACE
861
862/*
863 * Because preemptible RCU does not exist, it is not necessary to
864 * dump out its statistics.
865 */
866static void show_tiny_preempt_stats(struct seq_file *m)
867{
868}
869
870#endif /* #ifdef CONFIG_RCU_TRACE */
871
872/*
873 * Because preemptible RCU does not exist, it is never necessary to
874 * boost preempted RCU readers.
875 */
876static int rcu_boost(void)
877{
878 return 0;
879}
880
575/* 881/*
576 * Because preemptible RCU does not exist, it never has any callbacks 882 * Because preemptible RCU does not exist, it never has any callbacks
577 * to check. 883 * to check.
@@ -599,17 +905,116 @@ static void rcu_preempt_process_callbacks(void)
599#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ 905#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
600 906
601#ifdef CONFIG_DEBUG_LOCK_ALLOC 907#ifdef CONFIG_DEBUG_LOCK_ALLOC
602
603#include <linux/kernel_stat.h> 908#include <linux/kernel_stat.h>
604 909
605/* 910/*
606 * During boot, we forgive RCU lockdep issues. After this function is 911 * During boot, we forgive RCU lockdep issues. After this function is
607 * invoked, we start taking RCU lockdep issues seriously. 912 * invoked, we start taking RCU lockdep issues seriously.
608 */ 913 */
609void rcu_scheduler_starting(void) 914void __init rcu_scheduler_starting(void)
610{ 915{
611 WARN_ON(nr_context_switches() > 0); 916 WARN_ON(nr_context_switches() > 0);
612 rcu_scheduler_active = 1; 917 rcu_scheduler_active = 1;
613} 918}
614 919
615#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 920#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
921
922#ifdef CONFIG_RCU_BOOST
923#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
924#else /* #ifdef CONFIG_RCU_BOOST */
925#define RCU_BOOST_PRIO 1
926#endif /* #else #ifdef CONFIG_RCU_BOOST */
927
928#ifdef CONFIG_RCU_TRACE
929
930#ifdef CONFIG_RCU_BOOST
931
932static void rcu_initiate_boost_trace(void)
933{
934 if (rcu_preempt_ctrlblk.gp_tasks == NULL)
935 rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++;
936 else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
937 rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++;
938 else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
939 rcu_preempt_ctrlblk.n_normal_balk_boosted++;
940 else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
941 rcu_preempt_ctrlblk.n_normal_balk_notyet++;
942 else
943 rcu_preempt_ctrlblk.n_normal_balk_nos++;
944}
945
946static void rcu_initiate_exp_boost_trace(void)
947{
948 if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
949 rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
950 else
951 rcu_preempt_ctrlblk.n_exp_balk_nos++;
952}
953
954#endif /* #ifdef CONFIG_RCU_BOOST */
955
956static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
957{
958 unsigned long flags;
959
960 raw_local_irq_save(flags);
961 rcp->qlen -= n;
962 raw_local_irq_restore(flags);
963}
964
965/*
966 * Dump statistics for TINY_RCU, such as they are.
967 */
968static int show_tiny_stats(struct seq_file *m, void *unused)
969{
970 show_tiny_preempt_stats(m);
971 seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
972 seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
973 return 0;
974}
975
976static int show_tiny_stats_open(struct inode *inode, struct file *file)
977{
978 return single_open(file, show_tiny_stats, NULL);
979}
980
981static const struct file_operations show_tiny_stats_fops = {
982 .owner = THIS_MODULE,
983 .open = show_tiny_stats_open,
984 .read = seq_read,
985 .llseek = seq_lseek,
986 .release = single_release,
987};
988
989static struct dentry *rcudir;
990
991static int __init rcutiny_trace_init(void)
992{
993 struct dentry *retval;
994
995 rcudir = debugfs_create_dir("rcu", NULL);
996 if (!rcudir)
997 goto free_out;
998 retval = debugfs_create_file("rcudata", 0444, rcudir,
999 NULL, &show_tiny_stats_fops);
1000 if (!retval)
1001 goto free_out;
1002 return 0;
1003free_out:
1004 debugfs_remove_recursive(rcudir);
1005 return 1;
1006}
1007
1008static void __exit rcutiny_trace_cleanup(void)
1009{
1010 debugfs_remove_recursive(rcudir);
1011}
1012
1013module_init(rcutiny_trace_init);
1014module_exit(rcutiny_trace_cleanup);
1015
1016MODULE_AUTHOR("Paul E. McKenney");
1017MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
1018MODULE_LICENSE("GPL");
1019
1020#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9d8e8fb2515f..89613f97ff26 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,6 +47,7 @@
47#include <linux/srcu.h> 47#include <linux/srcu.h>
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <asm/byteorder.h> 49#include <asm/byteorder.h>
50#include <linux/sched.h>
50 51
51MODULE_LICENSE("GPL"); 52MODULE_LICENSE("GPL");
52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
@@ -64,6 +65,9 @@ static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ 65static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff = 0; /* Hold time within burst (us). */ 66static int fqs_holdoff = 0; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */ 67static int fqs_stutter = 3; /* Wait time between bursts (s). */
68static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
69static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
70static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
67static char *torture_type = "rcu"; /* What RCU implementation to torture. */ 71static char *torture_type = "rcu"; /* What RCU implementation to torture. */
68 72
69module_param(nreaders, int, 0444); 73module_param(nreaders, int, 0444);
@@ -88,6 +92,12 @@ module_param(fqs_holdoff, int, 0444);
88MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 92MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
89module_param(fqs_stutter, int, 0444); 93module_param(fqs_stutter, int, 0444);
90MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 94MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
95module_param(test_boost, int, 0444);
96MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
97module_param(test_boost_interval, int, 0444);
98MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
99module_param(test_boost_duration, int, 0444);
100MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
91module_param(torture_type, charp, 0444); 101module_param(torture_type, charp, 0444);
92MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); 102MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
93 103
@@ -109,6 +119,7 @@ static struct task_struct *stats_task;
109static struct task_struct *shuffler_task; 119static struct task_struct *shuffler_task;
110static struct task_struct *stutter_task; 120static struct task_struct *stutter_task;
111static struct task_struct *fqs_task; 121static struct task_struct *fqs_task;
122static struct task_struct *boost_tasks[NR_CPUS];
112 123
113#define RCU_TORTURE_PIPE_LEN 10 124#define RCU_TORTURE_PIPE_LEN 10
114 125
@@ -134,6 +145,12 @@ static atomic_t n_rcu_torture_alloc_fail;
134static atomic_t n_rcu_torture_free; 145static atomic_t n_rcu_torture_free;
135static atomic_t n_rcu_torture_mberror; 146static atomic_t n_rcu_torture_mberror;
136static atomic_t n_rcu_torture_error; 147static atomic_t n_rcu_torture_error;
148static long n_rcu_torture_boost_ktrerror;
149static long n_rcu_torture_boost_rterror;
150static long n_rcu_torture_boost_allocerror;
151static long n_rcu_torture_boost_afferror;
152static long n_rcu_torture_boost_failure;
153static long n_rcu_torture_boosts;
137static long n_rcu_torture_timers; 154static long n_rcu_torture_timers;
138static struct list_head rcu_torture_removed; 155static struct list_head rcu_torture_removed;
139static cpumask_var_t shuffle_tmp_mask; 156static cpumask_var_t shuffle_tmp_mask;
@@ -147,6 +164,16 @@ static int stutter_pause_test;
147#endif 164#endif
148int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 165int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
149 166
167#ifdef CONFIG_RCU_BOOST
168#define rcu_can_boost() 1
169#else /* #ifdef CONFIG_RCU_BOOST */
170#define rcu_can_boost() 0
171#endif /* #else #ifdef CONFIG_RCU_BOOST */
172
173static unsigned long boost_starttime; /* jiffies of next boost test start. */
174DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
175 /* and boost task create/destroy. */
176
150/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ 177/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
151 178
152#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ 179#define FULLSTOP_DONTSTOP 0 /* Normal operation. */
@@ -277,6 +304,7 @@ struct rcu_torture_ops {
277 void (*fqs)(void); 304 void (*fqs)(void);
278 int (*stats)(char *page); 305 int (*stats)(char *page);
279 int irq_capable; 306 int irq_capable;
307 int can_boost;
280 char *name; 308 char *name;
281}; 309};
282 310
@@ -366,6 +394,7 @@ static struct rcu_torture_ops rcu_ops = {
366 .fqs = rcu_force_quiescent_state, 394 .fqs = rcu_force_quiescent_state,
367 .stats = NULL, 395 .stats = NULL,
368 .irq_capable = 1, 396 .irq_capable = 1,
397 .can_boost = rcu_can_boost(),
369 .name = "rcu" 398 .name = "rcu"
370}; 399};
371 400
@@ -408,6 +437,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
408 .fqs = rcu_force_quiescent_state, 437 .fqs = rcu_force_quiescent_state,
409 .stats = NULL, 438 .stats = NULL,
410 .irq_capable = 1, 439 .irq_capable = 1,
440 .can_boost = rcu_can_boost(),
411 .name = "rcu_sync" 441 .name = "rcu_sync"
412}; 442};
413 443
@@ -424,6 +454,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
424 .fqs = rcu_force_quiescent_state, 454 .fqs = rcu_force_quiescent_state,
425 .stats = NULL, 455 .stats = NULL,
426 .irq_capable = 1, 456 .irq_capable = 1,
457 .can_boost = rcu_can_boost(),
427 .name = "rcu_expedited" 458 .name = "rcu_expedited"
428}; 459};
429 460
@@ -684,6 +715,110 @@ static struct rcu_torture_ops sched_expedited_ops = {
684}; 715};
685 716
686/* 717/*
718 * RCU torture priority-boost testing. Runs one real-time thread per
719 * CPU for moderate bursts, repeatedly registering RCU callbacks and
720 * spinning waiting for them to be invoked. If a given callback takes
721 * too long to be invoked, we assume that priority inversion has occurred.
722 */
723
724struct rcu_boost_inflight {
725 struct rcu_head rcu;
726 int inflight;
727};
728
729static void rcu_torture_boost_cb(struct rcu_head *head)
730{
731 struct rcu_boost_inflight *rbip =
732 container_of(head, struct rcu_boost_inflight, rcu);
733
734 smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
735 rbip->inflight = 0;
736}
737
738static int rcu_torture_boost(void *arg)
739{
740 unsigned long call_rcu_time;
741 unsigned long endtime;
742 unsigned long oldstarttime;
743 struct rcu_boost_inflight rbi = { .inflight = 0 };
744 struct sched_param sp;
745
746 VERBOSE_PRINTK_STRING("rcu_torture_boost started");
747
748 /* Set real-time priority. */
749 sp.sched_priority = 1;
750 if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
751 VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
752 n_rcu_torture_boost_rterror++;
753 }
754
755 /* Each pass through the following loop does one boost-test cycle. */
756 do {
757 /* Wait for the next test interval. */
758 oldstarttime = boost_starttime;
759 while (jiffies - oldstarttime > ULONG_MAX / 2) {
760 schedule_timeout_uninterruptible(1);
761 rcu_stutter_wait("rcu_torture_boost");
762 if (kthread_should_stop() ||
763 fullstop != FULLSTOP_DONTSTOP)
764 goto checkwait;
765 }
766
767 /* Do one boost-test interval. */
768 endtime = oldstarttime + test_boost_duration * HZ;
769 call_rcu_time = jiffies;
770 while (jiffies - endtime > ULONG_MAX / 2) {
771 /* If we don't have a callback in flight, post one. */
772 if (!rbi.inflight) {
773 smp_mb(); /* RCU core before ->inflight = 1. */
774 rbi.inflight = 1;
775 call_rcu(&rbi.rcu, rcu_torture_boost_cb);
776 if (jiffies - call_rcu_time >
777 test_boost_duration * HZ - HZ / 2) {
778 VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
779 n_rcu_torture_boost_failure++;
780 }
781 call_rcu_time = jiffies;
782 }
783 cond_resched();
784 rcu_stutter_wait("rcu_torture_boost");
785 if (kthread_should_stop() ||
786 fullstop != FULLSTOP_DONTSTOP)
787 goto checkwait;
788 }
789
790 /*
791 * Set the start time of the next test interval.
792 * Yes, this is vulnerable to long delays, but such
793 * delays simply cause a false negative for the next
794 * interval. Besides, we are running at RT priority,
795 * so delays should be relatively rare.
796 */
797 while (oldstarttime == boost_starttime) {
798 if (mutex_trylock(&boost_mutex)) {
799 boost_starttime = jiffies +
800 test_boost_interval * HZ;
801 n_rcu_torture_boosts++;
802 mutex_unlock(&boost_mutex);
803 break;
804 }
805 schedule_timeout_uninterruptible(1);
806 }
807
808 /* Go do the stutter. */
809checkwait: rcu_stutter_wait("rcu_torture_boost");
810 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
811
812 /* Clean up and exit. */
813 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
814 rcutorture_shutdown_absorb("rcu_torture_boost");
815 while (!kthread_should_stop() || rbi.inflight)
816 schedule_timeout_uninterruptible(1);
817 smp_mb(); /* order accesses to ->inflight before stack-frame death. */
818 return 0;
819}
820
821/*
687 * RCU torture force-quiescent-state kthread. Repeatedly induces 822 * RCU torture force-quiescent-state kthread. Repeatedly induces
688 * bursts of calls to force_quiescent_state(), increasing the probability 823 * bursts of calls to force_quiescent_state(), increasing the probability
689 * of occurrence of some important types of race conditions. 824 * of occurrence of some important types of race conditions.
@@ -933,7 +1068,8 @@ rcu_torture_printk(char *page)
933 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); 1068 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
934 cnt += sprintf(&page[cnt], 1069 cnt += sprintf(&page[cnt],
935 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " 1070 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
936 "rtmbe: %d nt: %ld", 1071 "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld "
1072 "rtbf: %ld rtb: %ld nt: %ld",
937 rcu_torture_current, 1073 rcu_torture_current,
938 rcu_torture_current_version, 1074 rcu_torture_current_version,
939 list_empty(&rcu_torture_freelist), 1075 list_empty(&rcu_torture_freelist),
@@ -941,8 +1077,19 @@ rcu_torture_printk(char *page)
941 atomic_read(&n_rcu_torture_alloc_fail), 1077 atomic_read(&n_rcu_torture_alloc_fail),
942 atomic_read(&n_rcu_torture_free), 1078 atomic_read(&n_rcu_torture_free),
943 atomic_read(&n_rcu_torture_mberror), 1079 atomic_read(&n_rcu_torture_mberror),
1080 n_rcu_torture_boost_ktrerror,
1081 n_rcu_torture_boost_rterror,
1082 n_rcu_torture_boost_allocerror,
1083 n_rcu_torture_boost_afferror,
1084 n_rcu_torture_boost_failure,
1085 n_rcu_torture_boosts,
944 n_rcu_torture_timers); 1086 n_rcu_torture_timers);
945 if (atomic_read(&n_rcu_torture_mberror) != 0) 1087 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1088 n_rcu_torture_boost_ktrerror != 0 ||
1089 n_rcu_torture_boost_rterror != 0 ||
1090 n_rcu_torture_boost_allocerror != 0 ||
1091 n_rcu_torture_boost_afferror != 0 ||
1092 n_rcu_torture_boost_failure != 0)
946 cnt += sprintf(&page[cnt], " !!!"); 1093 cnt += sprintf(&page[cnt], " !!!");
947 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 1094 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
948 if (i > 1) { 1095 if (i > 1) {
@@ -1094,22 +1241,91 @@ rcu_torture_stutter(void *arg)
1094} 1241}
1095 1242
1096static inline void 1243static inline void
1097rcu_torture_print_module_parms(char *tag) 1244rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1098{ 1245{
1099 printk(KERN_ALERT "%s" TORTURE_FLAG 1246 printk(KERN_ALERT "%s" TORTURE_FLAG
1100 "--- %s: nreaders=%d nfakewriters=%d " 1247 "--- %s: nreaders=%d nfakewriters=%d "
1101 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 1248 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
1102 "shuffle_interval=%d stutter=%d irqreader=%d " 1249 "shuffle_interval=%d stutter=%d irqreader=%d "
1103 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n", 1250 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1251 "test_boost=%d/%d test_boost_interval=%d "
1252 "test_boost_duration=%d\n",
1104 torture_type, tag, nrealreaders, nfakewriters, 1253 torture_type, tag, nrealreaders, nfakewriters,
1105 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1254 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1106 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter); 1255 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1256 test_boost, cur_ops->can_boost,
1257 test_boost_interval, test_boost_duration);
1107} 1258}
1108 1259
1109static struct notifier_block rcutorture_nb = { 1260static struct notifier_block rcutorture_shutdown_nb = {
1110 .notifier_call = rcutorture_shutdown_notify, 1261 .notifier_call = rcutorture_shutdown_notify,
1111}; 1262};
1112 1263
1264static void rcutorture_booster_cleanup(int cpu)
1265{
1266 struct task_struct *t;
1267
1268 if (boost_tasks[cpu] == NULL)
1269 return;
1270 mutex_lock(&boost_mutex);
1271 VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
1272 t = boost_tasks[cpu];
1273 boost_tasks[cpu] = NULL;
1274 mutex_unlock(&boost_mutex);
1275
1276 /* This must be outside of the mutex, otherwise deadlock! */
1277 kthread_stop(t);
1278}
1279
1280static int rcutorture_booster_init(int cpu)
1281{
1282 int retval;
1283
1284 if (boost_tasks[cpu] != NULL)
1285 return 0; /* Already created, nothing more to do. */
1286
1287 /* Don't allow time recalculation while creating a new task. */
1288 mutex_lock(&boost_mutex);
1289 VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
1290 boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
1291 "rcu_torture_boost");
1292 if (IS_ERR(boost_tasks[cpu])) {
1293 retval = PTR_ERR(boost_tasks[cpu]);
1294 VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
1295 n_rcu_torture_boost_ktrerror++;
1296 boost_tasks[cpu] = NULL;
1297 mutex_unlock(&boost_mutex);
1298 return retval;
1299 }
1300 kthread_bind(boost_tasks[cpu], cpu);
1301 wake_up_process(boost_tasks[cpu]);
1302 mutex_unlock(&boost_mutex);
1303 return 0;
1304}
1305
1306static int rcutorture_cpu_notify(struct notifier_block *self,
1307 unsigned long action, void *hcpu)
1308{
1309 long cpu = (long)hcpu;
1310
1311 switch (action) {
1312 case CPU_ONLINE:
1313 case CPU_DOWN_FAILED:
1314 (void)rcutorture_booster_init(cpu);
1315 break;
1316 case CPU_DOWN_PREPARE:
1317 rcutorture_booster_cleanup(cpu);
1318 break;
1319 default:
1320 break;
1321 }
1322 return NOTIFY_OK;
1323}
1324
1325static struct notifier_block rcutorture_cpu_nb = {
1326 .notifier_call = rcutorture_cpu_notify,
1327};
1328
1113static void 1329static void
1114rcu_torture_cleanup(void) 1330rcu_torture_cleanup(void)
1115{ 1331{
@@ -1127,7 +1343,7 @@ rcu_torture_cleanup(void)
1127 } 1343 }
1128 fullstop = FULLSTOP_RMMOD; 1344 fullstop = FULLSTOP_RMMOD;
1129 mutex_unlock(&fullstop_mutex); 1345 mutex_unlock(&fullstop_mutex);
1130 unregister_reboot_notifier(&rcutorture_nb); 1346 unregister_reboot_notifier(&rcutorture_shutdown_nb);
1131 if (stutter_task) { 1347 if (stutter_task) {
1132 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); 1348 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
1133 kthread_stop(stutter_task); 1349 kthread_stop(stutter_task);
@@ -1184,6 +1400,12 @@ rcu_torture_cleanup(void)
1184 kthread_stop(fqs_task); 1400 kthread_stop(fqs_task);
1185 } 1401 }
1186 fqs_task = NULL; 1402 fqs_task = NULL;
1403 if ((test_boost == 1 && cur_ops->can_boost) ||
1404 test_boost == 2) {
1405 unregister_cpu_notifier(&rcutorture_cpu_nb);
1406 for_each_possible_cpu(i)
1407 rcutorture_booster_cleanup(i);
1408 }
1187 1409
1188 /* Wait for all RCU callbacks to fire. */ 1410 /* Wait for all RCU callbacks to fire. */
1189 1411
@@ -1195,9 +1417,9 @@ rcu_torture_cleanup(void)
1195 if (cur_ops->cleanup) 1417 if (cur_ops->cleanup)
1196 cur_ops->cleanup(); 1418 cur_ops->cleanup();
1197 if (atomic_read(&n_rcu_torture_error)) 1419 if (atomic_read(&n_rcu_torture_error))
1198 rcu_torture_print_module_parms("End of test: FAILURE"); 1420 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1199 else 1421 else
1200 rcu_torture_print_module_parms("End of test: SUCCESS"); 1422 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
1201} 1423}
1202 1424
1203static int __init 1425static int __init
@@ -1242,7 +1464,7 @@ rcu_torture_init(void)
1242 nrealreaders = nreaders; 1464 nrealreaders = nreaders;
1243 else 1465 else
1244 nrealreaders = 2 * num_online_cpus(); 1466 nrealreaders = 2 * num_online_cpus();
1245 rcu_torture_print_module_parms("Start of test"); 1467 rcu_torture_print_module_parms(cur_ops, "Start of test");
1246 fullstop = FULLSTOP_DONTSTOP; 1468 fullstop = FULLSTOP_DONTSTOP;
1247 1469
1248 /* Set up the freelist. */ 1470 /* Set up the freelist. */
@@ -1263,6 +1485,12 @@ rcu_torture_init(void)
1263 atomic_set(&n_rcu_torture_free, 0); 1485 atomic_set(&n_rcu_torture_free, 0);
1264 atomic_set(&n_rcu_torture_mberror, 0); 1486 atomic_set(&n_rcu_torture_mberror, 0);
1265 atomic_set(&n_rcu_torture_error, 0); 1487 atomic_set(&n_rcu_torture_error, 0);
1488 n_rcu_torture_boost_ktrerror = 0;
1489 n_rcu_torture_boost_rterror = 0;
1490 n_rcu_torture_boost_allocerror = 0;
1491 n_rcu_torture_boost_afferror = 0;
1492 n_rcu_torture_boost_failure = 0;
1493 n_rcu_torture_boosts = 0;
1266 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 1494 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
1267 atomic_set(&rcu_torture_wcount[i], 0); 1495 atomic_set(&rcu_torture_wcount[i], 0);
1268 for_each_possible_cpu(cpu) { 1496 for_each_possible_cpu(cpu) {
@@ -1376,7 +1604,27 @@ rcu_torture_init(void)
1376 goto unwind; 1604 goto unwind;
1377 } 1605 }
1378 } 1606 }
1379 register_reboot_notifier(&rcutorture_nb); 1607 if (test_boost_interval < 1)
1608 test_boost_interval = 1;
1609 if (test_boost_duration < 2)
1610 test_boost_duration = 2;
1611 if ((test_boost == 1 && cur_ops->can_boost) ||
1612 test_boost == 2) {
1613 int retval;
1614
1615 boost_starttime = jiffies + test_boost_interval * HZ;
1616 register_cpu_notifier(&rcutorture_cpu_nb);
1617 for_each_possible_cpu(i) {
1618 if (cpu_is_offline(i))
1619 continue; /* Heuristic: CPU can go offline. */
1620 retval = rcutorture_booster_init(i);
1621 if (retval < 0) {
1622 firsterr = retval;
1623 goto unwind;
1624 }
1625 }
1626 }
1627 register_reboot_notifier(&rcutorture_shutdown_nb);
1380 mutex_unlock(&fullstop_mutex); 1628 mutex_unlock(&fullstop_mutex);
1381 return 0; 1629 return 0;
1382 1630
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ccdc04c47981..dd4aea806f8e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
67 .gpnum = -300, \ 67 .gpnum = -300, \
68 .completed = -300, \ 68 .completed = -300, \
69 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ 69 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
70 .orphan_cbs_list = NULL, \
71 .orphan_cbs_tail = &structname.orphan_cbs_list, \
72 .orphan_qlen = 0, \
73 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ 70 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
74 .n_force_qs = 0, \ 71 .n_force_qs = 0, \
75 .n_force_qs_ngp = 0, \ 72 .n_force_qs_ngp = 0, \
@@ -367,8 +364,8 @@ void rcu_irq_exit(void)
367 WARN_ON_ONCE(rdtp->dynticks & 0x1); 364 WARN_ON_ONCE(rdtp->dynticks & 0x1);
368 365
369 /* If the interrupt queued a callback, get out of dyntick mode. */ 366 /* If the interrupt queued a callback, get out of dyntick mode. */
370 if (__get_cpu_var(rcu_sched_data).nxtlist || 367 if (__this_cpu_read(rcu_sched_data.nxtlist) ||
371 __get_cpu_var(rcu_bh_data).nxtlist) 368 __this_cpu_read(rcu_bh_data.nxtlist))
372 set_need_resched(); 369 set_need_resched();
373} 370}
374 371
@@ -620,9 +617,17 @@ static void __init check_cpu_stall_init(void)
620static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) 617static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
621{ 618{
622 if (rdp->gpnum != rnp->gpnum) { 619 if (rdp->gpnum != rnp->gpnum) {
623 rdp->qs_pending = 1; 620 /*
624 rdp->passed_quiesc = 0; 621 * If the current grace period is waiting for this CPU,
622 * set up to detect a quiescent state, otherwise don't
623 * go looking for one.
624 */
625 rdp->gpnum = rnp->gpnum; 625 rdp->gpnum = rnp->gpnum;
626 if (rnp->qsmask & rdp->grpmask) {
627 rdp->qs_pending = 1;
628 rdp->passed_quiesc = 0;
629 } else
630 rdp->qs_pending = 0;
626 } 631 }
627} 632}
628 633
@@ -681,6 +686,24 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
681 686
682 /* Remember that we saw this grace-period completion. */ 687 /* Remember that we saw this grace-period completion. */
683 rdp->completed = rnp->completed; 688 rdp->completed = rnp->completed;
689
690 /*
691 * If we were in an extended quiescent state, we may have
692 * missed some grace periods that others CPUs handled on
693 * our behalf. Catch up with this state to avoid noting
694 * spurious new grace periods. If another grace period
695 * has started, then rnp->gpnum will have advanced, so
696 * we will detect this later on.
697 */
698 if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
699 rdp->gpnum = rdp->completed;
700
701 /*
702 * If RCU does not need a quiescent state from this CPU,
703 * then make sure that this CPU doesn't go looking for one.
704 */
705 if ((rnp->qsmask & rdp->grpmask) == 0)
706 rdp->qs_pending = 0;
684 } 707 }
685} 708}
686 709
@@ -984,53 +1007,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
984#ifdef CONFIG_HOTPLUG_CPU 1007#ifdef CONFIG_HOTPLUG_CPU
985 1008
986/* 1009/*
987 * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the 1010 * Move a dying CPU's RCU callbacks to online CPU's callback list.
988 * specified flavor of RCU. The callbacks will be adopted by the next 1011 * Synchronization is not required because this function executes
989 * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever 1012 * in stop_machine() context.
990 * comes first. Because this is invoked from the CPU_DYING notifier,
991 * irqs are already disabled.
992 */ 1013 */
993static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) 1014static void rcu_send_cbs_to_online(struct rcu_state *rsp)
994{ 1015{
995 int i; 1016 int i;
1017 /* current DYING CPU is cleared in the cpu_online_mask */
1018 int receive_cpu = cpumask_any(cpu_online_mask);
996 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1019 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1020 struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
997 1021
998 if (rdp->nxtlist == NULL) 1022 if (rdp->nxtlist == NULL)
999 return; /* irqs disabled, so comparison is stable. */ 1023 return; /* irqs disabled, so comparison is stable. */
1000 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ 1024
1001 *rsp->orphan_cbs_tail = rdp->nxtlist; 1025 *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
1002 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; 1026 receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1027 receive_rdp->qlen += rdp->qlen;
1028 receive_rdp->n_cbs_adopted += rdp->qlen;
1029 rdp->n_cbs_orphaned += rdp->qlen;
1030
1003 rdp->nxtlist = NULL; 1031 rdp->nxtlist = NULL;
1004 for (i = 0; i < RCU_NEXT_SIZE; i++) 1032 for (i = 0; i < RCU_NEXT_SIZE; i++)
1005 rdp->nxttail[i] = &rdp->nxtlist; 1033 rdp->nxttail[i] = &rdp->nxtlist;
1006 rsp->orphan_qlen += rdp->qlen;
1007 rdp->n_cbs_orphaned += rdp->qlen;
1008 rdp->qlen = 0; 1034 rdp->qlen = 0;
1009 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
1010}
1011
1012/*
1013 * Adopt previously orphaned RCU callbacks.
1014 */
1015static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1016{
1017 unsigned long flags;
1018 struct rcu_data *rdp;
1019
1020 raw_spin_lock_irqsave(&rsp->onofflock, flags);
1021 rdp = this_cpu_ptr(rsp->rda);
1022 if (rsp->orphan_cbs_list == NULL) {
1023 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1024 return;
1025 }
1026 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
1027 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
1028 rdp->qlen += rsp->orphan_qlen;
1029 rdp->n_cbs_adopted += rsp->orphan_qlen;
1030 rsp->orphan_cbs_list = NULL;
1031 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
1032 rsp->orphan_qlen = 0;
1033 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1034} 1035}
1035 1036
1036/* 1037/*
@@ -1081,8 +1082,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1081 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1082 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1082 if (need_report & RCU_OFL_TASKS_EXP_GP) 1083 if (need_report & RCU_OFL_TASKS_EXP_GP)
1083 rcu_report_exp_rnp(rsp, rnp); 1084 rcu_report_exp_rnp(rsp, rnp);
1084
1085 rcu_adopt_orphan_cbs(rsp);
1086} 1085}
1087 1086
1088/* 1087/*
@@ -1100,11 +1099,7 @@ static void rcu_offline_cpu(int cpu)
1100 1099
1101#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1100#else /* #ifdef CONFIG_HOTPLUG_CPU */
1102 1101
1103static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) 1102static void rcu_send_cbs_to_online(struct rcu_state *rsp)
1104{
1105}
1106
1107static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1108{ 1103{
1109} 1104}
1110 1105
@@ -1440,22 +1435,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1440 */ 1435 */
1441 local_irq_save(flags); 1436 local_irq_save(flags);
1442 rdp = this_cpu_ptr(rsp->rda); 1437 rdp = this_cpu_ptr(rsp->rda);
1443 rcu_process_gp_end(rsp, rdp);
1444 check_for_new_grace_period(rsp, rdp);
1445 1438
1446 /* Add the callback to our list. */ 1439 /* Add the callback to our list. */
1447 *rdp->nxttail[RCU_NEXT_TAIL] = head; 1440 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1448 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1441 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1449 1442
1450 /* Start a new grace period if one not already started. */
1451 if (!rcu_gp_in_progress(rsp)) {
1452 unsigned long nestflag;
1453 struct rcu_node *rnp_root = rcu_get_root(rsp);
1454
1455 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1456 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1457 }
1458
1459 /* 1443 /*
1460 * Force the grace period if too many callbacks or too long waiting. 1444 * Force the grace period if too many callbacks or too long waiting.
1461 * Enforce hysteresis, and don't invoke force_quiescent_state() 1445 * Enforce hysteresis, and don't invoke force_quiescent_state()
@@ -1464,12 +1448,27 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1464 * is the only one waiting for a grace period to complete. 1448 * is the only one waiting for a grace period to complete.
1465 */ 1449 */
1466 if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { 1450 if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
1467 rdp->blimit = LONG_MAX; 1451
1468 if (rsp->n_force_qs == rdp->n_force_qs_snap && 1452 /* Are we ignoring a completed grace period? */
1469 *rdp->nxttail[RCU_DONE_TAIL] != head) 1453 rcu_process_gp_end(rsp, rdp);
1470 force_quiescent_state(rsp, 0); 1454 check_for_new_grace_period(rsp, rdp);
1471 rdp->n_force_qs_snap = rsp->n_force_qs; 1455
1472 rdp->qlen_last_fqs_check = rdp->qlen; 1456 /* Start a new grace period if one not already started. */
1457 if (!rcu_gp_in_progress(rsp)) {
1458 unsigned long nestflag;
1459 struct rcu_node *rnp_root = rcu_get_root(rsp);
1460
1461 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1462 rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
1463 } else {
1464 /* Give the grace period a kick. */
1465 rdp->blimit = LONG_MAX;
1466 if (rsp->n_force_qs == rdp->n_force_qs_snap &&
1467 *rdp->nxttail[RCU_DONE_TAIL] != head)
1468 force_quiescent_state(rsp, 0);
1469 rdp->n_force_qs_snap = rsp->n_force_qs;
1470 rdp->qlen_last_fqs_check = rdp->qlen;
1471 }
1473 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) 1472 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1474 force_quiescent_state(rsp, 1); 1473 force_quiescent_state(rsp, 1);
1475 local_irq_restore(flags); 1474 local_irq_restore(flags);
@@ -1699,13 +1698,12 @@ static void _rcu_barrier(struct rcu_state *rsp,
1699 * decrement rcu_barrier_cpu_count -- otherwise the first CPU 1698 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
1700 * might complete its grace period before all of the other CPUs 1699 * might complete its grace period before all of the other CPUs
1701 * did their increment, causing this function to return too 1700 * did their increment, causing this function to return too
1702 * early. 1701 * early. Note that on_each_cpu() disables irqs, which prevents
1702 * any CPUs from coming online or going offline until each online
1703 * CPU has queued its RCU-barrier callback.
1703 */ 1704 */
1704 atomic_set(&rcu_barrier_cpu_count, 1); 1705 atomic_set(&rcu_barrier_cpu_count, 1);
1705 preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
1706 rcu_adopt_orphan_cbs(rsp);
1707 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); 1706 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
1708 preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
1709 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 1707 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
1710 complete(&rcu_barrier_completion); 1708 complete(&rcu_barrier_completion);
1711 wait_for_completion(&rcu_barrier_completion); 1709 wait_for_completion(&rcu_barrier_completion);
@@ -1831,18 +1829,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1831 case CPU_DYING: 1829 case CPU_DYING:
1832 case CPU_DYING_FROZEN: 1830 case CPU_DYING_FROZEN:
1833 /* 1831 /*
1834 * preempt_disable() in _rcu_barrier() prevents stop_machine(), 1832 * The whole machine is "stopped" except this CPU, so we can
1835 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" 1833 * touch any data without introducing corruption. We send the
1836 * returns, all online cpus have queued rcu_barrier_func(). 1834 * dying CPU's callbacks to an arbitrarily chosen online CPU.
1837 * The dying CPU clears its cpu_online_mask bit and
1838 * moves all of its RCU callbacks to ->orphan_cbs_list
1839 * in the context of stop_machine(), so subsequent calls
1840 * to _rcu_barrier() will adopt these callbacks and only
1841 * then queue rcu_barrier_func() on all remaining CPUs.
1842 */ 1835 */
1843 rcu_send_cbs_to_orphanage(&rcu_bh_state); 1836 rcu_send_cbs_to_online(&rcu_bh_state);
1844 rcu_send_cbs_to_orphanage(&rcu_sched_state); 1837 rcu_send_cbs_to_online(&rcu_sched_state);
1845 rcu_preempt_send_cbs_to_orphanage(); 1838 rcu_preempt_send_cbs_to_online();
1846 break; 1839 break;
1847 case CPU_DEAD: 1840 case CPU_DEAD:
1848 case CPU_DEAD_FROZEN: 1841 case CPU_DEAD_FROZEN:
@@ -1880,8 +1873,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
1880{ 1873{
1881 int i; 1874 int i;
1882 1875
1883 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) 1876 for (i = NUM_RCU_LVLS - 1; i > 0; i--)
1884 rsp->levelspread[i] = CONFIG_RCU_FANOUT; 1877 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
1878 rsp->levelspread[0] = RCU_FANOUT_LEAF;
1885} 1879}
1886#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ 1880#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
1887static void __init rcu_init_levelspread(struct rcu_state *rsp) 1881static void __init rcu_init_levelspread(struct rcu_state *rsp)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 91d4170c5c13..e8f057e44e3e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -31,46 +31,51 @@
31/* 31/*
32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. 32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
33 * In theory, it should be possible to add more levels straightforwardly. 33 * In theory, it should be possible to add more levels straightforwardly.
34 * In practice, this has not been tested, so there is probably some 34 * In practice, this did work well going from three levels to four.
35 * bug somewhere. 35 * Of course, your mileage may vary.
36 */ 36 */
37#define MAX_RCU_LVLS 4 37#define MAX_RCU_LVLS 4
38#define RCU_FANOUT (CONFIG_RCU_FANOUT) 38#if CONFIG_RCU_FANOUT > 16
39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) 39#define RCU_FANOUT_LEAF 16
40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) 40#else /* #if CONFIG_RCU_FANOUT > 16 */
41#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT) 41#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT)
42 42#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
43#if NR_CPUS <= RCU_FANOUT 43#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
44#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
45#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
46#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
47
48#if NR_CPUS <= RCU_FANOUT_1
44# define NUM_RCU_LVLS 1 49# define NUM_RCU_LVLS 1
45# define NUM_RCU_LVL_0 1 50# define NUM_RCU_LVL_0 1
46# define NUM_RCU_LVL_1 (NR_CPUS) 51# define NUM_RCU_LVL_1 (NR_CPUS)
47# define NUM_RCU_LVL_2 0 52# define NUM_RCU_LVL_2 0
48# define NUM_RCU_LVL_3 0 53# define NUM_RCU_LVL_3 0
49# define NUM_RCU_LVL_4 0 54# define NUM_RCU_LVL_4 0
50#elif NR_CPUS <= RCU_FANOUT_SQ 55#elif NR_CPUS <= RCU_FANOUT_2
51# define NUM_RCU_LVLS 2 56# define NUM_RCU_LVLS 2
52# define NUM_RCU_LVL_0 1 57# define NUM_RCU_LVL_0 1
53# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 58# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
54# define NUM_RCU_LVL_2 (NR_CPUS) 59# define NUM_RCU_LVL_2 (NR_CPUS)
55# define NUM_RCU_LVL_3 0 60# define NUM_RCU_LVL_3 0
56# define NUM_RCU_LVL_4 0 61# define NUM_RCU_LVL_4 0
57#elif NR_CPUS <= RCU_FANOUT_CUBE 62#elif NR_CPUS <= RCU_FANOUT_3
58# define NUM_RCU_LVLS 3 63# define NUM_RCU_LVLS 3
59# define NUM_RCU_LVL_0 1 64# define NUM_RCU_LVL_0 1
60# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) 65# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
61# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 66# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
62# define NUM_RCU_LVL_3 NR_CPUS 67# define NUM_RCU_LVL_3 (NR_CPUS)
63# define NUM_RCU_LVL_4 0 68# define NUM_RCU_LVL_4 0
64#elif NR_CPUS <= RCU_FANOUT_FOURTH 69#elif NR_CPUS <= RCU_FANOUT_4
65# define NUM_RCU_LVLS 4 70# define NUM_RCU_LVLS 4
66# define NUM_RCU_LVL_0 1 71# define NUM_RCU_LVL_0 1
67# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE) 72# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
68# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) 73# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
69# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 74# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
70# define NUM_RCU_LVL_4 NR_CPUS 75# define NUM_RCU_LVL_4 (NR_CPUS)
71#else 76#else
72# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" 77# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
73#endif /* #if (NR_CPUS) <= RCU_FANOUT */ 78#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
74 79
75#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) 80#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
76#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) 81#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
@@ -203,8 +208,8 @@ struct rcu_data {
203 long qlen_last_fqs_check; 208 long qlen_last_fqs_check;
204 /* qlen at last check for QS forcing */ 209 /* qlen at last check for QS forcing */
205 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ 210 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
206 unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */ 211 unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
207 unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */ 212 unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
208 unsigned long n_force_qs_snap; 213 unsigned long n_force_qs_snap;
209 /* did other CPU force QS recently? */ 214 /* did other CPU force QS recently? */
210 long blimit; /* Upper limit on a processed batch */ 215 long blimit; /* Upper limit on a processed batch */
@@ -309,15 +314,7 @@ struct rcu_state {
309 /* End of fields guarded by root rcu_node's lock. */ 314 /* End of fields guarded by root rcu_node's lock. */
310 315
311 raw_spinlock_t onofflock; /* exclude on/offline and */ 316 raw_spinlock_t onofflock; /* exclude on/offline and */
312 /* starting new GP. Also */ 317 /* starting new GP. */
313 /* protects the following */
314 /* orphan_cbs fields. */
315 struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */
316 /* orphaned by all CPUs in */
317 /* a given leaf rcu_node */
318 /* going offline. */
319 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
320 long orphan_qlen; /* Number of orphaned cbs. */
321 raw_spinlock_t fqslock; /* Only one task forcing */ 318 raw_spinlock_t fqslock; /* Only one task forcing */
322 /* quiescent states. */ 319 /* quiescent states. */
323 unsigned long jiffies_force_qs; /* Time at which to invoke */ 320 unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -390,7 +387,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
390static int rcu_preempt_pending(int cpu); 387static int rcu_preempt_pending(int cpu);
391static int rcu_preempt_needs_cpu(int cpu); 388static int rcu_preempt_needs_cpu(int cpu);
392static void __cpuinit rcu_preempt_init_percpu_data(int cpu); 389static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
393static void rcu_preempt_send_cbs_to_orphanage(void); 390static void rcu_preempt_send_cbs_to_online(void);
394static void __init __rcu_init_preempt(void); 391static void __init __rcu_init_preempt(void);
395static void rcu_needs_cpu_flush(void); 392static void rcu_needs_cpu_flush(void);
396 393
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 71a4147473f9..a3638710dc67 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
25 */ 25 */
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28#include <linux/stop_machine.h>
28 29
29/* 30/*
30 * Check the RCU kernel configuration parameters and print informative 31 * Check the RCU kernel configuration parameters and print informative
@@ -773,11 +774,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
773} 774}
774 775
775/* 776/*
776 * Move preemptable RCU's callbacks to ->orphan_cbs_list. 777 * Move preemptable RCU's callbacks from dying CPU to other online CPU.
777 */ 778 */
778static void rcu_preempt_send_cbs_to_orphanage(void) 779static void rcu_preempt_send_cbs_to_online(void)
779{ 780{
780 rcu_send_cbs_to_orphanage(&rcu_preempt_state); 781 rcu_send_cbs_to_online(&rcu_preempt_state);
781} 782}
782 783
783/* 784/*
@@ -1001,7 +1002,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
1001/* 1002/*
1002 * Because there is no preemptable RCU, there are no callbacks to move. 1003 * Because there is no preemptable RCU, there are no callbacks to move.
1003 */ 1004 */
1004static void rcu_preempt_send_cbs_to_orphanage(void) 1005static void rcu_preempt_send_cbs_to_online(void)
1005{ 1006{
1006} 1007}
1007 1008
@@ -1014,6 +1015,132 @@ static void __init __rcu_init_preempt(void)
1014 1015
1015#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1016#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1016 1017
1018#ifndef CONFIG_SMP
1019
1020void synchronize_sched_expedited(void)
1021{
1022 cond_resched();
1023}
1024EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1025
1026#else /* #ifndef CONFIG_SMP */
1027
1028static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
1029static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
1030
1031static int synchronize_sched_expedited_cpu_stop(void *data)
1032{
1033 /*
1034 * There must be a full memory barrier on each affected CPU
1035 * between the time that try_stop_cpus() is called and the
1036 * time that it returns.
1037 *
1038 * In the current initial implementation of cpu_stop, the
1039 * above condition is already met when the control reaches
1040 * this point and the following smp_mb() is not strictly
1041 * necessary. Do smp_mb() anyway for documentation and
1042 * robustness against future implementation changes.
1043 */
1044 smp_mb(); /* See above comment block. */
1045 return 0;
1046}
1047
1048/*
1049 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
1050 * approach to force grace period to end quickly. This consumes
1051 * significant time on all CPUs, and is thus not recommended for
1052 * any sort of common-case code.
1053 *
1054 * Note that it is illegal to call this function while holding any
1055 * lock that is acquired by a CPU-hotplug notifier. Failing to
1056 * observe this restriction will result in deadlock.
1057 *
1058 * This implementation can be thought of as an application of ticket
1059 * locking to RCU, with sync_sched_expedited_started and
1060 * sync_sched_expedited_done taking on the roles of the halves
1061 * of the ticket-lock word. Each task atomically increments
1062 * sync_sched_expedited_started upon entry, snapshotting the old value,
1063 * then attempts to stop all the CPUs. If this succeeds, then each
1064 * CPU will have executed a context switch, resulting in an RCU-sched
1065 * grace period. We are then done, so we use atomic_cmpxchg() to
1066 * update sync_sched_expedited_done to match our snapshot -- but
1067 * only if someone else has not already advanced past our snapshot.
1068 *
1069 * On the other hand, if try_stop_cpus() fails, we check the value
1070 * of sync_sched_expedited_done. If it has advanced past our
1071 * initial snapshot, then someone else must have forced a grace period
1072 * some time after we took our snapshot. In this case, our work is
1073 * done for us, and we can simply return. Otherwise, we try again,
1074 * but keep our initial snapshot for purposes of checking for someone
1075 * doing our work for us.
1076 *
1077 * If we fail too many times in a row, we fall back to synchronize_sched().
1078 */
1079void synchronize_sched_expedited(void)
1080{
1081 int firstsnap, s, snap, trycount = 0;
1082
1083 /* Note that atomic_inc_return() implies full memory barrier. */
1084 firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
1085 get_online_cpus();
1086
1087 /*
1088 * Each pass through the following loop attempts to force a
1089 * context switch on each CPU.
1090 */
1091 while (try_stop_cpus(cpu_online_mask,
1092 synchronize_sched_expedited_cpu_stop,
1093 NULL) == -EAGAIN) {
1094 put_online_cpus();
1095
1096 /* No joy, try again later. Or just synchronize_sched(). */
1097 if (trycount++ < 10)
1098 udelay(trycount * num_online_cpus());
1099 else {
1100 synchronize_sched();
1101 return;
1102 }
1103
1104 /* Check to see if someone else did our work for us. */
1105 s = atomic_read(&sync_sched_expedited_done);
1106 if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
1107 smp_mb(); /* ensure test happens before caller kfree */
1108 return;
1109 }
1110
1111 /*
1112 * Refetching sync_sched_expedited_started allows later
1113 * callers to piggyback on our grace period. We subtract
1114 * 1 to get the same token that the last incrementer got.
1115 * We retry after they started, so our grace period works
1116 * for them, and they started after our first try, so their
1117 * grace period works for us.
1118 */
1119 get_online_cpus();
1120 snap = atomic_read(&sync_sched_expedited_started) - 1;
1121 smp_mb(); /* ensure read is before try_stop_cpus(). */
1122 }
1123
1124 /*
1125 * Everyone up to our most recent fetch is covered by our grace
1126 * period. Update the counter, but only if our work is still
1127 * relevant -- which it won't be if someone who started later
1128 * than we did beat us to the punch.
1129 */
1130 do {
1131 s = atomic_read(&sync_sched_expedited_done);
1132 if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
1133 smp_mb(); /* ensure test happens before caller kfree */
1134 break;
1135 }
1136 } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
1137
1138 put_online_cpus();
1139}
1140EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1141
1142#endif /* #else #ifndef CONFIG_SMP */
1143
1017#if !defined(CONFIG_RCU_FAST_NO_HZ) 1144#if !defined(CONFIG_RCU_FAST_NO_HZ)
1018 1145
1019/* 1146/*
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d15430b9d122..c8e97853b970 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -166,13 +166,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
166 166
167 gpnum = rsp->gpnum; 167 gpnum = rsp->gpnum;
168 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " 168 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
169 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", 169 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
170 rsp->completed, gpnum, rsp->signaled, 170 rsp->completed, gpnum, rsp->signaled,
171 (long)(rsp->jiffies_force_qs - jiffies), 171 (long)(rsp->jiffies_force_qs - jiffies),
172 (int)(jiffies & 0xffff), 172 (int)(jiffies & 0xffff),
173 rsp->n_force_qs, rsp->n_force_qs_ngp, 173 rsp->n_force_qs, rsp->n_force_qs_ngp,
174 rsp->n_force_qs - rsp->n_force_qs_ngp, 174 rsp->n_force_qs - rsp->n_force_qs_ngp,
175 rsp->n_force_qs_lh, rsp->orphan_qlen); 175 rsp->n_force_qs_lh);
176 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { 176 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
177 if (rnp->level != level) { 177 if (rnp->level != level) {
178 seq_puts(m, "\n"); 178 seq_puts(m, "\n");
@@ -300,7 +300,7 @@ static const struct file_operations rcu_pending_fops = {
300 300
301static struct dentry *rcudir; 301static struct dentry *rcudir;
302 302
303static int __init rcuclassic_trace_init(void) 303static int __init rcutree_trace_init(void)
304{ 304{
305 struct dentry *retval; 305 struct dentry *retval;
306 306
@@ -337,14 +337,14 @@ free_out:
337 return 1; 337 return 1;
338} 338}
339 339
340static void __exit rcuclassic_trace_cleanup(void) 340static void __exit rcutree_trace_cleanup(void)
341{ 341{
342 debugfs_remove_recursive(rcudir); 342 debugfs_remove_recursive(rcudir);
343} 343}
344 344
345 345
346module_init(rcuclassic_trace_init); 346module_init(rcutree_trace_init);
347module_exit(rcuclassic_trace_cleanup); 347module_exit(rcutree_trace_cleanup);
348 348
349MODULE_AUTHOR("Paul E. McKenney"); 349MODULE_AUTHOR("Paul E. McKenney");
350MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); 350MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
diff --git a/kernel/resource.c b/kernel/resource.c
index 9fad33efd0db..798e2fae2a06 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -40,23 +40,6 @@ EXPORT_SYMBOL(iomem_resource);
40 40
41static DEFINE_RWLOCK(resource_lock); 41static DEFINE_RWLOCK(resource_lock);
42 42
43/*
44 * By default, we allocate free space bottom-up. The architecture can request
45 * top-down by clearing this flag. The user can override the architecture's
46 * choice with the "resource_alloc_from_bottom" kernel boot option, but that
47 * should only be a debugging tool.
48 */
49int resource_alloc_from_bottom = 1;
50
51static __init int setup_alloc_from_bottom(char *s)
52{
53 printk(KERN_INFO
54 "resource: allocating from bottom-up; please report a bug\n");
55 resource_alloc_from_bottom = 1;
56 return 0;
57}
58early_param("resource_alloc_from_bottom", setup_alloc_from_bottom);
59
60static void *r_next(struct seq_file *m, void *v, loff_t *pos) 43static void *r_next(struct seq_file *m, void *v, loff_t *pos)
61{ 44{
62 struct resource *p = v; 45 struct resource *p = v;
@@ -374,6 +357,10 @@ int __weak page_is_ram(unsigned long pfn)
374 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; 357 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
375} 358}
376 359
360void __weak arch_remove_reservations(struct resource *avail)
361{
362}
363
377static resource_size_t simple_align_resource(void *data, 364static resource_size_t simple_align_resource(void *data,
378 const struct resource *avail, 365 const struct resource *avail,
379 resource_size_t size, 366 resource_size_t size,
@@ -397,74 +384,7 @@ static bool resource_contains(struct resource *res1, struct resource *res2)
397} 384}
398 385
399/* 386/*
400 * Find the resource before "child" in the sibling list of "root" children.
401 */
402static struct resource *find_sibling_prev(struct resource *root, struct resource *child)
403{
404 struct resource *this;
405
406 for (this = root->child; this; this = this->sibling)
407 if (this->sibling == child)
408 return this;
409
410 return NULL;
411}
412
413/*
414 * Find empty slot in the resource tree given range and alignment. 387 * Find empty slot in the resource tree given range and alignment.
415 * This version allocates from the end of the root resource first.
416 */
417static int find_resource_from_top(struct resource *root, struct resource *new,
418 resource_size_t size, resource_size_t min,
419 resource_size_t max, resource_size_t align,
420 resource_size_t (*alignf)(void *,
421 const struct resource *,
422 resource_size_t,
423 resource_size_t),
424 void *alignf_data)
425{
426 struct resource *this;
427 struct resource tmp, avail, alloc;
428
429 tmp.start = root->end;
430 tmp.end = root->end;
431
432 this = find_sibling_prev(root, NULL);
433 for (;;) {
434 if (this) {
435 if (this->end < root->end)
436 tmp.start = this->end + 1;
437 } else
438 tmp.start = root->start;
439
440 resource_clip(&tmp, min, max);
441
442 /* Check for overflow after ALIGN() */
443 avail = *new;
444 avail.start = ALIGN(tmp.start, align);
445 avail.end = tmp.end;
446 if (avail.start >= tmp.start) {
447 alloc.start = alignf(alignf_data, &avail, size, align);
448 alloc.end = alloc.start + size - 1;
449 if (resource_contains(&avail, &alloc)) {
450 new->start = alloc.start;
451 new->end = alloc.end;
452 return 0;
453 }
454 }
455
456 if (!this || this->start == root->start)
457 break;
458
459 tmp.end = this->start - 1;
460 this = find_sibling_prev(root, this);
461 }
462 return -EBUSY;
463}
464
465/*
466 * Find empty slot in the resource tree given range and alignment.
467 * This version allocates from the beginning of the root resource first.
468 */ 388 */
469static int find_resource(struct resource *root, struct resource *new, 389static int find_resource(struct resource *root, struct resource *new,
470 resource_size_t size, resource_size_t min, 390 resource_size_t size, resource_size_t min,
@@ -478,23 +398,24 @@ static int find_resource(struct resource *root, struct resource *new,
478 struct resource *this = root->child; 398 struct resource *this = root->child;
479 struct resource tmp = *new, avail, alloc; 399 struct resource tmp = *new, avail, alloc;
480 400
401 tmp.flags = new->flags;
481 tmp.start = root->start; 402 tmp.start = root->start;
482 /* 403 /*
483 * Skip past an allocated resource that starts at 0, since the 404 * Skip past an allocated resource that starts at 0, since the assignment
484 * assignment of this->start - 1 to tmp->end below would cause an 405 * of this->start - 1 to tmp->end below would cause an underflow.
485 * underflow.
486 */ 406 */
487 if (this && this->start == 0) { 407 if (this && this->start == 0) {
488 tmp.start = this->end + 1; 408 tmp.start = this->end + 1;
489 this = this->sibling; 409 this = this->sibling;
490 } 410 }
491 for (;;) { 411 for(;;) {
492 if (this) 412 if (this)
493 tmp.end = this->start - 1; 413 tmp.end = this->start - 1;
494 else 414 else
495 tmp.end = root->end; 415 tmp.end = root->end;
496 416
497 resource_clip(&tmp, min, max); 417 resource_clip(&tmp, min, max);
418 arch_remove_reservations(&tmp);
498 419
499 /* Check for overflow after ALIGN() */ 420 /* Check for overflow after ALIGN() */
500 avail = *new; 421 avail = *new;
@@ -509,10 +430,8 @@ static int find_resource(struct resource *root, struct resource *new,
509 return 0; 430 return 0;
510 } 431 }
511 } 432 }
512
513 if (!this) 433 if (!this)
514 break; 434 break;
515
516 tmp.start = this->end + 1; 435 tmp.start = this->end + 1;
517 this = this->sibling; 436 this = this->sibling;
518 } 437 }
@@ -545,10 +464,7 @@ int allocate_resource(struct resource *root, struct resource *new,
545 alignf = simple_align_resource; 464 alignf = simple_align_resource;
546 465
547 write_lock(&resource_lock); 466 write_lock(&resource_lock);
548 if (resource_alloc_from_bottom) 467 err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
549 err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
550 else
551 err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data);
552 if (err >= 0 && __request_resource(root, new)) 468 if (err >= 0 && __request_resource(root, new))
553 err = -EBUSY; 469 err = -EBUSY;
554 write_unlock(&resource_lock); 470 write_unlock(&resource_lock);
diff --git a/kernel/sched.c b/kernel/sched.c
index aa14a56f9d03..a0eb0941fa84 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,9 +75,11 @@
75 75
76#include <asm/tlb.h> 76#include <asm/tlb.h>
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78#include <asm/mutex.h>
78 79
79#include "sched_cpupri.h" 80#include "sched_cpupri.h"
80#include "workqueue_sched.h" 81#include "workqueue_sched.h"
82#include "sched_autogroup.h"
81 83
82#define CREATE_TRACE_POINTS 84#define CREATE_TRACE_POINTS
83#include <trace/events/sched.h> 85#include <trace/events/sched.h>
@@ -253,6 +255,8 @@ struct task_group {
253 /* runqueue "owned" by this group on each cpu */ 255 /* runqueue "owned" by this group on each cpu */
254 struct cfs_rq **cfs_rq; 256 struct cfs_rq **cfs_rq;
255 unsigned long shares; 257 unsigned long shares;
258
259 atomic_t load_weight;
256#endif 260#endif
257 261
258#ifdef CONFIG_RT_GROUP_SCHED 262#ifdef CONFIG_RT_GROUP_SCHED
@@ -268,25 +272,18 @@ struct task_group {
268 struct task_group *parent; 272 struct task_group *parent;
269 struct list_head siblings; 273 struct list_head siblings;
270 struct list_head children; 274 struct list_head children;
271};
272 275
273#define root_task_group init_task_group 276#ifdef CONFIG_SCHED_AUTOGROUP
277 struct autogroup *autogroup;
278#endif
279};
274 280
275/* task_group_lock serializes add/remove of task groups and also changes to 281/* task_group_lock serializes the addition/removal of task groups */
276 * a task group's cpu shares.
277 */
278static DEFINE_SPINLOCK(task_group_lock); 282static DEFINE_SPINLOCK(task_group_lock);
279 283
280#ifdef CONFIG_FAIR_GROUP_SCHED 284#ifdef CONFIG_FAIR_GROUP_SCHED
281 285
282#ifdef CONFIG_SMP 286# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
283static int root_task_group_empty(void)
284{
285 return list_empty(&root_task_group.children);
286}
287#endif
288
289# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
290 287
291/* 288/*
292 * A weight of 0 or 1 can cause arithmetics problems. 289 * A weight of 0 or 1 can cause arithmetics problems.
@@ -299,13 +296,13 @@ static int root_task_group_empty(void)
299#define MIN_SHARES 2 296#define MIN_SHARES 2
300#define MAX_SHARES (1UL << 18) 297#define MAX_SHARES (1UL << 18)
301 298
302static int init_task_group_load = INIT_TASK_GROUP_LOAD; 299static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
303#endif 300#endif
304 301
305/* Default task group. 302/* Default task group.
306 * Every task in system belong to this group at bootup. 303 * Every task in system belong to this group at bootup.
307 */ 304 */
308struct task_group init_task_group; 305struct task_group root_task_group;
309 306
310#endif /* CONFIG_CGROUP_SCHED */ 307#endif /* CONFIG_CGROUP_SCHED */
311 308
@@ -342,6 +339,7 @@ struct cfs_rq {
342 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 339 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
343 * list is used during load balance. 340 * list is used during load balance.
344 */ 341 */
342 int on_list;
345 struct list_head leaf_cfs_rq_list; 343 struct list_head leaf_cfs_rq_list;
346 struct task_group *tg; /* group that "owns" this runqueue */ 344 struct task_group *tg; /* group that "owns" this runqueue */
347 345
@@ -360,14 +358,17 @@ struct cfs_rq {
360 unsigned long h_load; 358 unsigned long h_load;
361 359
362 /* 360 /*
363 * this cpu's part of tg->shares 361 * Maintaining per-cpu shares distribution for group scheduling
362 *
363 * load_stamp is the last time we updated the load average
364 * load_last is the last time we updated the load average and saw load
365 * load_unacc_exec_time is currently unaccounted execution time
364 */ 366 */
365 unsigned long shares; 367 u64 load_avg;
368 u64 load_period;
369 u64 load_stamp, load_last, load_unacc_exec_time;
366 370
367 /* 371 unsigned long load_contribution;
368 * load.weight at the time we set shares
369 */
370 unsigned long rq_weight;
371#endif 372#endif
372#endif 373#endif
373}; 374};
@@ -560,18 +561,8 @@ struct rq {
560 561
561static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 562static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
562 563
563static inline
564void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
565{
566 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
567 564
568 /* 565static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
569 * A queue event has occurred, and we're going to schedule. In
570 * this case, we can save a useless back to back clock update.
571 */
572 if (test_tsk_need_resched(p))
573 rq->skip_clock_update = 1;
574}
575 566
576static inline int cpu_of(struct rq *rq) 567static inline int cpu_of(struct rq *rq)
577{ 568{
@@ -615,11 +606,14 @@ static inline int cpu_of(struct rq *rq)
615 */ 606 */
616static inline struct task_group *task_group(struct task_struct *p) 607static inline struct task_group *task_group(struct task_struct *p)
617{ 608{
609 struct task_group *tg;
618 struct cgroup_subsys_state *css; 610 struct cgroup_subsys_state *css;
619 611
620 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 612 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
621 lockdep_is_held(&task_rq(p)->lock)); 613 lockdep_is_held(&task_rq(p)->lock));
622 return container_of(css, struct task_group, css); 614 tg = container_of(css, struct task_group, css);
615
616 return autogroup_task_group(p, tg);
623} 617}
624 618
625/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 619/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -646,22 +640,18 @@ static inline struct task_group *task_group(struct task_struct *p)
646 640
647#endif /* CONFIG_CGROUP_SCHED */ 641#endif /* CONFIG_CGROUP_SCHED */
648 642
649static u64 irq_time_cpu(int cpu); 643static void update_rq_clock_task(struct rq *rq, s64 delta);
650static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
651 644
652inline void update_rq_clock(struct rq *rq) 645static void update_rq_clock(struct rq *rq)
653{ 646{
654 if (!rq->skip_clock_update) { 647 s64 delta;
655 int cpu = cpu_of(rq);
656 u64 irq_time;
657 648
658 rq->clock = sched_clock_cpu(cpu); 649 if (rq->skip_clock_update)
659 irq_time = irq_time_cpu(cpu); 650 return;
660 if (rq->clock - irq_time > rq->clock_task)
661 rq->clock_task = rq->clock - irq_time;
662 651
663 sched_irq_time_avg_update(rq, irq_time); 652 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
664 } 653 rq->clock += delta;
654 update_rq_clock_task(rq, delta);
665} 655}
666 656
667/* 657/*
@@ -751,7 +741,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
751 buf[cnt] = 0; 741 buf[cnt] = 0;
752 cmp = strstrip(buf); 742 cmp = strstrip(buf);
753 743
754 if (strncmp(buf, "NO_", 3) == 0) { 744 if (strncmp(cmp, "NO_", 3) == 0) {
755 neg = 1; 745 neg = 1;
756 cmp += 3; 746 cmp += 3;
757 } 747 }
@@ -807,20 +797,6 @@ late_initcall(sched_init_debug);
807const_debug unsigned int sysctl_sched_nr_migrate = 32; 797const_debug unsigned int sysctl_sched_nr_migrate = 32;
808 798
809/* 799/*
810 * ratelimit for updating the group shares.
811 * default: 0.25ms
812 */
813unsigned int sysctl_sched_shares_ratelimit = 250000;
814unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
815
816/*
817 * Inject some fuzzyness into changing the per-cpu group shares
818 * this avoids remote rq-locks at the expense of fairness.
819 * default: 4
820 */
821unsigned int sysctl_sched_shares_thresh = 4;
822
823/*
824 * period over which we average the RT time consumption, measured 800 * period over which we average the RT time consumption, measured
825 * in ms. 801 * in ms.
826 * 802 *
@@ -1369,6 +1345,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1369 lw->inv_weight = 0; 1345 lw->inv_weight = 0;
1370} 1346}
1371 1347
1348static inline void update_load_set(struct load_weight *lw, unsigned long w)
1349{
1350 lw->weight = w;
1351 lw->inv_weight = 0;
1352}
1353
1372/* 1354/*
1373 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1355 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1374 * of tasks with abnormal "nice" values across CPUs the contribution that 1356 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1557,101 +1539,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1557 1539
1558#ifdef CONFIG_FAIR_GROUP_SCHED 1540#ifdef CONFIG_FAIR_GROUP_SCHED
1559 1541
1560static __read_mostly unsigned long __percpu *update_shares_data;
1561
1562static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1563
1564/*
1565 * Calculate and set the cpu's group shares.
1566 */
1567static void update_group_shares_cpu(struct task_group *tg, int cpu,
1568 unsigned long sd_shares,
1569 unsigned long sd_rq_weight,
1570 unsigned long *usd_rq_weight)
1571{
1572 unsigned long shares, rq_weight;
1573 int boost = 0;
1574
1575 rq_weight = usd_rq_weight[cpu];
1576 if (!rq_weight) {
1577 boost = 1;
1578 rq_weight = NICE_0_LOAD;
1579 }
1580
1581 /*
1582 * \Sum_j shares_j * rq_weight_i
1583 * shares_i = -----------------------------
1584 * \Sum_j rq_weight_j
1585 */
1586 shares = (sd_shares * rq_weight) / sd_rq_weight;
1587 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1588
1589 if (abs(shares - tg->se[cpu]->load.weight) >
1590 sysctl_sched_shares_thresh) {
1591 struct rq *rq = cpu_rq(cpu);
1592 unsigned long flags;
1593
1594 raw_spin_lock_irqsave(&rq->lock, flags);
1595 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1596 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1597 __set_se_shares(tg->se[cpu], shares);
1598 raw_spin_unlock_irqrestore(&rq->lock, flags);
1599 }
1600}
1601
1602/*
1603 * Re-compute the task group their per cpu shares over the given domain.
1604 * This needs to be done in a bottom-up fashion because the rq weight of a
1605 * parent group depends on the shares of its child groups.
1606 */
1607static int tg_shares_up(struct task_group *tg, void *data)
1608{
1609 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1610 unsigned long *usd_rq_weight;
1611 struct sched_domain *sd = data;
1612 unsigned long flags;
1613 int i;
1614
1615 if (!tg->se[0])
1616 return 0;
1617
1618 local_irq_save(flags);
1619 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1620
1621 for_each_cpu(i, sched_domain_span(sd)) {
1622 weight = tg->cfs_rq[i]->load.weight;
1623 usd_rq_weight[i] = weight;
1624
1625 rq_weight += weight;
1626 /*
1627 * If there are currently no tasks on the cpu pretend there
1628 * is one of average load so that when a new task gets to
1629 * run here it will not get delayed by group starvation.
1630 */
1631 if (!weight)
1632 weight = NICE_0_LOAD;
1633
1634 sum_weight += weight;
1635 shares += tg->cfs_rq[i]->shares;
1636 }
1637
1638 if (!rq_weight)
1639 rq_weight = sum_weight;
1640
1641 if ((!shares && rq_weight) || shares > tg->shares)
1642 shares = tg->shares;
1643
1644 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1645 shares = tg->shares;
1646
1647 for_each_cpu(i, sched_domain_span(sd))
1648 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1649
1650 local_irq_restore(flags);
1651
1652 return 0;
1653}
1654
1655/* 1542/*
1656 * Compute the cpu's hierarchical load factor for each task group. 1543 * Compute the cpu's hierarchical load factor for each task group.
1657 * This needs to be done in a top-down fashion because the load of a child 1544 * This needs to be done in a top-down fashion because the load of a child
@@ -1666,7 +1553,7 @@ static int tg_load_down(struct task_group *tg, void *data)
1666 load = cpu_rq(cpu)->load.weight; 1553 load = cpu_rq(cpu)->load.weight;
1667 } else { 1554 } else {
1668 load = tg->parent->cfs_rq[cpu]->h_load; 1555 load = tg->parent->cfs_rq[cpu]->h_load;
1669 load *= tg->cfs_rq[cpu]->shares; 1556 load *= tg->se[cpu]->load.weight;
1670 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 1557 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1671 } 1558 }
1672 1559
@@ -1675,34 +1562,11 @@ static int tg_load_down(struct task_group *tg, void *data)
1675 return 0; 1562 return 0;
1676} 1563}
1677 1564
1678static void update_shares(struct sched_domain *sd)
1679{
1680 s64 elapsed;
1681 u64 now;
1682
1683 if (root_task_group_empty())
1684 return;
1685
1686 now = local_clock();
1687 elapsed = now - sd->last_update;
1688
1689 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1690 sd->last_update = now;
1691 walk_tg_tree(tg_nop, tg_shares_up, sd);
1692 }
1693}
1694
1695static void update_h_load(long cpu) 1565static void update_h_load(long cpu)
1696{ 1566{
1697 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1567 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1698} 1568}
1699 1569
1700#else
1701
1702static inline void update_shares(struct sched_domain *sd)
1703{
1704}
1705
1706#endif 1570#endif
1707 1571
1708#ifdef CONFIG_PREEMPT 1572#ifdef CONFIG_PREEMPT
@@ -1824,15 +1688,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1824 1688
1825#endif 1689#endif
1826 1690
1827#ifdef CONFIG_FAIR_GROUP_SCHED
1828static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1829{
1830#ifdef CONFIG_SMP
1831 cfs_rq->shares = shares;
1832#endif
1833}
1834#endif
1835
1836static void calc_load_account_idle(struct rq *this_rq); 1691static void calc_load_account_idle(struct rq *this_rq);
1837static void update_sysctl(void); 1692static void update_sysctl(void);
1838static int get_update_sysctl_factor(void); 1693static int get_update_sysctl_factor(void);
@@ -1934,10 +1789,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1934 * They are read and saved off onto struct rq in update_rq_clock(). 1789 * They are read and saved off onto struct rq in update_rq_clock().
1935 * This may result in other CPU reading this CPU's irq time and can 1790 * This may result in other CPU reading this CPU's irq time and can
1936 * race with irq/account_system_vtime on this CPU. We would either get old 1791 * race with irq/account_system_vtime on this CPU. We would either get old
1937 * or new value (or semi updated value on 32 bit) with a side effect of 1792 * or new value with a side effect of accounting a slice of irq time to wrong
1938 * accounting a slice of irq time to wrong task when irq is in progress 1793 * task when irq is in progress while we read rq->clock. That is a worthy
1939 * while we read rq->clock. That is a worthy compromise in place of having 1794 * compromise in place of having locks on each irq in account_system_time.
1940 * locks on each irq in account_system_time.
1941 */ 1795 */
1942static DEFINE_PER_CPU(u64, cpu_hardirq_time); 1796static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1943static DEFINE_PER_CPU(u64, cpu_softirq_time); 1797static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@ -1955,19 +1809,58 @@ void disable_sched_clock_irqtime(void)
1955 sched_clock_irqtime = 0; 1809 sched_clock_irqtime = 0;
1956} 1810}
1957 1811
1958static u64 irq_time_cpu(int cpu) 1812#ifndef CONFIG_64BIT
1813static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
1814
1815static inline void irq_time_write_begin(void)
1959{ 1816{
1960 if (!sched_clock_irqtime) 1817 __this_cpu_inc(irq_time_seq.sequence);
1961 return 0; 1818 smp_wmb();
1819}
1820
1821static inline void irq_time_write_end(void)
1822{
1823 smp_wmb();
1824 __this_cpu_inc(irq_time_seq.sequence);
1825}
1826
1827static inline u64 irq_time_read(int cpu)
1828{
1829 u64 irq_time;
1830 unsigned seq;
1831
1832 do {
1833 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1834 irq_time = per_cpu(cpu_softirq_time, cpu) +
1835 per_cpu(cpu_hardirq_time, cpu);
1836 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1837
1838 return irq_time;
1839}
1840#else /* CONFIG_64BIT */
1841static inline void irq_time_write_begin(void)
1842{
1843}
1844
1845static inline void irq_time_write_end(void)
1846{
1847}
1962 1848
1849static inline u64 irq_time_read(int cpu)
1850{
1963 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); 1851 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1964} 1852}
1853#endif /* CONFIG_64BIT */
1965 1854
1855/*
1856 * Called before incrementing preempt_count on {soft,}irq_enter
1857 * and before decrementing preempt_count on {soft,}irq_exit.
1858 */
1966void account_system_vtime(struct task_struct *curr) 1859void account_system_vtime(struct task_struct *curr)
1967{ 1860{
1968 unsigned long flags; 1861 unsigned long flags;
1862 s64 delta;
1969 int cpu; 1863 int cpu;
1970 u64 now, delta;
1971 1864
1972 if (!sched_clock_irqtime) 1865 if (!sched_clock_irqtime)
1973 return; 1866 return;
@@ -1975,9 +1868,10 @@ void account_system_vtime(struct task_struct *curr)
1975 local_irq_save(flags); 1868 local_irq_save(flags);
1976 1869
1977 cpu = smp_processor_id(); 1870 cpu = smp_processor_id();
1978 now = sched_clock_cpu(cpu); 1871 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
1979 delta = now - per_cpu(irq_start_time, cpu); 1872 __this_cpu_add(irq_start_time, delta);
1980 per_cpu(irq_start_time, cpu) = now; 1873
1874 irq_time_write_begin();
1981 /* 1875 /*
1982 * We do not account for softirq time from ksoftirqd here. 1876 * We do not account for softirq time from ksoftirqd here.
1983 * We want to continue accounting softirq time to ksoftirqd thread 1877 * We want to continue accounting softirq time to ksoftirqd thread
@@ -1985,37 +1879,60 @@ void account_system_vtime(struct task_struct *curr)
1985 * that do not consume any time, but still wants to run. 1879 * that do not consume any time, but still wants to run.
1986 */ 1880 */
1987 if (hardirq_count()) 1881 if (hardirq_count())
1988 per_cpu(cpu_hardirq_time, cpu) += delta; 1882 __this_cpu_add(cpu_hardirq_time, delta);
1989 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) 1883 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
1990 per_cpu(cpu_softirq_time, cpu) += delta; 1884 __this_cpu_add(cpu_softirq_time, delta);
1991 1885
1886 irq_time_write_end();
1992 local_irq_restore(flags); 1887 local_irq_restore(flags);
1993} 1888}
1994EXPORT_SYMBOL_GPL(account_system_vtime); 1889EXPORT_SYMBOL_GPL(account_system_vtime);
1995 1890
1996static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) 1891static void update_rq_clock_task(struct rq *rq, s64 delta)
1997{ 1892{
1998 if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) { 1893 s64 irq_delta;
1999 u64 delta_irq = curr_irq_time - rq->prev_irq_time; 1894
2000 rq->prev_irq_time = curr_irq_time; 1895 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
2001 sched_rt_avg_update(rq, delta_irq); 1896
2002 } 1897 /*
1898 * Since irq_time is only updated on {soft,}irq_exit, we might run into
1899 * this case when a previous update_rq_clock() happened inside a
1900 * {soft,}irq region.
1901 *
1902 * When this happens, we stop ->clock_task and only update the
1903 * prev_irq_time stamp to account for the part that fit, so that a next
1904 * update will consume the rest. This ensures ->clock_task is
1905 * monotonic.
1906 *
1907 * It does however cause some slight miss-attribution of {soft,}irq
1908 * time, a more accurate solution would be to update the irq_time using
1909 * the current rq->clock timestamp, except that would require using
1910 * atomic ops.
1911 */
1912 if (irq_delta > delta)
1913 irq_delta = delta;
1914
1915 rq->prev_irq_time += irq_delta;
1916 delta -= irq_delta;
1917 rq->clock_task += delta;
1918
1919 if (irq_delta && sched_feat(NONIRQ_POWER))
1920 sched_rt_avg_update(rq, irq_delta);
2003} 1921}
2004 1922
2005#else 1923#else /* CONFIG_IRQ_TIME_ACCOUNTING */
2006 1924
2007static u64 irq_time_cpu(int cpu) 1925static void update_rq_clock_task(struct rq *rq, s64 delta)
2008{ 1926{
2009 return 0; 1927 rq->clock_task += delta;
2010} 1928}
2011 1929
2012static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { } 1930#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2013
2014#endif
2015 1931
2016#include "sched_idletask.c" 1932#include "sched_idletask.c"
2017#include "sched_fair.c" 1933#include "sched_fair.c"
2018#include "sched_rt.c" 1934#include "sched_rt.c"
1935#include "sched_autogroup.c"
2019#include "sched_stoptask.c" 1936#include "sched_stoptask.c"
2020#ifdef CONFIG_SCHED_DEBUG 1937#ifdef CONFIG_SCHED_DEBUG
2021# include "sched_debug.c" 1938# include "sched_debug.c"
@@ -2118,6 +2035,31 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2118 p->sched_class->prio_changed(rq, p, oldprio, running); 2035 p->sched_class->prio_changed(rq, p, oldprio, running);
2119} 2036}
2120 2037
2038static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2039{
2040 const struct sched_class *class;
2041
2042 if (p->sched_class == rq->curr->sched_class) {
2043 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2044 } else {
2045 for_each_class(class) {
2046 if (class == rq->curr->sched_class)
2047 break;
2048 if (class == p->sched_class) {
2049 resched_task(rq->curr);
2050 break;
2051 }
2052 }
2053 }
2054
2055 /*
2056 * A queue event has occurred, and we're going to schedule. In
2057 * this case, we can save a useless back to back clock update.
2058 */
2059 if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
2060 rq->skip_clock_update = 1;
2061}
2062
2121#ifdef CONFIG_SMP 2063#ifdef CONFIG_SMP
2122/* 2064/*
2123 * Is this task likely cache-hot: 2065 * Is this task likely cache-hot:
@@ -2183,10 +2125,8 @@ static int migration_cpu_stop(void *data);
2183 * The task's runqueue lock must be held. 2125 * The task's runqueue lock must be held.
2184 * Returns true if you have to wait for migration thread. 2126 * Returns true if you have to wait for migration thread.
2185 */ 2127 */
2186static bool migrate_task(struct task_struct *p, int dest_cpu) 2128static bool migrate_task(struct task_struct *p, struct rq *rq)
2187{ 2129{
2188 struct rq *rq = task_rq(p);
2189
2190 /* 2130 /*
2191 * If the task is not on a runqueue (and not running), then 2131 * If the task is not on a runqueue (and not running), then
2192 * the next wake-up will properly place the task. 2132 * the next wake-up will properly place the task.
@@ -2366,18 +2306,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2366 return dest_cpu; 2306 return dest_cpu;
2367 2307
2368 /* No more Mr. Nice Guy. */ 2308 /* No more Mr. Nice Guy. */
2369 if (unlikely(dest_cpu >= nr_cpu_ids)) { 2309 dest_cpu = cpuset_cpus_allowed_fallback(p);
2370 dest_cpu = cpuset_cpus_allowed_fallback(p); 2310 /*
2371 /* 2311 * Don't tell them about moving exiting tasks or
2372 * Don't tell them about moving exiting tasks or 2312 * kernel threads (both mm NULL), since they never
2373 * kernel threads (both mm NULL), since they never 2313 * leave kernel.
2374 * leave kernel. 2314 */
2375 */ 2315 if (p->mm && printk_ratelimit()) {
2376 if (p->mm && printk_ratelimit()) { 2316 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
2377 printk(KERN_INFO "process %d (%s) no " 2317 task_pid_nr(p), p->comm, cpu);
2378 "longer affine to cpu%d\n",
2379 task_pid_nr(p), p->comm, cpu);
2380 }
2381 } 2318 }
2382 2319
2383 return dest_cpu; 2320 return dest_cpu;
@@ -2713,7 +2650,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
2713 /* Want to start with kernel preemption disabled. */ 2650 /* Want to start with kernel preemption disabled. */
2714 task_thread_info(p)->preempt_count = 1; 2651 task_thread_info(p)->preempt_count = 1;
2715#endif 2652#endif
2653#ifdef CONFIG_SMP
2716 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2654 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2655#endif
2717 2656
2718 put_cpu(); 2657 put_cpu();
2719} 2658}
@@ -3104,6 +3043,15 @@ static long calc_load_fold_active(struct rq *this_rq)
3104 return delta; 3043 return delta;
3105} 3044}
3106 3045
3046static unsigned long
3047calc_load(unsigned long load, unsigned long exp, unsigned long active)
3048{
3049 load *= exp;
3050 load += active * (FIXED_1 - exp);
3051 load += 1UL << (FSHIFT - 1);
3052 return load >> FSHIFT;
3053}
3054
3107#ifdef CONFIG_NO_HZ 3055#ifdef CONFIG_NO_HZ
3108/* 3056/*
3109 * For NO_HZ we delay the active fold to the next LOAD_FREQ update. 3057 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3133,6 +3081,128 @@ static long calc_load_fold_idle(void)
3133 3081
3134 return delta; 3082 return delta;
3135} 3083}
3084
3085/**
3086 * fixed_power_int - compute: x^n, in O(log n) time
3087 *
3088 * @x: base of the power
3089 * @frac_bits: fractional bits of @x
3090 * @n: power to raise @x to.
3091 *
3092 * By exploiting the relation between the definition of the natural power
3093 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
3094 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
3095 * (where: n_i \elem {0, 1}, the binary vector representing n),
3096 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
3097 * of course trivially computable in O(log_2 n), the length of our binary
3098 * vector.
3099 */
3100static unsigned long
3101fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
3102{
3103 unsigned long result = 1UL << frac_bits;
3104
3105 if (n) for (;;) {
3106 if (n & 1) {
3107 result *= x;
3108 result += 1UL << (frac_bits - 1);
3109 result >>= frac_bits;
3110 }
3111 n >>= 1;
3112 if (!n)
3113 break;
3114 x *= x;
3115 x += 1UL << (frac_bits - 1);
3116 x >>= frac_bits;
3117 }
3118
3119 return result;
3120}
3121
3122/*
3123 * a1 = a0 * e + a * (1 - e)
3124 *
3125 * a2 = a1 * e + a * (1 - e)
3126 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
3127 * = a0 * e^2 + a * (1 - e) * (1 + e)
3128 *
3129 * a3 = a2 * e + a * (1 - e)
3130 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
3131 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
3132 *
3133 * ...
3134 *
3135 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
3136 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
3137 * = a0 * e^n + a * (1 - e^n)
3138 *
3139 * [1] application of the geometric series:
3140 *
3141 * n 1 - x^(n+1)
3142 * S_n := \Sum x^i = -------------
3143 * i=0 1 - x
3144 */
3145static unsigned long
3146calc_load_n(unsigned long load, unsigned long exp,
3147 unsigned long active, unsigned int n)
3148{
3149
3150 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
3151}
3152
3153/*
3154 * NO_HZ can leave us missing all per-cpu ticks calling
3155 * calc_load_account_active(), but since an idle CPU folds its delta into
3156 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
3157 * in the pending idle delta if our idle period crossed a load cycle boundary.
3158 *
3159 * Once we've updated the global active value, we need to apply the exponential
3160 * weights adjusted to the number of cycles missed.
3161 */
3162static void calc_global_nohz(unsigned long ticks)
3163{
3164 long delta, active, n;
3165
3166 if (time_before(jiffies, calc_load_update))
3167 return;
3168
3169 /*
3170 * If we crossed a calc_load_update boundary, make sure to fold
3171 * any pending idle changes, the respective CPUs might have
3172 * missed the tick driven calc_load_account_active() update
3173 * due to NO_HZ.
3174 */
3175 delta = calc_load_fold_idle();
3176 if (delta)
3177 atomic_long_add(delta, &calc_load_tasks);
3178
3179 /*
3180 * If we were idle for multiple load cycles, apply them.
3181 */
3182 if (ticks >= LOAD_FREQ) {
3183 n = ticks / LOAD_FREQ;
3184
3185 active = atomic_long_read(&calc_load_tasks);
3186 active = active > 0 ? active * FIXED_1 : 0;
3187
3188 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
3189 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
3190 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
3191
3192 calc_load_update += n * LOAD_FREQ;
3193 }
3194
3195 /*
3196 * Its possible the remainder of the above division also crosses
3197 * a LOAD_FREQ period, the regular check in calc_global_load()
3198 * which comes after this will take care of that.
3199 *
3200 * Consider us being 11 ticks before a cycle completion, and us
3201 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
3202 * age us 4 cycles, and the test in calc_global_load() will
3203 * pick up the final one.
3204 */
3205}
3136#else 3206#else
3137static void calc_load_account_idle(struct rq *this_rq) 3207static void calc_load_account_idle(struct rq *this_rq)
3138{ 3208{
@@ -3142,6 +3212,10 @@ static inline long calc_load_fold_idle(void)
3142{ 3212{
3143 return 0; 3213 return 0;
3144} 3214}
3215
3216static void calc_global_nohz(unsigned long ticks)
3217{
3218}
3145#endif 3219#endif
3146 3220
3147/** 3221/**
@@ -3159,24 +3233,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3159 loads[2] = (avenrun[2] + offset) << shift; 3233 loads[2] = (avenrun[2] + offset) << shift;
3160} 3234}
3161 3235
3162static unsigned long
3163calc_load(unsigned long load, unsigned long exp, unsigned long active)
3164{
3165 load *= exp;
3166 load += active * (FIXED_1 - exp);
3167 return load >> FSHIFT;
3168}
3169
3170/* 3236/*
3171 * calc_load - update the avenrun load estimates 10 ticks after the 3237 * calc_load - update the avenrun load estimates 10 ticks after the
3172 * CPUs have updated calc_load_tasks. 3238 * CPUs have updated calc_load_tasks.
3173 */ 3239 */
3174void calc_global_load(void) 3240void calc_global_load(unsigned long ticks)
3175{ 3241{
3176 unsigned long upd = calc_load_update + 10;
3177 long active; 3242 long active;
3178 3243
3179 if (time_before(jiffies, upd)) 3244 calc_global_nohz(ticks);
3245
3246 if (time_before(jiffies, calc_load_update + 10))
3180 return; 3247 return;
3181 3248
3182 active = atomic_long_read(&calc_load_tasks); 3249 active = atomic_long_read(&calc_load_tasks);
@@ -3349,7 +3416,7 @@ void sched_exec(void)
3349 * select_task_rq() can race against ->cpus_allowed 3416 * select_task_rq() can race against ->cpus_allowed
3350 */ 3417 */
3351 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && 3418 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3352 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { 3419 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3353 struct migration_arg arg = { p, dest_cpu }; 3420 struct migration_arg arg = { p, dest_cpu };
3354 3421
3355 task_rq_unlock(rq, &flags); 3422 task_rq_unlock(rq, &flags);
@@ -3830,7 +3897,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
3830{ 3897{
3831 if (prev->se.on_rq) 3898 if (prev->se.on_rq)
3832 update_rq_clock(rq); 3899 update_rq_clock(rq);
3833 rq->skip_clock_update = 0;
3834 prev->sched_class->put_prev_task(rq, prev); 3900 prev->sched_class->put_prev_task(rq, prev);
3835} 3901}
3836 3902
@@ -3888,7 +3954,6 @@ need_resched_nonpreemptible:
3888 hrtick_clear(rq); 3954 hrtick_clear(rq);
3889 3955
3890 raw_spin_lock_irq(&rq->lock); 3956 raw_spin_lock_irq(&rq->lock);
3891 clear_tsk_need_resched(prev);
3892 3957
3893 switch_count = &prev->nivcsw; 3958 switch_count = &prev->nivcsw;
3894 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3959 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -3920,6 +3985,8 @@ need_resched_nonpreemptible:
3920 3985
3921 put_prev_task(rq, prev); 3986 put_prev_task(rq, prev);
3922 next = pick_next_task(rq); 3987 next = pick_next_task(rq);
3988 clear_tsk_need_resched(prev);
3989 rq->skip_clock_update = 0;
3923 3990
3924 if (likely(prev != next)) { 3991 if (likely(prev != next)) {
3925 sched_info_switch(prev, next); 3992 sched_info_switch(prev, next);
@@ -4014,7 +4081,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4014 if (task_thread_info(rq->curr) != owner || need_resched()) 4081 if (task_thread_info(rq->curr) != owner || need_resched())
4015 return 0; 4082 return 0;
4016 4083
4017 cpu_relax(); 4084 arch_mutex_cpu_relax();
4018 } 4085 }
4019 4086
4020 return 1; 4087 return 1;
@@ -4326,7 +4393,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
4326 * This waits for either a completion of a specific task to be signaled or for a 4393 * This waits for either a completion of a specific task to be signaled or for a
4327 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 4394 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4328 */ 4395 */
4329unsigned long __sched 4396long __sched
4330wait_for_completion_interruptible_timeout(struct completion *x, 4397wait_for_completion_interruptible_timeout(struct completion *x,
4331 unsigned long timeout) 4398 unsigned long timeout)
4332{ 4399{
@@ -4359,7 +4426,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
4359 * signaled or for a specified timeout to expire. It can be 4426 * signaled or for a specified timeout to expire. It can be
4360 * interrupted by a kill signal. The timeout is in jiffies. 4427 * interrupted by a kill signal. The timeout is in jiffies.
4361 */ 4428 */
4362unsigned long __sched 4429long __sched
4363wait_for_completion_killable_timeout(struct completion *x, 4430wait_for_completion_killable_timeout(struct completion *x,
4364 unsigned long timeout) 4431 unsigned long timeout)
4365{ 4432{
@@ -4701,7 +4768,7 @@ static bool check_same_owner(struct task_struct *p)
4701} 4768}
4702 4769
4703static int __sched_setscheduler(struct task_struct *p, int policy, 4770static int __sched_setscheduler(struct task_struct *p, int policy,
4704 struct sched_param *param, bool user) 4771 const struct sched_param *param, bool user)
4705{ 4772{
4706 int retval, oldprio, oldpolicy = -1, on_rq, running; 4773 int retval, oldprio, oldpolicy = -1, on_rq, running;
4707 unsigned long flags; 4774 unsigned long flags;
@@ -4856,7 +4923,7 @@ recheck:
4856 * NOTE that the task may be already dead. 4923 * NOTE that the task may be already dead.
4857 */ 4924 */
4858int sched_setscheduler(struct task_struct *p, int policy, 4925int sched_setscheduler(struct task_struct *p, int policy,
4859 struct sched_param *param) 4926 const struct sched_param *param)
4860{ 4927{
4861 return __sched_setscheduler(p, policy, param, true); 4928 return __sched_setscheduler(p, policy, param, true);
4862} 4929}
@@ -4874,7 +4941,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
4874 * but our caller might not have that capability. 4941 * but our caller might not have that capability.
4875 */ 4942 */
4876int sched_setscheduler_nocheck(struct task_struct *p, int policy, 4943int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4877 struct sched_param *param) 4944 const struct sched_param *param)
4878{ 4945{
4879 return __sched_setscheduler(p, policy, param, false); 4946 return __sched_setscheduler(p, policy, param, false);
4880} 4947}
@@ -5390,7 +5457,7 @@ void sched_show_task(struct task_struct *p)
5390 unsigned state; 5457 unsigned state;
5391 5458
5392 state = p->state ? __ffs(p->state) + 1 : 0; 5459 state = p->state ? __ffs(p->state) + 1 : 0;
5393 printk(KERN_INFO "%-13.13s %c", p->comm, 5460 printk(KERN_INFO "%-15.15s %c", p->comm,
5394 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 5461 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5395#if BITS_PER_LONG == 32 5462#if BITS_PER_LONG == 32
5396 if (state == TASK_RUNNING) 5463 if (state == TASK_RUNNING)
@@ -5554,7 +5621,6 @@ static void update_sysctl(void)
5554 SET_SYSCTL(sched_min_granularity); 5621 SET_SYSCTL(sched_min_granularity);
5555 SET_SYSCTL(sched_latency); 5622 SET_SYSCTL(sched_latency);
5556 SET_SYSCTL(sched_wakeup_granularity); 5623 SET_SYSCTL(sched_wakeup_granularity);
5557 SET_SYSCTL(sched_shares_ratelimit);
5558#undef SET_SYSCTL 5624#undef SET_SYSCTL
5559} 5625}
5560 5626
@@ -5630,7 +5696,7 @@ again:
5630 goto out; 5696 goto out;
5631 5697
5632 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 5698 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5633 if (migrate_task(p, dest_cpu)) { 5699 if (migrate_task(p, rq)) {
5634 struct migration_arg arg = { p, dest_cpu }; 5700 struct migration_arg arg = { p, dest_cpu };
5635 /* Need help from migration thread: drop lock and wait. */ 5701 /* Need help from migration thread: drop lock and wait. */
5636 task_rq_unlock(rq, &flags); 5702 task_rq_unlock(rq, &flags);
@@ -5712,29 +5778,20 @@ static int migration_cpu_stop(void *data)
5712} 5778}
5713 5779
5714#ifdef CONFIG_HOTPLUG_CPU 5780#ifdef CONFIG_HOTPLUG_CPU
5781
5715/* 5782/*
5716 * Figure out where task on dead CPU should go, use force if necessary. 5783 * Ensures that the idle task is using init_mm right before its cpu goes
5784 * offline.
5717 */ 5785 */
5718void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5786void idle_task_exit(void)
5719{ 5787{
5720 struct rq *rq = cpu_rq(dead_cpu); 5788 struct mm_struct *mm = current->active_mm;
5721 int needs_cpu, uninitialized_var(dest_cpu);
5722 unsigned long flags;
5723 5789
5724 local_irq_save(flags); 5790 BUG_ON(cpu_online(smp_processor_id()));
5725 5791
5726 raw_spin_lock(&rq->lock); 5792 if (mm != &init_mm)
5727 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); 5793 switch_mm(mm, &init_mm, current);
5728 if (needs_cpu) 5794 mmdrop(mm);
5729 dest_cpu = select_fallback_rq(dead_cpu, p);
5730 raw_spin_unlock(&rq->lock);
5731 /*
5732 * It can only fail if we race with set_cpus_allowed(),
5733 * in the racer should migrate the task anyway.
5734 */
5735 if (needs_cpu)
5736 __migrate_task(p, dead_cpu, dest_cpu);
5737 local_irq_restore(flags);
5738} 5795}
5739 5796
5740/* 5797/*
@@ -5747,128 +5804,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5747static void migrate_nr_uninterruptible(struct rq *rq_src) 5804static void migrate_nr_uninterruptible(struct rq *rq_src)
5748{ 5805{
5749 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); 5806 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5750 unsigned long flags;
5751 5807
5752 local_irq_save(flags);
5753 double_rq_lock(rq_src, rq_dest);
5754 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 5808 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5755 rq_src->nr_uninterruptible = 0; 5809 rq_src->nr_uninterruptible = 0;
5756 double_rq_unlock(rq_src, rq_dest);
5757 local_irq_restore(flags);
5758}
5759
5760/* Run through task list and migrate tasks from the dead cpu. */
5761static void migrate_live_tasks(int src_cpu)
5762{
5763 struct task_struct *p, *t;
5764
5765 read_lock(&tasklist_lock);
5766
5767 do_each_thread(t, p) {
5768 if (p == current)
5769 continue;
5770
5771 if (task_cpu(p) == src_cpu)
5772 move_task_off_dead_cpu(src_cpu, p);
5773 } while_each_thread(t, p);
5774
5775 read_unlock(&tasklist_lock);
5776} 5810}
5777 5811
5778/* 5812/*
5779 * Schedules idle task to be the next runnable task on current CPU. 5813 * remove the tasks which were accounted by rq from calc_load_tasks.
5780 * It does so by boosting its priority to highest possible.
5781 * Used by CPU offline code.
5782 */ 5814 */
5783void sched_idle_next(void) 5815static void calc_global_load_remove(struct rq *rq)
5784{ 5816{
5785 int this_cpu = smp_processor_id(); 5817 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5786 struct rq *rq = cpu_rq(this_cpu); 5818 rq->calc_load_active = 0;
5787 struct task_struct *p = rq->idle;
5788 unsigned long flags;
5789
5790 /* cpu has to be offline */
5791 BUG_ON(cpu_online(this_cpu));
5792
5793 /*
5794 * Strictly not necessary since rest of the CPUs are stopped by now
5795 * and interrupts disabled on the current cpu.
5796 */
5797 raw_spin_lock_irqsave(&rq->lock, flags);
5798
5799 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5800
5801 activate_task(rq, p, 0);
5802
5803 raw_spin_unlock_irqrestore(&rq->lock, flags);
5804} 5819}
5805 5820
5806/* 5821/*
5807 * Ensures that the idle task is using init_mm right before its cpu goes 5822 * Migrate all tasks from the rq, sleeping tasks will be migrated by
5808 * offline. 5823 * try_to_wake_up()->select_task_rq().
5824 *
5825 * Called with rq->lock held even though we'er in stop_machine() and
5826 * there's no concurrency possible, we hold the required locks anyway
5827 * because of lock validation efforts.
5809 */ 5828 */
5810void idle_task_exit(void) 5829static void migrate_tasks(unsigned int dead_cpu)
5811{
5812 struct mm_struct *mm = current->active_mm;
5813
5814 BUG_ON(cpu_online(smp_processor_id()));
5815
5816 if (mm != &init_mm)
5817 switch_mm(mm, &init_mm, current);
5818 mmdrop(mm);
5819}
5820
5821/* called under rq->lock with disabled interrupts */
5822static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5823{ 5830{
5824 struct rq *rq = cpu_rq(dead_cpu); 5831 struct rq *rq = cpu_rq(dead_cpu);
5825 5832 struct task_struct *next, *stop = rq->stop;
5826 /* Must be exiting, otherwise would be on tasklist. */ 5833 int dest_cpu;
5827 BUG_ON(!p->exit_state);
5828
5829 /* Cannot have done final schedule yet: would have vanished. */
5830 BUG_ON(p->state == TASK_DEAD);
5831
5832 get_task_struct(p);
5833 5834
5834 /* 5835 /*
5835 * Drop lock around migration; if someone else moves it, 5836 * Fudge the rq selection such that the below task selection loop
5836 * that's OK. No task can be added to this CPU, so iteration is 5837 * doesn't get stuck on the currently eligible stop task.
5837 * fine. 5838 *
5839 * We're currently inside stop_machine() and the rq is either stuck
5840 * in the stop_machine_cpu_stop() loop, or we're executing this code,
5841 * either way we should never end up calling schedule() until we're
5842 * done here.
5838 */ 5843 */
5839 raw_spin_unlock_irq(&rq->lock); 5844 rq->stop = NULL;
5840 move_task_off_dead_cpu(dead_cpu, p);
5841 raw_spin_lock_irq(&rq->lock);
5842
5843 put_task_struct(p);
5844}
5845
5846/* release_task() removes task from tasklist, so we won't find dead tasks. */
5847static void migrate_dead_tasks(unsigned int dead_cpu)
5848{
5849 struct rq *rq = cpu_rq(dead_cpu);
5850 struct task_struct *next;
5851 5845
5852 for ( ; ; ) { 5846 for ( ; ; ) {
5853 if (!rq->nr_running) 5847 /*
5848 * There's this thread running, bail when that's the only
5849 * remaining thread.
5850 */
5851 if (rq->nr_running == 1)
5854 break; 5852 break;
5853
5855 next = pick_next_task(rq); 5854 next = pick_next_task(rq);
5856 if (!next) 5855 BUG_ON(!next);
5857 break;
5858 next->sched_class->put_prev_task(rq, next); 5856 next->sched_class->put_prev_task(rq, next);
5859 migrate_dead(dead_cpu, next);
5860 5857
5858 /* Find suitable destination for @next, with force if needed. */
5859 dest_cpu = select_fallback_rq(dead_cpu, next);
5860 raw_spin_unlock(&rq->lock);
5861
5862 __migrate_task(next, dead_cpu, dest_cpu);
5863
5864 raw_spin_lock(&rq->lock);
5861 } 5865 }
5862}
5863 5866
5864/* 5867 rq->stop = stop;
5865 * remove the tasks which were accounted by rq from calc_load_tasks.
5866 */
5867static void calc_global_load_remove(struct rq *rq)
5868{
5869 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5870 rq->calc_load_active = 0;
5871} 5868}
5869
5872#endif /* CONFIG_HOTPLUG_CPU */ 5870#endif /* CONFIG_HOTPLUG_CPU */
5873 5871
5874#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 5872#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6078,15 +6076,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6078 unsigned long flags; 6076 unsigned long flags;
6079 struct rq *rq = cpu_rq(cpu); 6077 struct rq *rq = cpu_rq(cpu);
6080 6078
6081 switch (action) { 6079 switch (action & ~CPU_TASKS_FROZEN) {
6082 6080
6083 case CPU_UP_PREPARE: 6081 case CPU_UP_PREPARE:
6084 case CPU_UP_PREPARE_FROZEN:
6085 rq->calc_load_update = calc_load_update; 6082 rq->calc_load_update = calc_load_update;
6086 break; 6083 break;
6087 6084
6088 case CPU_ONLINE: 6085 case CPU_ONLINE:
6089 case CPU_ONLINE_FROZEN:
6090 /* Update our root-domain */ 6086 /* Update our root-domain */
6091 raw_spin_lock_irqsave(&rq->lock, flags); 6087 raw_spin_lock_irqsave(&rq->lock, flags);
6092 if (rq->rd) { 6088 if (rq->rd) {
@@ -6098,30 +6094,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6098 break; 6094 break;
6099 6095
6100#ifdef CONFIG_HOTPLUG_CPU 6096#ifdef CONFIG_HOTPLUG_CPU
6101 case CPU_DEAD:
6102 case CPU_DEAD_FROZEN:
6103 migrate_live_tasks(cpu);
6104 /* Idle task back to normal (off runqueue, low prio) */
6105 raw_spin_lock_irq(&rq->lock);
6106 deactivate_task(rq, rq->idle, 0);
6107 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
6108 rq->idle->sched_class = &idle_sched_class;
6109 migrate_dead_tasks(cpu);
6110 raw_spin_unlock_irq(&rq->lock);
6111 migrate_nr_uninterruptible(rq);
6112 BUG_ON(rq->nr_running != 0);
6113 calc_global_load_remove(rq);
6114 break;
6115
6116 case CPU_DYING: 6097 case CPU_DYING:
6117 case CPU_DYING_FROZEN:
6118 /* Update our root-domain */ 6098 /* Update our root-domain */
6119 raw_spin_lock_irqsave(&rq->lock, flags); 6099 raw_spin_lock_irqsave(&rq->lock, flags);
6120 if (rq->rd) { 6100 if (rq->rd) {
6121 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6101 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6122 set_rq_offline(rq); 6102 set_rq_offline(rq);
6123 } 6103 }
6104 migrate_tasks(cpu);
6105 BUG_ON(rq->nr_running != 1); /* the migration thread */
6124 raw_spin_unlock_irqrestore(&rq->lock, flags); 6106 raw_spin_unlock_irqrestore(&rq->lock, flags);
6107
6108 migrate_nr_uninterruptible(rq);
6109 calc_global_load_remove(rq);
6125 break; 6110 break;
6126#endif 6111#endif
6127 } 6112 }
@@ -6960,6 +6945,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6960 if (cpu != group_first_cpu(sd->groups)) 6945 if (cpu != group_first_cpu(sd->groups))
6961 return; 6946 return;
6962 6947
6948 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
6949
6963 child = sd->child; 6950 child = sd->child;
6964 6951
6965 sd->groups->cpu_power = 0; 6952 sd->groups->cpu_power = 0;
@@ -7850,18 +7837,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7850 7837
7851#ifdef CONFIG_FAIR_GROUP_SCHED 7838#ifdef CONFIG_FAIR_GROUP_SCHED
7852static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 7839static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7853 struct sched_entity *se, int cpu, int add, 7840 struct sched_entity *se, int cpu,
7854 struct sched_entity *parent) 7841 struct sched_entity *parent)
7855{ 7842{
7856 struct rq *rq = cpu_rq(cpu); 7843 struct rq *rq = cpu_rq(cpu);
7857 tg->cfs_rq[cpu] = cfs_rq; 7844 tg->cfs_rq[cpu] = cfs_rq;
7858 init_cfs_rq(cfs_rq, rq); 7845 init_cfs_rq(cfs_rq, rq);
7859 cfs_rq->tg = tg; 7846 cfs_rq->tg = tg;
7860 if (add)
7861 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7862 7847
7863 tg->se[cpu] = se; 7848 tg->se[cpu] = se;
7864 /* se could be NULL for init_task_group */ 7849 /* se could be NULL for root_task_group */
7865 if (!se) 7850 if (!se)
7866 return; 7851 return;
7867 7852
@@ -7871,15 +7856,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7871 se->cfs_rq = parent->my_q; 7856 se->cfs_rq = parent->my_q;
7872 7857
7873 se->my_q = cfs_rq; 7858 se->my_q = cfs_rq;
7874 se->load.weight = tg->shares; 7859 update_load_set(&se->load, 0);
7875 se->load.inv_weight = 0;
7876 se->parent = parent; 7860 se->parent = parent;
7877} 7861}
7878#endif 7862#endif
7879 7863
7880#ifdef CONFIG_RT_GROUP_SCHED 7864#ifdef CONFIG_RT_GROUP_SCHED
7881static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 7865static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7882 struct sched_rt_entity *rt_se, int cpu, int add, 7866 struct sched_rt_entity *rt_se, int cpu,
7883 struct sched_rt_entity *parent) 7867 struct sched_rt_entity *parent)
7884{ 7868{
7885 struct rq *rq = cpu_rq(cpu); 7869 struct rq *rq = cpu_rq(cpu);
@@ -7888,8 +7872,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7888 init_rt_rq(rt_rq, rq); 7872 init_rt_rq(rt_rq, rq);
7889 rt_rq->tg = tg; 7873 rt_rq->tg = tg;
7890 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7874 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
7891 if (add)
7892 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7893 7875
7894 tg->rt_se[cpu] = rt_se; 7876 tg->rt_se[cpu] = rt_se;
7895 if (!rt_se) 7877 if (!rt_se)
@@ -7924,18 +7906,18 @@ void __init sched_init(void)
7924 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 7906 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7925 7907
7926#ifdef CONFIG_FAIR_GROUP_SCHED 7908#ifdef CONFIG_FAIR_GROUP_SCHED
7927 init_task_group.se = (struct sched_entity **)ptr; 7909 root_task_group.se = (struct sched_entity **)ptr;
7928 ptr += nr_cpu_ids * sizeof(void **); 7910 ptr += nr_cpu_ids * sizeof(void **);
7929 7911
7930 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7912 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7931 ptr += nr_cpu_ids * sizeof(void **); 7913 ptr += nr_cpu_ids * sizeof(void **);
7932 7914
7933#endif /* CONFIG_FAIR_GROUP_SCHED */ 7915#endif /* CONFIG_FAIR_GROUP_SCHED */
7934#ifdef CONFIG_RT_GROUP_SCHED 7916#ifdef CONFIG_RT_GROUP_SCHED
7935 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7917 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
7936 ptr += nr_cpu_ids * sizeof(void **); 7918 ptr += nr_cpu_ids * sizeof(void **);
7937 7919
7938 init_task_group.rt_rq = (struct rt_rq **)ptr; 7920 root_task_group.rt_rq = (struct rt_rq **)ptr;
7939 ptr += nr_cpu_ids * sizeof(void **); 7921 ptr += nr_cpu_ids * sizeof(void **);
7940 7922
7941#endif /* CONFIG_RT_GROUP_SCHED */ 7923#endif /* CONFIG_RT_GROUP_SCHED */
@@ -7955,20 +7937,16 @@ void __init sched_init(void)
7955 global_rt_period(), global_rt_runtime()); 7937 global_rt_period(), global_rt_runtime());
7956 7938
7957#ifdef CONFIG_RT_GROUP_SCHED 7939#ifdef CONFIG_RT_GROUP_SCHED
7958 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7940 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7959 global_rt_period(), global_rt_runtime()); 7941 global_rt_period(), global_rt_runtime());
7960#endif /* CONFIG_RT_GROUP_SCHED */ 7942#endif /* CONFIG_RT_GROUP_SCHED */
7961 7943
7962#ifdef CONFIG_CGROUP_SCHED 7944#ifdef CONFIG_CGROUP_SCHED
7963 list_add(&init_task_group.list, &task_groups); 7945 list_add(&root_task_group.list, &task_groups);
7964 INIT_LIST_HEAD(&init_task_group.children); 7946 INIT_LIST_HEAD(&root_task_group.children);
7965 7947 autogroup_init(&init_task);
7966#endif /* CONFIG_CGROUP_SCHED */ 7948#endif /* CONFIG_CGROUP_SCHED */
7967 7949
7968#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
7969 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
7970 __alignof__(unsigned long));
7971#endif
7972 for_each_possible_cpu(i) { 7950 for_each_possible_cpu(i) {
7973 struct rq *rq; 7951 struct rq *rq;
7974 7952
@@ -7980,38 +7958,34 @@ void __init sched_init(void)
7980 init_cfs_rq(&rq->cfs, rq); 7958 init_cfs_rq(&rq->cfs, rq);
7981 init_rt_rq(&rq->rt, rq); 7959 init_rt_rq(&rq->rt, rq);
7982#ifdef CONFIG_FAIR_GROUP_SCHED 7960#ifdef CONFIG_FAIR_GROUP_SCHED
7983 init_task_group.shares = init_task_group_load; 7961 root_task_group.shares = root_task_group_load;
7984 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 7962 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7985#ifdef CONFIG_CGROUP_SCHED
7986 /* 7963 /*
7987 * How much cpu bandwidth does init_task_group get? 7964 * How much cpu bandwidth does root_task_group get?
7988 * 7965 *
7989 * In case of task-groups formed thr' the cgroup filesystem, it 7966 * In case of task-groups formed thr' the cgroup filesystem, it
7990 * gets 100% of the cpu resources in the system. This overall 7967 * gets 100% of the cpu resources in the system. This overall
7991 * system cpu resource is divided among the tasks of 7968 * system cpu resource is divided among the tasks of
7992 * init_task_group and its child task-groups in a fair manner, 7969 * root_task_group and its child task-groups in a fair manner,
7993 * based on each entity's (task or task-group's) weight 7970 * based on each entity's (task or task-group's) weight
7994 * (se->load.weight). 7971 * (se->load.weight).
7995 * 7972 *
7996 * In other words, if init_task_group has 10 tasks of weight 7973 * In other words, if root_task_group has 10 tasks of weight
7997 * 1024) and two child groups A0 and A1 (of weight 1024 each), 7974 * 1024) and two child groups A0 and A1 (of weight 1024 each),
7998 * then A0's share of the cpu resource is: 7975 * then A0's share of the cpu resource is:
7999 * 7976 *
8000 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 7977 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8001 * 7978 *
8002 * We achieve this by letting init_task_group's tasks sit 7979 * We achieve this by letting root_task_group's tasks sit
8003 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7980 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
8004 */ 7981 */
8005 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7982 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
8006#endif
8007#endif /* CONFIG_FAIR_GROUP_SCHED */ 7983#endif /* CONFIG_FAIR_GROUP_SCHED */
8008 7984
8009 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 7985 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
8010#ifdef CONFIG_RT_GROUP_SCHED 7986#ifdef CONFIG_RT_GROUP_SCHED
8011 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7987 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
8012#ifdef CONFIG_CGROUP_SCHED 7988 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
8013 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
8014#endif
8015#endif 7989#endif
8016 7990
8017 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7991 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8091,8 +8065,6 @@ void __init sched_init(void)
8091 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 8065 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
8092#endif /* SMP */ 8066#endif /* SMP */
8093 8067
8094 perf_event_init();
8095
8096 scheduler_running = 1; 8068 scheduler_running = 1;
8097} 8069}
8098 8070
@@ -8286,7 +8258,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8286 if (!se) 8258 if (!se)
8287 goto err_free_rq; 8259 goto err_free_rq;
8288 8260
8289 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 8261 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8290 } 8262 }
8291 8263
8292 return 1; 8264 return 1;
@@ -8297,15 +8269,21 @@ err:
8297 return 0; 8269 return 0;
8298} 8270}
8299 8271
8300static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8301{
8302 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
8303 &cpu_rq(cpu)->leaf_cfs_rq_list);
8304}
8305
8306static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8272static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8307{ 8273{
8308 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8274 struct rq *rq = cpu_rq(cpu);
8275 unsigned long flags;
8276
8277 /*
8278 * Only empty task groups can be destroyed; so we can speculatively
8279 * check on_list without danger of it being re-added.
8280 */
8281 if (!tg->cfs_rq[cpu]->on_list)
8282 return;
8283
8284 raw_spin_lock_irqsave(&rq->lock, flags);
8285 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8286 raw_spin_unlock_irqrestore(&rq->lock, flags);
8309} 8287}
8310#else /* !CONFG_FAIR_GROUP_SCHED */ 8288#else /* !CONFG_FAIR_GROUP_SCHED */
8311static inline void free_fair_sched_group(struct task_group *tg) 8289static inline void free_fair_sched_group(struct task_group *tg)
@@ -8318,10 +8296,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8318 return 1; 8296 return 1;
8319} 8297}
8320 8298
8321static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8322{
8323}
8324
8325static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8299static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8326{ 8300{
8327} 8301}
@@ -8376,7 +8350,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8376 if (!rt_se) 8350 if (!rt_se)
8377 goto err_free_rq; 8351 goto err_free_rq;
8378 8352
8379 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 8353 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8380 } 8354 }
8381 8355
8382 return 1; 8356 return 1;
@@ -8386,17 +8360,6 @@ err_free_rq:
8386err: 8360err:
8387 return 0; 8361 return 0;
8388} 8362}
8389
8390static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8391{
8392 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
8393 &cpu_rq(cpu)->leaf_rt_rq_list);
8394}
8395
8396static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8397{
8398 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8399}
8400#else /* !CONFIG_RT_GROUP_SCHED */ 8363#else /* !CONFIG_RT_GROUP_SCHED */
8401static inline void free_rt_sched_group(struct task_group *tg) 8364static inline void free_rt_sched_group(struct task_group *tg)
8402{ 8365{
@@ -8407,14 +8370,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8407{ 8370{
8408 return 1; 8371 return 1;
8409} 8372}
8410
8411static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8412{
8413}
8414
8415static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8416{
8417}
8418#endif /* CONFIG_RT_GROUP_SCHED */ 8373#endif /* CONFIG_RT_GROUP_SCHED */
8419 8374
8420#ifdef CONFIG_CGROUP_SCHED 8375#ifdef CONFIG_CGROUP_SCHED
@@ -8422,6 +8377,7 @@ static void free_sched_group(struct task_group *tg)
8422{ 8377{
8423 free_fair_sched_group(tg); 8378 free_fair_sched_group(tg);
8424 free_rt_sched_group(tg); 8379 free_rt_sched_group(tg);
8380 autogroup_free(tg);
8425 kfree(tg); 8381 kfree(tg);
8426} 8382}
8427 8383
@@ -8430,7 +8386,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8430{ 8386{
8431 struct task_group *tg; 8387 struct task_group *tg;
8432 unsigned long flags; 8388 unsigned long flags;
8433 int i;
8434 8389
8435 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 8390 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8436 if (!tg) 8391 if (!tg)
@@ -8443,10 +8398,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8443 goto err; 8398 goto err;
8444 8399
8445 spin_lock_irqsave(&task_group_lock, flags); 8400 spin_lock_irqsave(&task_group_lock, flags);
8446 for_each_possible_cpu(i) {
8447 register_fair_sched_group(tg, i);
8448 register_rt_sched_group(tg, i);
8449 }
8450 list_add_rcu(&tg->list, &task_groups); 8401 list_add_rcu(&tg->list, &task_groups);
8451 8402
8452 WARN_ON(!parent); /* root should already exist */ 8403 WARN_ON(!parent); /* root should already exist */
@@ -8476,11 +8427,11 @@ void sched_destroy_group(struct task_group *tg)
8476 unsigned long flags; 8427 unsigned long flags;
8477 int i; 8428 int i;
8478 8429
8479 spin_lock_irqsave(&task_group_lock, flags); 8430 /* end participation in shares distribution */
8480 for_each_possible_cpu(i) { 8431 for_each_possible_cpu(i)
8481 unregister_fair_sched_group(tg, i); 8432 unregister_fair_sched_group(tg, i);
8482 unregister_rt_sched_group(tg, i); 8433
8483 } 8434 spin_lock_irqsave(&task_group_lock, flags);
8484 list_del_rcu(&tg->list); 8435 list_del_rcu(&tg->list);
8485 list_del_rcu(&tg->siblings); 8436 list_del_rcu(&tg->siblings);
8486 spin_unlock_irqrestore(&task_group_lock, flags); 8437 spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8527,33 +8478,6 @@ void sched_move_task(struct task_struct *tsk)
8527#endif /* CONFIG_CGROUP_SCHED */ 8478#endif /* CONFIG_CGROUP_SCHED */
8528 8479
8529#ifdef CONFIG_FAIR_GROUP_SCHED 8480#ifdef CONFIG_FAIR_GROUP_SCHED
8530static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8531{
8532 struct cfs_rq *cfs_rq = se->cfs_rq;
8533 int on_rq;
8534
8535 on_rq = se->on_rq;
8536 if (on_rq)
8537 dequeue_entity(cfs_rq, se, 0);
8538
8539 se->load.weight = shares;
8540 se->load.inv_weight = 0;
8541
8542 if (on_rq)
8543 enqueue_entity(cfs_rq, se, 0);
8544}
8545
8546static void set_se_shares(struct sched_entity *se, unsigned long shares)
8547{
8548 struct cfs_rq *cfs_rq = se->cfs_rq;
8549 struct rq *rq = cfs_rq->rq;
8550 unsigned long flags;
8551
8552 raw_spin_lock_irqsave(&rq->lock, flags);
8553 __set_se_shares(se, shares);
8554 raw_spin_unlock_irqrestore(&rq->lock, flags);
8555}
8556
8557static DEFINE_MUTEX(shares_mutex); 8481static DEFINE_MUTEX(shares_mutex);
8558 8482
8559int sched_group_set_shares(struct task_group *tg, unsigned long shares) 8483int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8576,37 +8500,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8576 if (tg->shares == shares) 8500 if (tg->shares == shares)
8577 goto done; 8501 goto done;
8578 8502
8579 spin_lock_irqsave(&task_group_lock, flags);
8580 for_each_possible_cpu(i)
8581 unregister_fair_sched_group(tg, i);
8582 list_del_rcu(&tg->siblings);
8583 spin_unlock_irqrestore(&task_group_lock, flags);
8584
8585 /* wait for any ongoing reference to this group to finish */
8586 synchronize_sched();
8587
8588 /*
8589 * Now we are free to modify the group's share on each cpu
8590 * w/o tripping rebalance_share or load_balance_fair.
8591 */
8592 tg->shares = shares; 8503 tg->shares = shares;
8593 for_each_possible_cpu(i) { 8504 for_each_possible_cpu(i) {
8594 /* 8505 struct rq *rq = cpu_rq(i);
8595 * force a rebalance 8506 struct sched_entity *se;
8596 */ 8507
8597 cfs_rq_set_shares(tg->cfs_rq[i], 0); 8508 se = tg->se[i];
8598 set_se_shares(tg->se[i], shares); 8509 /* Propagate contribution to hierarchy */
8510 raw_spin_lock_irqsave(&rq->lock, flags);
8511 for_each_sched_entity(se)
8512 update_cfs_shares(group_cfs_rq(se), 0);
8513 raw_spin_unlock_irqrestore(&rq->lock, flags);
8599 } 8514 }
8600 8515
8601 /*
8602 * Enable load balance activity on this group, by inserting it back on
8603 * each cpu's rq->leaf_cfs_rq_list.
8604 */
8605 spin_lock_irqsave(&task_group_lock, flags);
8606 for_each_possible_cpu(i)
8607 register_fair_sched_group(tg, i);
8608 list_add_rcu(&tg->siblings, &tg->parent->children);
8609 spin_unlock_irqrestore(&task_group_lock, flags);
8610done: 8516done:
8611 mutex_unlock(&shares_mutex); 8517 mutex_unlock(&shares_mutex);
8612 return 0; 8518 return 0;
@@ -8905,7 +8811,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8905 8811
8906 if (!cgrp->parent) { 8812 if (!cgrp->parent) {
8907 /* This is early initialization for the top cgroup */ 8813 /* This is early initialization for the top cgroup */
8908 return &init_task_group.css; 8814 return &root_task_group.css;
8909 } 8815 }
8910 8816
8911 parent = cgroup_tg(cgrp->parent); 8817 parent = cgroup_tg(cgrp->parent);
@@ -9332,72 +9238,3 @@ struct cgroup_subsys cpuacct_subsys = {
9332}; 9238};
9333#endif /* CONFIG_CGROUP_CPUACCT */ 9239#endif /* CONFIG_CGROUP_CPUACCT */
9334 9240
9335#ifndef CONFIG_SMP
9336
9337void synchronize_sched_expedited(void)
9338{
9339 barrier();
9340}
9341EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9342
9343#else /* #ifndef CONFIG_SMP */
9344
9345static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
9346
9347static int synchronize_sched_expedited_cpu_stop(void *data)
9348{
9349 /*
9350 * There must be a full memory barrier on each affected CPU
9351 * between the time that try_stop_cpus() is called and the
9352 * time that it returns.
9353 *
9354 * In the current initial implementation of cpu_stop, the
9355 * above condition is already met when the control reaches
9356 * this point and the following smp_mb() is not strictly
9357 * necessary. Do smp_mb() anyway for documentation and
9358 * robustness against future implementation changes.
9359 */
9360 smp_mb(); /* See above comment block. */
9361 return 0;
9362}
9363
9364/*
9365 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
9366 * approach to force grace period to end quickly. This consumes
9367 * significant time on all CPUs, and is thus not recommended for
9368 * any sort of common-case code.
9369 *
9370 * Note that it is illegal to call this function while holding any
9371 * lock that is acquired by a CPU-hotplug notifier. Failing to
9372 * observe this restriction will result in deadlock.
9373 */
9374void synchronize_sched_expedited(void)
9375{
9376 int snap, trycount = 0;
9377
9378 smp_mb(); /* ensure prior mod happens before capturing snap. */
9379 snap = atomic_read(&synchronize_sched_expedited_count) + 1;
9380 get_online_cpus();
9381 while (try_stop_cpus(cpu_online_mask,
9382 synchronize_sched_expedited_cpu_stop,
9383 NULL) == -EAGAIN) {
9384 put_online_cpus();
9385 if (trycount++ < 10)
9386 udelay(trycount * num_online_cpus());
9387 else {
9388 synchronize_sched();
9389 return;
9390 }
9391 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
9392 smp_mb(); /* ensure test happens before caller kfree */
9393 return;
9394 }
9395 get_online_cpus();
9396 }
9397 atomic_inc(&synchronize_sched_expedited_count);
9398 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
9399 put_online_cpus();
9400}
9401EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9402
9403#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
new file mode 100644
index 000000000000..32a723b8f84c
--- /dev/null
+++ b/kernel/sched_autogroup.c
@@ -0,0 +1,238 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3#include <linux/proc_fs.h>
4#include <linux/seq_file.h>
5#include <linux/kallsyms.h>
6#include <linux/utsname.h>
7
8unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
9static struct autogroup autogroup_default;
10static atomic_t autogroup_seq_nr;
11
12static void __init autogroup_init(struct task_struct *init_task)
13{
14 autogroup_default.tg = &root_task_group;
15 root_task_group.autogroup = &autogroup_default;
16 kref_init(&autogroup_default.kref);
17 init_rwsem(&autogroup_default.lock);
18 init_task->signal->autogroup = &autogroup_default;
19}
20
21static inline void autogroup_free(struct task_group *tg)
22{
23 kfree(tg->autogroup);
24}
25
26static inline void autogroup_destroy(struct kref *kref)
27{
28 struct autogroup *ag = container_of(kref, struct autogroup, kref);
29
30 sched_destroy_group(ag->tg);
31}
32
33static inline void autogroup_kref_put(struct autogroup *ag)
34{
35 kref_put(&ag->kref, autogroup_destroy);
36}
37
38static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
39{
40 kref_get(&ag->kref);
41 return ag;
42}
43
44static inline struct autogroup *autogroup_task_get(struct task_struct *p)
45{
46 struct autogroup *ag;
47 unsigned long flags;
48
49 if (!lock_task_sighand(p, &flags))
50 return autogroup_kref_get(&autogroup_default);
51
52 ag = autogroup_kref_get(p->signal->autogroup);
53 unlock_task_sighand(p, &flags);
54
55 return ag;
56}
57
58static inline struct autogroup *autogroup_create(void)
59{
60 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
61 struct task_group *tg;
62
63 if (!ag)
64 goto out_fail;
65
66 tg = sched_create_group(&root_task_group);
67
68 if (IS_ERR(tg))
69 goto out_free;
70
71 kref_init(&ag->kref);
72 init_rwsem(&ag->lock);
73 ag->id = atomic_inc_return(&autogroup_seq_nr);
74 ag->tg = tg;
75 tg->autogroup = ag;
76
77 return ag;
78
79out_free:
80 kfree(ag);
81out_fail:
82 if (printk_ratelimit()) {
83 printk(KERN_WARNING "autogroup_create: %s failure.\n",
84 ag ? "sched_create_group()" : "kmalloc()");
85 }
86
87 return autogroup_kref_get(&autogroup_default);
88}
89
90static inline bool
91task_wants_autogroup(struct task_struct *p, struct task_group *tg)
92{
93 if (tg != &root_task_group)
94 return false;
95
96 if (p->sched_class != &fair_sched_class)
97 return false;
98
99 /*
100 * We can only assume the task group can't go away on us if
101 * autogroup_move_group() can see us on ->thread_group list.
102 */
103 if (p->flags & PF_EXITING)
104 return false;
105
106 return true;
107}
108
109static inline struct task_group *
110autogroup_task_group(struct task_struct *p, struct task_group *tg)
111{
112 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
113
114 if (enabled && task_wants_autogroup(p, tg))
115 return p->signal->autogroup->tg;
116
117 return tg;
118}
119
120static void
121autogroup_move_group(struct task_struct *p, struct autogroup *ag)
122{
123 struct autogroup *prev;
124 struct task_struct *t;
125 unsigned long flags;
126
127 BUG_ON(!lock_task_sighand(p, &flags));
128
129 prev = p->signal->autogroup;
130 if (prev == ag) {
131 unlock_task_sighand(p, &flags);
132 return;
133 }
134
135 p->signal->autogroup = autogroup_kref_get(ag);
136
137 t = p;
138 do {
139 sched_move_task(t);
140 } while_each_thread(p, t);
141
142 unlock_task_sighand(p, &flags);
143 autogroup_kref_put(prev);
144}
145
146/* Allocates GFP_KERNEL, cannot be called under any spinlock */
147void sched_autogroup_create_attach(struct task_struct *p)
148{
149 struct autogroup *ag = autogroup_create();
150
151 autogroup_move_group(p, ag);
152 /* drop extra refrence added by autogroup_create() */
153 autogroup_kref_put(ag);
154}
155EXPORT_SYMBOL(sched_autogroup_create_attach);
156
157/* Cannot be called under siglock. Currently has no users */
158void sched_autogroup_detach(struct task_struct *p)
159{
160 autogroup_move_group(p, &autogroup_default);
161}
162EXPORT_SYMBOL(sched_autogroup_detach);
163
164void sched_autogroup_fork(struct signal_struct *sig)
165{
166 sig->autogroup = autogroup_task_get(current);
167}
168
169void sched_autogroup_exit(struct signal_struct *sig)
170{
171 autogroup_kref_put(sig->autogroup);
172}
173
174static int __init setup_autogroup(char *str)
175{
176 sysctl_sched_autogroup_enabled = 0;
177
178 return 1;
179}
180
181__setup("noautogroup", setup_autogroup);
182
183#ifdef CONFIG_PROC_FS
184
185int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
186{
187 static unsigned long next = INITIAL_JIFFIES;
188 struct autogroup *ag;
189 int err;
190
191 if (*nice < -20 || *nice > 19)
192 return -EINVAL;
193
194 err = security_task_setnice(current, *nice);
195 if (err)
196 return err;
197
198 if (*nice < 0 && !can_nice(current, *nice))
199 return -EPERM;
200
201 /* this is a heavy operation taking global locks.. */
202 if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
203 return -EAGAIN;
204
205 next = HZ / 10 + jiffies;
206 ag = autogroup_task_get(p);
207
208 down_write(&ag->lock);
209 err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
210 if (!err)
211 ag->nice = *nice;
212 up_write(&ag->lock);
213
214 autogroup_kref_put(ag);
215
216 return err;
217}
218
219void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
220{
221 struct autogroup *ag = autogroup_task_get(p);
222
223 down_read(&ag->lock);
224 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
225 up_read(&ag->lock);
226
227 autogroup_kref_put(ag);
228}
229#endif /* CONFIG_PROC_FS */
230
231#ifdef CONFIG_SCHED_DEBUG
232static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
233{
234 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
235}
236#endif /* CONFIG_SCHED_DEBUG */
237
238#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
new file mode 100644
index 000000000000..5358e241cb20
--- /dev/null
+++ b/kernel/sched_autogroup.h
@@ -0,0 +1,32 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3struct autogroup {
4 struct kref kref;
5 struct task_group *tg;
6 struct rw_semaphore lock;
7 unsigned long id;
8 int nice;
9};
10
11static inline struct task_group *
12autogroup_task_group(struct task_struct *p, struct task_group *tg);
13
14#else /* !CONFIG_SCHED_AUTOGROUP */
15
16static inline void autogroup_init(struct task_struct *init_task) { }
17static inline void autogroup_free(struct task_group *tg) { }
18
19static inline struct task_group *
20autogroup_task_group(struct task_struct *p, struct task_group *tg)
21{
22 return tg;
23}
24
25#ifdef CONFIG_SCHED_DEBUG
26static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
27{
28 return 0;
29}
30#endif
31
32#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 52f1a149bfb1..9d8af0b3fb64 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
79} 79}
80EXPORT_SYMBOL_GPL(sched_clock); 80EXPORT_SYMBOL_GPL(sched_clock);
81 81
82static __read_mostly int sched_clock_running; 82__read_mostly int sched_clock_running;
83 83
84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
85__read_mostly int sched_clock_stable; 85__read_mostly int sched_clock_stable;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 2e1b0d17dd9b..1dfae3d014b5 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -54,8 +54,7 @@ static unsigned long nsec_low(unsigned long long nsec)
54#define SPLIT_NS(x) nsec_high(x), nsec_low(x) 54#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
55 55
56#ifdef CONFIG_FAIR_GROUP_SCHED 56#ifdef CONFIG_FAIR_GROUP_SCHED
57static void print_cfs_group_stats(struct seq_file *m, int cpu, 57static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
58 struct task_group *tg)
59{ 58{
60 struct sched_entity *se = tg->se[cpu]; 59 struct sched_entity *se = tg->se[cpu];
61 if (!se) 60 if (!se)
@@ -110,16 +109,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 109 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
111#endif 110#endif
112 111
113#ifdef CONFIG_CGROUP_SCHED
114 {
115 char path[64];
116
117 rcu_read_lock();
118 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
119 rcu_read_unlock();
120 SEQ_printf(m, " %s", path);
121 }
122#endif
123 SEQ_printf(m, "\n"); 112 SEQ_printf(m, "\n");
124} 113}
125 114
@@ -147,19 +136,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
147 read_unlock_irqrestore(&tasklist_lock, flags); 136 read_unlock_irqrestore(&tasklist_lock, flags);
148} 137}
149 138
150#if defined(CONFIG_CGROUP_SCHED) && \
151 (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
152static void task_group_path(struct task_group *tg, char *buf, int buflen)
153{
154 /* may be NULL if the underlying cgroup isn't fully-created yet */
155 if (!tg->css.cgroup) {
156 buf[0] = '\0';
157 return;
158 }
159 cgroup_path(tg->css.cgroup, buf, buflen);
160}
161#endif
162
163void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) 139void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
164{ 140{
165 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, 141 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@@ -168,16 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
168 struct sched_entity *last; 144 struct sched_entity *last;
169 unsigned long flags; 145 unsigned long flags;
170 146
171#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
172 char path[128];
173 struct task_group *tg = cfs_rq->tg;
174
175 task_group_path(tg, path, sizeof(path));
176
177 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
178#else
179 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 147 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
180#endif
181 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 148 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
182 SPLIT_NS(cfs_rq->exec_clock)); 149 SPLIT_NS(cfs_rq->exec_clock));
183 150
@@ -202,32 +169,29 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
202 spread0 = min_vruntime - rq0_min_vruntime; 169 spread0 = min_vruntime - rq0_min_vruntime;
203 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", 170 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
204 SPLIT_NS(spread0)); 171 SPLIT_NS(spread0));
205 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207
208 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", 172 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
209 cfs_rq->nr_spread_over); 173 cfs_rq->nr_spread_over);
174 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
175 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
210#ifdef CONFIG_FAIR_GROUP_SCHED 176#ifdef CONFIG_FAIR_GROUP_SCHED
211#ifdef CONFIG_SMP 177#ifdef CONFIG_SMP
212 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); 178 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
179 SPLIT_NS(cfs_rq->load_avg));
180 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
181 SPLIT_NS(cfs_rq->load_period));
182 SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
183 cfs_rq->load_contribution);
184 SEQ_printf(m, " .%-30s: %d\n", "load_tg",
185 atomic_read(&cfs_rq->tg->load_weight));
213#endif 186#endif
187
214 print_cfs_group_stats(m, cpu, cfs_rq->tg); 188 print_cfs_group_stats(m, cpu, cfs_rq->tg);
215#endif 189#endif
216} 190}
217 191
218void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) 192void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
219{ 193{
220#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
221 char path[128];
222 struct task_group *tg = rt_rq->tg;
223
224 task_group_path(tg, path, sizeof(path));
225
226 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
227#else
228 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); 194 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
229#endif
230
231 195
232#define P(x) \ 196#define P(x) \
233 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) 197 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
@@ -243,6 +207,8 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
243#undef P 207#undef P
244} 208}
245 209
210extern __read_mostly int sched_clock_running;
211
246static void print_cpu(struct seq_file *m, int cpu) 212static void print_cpu(struct seq_file *m, int cpu)
247{ 213{
248 struct rq *rq = cpu_rq(cpu); 214 struct rq *rq = cpu_rq(cpu);
@@ -314,21 +280,42 @@ static const char *sched_tunable_scaling_names[] = {
314 280
315static int sched_debug_show(struct seq_file *m, void *v) 281static int sched_debug_show(struct seq_file *m, void *v)
316{ 282{
317 u64 now = ktime_to_ns(ktime_get()); 283 u64 ktime, sched_clk, cpu_clk;
284 unsigned long flags;
318 int cpu; 285 int cpu;
319 286
320 SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n", 287 local_irq_save(flags);
288 ktime = ktime_to_ns(ktime_get());
289 sched_clk = sched_clock();
290 cpu_clk = local_clock();
291 local_irq_restore(flags);
292
293 SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
321 init_utsname()->release, 294 init_utsname()->release,
322 (int)strcspn(init_utsname()->version, " "), 295 (int)strcspn(init_utsname()->version, " "),
323 init_utsname()->version); 296 init_utsname()->version);
324 297
325 SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); 298#define P(x) \
299 SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
300#define PN(x) \
301 SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
302 PN(ktime);
303 PN(sched_clk);
304 PN(cpu_clk);
305 P(jiffies);
306#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
307 P(sched_clock_stable);
308#endif
309#undef PN
310#undef P
311
312 SEQ_printf(m, "\n");
313 SEQ_printf(m, "sysctl_sched\n");
326 314
327#define P(x) \ 315#define P(x) \
328 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) 316 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
329#define PN(x) \ 317#define PN(x) \
330 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) 318 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
331 P(jiffies);
332 PN(sysctl_sched_latency); 319 PN(sysctl_sched_latency);
333 PN(sysctl_sched_min_granularity); 320 PN(sysctl_sched_min_granularity);
334 PN(sysctl_sched_wakeup_granularity); 321 PN(sysctl_sched_wakeup_granularity);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f4f6a8326dd0..c62ebae65cf0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
89 89
90const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
91 91
92/*
93 * The exponential sliding window over which load is averaged for shares
94 * distribution.
95 * (default: 10msec)
96 */
97unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
98
92static const struct sched_class fair_sched_class; 99static const struct sched_class fair_sched_class;
93 100
94/************************************************************** 101/**************************************************************
@@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
143 return cfs_rq->tg->cfs_rq[this_cpu]; 150 return cfs_rq->tg->cfs_rq[this_cpu];
144} 151}
145 152
153static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
154{
155 if (!cfs_rq->on_list) {
156 /*
157 * Ensure we either appear before our parent (if already
158 * enqueued) or force our parent to appear after us when it is
159 * enqueued. The fact that we always enqueue bottom-up
160 * reduces this to two cases.
161 */
162 if (cfs_rq->tg->parent &&
163 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
164 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
165 &rq_of(cfs_rq)->leaf_cfs_rq_list);
166 } else {
167 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
168 &rq_of(cfs_rq)->leaf_cfs_rq_list);
169 }
170
171 cfs_rq->on_list = 1;
172 }
173}
174
175static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
176{
177 if (cfs_rq->on_list) {
178 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
179 cfs_rq->on_list = 0;
180 }
181}
182
146/* Iterate thr' all leaf cfs_rq's on a runqueue */ 183/* Iterate thr' all leaf cfs_rq's on a runqueue */
147#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 184#define for_each_leaf_cfs_rq(rq, cfs_rq) \
148 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 185 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
246 return &cpu_rq(this_cpu)->cfs; 283 return &cpu_rq(this_cpu)->cfs;
247} 284}
248 285
286static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
287{
288}
289
290static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
291{
292}
293
249#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 294#define for_each_leaf_cfs_rq(rq, cfs_rq) \
250 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 295 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
251 296
@@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
417 WRT_SYSCTL(sched_min_granularity); 462 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency); 463 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity); 464 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL 465#undef WRT_SYSCTL
422 466
423 return 0; 467 return 0;
@@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
495 return calc_delta_fair(sched_slice(cfs_rq, se), se); 539 return calc_delta_fair(sched_slice(cfs_rq, se), se);
496} 540}
497 541
542static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
543static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
544
498/* 545/*
499 * Update the current task's runtime statistics. Skip current tasks that 546 * Update the current task's runtime statistics. Skip current tasks that
500 * are not in our scheduling class. 547 * are not in our scheduling class.
@@ -514,6 +561,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
514 561
515 curr->vruntime += delta_exec_weighted; 562 curr->vruntime += delta_exec_weighted;
516 update_min_vruntime(cfs_rq); 563 update_min_vruntime(cfs_rq);
564
565#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
566 cfs_rq->load_unacc_exec_time += delta_exec;
567#endif
517} 568}
518 569
519static void update_curr(struct cfs_rq *cfs_rq) 570static void update_curr(struct cfs_rq *cfs_rq)
@@ -633,7 +684,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
633 list_add(&se->group_node, &cfs_rq->tasks); 684 list_add(&se->group_node, &cfs_rq->tasks);
634 } 685 }
635 cfs_rq->nr_running++; 686 cfs_rq->nr_running++;
636 se->on_rq = 1;
637} 687}
638 688
639static void 689static void
@@ -647,9 +697,140 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
647 list_del_init(&se->group_node); 697 list_del_init(&se->group_node);
648 } 698 }
649 cfs_rq->nr_running--; 699 cfs_rq->nr_running--;
650 se->on_rq = 0;
651} 700}
652 701
702#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
703static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
704 int global_update)
705{
706 struct task_group *tg = cfs_rq->tg;
707 long load_avg;
708
709 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
710 load_avg -= cfs_rq->load_contribution;
711
712 if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
713 atomic_add(load_avg, &tg->load_weight);
714 cfs_rq->load_contribution += load_avg;
715 }
716}
717
718static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
719{
720 u64 period = sysctl_sched_shares_window;
721 u64 now, delta;
722 unsigned long load = cfs_rq->load.weight;
723
724 if (!cfs_rq)
725 return;
726
727 now = rq_of(cfs_rq)->clock;
728 delta = now - cfs_rq->load_stamp;
729
730 /* truncate load history at 4 idle periods */
731 if (cfs_rq->load_stamp > cfs_rq->load_last &&
732 now - cfs_rq->load_last > 4 * period) {
733 cfs_rq->load_period = 0;
734 cfs_rq->load_avg = 0;
735 }
736
737 cfs_rq->load_stamp = now;
738 cfs_rq->load_unacc_exec_time = 0;
739 cfs_rq->load_period += delta;
740 if (load) {
741 cfs_rq->load_last = now;
742 cfs_rq->load_avg += delta * load;
743 }
744
745 /* consider updating load contribution on each fold or truncate */
746 if (global_update || cfs_rq->load_period > period
747 || !cfs_rq->load_period)
748 update_cfs_rq_load_contribution(cfs_rq, global_update);
749
750 while (cfs_rq->load_period > period) {
751 /*
752 * Inline assembly required to prevent the compiler
753 * optimising this loop into a divmod call.
754 * See __iter_div_u64_rem() for another example of this.
755 */
756 asm("" : "+rm" (cfs_rq->load_period));
757 cfs_rq->load_period /= 2;
758 cfs_rq->load_avg /= 2;
759 }
760
761 if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
762 list_del_leaf_cfs_rq(cfs_rq);
763}
764
765static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
766 unsigned long weight)
767{
768 if (se->on_rq) {
769 /* commit outstanding execution time */
770 if (cfs_rq->curr == se)
771 update_curr(cfs_rq);
772 account_entity_dequeue(cfs_rq, se);
773 }
774
775 update_load_set(&se->load, weight);
776
777 if (se->on_rq)
778 account_entity_enqueue(cfs_rq, se);
779}
780
781static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
782{
783 struct task_group *tg;
784 struct sched_entity *se;
785 long load_weight, load, shares;
786
787 if (!cfs_rq)
788 return;
789
790 tg = cfs_rq->tg;
791 se = tg->se[cpu_of(rq_of(cfs_rq))];
792 if (!se)
793 return;
794
795 load = cfs_rq->load.weight + weight_delta;
796
797 load_weight = atomic_read(&tg->load_weight);
798 load_weight -= cfs_rq->load_contribution;
799 load_weight += load;
800
801 shares = (tg->shares * load);
802 if (load_weight)
803 shares /= load_weight;
804
805 if (shares < MIN_SHARES)
806 shares = MIN_SHARES;
807 if (shares > tg->shares)
808 shares = tg->shares;
809
810 reweight_entity(cfs_rq_of(se), se, shares);
811}
812
813static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
814{
815 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
816 update_cfs_load(cfs_rq, 0);
817 update_cfs_shares(cfs_rq, 0);
818 }
819}
820#else /* CONFIG_FAIR_GROUP_SCHED */
821static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
822{
823}
824
825static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
826{
827}
828
829static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
830{
831}
832#endif /* CONFIG_FAIR_GROUP_SCHED */
833
653static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 834static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
654{ 835{
655#ifdef CONFIG_SCHEDSTATS 836#ifdef CONFIG_SCHEDSTATS
@@ -771,6 +952,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
771 * Update run-time statistics of the 'current'. 952 * Update run-time statistics of the 'current'.
772 */ 953 */
773 update_curr(cfs_rq); 954 update_curr(cfs_rq);
955 update_cfs_load(cfs_rq, 0);
956 update_cfs_shares(cfs_rq, se->load.weight);
774 account_entity_enqueue(cfs_rq, se); 957 account_entity_enqueue(cfs_rq, se);
775 958
776 if (flags & ENQUEUE_WAKEUP) { 959 if (flags & ENQUEUE_WAKEUP) {
@@ -782,6 +965,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
782 check_spread(cfs_rq, se); 965 check_spread(cfs_rq, se);
783 if (se != cfs_rq->curr) 966 if (se != cfs_rq->curr)
784 __enqueue_entity(cfs_rq, se); 967 __enqueue_entity(cfs_rq, se);
968 se->on_rq = 1;
969
970 if (cfs_rq->nr_running == 1)
971 list_add_leaf_cfs_rq(cfs_rq);
785} 972}
786 973
787static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 974static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +1012,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
825 1012
826 if (se != cfs_rq->curr) 1013 if (se != cfs_rq->curr)
827 __dequeue_entity(cfs_rq, se); 1014 __dequeue_entity(cfs_rq, se);
1015 se->on_rq = 0;
1016 update_cfs_load(cfs_rq, 0);
828 account_entity_dequeue(cfs_rq, se); 1017 account_entity_dequeue(cfs_rq, se);
829 update_min_vruntime(cfs_rq); 1018 update_min_vruntime(cfs_rq);
1019 update_cfs_shares(cfs_rq, 0);
830 1020
831 /* 1021 /*
832 * Normalize the entity after updating the min_vruntime because the 1022 * Normalize the entity after updating the min_vruntime because the
@@ -955,6 +1145,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
955 */ 1145 */
956 update_curr(cfs_rq); 1146 update_curr(cfs_rq);
957 1147
1148 /*
1149 * Update share accounting for long-running entities.
1150 */
1151 update_entity_shares_tick(cfs_rq);
1152
958#ifdef CONFIG_SCHED_HRTICK 1153#ifdef CONFIG_SCHED_HRTICK
959 /* 1154 /*
960 * queued ticks are scheduled to match the slice, so don't bother 1155 * queued ticks are scheduled to match the slice, so don't bother
@@ -1055,6 +1250,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1055 flags = ENQUEUE_WAKEUP; 1250 flags = ENQUEUE_WAKEUP;
1056 } 1251 }
1057 1252
1253 for_each_sched_entity(se) {
1254 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1255
1256 update_cfs_load(cfs_rq, 0);
1257 update_cfs_shares(cfs_rq, 0);
1258 }
1259
1058 hrtick_update(rq); 1260 hrtick_update(rq);
1059} 1261}
1060 1262
@@ -1071,12 +1273,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1071 for_each_sched_entity(se) { 1273 for_each_sched_entity(se) {
1072 cfs_rq = cfs_rq_of(se); 1274 cfs_rq = cfs_rq_of(se);
1073 dequeue_entity(cfs_rq, se, flags); 1275 dequeue_entity(cfs_rq, se, flags);
1276
1074 /* Don't dequeue parent if it has other entities besides us */ 1277 /* Don't dequeue parent if it has other entities besides us */
1075 if (cfs_rq->load.weight) 1278 if (cfs_rq->load.weight)
1076 break; 1279 break;
1077 flags |= DEQUEUE_SLEEP; 1280 flags |= DEQUEUE_SLEEP;
1078 } 1281 }
1079 1282
1283 for_each_sched_entity(se) {
1284 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1285
1286 update_cfs_load(cfs_rq, 0);
1287 update_cfs_shares(cfs_rq, 0);
1288 }
1289
1080 hrtick_update(rq); 1290 hrtick_update(rq);
1081} 1291}
1082 1292
@@ -1143,51 +1353,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
1143 * Adding load to a group doesn't make a group heavier, but can cause movement 1353 * Adding load to a group doesn't make a group heavier, but can cause movement
1144 * of group shares between cpus. Assuming the shares were perfectly aligned one 1354 * of group shares between cpus. Assuming the shares were perfectly aligned one
1145 * can calculate the shift in shares. 1355 * can calculate the shift in shares.
1146 *
1147 * The problem is that perfectly aligning the shares is rather expensive, hence
1148 * we try to avoid doing that too often - see update_shares(), which ratelimits
1149 * this change.
1150 *
1151 * We compensate this by not only taking the current delta into account, but
1152 * also considering the delta between when the shares were last adjusted and
1153 * now.
1154 *
1155 * We still saw a performance dip, some tracing learned us that between
1156 * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
1157 * significantly. Therefore try to bias the error in direction of failing
1158 * the affine wakeup.
1159 *
1160 */ 1356 */
1161static long effective_load(struct task_group *tg, int cpu, 1357static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1162 long wl, long wg)
1163{ 1358{
1164 struct sched_entity *se = tg->se[cpu]; 1359 struct sched_entity *se = tg->se[cpu];
1165 1360
1166 if (!tg->parent) 1361 if (!tg->parent)
1167 return wl; 1362 return wl;
1168 1363
1169 /*
1170 * By not taking the decrease of shares on the other cpu into
1171 * account our error leans towards reducing the affine wakeups.
1172 */
1173 if (!wl && sched_feat(ASYM_EFF_LOAD))
1174 return wl;
1175
1176 for_each_sched_entity(se) { 1364 for_each_sched_entity(se) {
1177 long S, rw, s, a, b; 1365 long S, rw, s, a, b;
1178 long more_w;
1179
1180 /*
1181 * Instead of using this increment, also add the difference
1182 * between when the shares were last updated and now.
1183 */
1184 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1185 wl += more_w;
1186 wg += more_w;
1187 1366
1188 S = se->my_q->tg->shares; 1367 S = se->my_q->tg->shares;
1189 s = se->my_q->shares; 1368 s = se->load.weight;
1190 rw = se->my_q->rq_weight; 1369 rw = se->my_q->load.weight;
1191 1370
1192 a = S*(rw + wl); 1371 a = S*(rw + wl);
1193 b = S*rw + s*wg; 1372 b = S*rw + s*wg;
@@ -1508,23 +1687,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1508 sd = tmp; 1687 sd = tmp;
1509 } 1688 }
1510 1689
1511#ifdef CONFIG_FAIR_GROUP_SCHED
1512 if (sched_feat(LB_SHARES_UPDATE)) {
1513 /*
1514 * Pick the largest domain to update shares over
1515 */
1516 tmp = sd;
1517 if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
1518 tmp = affine_sd;
1519
1520 if (tmp) {
1521 raw_spin_unlock(&rq->lock);
1522 update_shares(tmp);
1523 raw_spin_lock(&rq->lock);
1524 }
1525 }
1526#endif
1527
1528 if (affine_sd) { 1690 if (affine_sd) {
1529 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 1691 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1530 return select_idle_sibling(p, cpu); 1692 return select_idle_sibling(p, cpu);
@@ -1654,12 +1816,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1654 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1816 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1655 int scale = cfs_rq->nr_running >= sched_nr_latency; 1817 int scale = cfs_rq->nr_running >= sched_nr_latency;
1656 1818
1657 if (unlikely(rt_prio(p->prio)))
1658 goto preempt;
1659
1660 if (unlikely(p->sched_class != &fair_sched_class))
1661 return;
1662
1663 if (unlikely(se == pse)) 1819 if (unlikely(se == pse))
1664 return; 1820 return;
1665 1821
@@ -1764,10 +1920,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
1764 set_task_cpu(p, this_cpu); 1920 set_task_cpu(p, this_cpu);
1765 activate_task(this_rq, p, 0); 1921 activate_task(this_rq, p, 0);
1766 check_preempt_curr(this_rq, p, 0); 1922 check_preempt_curr(this_rq, p, 0);
1767
1768 /* re-arm NEWIDLE balancing when moving tasks */
1769 src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
1770 this_rq->idle_stamp = 0;
1771} 1923}
1772 1924
1773/* 1925/*
@@ -1919,6 +2071,48 @@ out:
1919} 2071}
1920 2072
1921#ifdef CONFIG_FAIR_GROUP_SCHED 2073#ifdef CONFIG_FAIR_GROUP_SCHED
2074/*
2075 * update tg->load_weight by folding this cpu's load_avg
2076 */
2077static int update_shares_cpu(struct task_group *tg, int cpu)
2078{
2079 struct cfs_rq *cfs_rq;
2080 unsigned long flags;
2081 struct rq *rq;
2082
2083 if (!tg->se[cpu])
2084 return 0;
2085
2086 rq = cpu_rq(cpu);
2087 cfs_rq = tg->cfs_rq[cpu];
2088
2089 raw_spin_lock_irqsave(&rq->lock, flags);
2090
2091 update_rq_clock(rq);
2092 update_cfs_load(cfs_rq, 1);
2093
2094 /*
2095 * We need to update shares after updating tg->load_weight in
2096 * order to adjust the weight of groups with long running tasks.
2097 */
2098 update_cfs_shares(cfs_rq, 0);
2099
2100 raw_spin_unlock_irqrestore(&rq->lock, flags);
2101
2102 return 0;
2103}
2104
2105static void update_shares(int cpu)
2106{
2107 struct cfs_rq *cfs_rq;
2108 struct rq *rq = cpu_rq(cpu);
2109
2110 rcu_read_lock();
2111 for_each_leaf_cfs_rq(rq, cfs_rq)
2112 update_shares_cpu(cfs_rq->tg, cpu);
2113 rcu_read_unlock();
2114}
2115
1922static unsigned long 2116static unsigned long
1923load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2117load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1924 unsigned long max_load_move, 2118 unsigned long max_load_move,
@@ -1966,6 +2160,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1966 return max_load_move - rem_load_move; 2160 return max_load_move - rem_load_move;
1967} 2161}
1968#else 2162#else
2163static inline void update_shares(int cpu)
2164{
2165}
2166
1969static unsigned long 2167static unsigned long
1970load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2168load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1971 unsigned long max_load_move, 2169 unsigned long max_load_move,
@@ -2035,13 +2233,16 @@ struct sd_lb_stats {
2035 unsigned long this_load_per_task; 2233 unsigned long this_load_per_task;
2036 unsigned long this_nr_running; 2234 unsigned long this_nr_running;
2037 unsigned long this_has_capacity; 2235 unsigned long this_has_capacity;
2236 unsigned int this_idle_cpus;
2038 2237
2039 /* Statistics of the busiest group */ 2238 /* Statistics of the busiest group */
2239 unsigned int busiest_idle_cpus;
2040 unsigned long max_load; 2240 unsigned long max_load;
2041 unsigned long busiest_load_per_task; 2241 unsigned long busiest_load_per_task;
2042 unsigned long busiest_nr_running; 2242 unsigned long busiest_nr_running;
2043 unsigned long busiest_group_capacity; 2243 unsigned long busiest_group_capacity;
2044 unsigned long busiest_has_capacity; 2244 unsigned long busiest_has_capacity;
2245 unsigned int busiest_group_weight;
2045 2246
2046 int group_imb; /* Is there imbalance in this sd */ 2247 int group_imb; /* Is there imbalance in this sd */
2047#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2248#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2063,6 +2264,8 @@ struct sg_lb_stats {
2063 unsigned long sum_nr_running; /* Nr tasks running in the group */ 2264 unsigned long sum_nr_running; /* Nr tasks running in the group */
2064 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 2265 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2065 unsigned long group_capacity; 2266 unsigned long group_capacity;
2267 unsigned long idle_cpus;
2268 unsigned long group_weight;
2066 int group_imb; /* Is there an imbalance in the group ? */ 2269 int group_imb; /* Is there an imbalance in the group ? */
2067 int group_has_capacity; /* Is there extra capacity in the group? */ 2270 int group_has_capacity; /* Is there extra capacity in the group? */
2068}; 2271};
@@ -2431,7 +2634,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2431 sgs->group_load += load; 2634 sgs->group_load += load;
2432 sgs->sum_nr_running += rq->nr_running; 2635 sgs->sum_nr_running += rq->nr_running;
2433 sgs->sum_weighted_load += weighted_cpuload(i); 2636 sgs->sum_weighted_load += weighted_cpuload(i);
2434 2637 if (idle_cpu(i))
2638 sgs->idle_cpus++;
2435 } 2639 }
2436 2640
2437 /* 2641 /*
@@ -2469,6 +2673,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2469 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2673 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2470 if (!sgs->group_capacity) 2674 if (!sgs->group_capacity)
2471 sgs->group_capacity = fix_small_capacity(sd, group); 2675 sgs->group_capacity = fix_small_capacity(sd, group);
2676 sgs->group_weight = group->group_weight;
2472 2677
2473 if (sgs->group_capacity > sgs->sum_nr_running) 2678 if (sgs->group_capacity > sgs->sum_nr_running)
2474 sgs->group_has_capacity = 1; 2679 sgs->group_has_capacity = 1;
@@ -2576,13 +2781,16 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2576 sds->this_nr_running = sgs.sum_nr_running; 2781 sds->this_nr_running = sgs.sum_nr_running;
2577 sds->this_load_per_task = sgs.sum_weighted_load; 2782 sds->this_load_per_task = sgs.sum_weighted_load;
2578 sds->this_has_capacity = sgs.group_has_capacity; 2783 sds->this_has_capacity = sgs.group_has_capacity;
2784 sds->this_idle_cpus = sgs.idle_cpus;
2579 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { 2785 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2580 sds->max_load = sgs.avg_load; 2786 sds->max_load = sgs.avg_load;
2581 sds->busiest = sg; 2787 sds->busiest = sg;
2582 sds->busiest_nr_running = sgs.sum_nr_running; 2788 sds->busiest_nr_running = sgs.sum_nr_running;
2789 sds->busiest_idle_cpus = sgs.idle_cpus;
2583 sds->busiest_group_capacity = sgs.group_capacity; 2790 sds->busiest_group_capacity = sgs.group_capacity;
2584 sds->busiest_load_per_task = sgs.sum_weighted_load; 2791 sds->busiest_load_per_task = sgs.sum_weighted_load;
2585 sds->busiest_has_capacity = sgs.group_has_capacity; 2792 sds->busiest_has_capacity = sgs.group_has_capacity;
2793 sds->busiest_group_weight = sgs.group_weight;
2586 sds->group_imb = sgs.group_imb; 2794 sds->group_imb = sgs.group_imb;
2587 } 2795 }
2588 2796
@@ -2860,8 +3068,26 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2860 if (sds.this_load >= sds.avg_load) 3068 if (sds.this_load >= sds.avg_load)
2861 goto out_balanced; 3069 goto out_balanced;
2862 3070
2863 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 3071 /*
2864 goto out_balanced; 3072 * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
3073 * And to check for busy balance use !idle_cpu instead of
3074 * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
3075 * even when they are idle.
3076 */
3077 if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
3078 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3079 goto out_balanced;
3080 } else {
3081 /*
3082 * This cpu is idle. If the busiest group load doesn't
3083 * have more tasks than the number of available cpu's and
3084 * there is no imbalance between this and busiest group
3085 * wrt to idle cpu's, it is balanced.
3086 */
3087 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
3088 sds.busiest_nr_running <= sds.busiest_group_weight)
3089 goto out_balanced;
3090 }
2865 3091
2866force_balance: 3092force_balance:
2867 /* Looks like there is an imbalance. Compute it */ 3093 /* Looks like there is an imbalance. Compute it */
@@ -3014,7 +3240,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3014 schedstat_inc(sd, lb_count[idle]); 3240 schedstat_inc(sd, lb_count[idle]);
3015 3241
3016redo: 3242redo:
3017 update_shares(sd);
3018 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3243 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3019 cpus, balance); 3244 cpus, balance);
3020 3245
@@ -3156,8 +3381,6 @@ out_one_pinned:
3156 else 3381 else
3157 ld_moved = 0; 3382 ld_moved = 0;
3158out: 3383out:
3159 if (ld_moved)
3160 update_shares(sd);
3161 return ld_moved; 3384 return ld_moved;
3162} 3385}
3163 3386
@@ -3181,6 +3404,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3181 */ 3404 */
3182 raw_spin_unlock(&this_rq->lock); 3405 raw_spin_unlock(&this_rq->lock);
3183 3406
3407 update_shares(this_cpu);
3184 for_each_domain(this_cpu, sd) { 3408 for_each_domain(this_cpu, sd) {
3185 unsigned long interval; 3409 unsigned long interval;
3186 int balance = 1; 3410 int balance = 1;
@@ -3197,8 +3421,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3197 interval = msecs_to_jiffies(sd->balance_interval); 3421 interval = msecs_to_jiffies(sd->balance_interval);
3198 if (time_after(next_balance, sd->last_balance + interval)) 3422 if (time_after(next_balance, sd->last_balance + interval))
3199 next_balance = sd->last_balance + interval; 3423 next_balance = sd->last_balance + interval;
3200 if (pulled_task) 3424 if (pulled_task) {
3425 this_rq->idle_stamp = 0;
3201 break; 3426 break;
3427 }
3202 } 3428 }
3203 3429
3204 raw_spin_lock(&this_rq->lock); 3430 raw_spin_lock(&this_rq->lock);
@@ -3549,6 +3775,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3549 int update_next_balance = 0; 3775 int update_next_balance = 0;
3550 int need_serialize; 3776 int need_serialize;
3551 3777
3778 update_shares(cpu);
3779
3552 for_each_domain(cpu, sd) { 3780 for_each_domain(cpu, sd) {
3553 if (!(sd->flags & SD_LOAD_BALANCE)) 3781 if (!(sd->flags & SD_LOAD_BALANCE))
3554 continue; 3782 continue;
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 185f920ec1a2..68e69acc29b9 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
52SCHED_FEAT(HRTICK, 0) 52SCHED_FEAT(HRTICK, 0)
53SCHED_FEAT(DOUBLE_TICK, 0) 53SCHED_FEAT(DOUBLE_TICK, 0)
54SCHED_FEAT(LB_BIAS, 1) 54SCHED_FEAT(LB_BIAS, 1)
55SCHED_FEAT(LB_SHARES_UPDATE, 1)
56SCHED_FEAT(ASYM_EFF_LOAD, 1)
57 55
58/* 56/*
59 * Spin-wait on mutex acquisition when the mutex owner is running on 57 * Spin-wait on mutex acquisition when the mutex owner is running on
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index bea7d79f7e9c..c914ec747ca6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,17 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
184} 184}
185 185
186static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
187{
188 list_add_rcu(&rt_rq->leaf_rt_rq_list,
189 &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
190}
191
192static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
193{
194 list_del_rcu(&rt_rq->leaf_rt_rq_list);
195}
196
186#define for_each_leaf_rt_rq(rt_rq, rq) \ 197#define for_each_leaf_rt_rq(rt_rq, rq) \
187 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) 198 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
188 199
@@ -276,6 +287,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
276 return ktime_to_ns(def_rt_bandwidth.rt_period); 287 return ktime_to_ns(def_rt_bandwidth.rt_period);
277} 288}
278 289
290static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
291{
292}
293
294static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
295{
296}
297
279#define for_each_leaf_rt_rq(rt_rq, rq) \ 298#define for_each_leaf_rt_rq(rt_rq, rq) \
280 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 299 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
281 300
@@ -825,6 +844,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
825 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 844 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
826 return; 845 return;
827 846
847 if (!rt_rq->rt_nr_running)
848 list_add_leaf_rt_rq(rt_rq);
849
828 if (head) 850 if (head)
829 list_add(&rt_se->run_list, queue); 851 list_add(&rt_se->run_list, queue);
830 else 852 else
@@ -844,6 +866,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
844 __clear_bit(rt_se_prio(rt_se), array->bitmap); 866 __clear_bit(rt_se_prio(rt_se), array->bitmap);
845 867
846 dec_rt_tasks(rt_se, rt_rq); 868 dec_rt_tasks(rt_se, rt_rq);
869 if (!rt_rq->rt_nr_running)
870 list_del_leaf_rt_rq(rt_rq);
847} 871}
848 872
849/* 873/*
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 45bddc0c1048..2bf6b47058c1 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -19,14 +19,14 @@ select_task_rq_stop(struct rq *rq, struct task_struct *p,
19static void 19static void
20check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) 20check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
21{ 21{
22 resched_task(rq->curr); /* we preempt everything */ 22 /* we're never preempted */
23} 23}
24 24
25static struct task_struct *pick_next_task_stop(struct rq *rq) 25static struct task_struct *pick_next_task_stop(struct rq *rq)
26{ 26{
27 struct task_struct *stop = rq->stop; 27 struct task_struct *stop = rq->stop;
28 28
29 if (stop && stop->state == TASK_RUNNING) 29 if (stop && stop->se.on_rq)
30 return stop; 30 return stop;
31 31
32 return NULL; 32 return NULL;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 18f4be0d5fe0..0823778f87fc 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -70,7 +70,7 @@ char *softirq_to_name[NR_SOFTIRQS] = {
70static void wakeup_softirqd(void) 70static void wakeup_softirqd(void)
71{ 71{
72 /* Interrupts are disabled: no need to stop preemption */ 72 /* Interrupts are disabled: no need to stop preemption */
73 struct task_struct *tsk = __get_cpu_var(ksoftirqd); 73 struct task_struct *tsk = __this_cpu_read(ksoftirqd);
74 74
75 if (tsk && tsk->state != TASK_RUNNING) 75 if (tsk && tsk->state != TASK_RUNNING)
76 wake_up_process(tsk); 76 wake_up_process(tsk);
@@ -388,8 +388,8 @@ void __tasklet_schedule(struct tasklet_struct *t)
388 388
389 local_irq_save(flags); 389 local_irq_save(flags);
390 t->next = NULL; 390 t->next = NULL;
391 *__get_cpu_var(tasklet_vec).tail = t; 391 *__this_cpu_read(tasklet_vec.tail) = t;
392 __get_cpu_var(tasklet_vec).tail = &(t->next); 392 __this_cpu_write(tasklet_vec.tail, &(t->next));
393 raise_softirq_irqoff(TASKLET_SOFTIRQ); 393 raise_softirq_irqoff(TASKLET_SOFTIRQ);
394 local_irq_restore(flags); 394 local_irq_restore(flags);
395} 395}
@@ -402,8 +402,8 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
402 402
403 local_irq_save(flags); 403 local_irq_save(flags);
404 t->next = NULL; 404 t->next = NULL;
405 *__get_cpu_var(tasklet_hi_vec).tail = t; 405 *__this_cpu_read(tasklet_hi_vec.tail) = t;
406 __get_cpu_var(tasklet_hi_vec).tail = &(t->next); 406 __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
407 raise_softirq_irqoff(HI_SOFTIRQ); 407 raise_softirq_irqoff(HI_SOFTIRQ);
408 local_irq_restore(flags); 408 local_irq_restore(flags);
409} 409}
@@ -414,8 +414,8 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
414{ 414{
415 BUG_ON(!irqs_disabled()); 415 BUG_ON(!irqs_disabled());
416 416
417 t->next = __get_cpu_var(tasklet_hi_vec).head; 417 t->next = __this_cpu_read(tasklet_hi_vec.head);
418 __get_cpu_var(tasklet_hi_vec).head = t; 418 __this_cpu_write(tasklet_hi_vec.head, t);
419 __raise_softirq_irqoff(HI_SOFTIRQ); 419 __raise_softirq_irqoff(HI_SOFTIRQ);
420} 420}
421 421
@@ -426,9 +426,9 @@ static void tasklet_action(struct softirq_action *a)
426 struct tasklet_struct *list; 426 struct tasklet_struct *list;
427 427
428 local_irq_disable(); 428 local_irq_disable();
429 list = __get_cpu_var(tasklet_vec).head; 429 list = __this_cpu_read(tasklet_vec.head);
430 __get_cpu_var(tasklet_vec).head = NULL; 430 __this_cpu_write(tasklet_vec.head, NULL);
431 __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; 431 __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head);
432 local_irq_enable(); 432 local_irq_enable();
433 433
434 while (list) { 434 while (list) {
@@ -449,8 +449,8 @@ static void tasklet_action(struct softirq_action *a)
449 449
450 local_irq_disable(); 450 local_irq_disable();
451 t->next = NULL; 451 t->next = NULL;
452 *__get_cpu_var(tasklet_vec).tail = t; 452 *__this_cpu_read(tasklet_vec.tail) = t;
453 __get_cpu_var(tasklet_vec).tail = &(t->next); 453 __this_cpu_write(tasklet_vec.tail, &(t->next));
454 __raise_softirq_irqoff(TASKLET_SOFTIRQ); 454 __raise_softirq_irqoff(TASKLET_SOFTIRQ);
455 local_irq_enable(); 455 local_irq_enable();
456 } 456 }
@@ -461,9 +461,9 @@ static void tasklet_hi_action(struct softirq_action *a)
461 struct tasklet_struct *list; 461 struct tasklet_struct *list;
462 462
463 local_irq_disable(); 463 local_irq_disable();
464 list = __get_cpu_var(tasklet_hi_vec).head; 464 list = __this_cpu_read(tasklet_hi_vec.head);
465 __get_cpu_var(tasklet_hi_vec).head = NULL; 465 __this_cpu_write(tasklet_hi_vec.head, NULL);
466 __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head; 466 __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head);
467 local_irq_enable(); 467 local_irq_enable();
468 468
469 while (list) { 469 while (list) {
@@ -484,8 +484,8 @@ static void tasklet_hi_action(struct softirq_action *a)
484 484
485 local_irq_disable(); 485 local_irq_disable();
486 t->next = NULL; 486 t->next = NULL;
487 *__get_cpu_var(tasklet_hi_vec).tail = t; 487 *__this_cpu_read(tasklet_hi_vec.tail) = t;
488 __get_cpu_var(tasklet_hi_vec).tail = &(t->next); 488 __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
489 __raise_softirq_irqoff(HI_SOFTIRQ); 489 __raise_softirq_irqoff(HI_SOFTIRQ);
490 local_irq_enable(); 490 local_irq_enable();
491 } 491 }
@@ -802,16 +802,16 @@ static void takeover_tasklets(unsigned int cpu)
802 802
803 /* Find end, append list for that CPU. */ 803 /* Find end, append list for that CPU. */
804 if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { 804 if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
805 *(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head; 805 *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head;
806 __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail; 806 this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);
807 per_cpu(tasklet_vec, cpu).head = NULL; 807 per_cpu(tasklet_vec, cpu).head = NULL;
808 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; 808 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
809 } 809 }
810 raise_softirq_irqoff(TASKLET_SOFTIRQ); 810 raise_softirq_irqoff(TASKLET_SOFTIRQ);
811 811
812 if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { 812 if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {
813 *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head; 813 *__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head;
814 __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail; 814 __this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail);
815 per_cpu(tasklet_hi_vec, cpu).head = NULL; 815 per_cpu(tasklet_hi_vec, cpu).head = NULL;
816 per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; 816 per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
817 } 817 }
@@ -853,7 +853,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
853 cpumask_any(cpu_online_mask)); 853 cpumask_any(cpu_online_mask));
854 case CPU_DEAD: 854 case CPU_DEAD:
855 case CPU_DEAD_FROZEN: { 855 case CPU_DEAD_FROZEN: {
856 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 856 static const struct sched_param param = {
857 .sched_priority = MAX_RT_PRIO-1
858 };
857 859
858 p = per_cpu(ksoftirqd, hotcpu); 860 p = per_cpu(ksoftirqd, hotcpu);
859 per_cpu(ksoftirqd, hotcpu) = NULL; 861 per_cpu(ksoftirqd, hotcpu) = NULL;
diff --git a/kernel/srcu.c b/kernel/srcu.c
index c71e07500536..98d8c1e80edb 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -31,6 +31,7 @@
31#include <linux/rcupdate.h> 31#include <linux/rcupdate.h>
32#include <linux/sched.h> 32#include <linux/sched.h>
33#include <linux/smp.h> 33#include <linux/smp.h>
34#include <linux/delay.h>
34#include <linux/srcu.h> 35#include <linux/srcu.h>
35 36
36static int init_srcu_struct_fields(struct srcu_struct *sp) 37static int init_srcu_struct_fields(struct srcu_struct *sp)
@@ -203,9 +204,14 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
203 * all srcu_read_lock() calls using the old counters have completed. 204 * all srcu_read_lock() calls using the old counters have completed.
204 * Their corresponding critical sections might well be still 205 * Their corresponding critical sections might well be still
205 * executing, but the srcu_read_lock() primitives themselves 206 * executing, but the srcu_read_lock() primitives themselves
206 * will have finished executing. 207 * will have finished executing. We initially give readers
208 * an arbitrarily chosen 10 microseconds to get out of their
209 * SRCU read-side critical sections, then loop waiting 1/HZ
210 * seconds per iteration.
207 */ 211 */
208 212
213 if (srcu_readers_active_idx(sp, idx))
214 udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY);
209 while (srcu_readers_active_idx(sp, idx)) 215 while (srcu_readers_active_idx(sp, idx))
210 schedule_timeout_interruptible(1); 216 schedule_timeout_interruptible(1);
211 217
diff --git a/kernel/sys.c b/kernel/sys.c
index 7f5a0cd296a9..2745dcdb6c6c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1080,8 +1080,10 @@ SYSCALL_DEFINE0(setsid)
1080 err = session; 1080 err = session;
1081out: 1081out:
1082 write_unlock_irq(&tasklist_lock); 1082 write_unlock_irq(&tasklist_lock);
1083 if (err > 0) 1083 if (err > 0) {
1084 proc_sid_connector(group_leader); 1084 proc_sid_connector(group_leader);
1085 sched_autogroup_create_attach(group_leader);
1086 }
1085 return err; 1087 return err;
1086} 1088}
1087 1089
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c33a1edb799f..ae5cbb1e3ced 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -259,8 +259,6 @@ static int min_wakeup_granularity_ns; /* 0 usecs */
259static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 259static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
260static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; 260static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
261static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; 261static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
262static int min_sched_shares_ratelimit = 100000; /* 100 usec */
263static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
264#endif 262#endif
265 263
266#ifdef CONFIG_COMPACTION 264#ifdef CONFIG_COMPACTION
@@ -305,15 +303,6 @@ static struct ctl_table kern_table[] = {
305 .extra2 = &max_wakeup_granularity_ns, 303 .extra2 = &max_wakeup_granularity_ns,
306 }, 304 },
307 { 305 {
308 .procname = "sched_shares_ratelimit",
309 .data = &sysctl_sched_shares_ratelimit,
310 .maxlen = sizeof(unsigned int),
311 .mode = 0644,
312 .proc_handler = sched_proc_update_handler,
313 .extra1 = &min_sched_shares_ratelimit,
314 .extra2 = &max_sched_shares_ratelimit,
315 },
316 {
317 .procname = "sched_tunable_scaling", 306 .procname = "sched_tunable_scaling",
318 .data = &sysctl_sched_tunable_scaling, 307 .data = &sysctl_sched_tunable_scaling,
319 .maxlen = sizeof(enum sched_tunable_scaling), 308 .maxlen = sizeof(enum sched_tunable_scaling),
@@ -323,14 +312,6 @@ static struct ctl_table kern_table[] = {
323 .extra2 = &max_sched_tunable_scaling, 312 .extra2 = &max_sched_tunable_scaling,
324 }, 313 },
325 { 314 {
326 .procname = "sched_shares_thresh",
327 .data = &sysctl_sched_shares_thresh,
328 .maxlen = sizeof(unsigned int),
329 .mode = 0644,
330 .proc_handler = proc_dointvec_minmax,
331 .extra1 = &zero,
332 },
333 {
334 .procname = "sched_migration_cost", 315 .procname = "sched_migration_cost",
335 .data = &sysctl_sched_migration_cost, 316 .data = &sysctl_sched_migration_cost,
336 .maxlen = sizeof(unsigned int), 317 .maxlen = sizeof(unsigned int),
@@ -352,6 +333,13 @@ static struct ctl_table kern_table[] = {
352 .proc_handler = proc_dointvec, 333 .proc_handler = proc_dointvec,
353 }, 334 },
354 { 335 {
336 .procname = "sched_shares_window",
337 .data = &sysctl_sched_shares_window,
338 .maxlen = sizeof(unsigned int),
339 .mode = 0644,
340 .proc_handler = proc_dointvec,
341 },
342 {
355 .procname = "timer_migration", 343 .procname = "timer_migration",
356 .data = &sysctl_timer_migration, 344 .data = &sysctl_timer_migration,
357 .maxlen = sizeof(unsigned int), 345 .maxlen = sizeof(unsigned int),
@@ -382,6 +370,17 @@ static struct ctl_table kern_table[] = {
382 .mode = 0644, 370 .mode = 0644,
383 .proc_handler = proc_dointvec, 371 .proc_handler = proc_dointvec,
384 }, 372 },
373#ifdef CONFIG_SCHED_AUTOGROUP
374 {
375 .procname = "sched_autogroup_enabled",
376 .data = &sysctl_sched_autogroup_enabled,
377 .maxlen = sizeof(unsigned int),
378 .mode = 0644,
379 .proc_handler = proc_dointvec,
380 .extra1 = &zero,
381 .extra2 = &one,
382 },
383#endif
385#ifdef CONFIG_PROVE_LOCKING 384#ifdef CONFIG_PROVE_LOCKING
386 { 385 {
387 .procname = "prove_locking", 386 .procname = "prove_locking",
@@ -702,6 +701,15 @@ static struct ctl_table kern_table[] = {
702 .extra1 = &zero, 701 .extra1 = &zero,
703 .extra2 = &ten_thousand, 702 .extra2 = &ten_thousand,
704 }, 703 },
704 {
705 .procname = "dmesg_restrict",
706 .data = &dmesg_restrict,
707 .maxlen = sizeof(int),
708 .mode = 0644,
709 .proc_handler = proc_dointvec_minmax,
710 .extra1 = &zero,
711 .extra2 = &one,
712 },
705#endif 713#endif
706 { 714 {
707 .procname = "ngroups_max", 715 .procname = "ngroups_max",
@@ -736,21 +744,21 @@ static struct ctl_table kern_table[] = {
736 .extra1 = &zero, 744 .extra1 = &zero,
737 .extra2 = &one, 745 .extra2 = &one,
738 }, 746 },
739#endif
740#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
741 { 747 {
742 .procname = "unknown_nmi_panic", 748 .procname = "nmi_watchdog",
743 .data = &unknown_nmi_panic, 749 .data = &watchdog_enabled,
744 .maxlen = sizeof (int), 750 .maxlen = sizeof (int),
745 .mode = 0644, 751 .mode = 0644,
746 .proc_handler = proc_dointvec, 752 .proc_handler = proc_dowatchdog_enabled,
747 }, 753 },
754#endif
755#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
748 { 756 {
749 .procname = "nmi_watchdog", 757 .procname = "unknown_nmi_panic",
750 .data = &nmi_watchdog_enabled, 758 .data = &unknown_nmi_panic,
751 .maxlen = sizeof (int), 759 .maxlen = sizeof (int),
752 .mode = 0644, 760 .mode = 0644,
753 .proc_handler = proc_nmi_enabled, 761 .proc_handler = proc_dointvec,
754 }, 762 },
755#endif 763#endif
756#if defined(CONFIG_X86) 764#if defined(CONFIG_X86)
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 1357c5786064..4b2545a136ff 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -136,7 +136,6 @@ static const struct bin_table bin_kern_table[] = {
136 { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, 136 { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" },
137 { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, 137 { CTL_INT, KERN_COMPAT_LOG, "compat-log" },
138 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, 138 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
139 { CTL_INT, KERN_NMI_WATCHDOG, "nmi_watchdog" },
140 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, 139 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
141 {} 140 {}
142}; 141};
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index c8231fb15708..69691eb4b715 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -89,8 +89,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
89 return -ENOMEM; 89 return -ENOMEM;
90 90
91 if (!info) { 91 if (!info) {
92 int seq = get_cpu_var(taskstats_seqnum)++; 92 int seq = this_cpu_inc_return(taskstats_seqnum) - 1;
93 put_cpu_var(taskstats_seqnum);
94 93
95 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); 94 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
96 } else 95 } else
@@ -349,25 +348,47 @@ static int parse(struct nlattr *na, struct cpumask *mask)
349 return ret; 348 return ret;
350} 349}
351 350
351#ifdef CONFIG_IA64
352#define TASKSTATS_NEEDS_PADDING 1
353#endif
354
352static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) 355static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
353{ 356{
354 struct nlattr *na, *ret; 357 struct nlattr *na, *ret;
355 int aggr; 358 int aggr;
356 359
357 /* If we don't pad, we end up with alignment on a 4 byte boundary.
358 * This causes lots of runtime warnings on systems requiring 8 byte
359 * alignment */
360 u32 pids[2] = { pid, 0 };
361 int pid_size = ALIGN(sizeof(pid), sizeof(long));
362
363 aggr = (type == TASKSTATS_TYPE_PID) 360 aggr = (type == TASKSTATS_TYPE_PID)
364 ? TASKSTATS_TYPE_AGGR_PID 361 ? TASKSTATS_TYPE_AGGR_PID
365 : TASKSTATS_TYPE_AGGR_TGID; 362 : TASKSTATS_TYPE_AGGR_TGID;
366 363
364 /*
365 * The taskstats structure is internally aligned on 8 byte
366 * boundaries but the layout of the aggregrate reply, with
367 * two NLA headers and the pid (each 4 bytes), actually
368 * force the entire structure to be unaligned. This causes
369 * the kernel to issue unaligned access warnings on some
370 * architectures like ia64. Unfortunately, some software out there
371 * doesn't properly unroll the NLA packet and assumes that the start
372 * of the taskstats structure will always be 20 bytes from the start
373 * of the netlink payload. Aligning the start of the taskstats
374 * structure breaks this software, which we don't want. So, for now
375 * the alignment only happens on architectures that require it
376 * and those users will have to update to fixed versions of those
377 * packages. Space is reserved in the packet only when needed.
378 * This ifdef should be removed in several years e.g. 2012 once
379 * we can be confident that fixed versions are installed on most
380 * systems. We add the padding before the aggregate since the
381 * aggregate is already a defined type.
382 */
383#ifdef TASKSTATS_NEEDS_PADDING
384 if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0)
385 goto err;
386#endif
367 na = nla_nest_start(skb, aggr); 387 na = nla_nest_start(skb, aggr);
368 if (!na) 388 if (!na)
369 goto err; 389 goto err;
370 if (nla_put(skb, type, pid_size, pids) < 0) 390
391 if (nla_put(skb, type, sizeof(pid), &pid) < 0)
371 goto err; 392 goto err;
372 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); 393 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
373 if (!ret) 394 if (!ret)
@@ -456,6 +477,18 @@ out:
456 return rc; 477 return rc;
457} 478}
458 479
480static size_t taskstats_packet_size(void)
481{
482 size_t size;
483
484 size = nla_total_size(sizeof(u32)) +
485 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
486#ifdef TASKSTATS_NEEDS_PADDING
487 size += nla_total_size(0); /* Padding for alignment */
488#endif
489 return size;
490}
491
459static int cmd_attr_pid(struct genl_info *info) 492static int cmd_attr_pid(struct genl_info *info)
460{ 493{
461 struct taskstats *stats; 494 struct taskstats *stats;
@@ -464,8 +497,7 @@ static int cmd_attr_pid(struct genl_info *info)
464 u32 pid; 497 u32 pid;
465 int rc; 498 int rc;
466 499
467 size = nla_total_size(sizeof(u32)) + 500 size = taskstats_packet_size();
468 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
469 501
470 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 502 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
471 if (rc < 0) 503 if (rc < 0)
@@ -494,8 +526,7 @@ static int cmd_attr_tgid(struct genl_info *info)
494 u32 tgid; 526 u32 tgid;
495 int rc; 527 int rc;
496 528
497 size = nla_total_size(sizeof(u32)) + 529 size = taskstats_packet_size();
498 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
499 530
500 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 531 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
501 if (rc < 0) 532 if (rc < 0)
@@ -570,8 +601,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
570 /* 601 /*
571 * Size includes space for nested attributes 602 * Size includes space for nested attributes
572 */ 603 */
573 size = nla_total_size(sizeof(u32)) + 604 size = taskstats_packet_size();
574 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
575 605
576 is_thread_group = !!taskstats_tgid_alloc(tsk); 606 is_thread_group = !!taskstats_tgid_alloc(tsk);
577 if (is_thread_group) { 607 if (is_thread_group) {
@@ -581,7 +611,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
581 fill_tgid_exit(tsk); 611 fill_tgid_exit(tsk);
582 } 612 }
583 613
584 listeners = &__raw_get_cpu_var(listener_array); 614 listeners = __this_cpu_ptr(&listener_array);
585 if (list_empty(&listeners->list)) 615 if (list_empty(&listeners->list))
586 return; 616 return;
587 617
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c18d7efa1b4b..df140cd3ea47 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -152,6 +152,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
152 */ 152 */
153 for (sft = 32; sft > 0; sft--) { 153 for (sft = 32; sft > 0; sft--) {
154 tmp = (u64) to << sft; 154 tmp = (u64) to << sft;
155 tmp += from / 2;
155 do_div(tmp, from); 156 do_div(tmp, from);
156 if ((tmp >> sftacc) == 0) 157 if ((tmp >> sftacc) == 0)
157 break; 158 break;
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b6b898d2eeef..051bc80a0c43 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -49,7 +49,7 @@ struct tick_device *tick_get_device(int cpu)
49 */ 49 */
50int tick_is_oneshot_available(void) 50int tick_is_oneshot_available(void)
51{ 51{
52 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 52 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
53 53
54 return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); 54 return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT);
55} 55}
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index aada0e52680a..5cbc101f908b 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -95,7 +95,7 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
95 */ 95 */
96int tick_program_event(ktime_t expires, int force) 96int tick_program_event(ktime_t expires, int force)
97{ 97{
98 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 98 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
99 99
100 return tick_dev_program_event(dev, expires, force); 100 return tick_dev_program_event(dev, expires, force);
101} 101}
@@ -167,7 +167,7 @@ int tick_oneshot_mode_active(void)
167 int ret; 167 int ret;
168 168
169 local_irq_save(flags); 169 local_irq_save(flags);
170 ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT; 170 ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT;
171 local_irq_restore(flags); 171 local_irq_restore(flags);
172 172
173 return ret; 173 return ret;
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index ac38fbb176cc..a9ae369925ce 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -21,6 +21,7 @@
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/math64.h> 23#include <linux/math64.h>
24#include <linux/kernel.h>
24 25
25/* 26/*
26 * fixed point arithmetic scale factor for skew 27 * fixed point arithmetic scale factor for skew
@@ -57,11 +58,11 @@ int timecompare_offset(struct timecompare *sync,
57 int index; 58 int index;
58 int num_samples = sync->num_samples; 59 int num_samples = sync->num_samples;
59 60
60 if (num_samples > sizeof(buffer)/sizeof(buffer[0])) { 61 if (num_samples > ARRAY_SIZE(buffer)) {
61 samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); 62 samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
62 if (!samples) { 63 if (!samples) {
63 samples = buffer; 64 samples = buffer;
64 num_samples = sizeof(buffer)/sizeof(buffer[0]); 65 num_samples = ARRAY_SIZE(buffer);
65 } 66 }
66 } else { 67 } else {
67 samples = buffer; 68 samples = buffer;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 49010d822f72..5bb86da82003 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -32,6 +32,8 @@ struct timekeeper {
32 cycle_t cycle_interval; 32 cycle_t cycle_interval;
33 /* Number of clock shifted nano seconds in one NTP interval. */ 33 /* Number of clock shifted nano seconds in one NTP interval. */
34 u64 xtime_interval; 34 u64 xtime_interval;
35 /* shifted nano seconds left over when rounding cycle_interval */
36 s64 xtime_remainder;
35 /* Raw nano seconds accumulated per NTP interval. */ 37 /* Raw nano seconds accumulated per NTP interval. */
36 u32 raw_interval; 38 u32 raw_interval;
37 39
@@ -62,7 +64,7 @@ struct timekeeper timekeeper;
62static void timekeeper_setup_internals(struct clocksource *clock) 64static void timekeeper_setup_internals(struct clocksource *clock)
63{ 65{
64 cycle_t interval; 66 cycle_t interval;
65 u64 tmp; 67 u64 tmp, ntpinterval;
66 68
67 timekeeper.clock = clock; 69 timekeeper.clock = clock;
68 clock->cycle_last = clock->read(clock); 70 clock->cycle_last = clock->read(clock);
@@ -70,6 +72,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
70 /* Do the ns -> cycle conversion first, using original mult */ 72 /* Do the ns -> cycle conversion first, using original mult */
71 tmp = NTP_INTERVAL_LENGTH; 73 tmp = NTP_INTERVAL_LENGTH;
72 tmp <<= clock->shift; 74 tmp <<= clock->shift;
75 ntpinterval = tmp;
73 tmp += clock->mult/2; 76 tmp += clock->mult/2;
74 do_div(tmp, clock->mult); 77 do_div(tmp, clock->mult);
75 if (tmp == 0) 78 if (tmp == 0)
@@ -80,6 +83,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
80 83
81 /* Go back from cycles -> shifted ns */ 84 /* Go back from cycles -> shifted ns */
82 timekeeper.xtime_interval = (u64) interval * clock->mult; 85 timekeeper.xtime_interval = (u64) interval * clock->mult;
86 timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval;
83 timekeeper.raw_interval = 87 timekeeper.raw_interval =
84 ((u64) interval * clock->mult) >> clock->shift; 88 ((u64) interval * clock->mult) >> clock->shift;
85 89
@@ -719,7 +723,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
719 723
720 /* Accumulate error between NTP and clock interval */ 724 /* Accumulate error between NTP and clock interval */
721 timekeeper.ntp_error += tick_length << shift; 725 timekeeper.ntp_error += tick_length << shift;
722 timekeeper.ntp_error -= timekeeper.xtime_interval << 726 timekeeper.ntp_error -=
727 (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
723 (timekeeper.ntp_error_shift + shift); 728 (timekeeper.ntp_error_shift + shift);
724 729
725 return offset; 730 return offset;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index ab8f5e33fa92..32a19f9397fc 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -79,26 +79,26 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
79{ 79{
80 struct hrtimer *timer, tmp; 80 struct hrtimer *timer, tmp;
81 unsigned long next = 0, i; 81 unsigned long next = 0, i;
82 struct rb_node *curr; 82 struct timerqueue_node *curr;
83 unsigned long flags; 83 unsigned long flags;
84 84
85next_one: 85next_one:
86 i = 0; 86 i = 0;
87 raw_spin_lock_irqsave(&base->cpu_base->lock, flags); 87 raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
88 88
89 curr = base->first; 89 curr = timerqueue_getnext(&base->active);
90 /* 90 /*
91 * Crude but we have to do this O(N*N) thing, because 91 * Crude but we have to do this O(N*N) thing, because
92 * we have to unlock the base when printing: 92 * we have to unlock the base when printing:
93 */ 93 */
94 while (curr && i < next) { 94 while (curr && i < next) {
95 curr = rb_next(curr); 95 curr = timerqueue_iterate_next(curr);
96 i++; 96 i++;
97 } 97 }
98 98
99 if (curr) { 99 if (curr) {
100 100
101 timer = rb_entry(curr, struct hrtimer, node); 101 timer = container_of(curr, struct hrtimer, node);
102 tmp = *timer; 102 tmp = *timer;
103 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); 103 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
104 104
diff --git a/kernel/timer.c b/kernel/timer.c
index 68a9ae7679b7..43ca9936f2d0 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -88,18 +88,6 @@ struct tvec_base boot_tvec_bases;
88EXPORT_SYMBOL(boot_tvec_bases); 88EXPORT_SYMBOL(boot_tvec_bases);
89static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; 89static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
90 90
91/*
92 * Note that all tvec_bases are 2 byte aligned and lower bit of
93 * base in timer_list is guaranteed to be zero. Use the LSB to
94 * indicate whether the timer is deferrable.
95 *
96 * A deferrable timer will work normally when the system is busy, but
97 * will not cause a CPU to come out of idle just to service it; instead,
98 * the timer will be serviced when the CPU eventually wakes up with a
99 * subsequent non-deferrable timer.
100 */
101#define TBASE_DEFERRABLE_FLAG (0x1)
102
103/* Functions below help us manage 'deferrable' flag */ 91/* Functions below help us manage 'deferrable' flag */
104static inline unsigned int tbase_get_deferrable(struct tvec_base *base) 92static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
105{ 93{
@@ -113,8 +101,7 @@ static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
113 101
114static inline void timer_set_deferrable(struct timer_list *timer) 102static inline void timer_set_deferrable(struct timer_list *timer)
115{ 103{
116 timer->base = ((struct tvec_base *)((unsigned long)(timer->base) | 104 timer->base = TBASE_MAKE_DEFERRED(timer->base);
117 TBASE_DEFERRABLE_FLAG));
118} 105}
119 106
120static inline void 107static inline void
@@ -343,15 +330,6 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
343} 330}
344EXPORT_SYMBOL_GPL(set_timer_slack); 331EXPORT_SYMBOL_GPL(set_timer_slack);
345 332
346
347static inline void set_running_timer(struct tvec_base *base,
348 struct timer_list *timer)
349{
350#ifdef CONFIG_SMP
351 base->running_timer = timer;
352#endif
353}
354
355static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) 333static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
356{ 334{
357 unsigned long expires = timer->expires; 335 unsigned long expires = timer->expires;
@@ -936,15 +914,12 @@ int del_timer(struct timer_list *timer)
936} 914}
937EXPORT_SYMBOL(del_timer); 915EXPORT_SYMBOL(del_timer);
938 916
939#ifdef CONFIG_SMP
940/** 917/**
941 * try_to_del_timer_sync - Try to deactivate a timer 918 * try_to_del_timer_sync - Try to deactivate a timer
942 * @timer: timer do del 919 * @timer: timer do del
943 * 920 *
944 * This function tries to deactivate a timer. Upon successful (ret >= 0) 921 * This function tries to deactivate a timer. Upon successful (ret >= 0)
945 * exit the timer is not queued and the handler is not running on any CPU. 922 * exit the timer is not queued and the handler is not running on any CPU.
946 *
947 * It must not be called from interrupt contexts.
948 */ 923 */
949int try_to_del_timer_sync(struct timer_list *timer) 924int try_to_del_timer_sync(struct timer_list *timer)
950{ 925{
@@ -973,6 +948,7 @@ out:
973} 948}
974EXPORT_SYMBOL(try_to_del_timer_sync); 949EXPORT_SYMBOL(try_to_del_timer_sync);
975 950
951#ifdef CONFIG_SMP
976/** 952/**
977 * del_timer_sync - deactivate a timer and wait for the handler to finish. 953 * del_timer_sync - deactivate a timer and wait for the handler to finish.
978 * @timer: the timer to be deactivated 954 * @timer: the timer to be deactivated
@@ -983,7 +959,7 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
983 * 959 *
984 * Synchronization rules: Callers must prevent restarting of the timer, 960 * Synchronization rules: Callers must prevent restarting of the timer,
985 * otherwise this function is meaningless. It must not be called from 961 * otherwise this function is meaningless. It must not be called from
986 * interrupt contexts. The caller must not hold locks which would prevent 962 * hardirq contexts. The caller must not hold locks which would prevent
987 * completion of the timer's handler. The timer's handler must not call 963 * completion of the timer's handler. The timer's handler must not call
988 * add_timer_on(). Upon exit the timer is not queued and the handler is 964 * add_timer_on(). Upon exit the timer is not queued and the handler is
989 * not running on any CPU. 965 * not running on any CPU.
@@ -993,14 +969,16 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
993int del_timer_sync(struct timer_list *timer) 969int del_timer_sync(struct timer_list *timer)
994{ 970{
995#ifdef CONFIG_LOCKDEP 971#ifdef CONFIG_LOCKDEP
996 unsigned long flags; 972 local_bh_disable();
997
998 local_irq_save(flags);
999 lock_map_acquire(&timer->lockdep_map); 973 lock_map_acquire(&timer->lockdep_map);
1000 lock_map_release(&timer->lockdep_map); 974 lock_map_release(&timer->lockdep_map);
1001 local_irq_restore(flags); 975 local_bh_enable();
1002#endif 976#endif
1003 977 /*
978 * don't use it in hardirq context, because it
979 * could lead to deadlock.
980 */
981 WARN_ON(in_irq());
1004 for (;;) { 982 for (;;) {
1005 int ret = try_to_del_timer_sync(timer); 983 int ret = try_to_del_timer_sync(timer);
1006 if (ret >= 0) 984 if (ret >= 0)
@@ -1111,7 +1089,7 @@ static inline void __run_timers(struct tvec_base *base)
1111 1089
1112 timer_stats_account_timer(timer); 1090 timer_stats_account_timer(timer);
1113 1091
1114 set_running_timer(base, timer); 1092 base->running_timer = timer;
1115 detach_timer(timer, 1); 1093 detach_timer(timer, 1);
1116 1094
1117 spin_unlock_irq(&base->lock); 1095 spin_unlock_irq(&base->lock);
@@ -1119,7 +1097,7 @@ static inline void __run_timers(struct tvec_base *base)
1119 spin_lock_irq(&base->lock); 1097 spin_lock_irq(&base->lock);
1120 } 1098 }
1121 } 1099 }
1122 set_running_timer(base, NULL); 1100 base->running_timer = NULL;
1123 spin_unlock_irq(&base->lock); 1101 spin_unlock_irq(&base->lock);
1124} 1102}
1125 1103
@@ -1249,9 +1227,15 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
1249 */ 1227 */
1250unsigned long get_next_timer_interrupt(unsigned long now) 1228unsigned long get_next_timer_interrupt(unsigned long now)
1251{ 1229{
1252 struct tvec_base *base = __get_cpu_var(tvec_bases); 1230 struct tvec_base *base = __this_cpu_read(tvec_bases);
1253 unsigned long expires; 1231 unsigned long expires;
1254 1232
1233 /*
1234 * Pretend that there is no timer pending if the cpu is offline.
1235 * Possible pending timers will be migrated later to an active cpu.
1236 */
1237 if (cpu_is_offline(smp_processor_id()))
1238 return now + NEXT_TIMER_MAX_DELTA;
1255 spin_lock(&base->lock); 1239 spin_lock(&base->lock);
1256 if (time_before_eq(base->next_timer, base->timer_jiffies)) 1240 if (time_before_eq(base->next_timer, base->timer_jiffies))
1257 base->next_timer = __next_timer_interrupt(base); 1241 base->next_timer = __next_timer_interrupt(base);
@@ -1292,7 +1276,7 @@ void update_process_times(int user_tick)
1292 */ 1276 */
1293static void run_timer_softirq(struct softirq_action *h) 1277static void run_timer_softirq(struct softirq_action *h)
1294{ 1278{
1295 struct tvec_base *base = __get_cpu_var(tvec_bases); 1279 struct tvec_base *base = __this_cpu_read(tvec_bases);
1296 1280
1297 hrtimer_run_pending(); 1281 hrtimer_run_pending();
1298 1282
@@ -1319,7 +1303,7 @@ void do_timer(unsigned long ticks)
1319{ 1303{
1320 jiffies_64 += ticks; 1304 jiffies_64 += ticks;
1321 update_wall_time(); 1305 update_wall_time();
1322 calc_global_load(); 1306 calc_global_load(ticks);
1323} 1307}
1324 1308
1325#ifdef __ARCH_WANT_SYS_ALARM 1309#ifdef __ARCH_WANT_SYS_ALARM
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e04b8bcdef88..14674dce77a6 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -69,6 +69,21 @@ config EVENT_TRACING
69 select CONTEXT_SWITCH_TRACER 69 select CONTEXT_SWITCH_TRACER
70 bool 70 bool
71 71
72config EVENT_POWER_TRACING_DEPRECATED
73 depends on EVENT_TRACING
74 bool "Deprecated power event trace API, to be removed"
75 default y
76 help
77 Provides old power event types:
78 C-state/idle accounting events:
79 power:power_start
80 power:power_end
81 and old cpufreq accounting event:
82 power:power_frequency
83 This is for userspace compatibility
84 and will vanish after 5 kernel iterations,
85 namely 2.6.41.
86
72config CONTEXT_SWITCH_TRACER 87config CONTEXT_SWITCH_TRACER
73 bool 88 bool
74 89
@@ -126,7 +141,7 @@ if FTRACE
126config FUNCTION_TRACER 141config FUNCTION_TRACER
127 bool "Kernel Function Tracer" 142 bool "Kernel Function Tracer"
128 depends on HAVE_FUNCTION_TRACER 143 depends on HAVE_FUNCTION_TRACER
129 select FRAME_POINTER if (!ARM_UNWIND) 144 select FRAME_POINTER if !ARM_UNWIND && !S390
130 select KALLSYMS 145 select KALLSYMS
131 select GENERIC_TRACER 146 select GENERIC_TRACER
132 select CONTEXT_SWITCH_TRACER 147 select CONTEXT_SWITCH_TRACER
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index bc251ed66724..7b8ec0281548 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -168,7 +168,6 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
168static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), 168static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
169 BLK_TC_ACT(BLK_TC_WRITE) }; 169 BLK_TC_ACT(BLK_TC_WRITE) };
170 170
171#define BLK_TC_HARDBARRIER BLK_TC_BARRIER
172#define BLK_TC_RAHEAD BLK_TC_AHEAD 171#define BLK_TC_RAHEAD BLK_TC_AHEAD
173 172
174/* The ilog2() calls fall out because they're constant */ 173/* The ilog2() calls fall out because they're constant */
@@ -196,7 +195,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
196 return; 195 return;
197 196
198 what |= ddir_act[rw & WRITE]; 197 what |= ddir_act[rw & WRITE];
199 what |= MASK_TC_BIT(rw, HARDBARRIER);
200 what |= MASK_TC_BIT(rw, SYNC); 198 what |= MASK_TC_BIT(rw, SYNC);
201 what |= MASK_TC_BIT(rw, RAHEAD); 199 what |= MASK_TC_BIT(rw, RAHEAD);
202 what |= MASK_TC_BIT(rw, META); 200 what |= MASK_TC_BIT(rw, META);
@@ -1807,8 +1805,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1807 1805
1808 if (rw & REQ_RAHEAD) 1806 if (rw & REQ_RAHEAD)
1809 rwbs[i++] = 'A'; 1807 rwbs[i++] = 'A';
1810 if (rw & REQ_HARDBARRIER)
1811 rwbs[i++] = 'B';
1812 if (rw & REQ_SYNC) 1808 if (rw & REQ_SYNC)
1813 rwbs[i++] = 'S'; 1809 rwbs[i++] = 'S';
1814 if (rw & REQ_META) 1810 if (rw & REQ_META)
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index a22582a06161..f55fcf61b223 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,8 @@
13#define CREATE_TRACE_POINTS 13#define CREATE_TRACE_POINTS
14#include <trace/events/power.h> 14#include <trace/events/power.h>
15 15
16EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); 16#ifdef EVENT_POWER_TRACING_DEPRECATED
17EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
18#endif
19EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
17 20
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9ed509a015d8..bd1c35a4fbcc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3853,6 +3853,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3853 3853
3854 /* Need to copy one event at a time */ 3854 /* Need to copy one event at a time */
3855 do { 3855 do {
3856 /* We need the size of one event, because
3857 * rb_advance_reader only advances by one event,
3858 * whereas rb_event_ts_length may include the size of
3859 * one or two events.
3860 * We have already ensured there's enough space if this
3861 * is a time extend. */
3862 size = rb_event_length(event);
3856 memcpy(bpage->data + pos, rpage->data + rpos, size); 3863 memcpy(bpage->data + pos, rpage->data + rpos, size);
3857 3864
3858 len -= size; 3865 len -= size;
@@ -3867,7 +3874,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3867 event = rb_reader_event(cpu_buffer); 3874 event = rb_reader_event(cpu_buffer);
3868 /* Always keep the time extend and data together */ 3875 /* Always keep the time extend and data together */
3869 size = rb_event_ts_length(event); 3876 size = rb_event_ts_length(event);
3870 } while (len > size); 3877 } while (len >= size);
3871 3878
3872 /* update bpage */ 3879 /* update bpage */
3873 local_set(&bpage->commit, pos); 3880 local_set(&bpage->commit, pos);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 82d9b8106cd0..f8cf959bad45 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -17,7 +17,6 @@
17#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/smp_lock.h>
21#include <linux/notifier.h> 20#include <linux/notifier.h>
22#include <linux/irqflags.h> 21#include <linux/irqflags.h>
23#include <linux/debugfs.h> 22#include <linux/debugfs.h>
@@ -1284,6 +1283,8 @@ void trace_dump_stack(void)
1284 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); 1283 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
1285} 1284}
1286 1285
1286static DEFINE_PER_CPU(int, user_stack_count);
1287
1287void 1288void
1288ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) 1289ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1289{ 1290{
@@ -1302,6 +1303,18 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1302 if (unlikely(in_nmi())) 1303 if (unlikely(in_nmi()))
1303 return; 1304 return;
1304 1305
1306 /*
1307 * prevent recursion, since the user stack tracing may
1308 * trigger other kernel events.
1309 */
1310 preempt_disable();
1311 if (__this_cpu_read(user_stack_count))
1312 goto out;
1313
1314 __this_cpu_inc(user_stack_count);
1315
1316
1317
1305 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, 1318 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
1306 sizeof(*entry), flags, pc); 1319 sizeof(*entry), flags, pc);
1307 if (!event) 1320 if (!event)
@@ -1319,6 +1332,11 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1319 save_stack_trace_user(&trace); 1332 save_stack_trace_user(&trace);
1320 if (!filter_check_discard(call, entry, buffer, event)) 1333 if (!filter_check_discard(call, entry, buffer, event))
1321 ring_buffer_unlock_commit(buffer, event); 1334 ring_buffer_unlock_commit(buffer, event);
1335
1336 __this_cpu_dec(user_stack_count);
1337
1338 out:
1339 preempt_enable();
1322} 1340}
1323 1341
1324#ifdef UNUSED 1342#ifdef UNUSED
@@ -2320,11 +2338,19 @@ tracing_write_stub(struct file *filp, const char __user *ubuf,
2320 return count; 2338 return count;
2321} 2339}
2322 2340
2341static loff_t tracing_seek(struct file *file, loff_t offset, int origin)
2342{
2343 if (file->f_mode & FMODE_READ)
2344 return seq_lseek(file, offset, origin);
2345 else
2346 return 0;
2347}
2348
2323static const struct file_operations tracing_fops = { 2349static const struct file_operations tracing_fops = {
2324 .open = tracing_open, 2350 .open = tracing_open,
2325 .read = seq_read, 2351 .read = seq_read,
2326 .write = tracing_write_stub, 2352 .write = tracing_write_stub,
2327 .llseek = seq_lseek, 2353 .llseek = tracing_seek,
2328 .release = tracing_release, 2354 .release = tracing_release,
2329}; 2355};
2330 2356
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 39c059ca670e..19a359d5e6d5 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -21,17 +21,46 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
21/* Count the events in use (per event id, not per instance) */ 21/* Count the events in use (per event id, not per instance) */
22static int total_ref_count; 22static int total_ref_count;
23 23
24static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event)
26{
27 /* No tracing, just counting, so no obvious leak */
28 if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
29 return 0;
30
31 /* Some events are ok to be traced by non-root users... */
32 if (p_event->attach_state == PERF_ATTACH_TASK) {
33 if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
34 return 0;
35 }
36
37 /*
38 * ...otherwise raw tracepoint data can be a severe data leak,
39 * only allow root to have these.
40 */
41 if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
42 return -EPERM;
43
44 return 0;
45}
46
24static int perf_trace_event_init(struct ftrace_event_call *tp_event, 47static int perf_trace_event_init(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event) 48 struct perf_event *p_event)
26{ 49{
27 struct hlist_head __percpu *list; 50 struct hlist_head __percpu *list;
28 int ret = -ENOMEM; 51 int ret;
29 int cpu; 52 int cpu;
30 53
54 ret = perf_trace_event_perm(tp_event, p_event);
55 if (ret)
56 return ret;
57
31 p_event->tp_event = tp_event; 58 p_event->tp_event = tp_event;
32 if (tp_event->perf_refcount++ > 0) 59 if (tp_event->perf_refcount++ > 0)
33 return 0; 60 return 0;
34 61
62 ret = -ENOMEM;
63
35 list = alloc_percpu(struct hlist_head); 64 list = alloc_percpu(struct hlist_head);
36 if (!list) 65 if (!list)
37 goto fail; 66 goto fail;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0725eeab1937..35fde09b81de 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -27,6 +27,12 @@
27 27
28DEFINE_MUTEX(event_mutex); 28DEFINE_MUTEX(event_mutex);
29 29
30DEFINE_MUTEX(event_storage_mutex);
31EXPORT_SYMBOL_GPL(event_storage_mutex);
32
33char event_storage[EVENT_STORAGE_SIZE];
34EXPORT_SYMBOL_GPL(event_storage);
35
30LIST_HEAD(ftrace_events); 36LIST_HEAD(ftrace_events);
31LIST_HEAD(ftrace_common_fields); 37LIST_HEAD(ftrace_common_fields);
32 38
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 4ba44deaac25..4b74d71705c0 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -83,13 +83,19 @@ static void __always_unused ____ftrace_check_##name(void) \
83 83
84#undef __array 84#undef __array
85#define __array(type, item, len) \ 85#define __array(type, item, len) \
86 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ 86 do { \
87 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 87 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
88 mutex_lock(&event_storage_mutex); \
89 snprintf(event_storage, sizeof(event_storage), \
90 "%s[%d]", #type, len); \
91 ret = trace_define_field(event_call, event_storage, #item, \
88 offsetof(typeof(field), item), \ 92 offsetof(typeof(field), item), \
89 sizeof(field.item), \ 93 sizeof(field.item), \
90 is_signed_type(type), FILTER_OTHER); \ 94 is_signed_type(type), FILTER_OTHER); \
91 if (ret) \ 95 mutex_unlock(&event_storage_mutex); \
92 return ret; 96 if (ret) \
97 return ret; \
98 } while (0);
93 99
94#undef __array_desc 100#undef __array_desc
95#define __array_desc(type, container, item, len) \ 101#define __array_desc(type, container, item, len) \
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 155a415b3209..659732eba07c 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
558static int trace_wakeup_test_thread(void *data) 558static int trace_wakeup_test_thread(void *data)
559{ 559{
560 /* Make this a RT thread, doesn't need to be too high */ 560 /* Make this a RT thread, doesn't need to be too high */
561 struct sched_param param = { .sched_priority = 5 }; 561 static const struct sched_param param = { .sched_priority = 5 };
562 struct completion *x = data; 562 struct completion *x = data;
563 563
564 sched_setscheduler(current, SCHED_FIFO, &param); 564 sched_setscheduler(current, SCHED_FIFO, &param);
diff --git a/kernel/user.c b/kernel/user.c
index 2c7d8d5914b1..5c598ca781df 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -158,6 +158,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
158 spin_lock_irq(&uidhash_lock); 158 spin_lock_irq(&uidhash_lock);
159 up = uid_hash_find(uid, hashent); 159 up = uid_hash_find(uid, hashent);
160 if (up) { 160 if (up) {
161 put_user_ns(ns);
161 key_put(new->uid_keyring); 162 key_put(new->uid_keyring);
162 key_put(new->session_keyring); 163 key_put(new->session_keyring);
163 kmem_cache_free(uid_cachep, new); 164 kmem_cache_free(uid_cachep, new);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6e3c41a4024c..d7ebdf4cea98 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -57,6 +57,8 @@ static int __init hardlockup_panic_setup(char *str)
57{ 57{
58 if (!strncmp(str, "panic", 5)) 58 if (!strncmp(str, "panic", 5))
59 hardlockup_panic = 1; 59 hardlockup_panic = 1;
60 else if (!strncmp(str, "0", 1))
61 no_watchdog = 1;
60 return 1; 62 return 1;
61} 63}
62__setup("nmi_watchdog=", hardlockup_panic_setup); 64__setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -116,12 +118,12 @@ static void __touch_watchdog(void)
116{ 118{
117 int this_cpu = smp_processor_id(); 119 int this_cpu = smp_processor_id();
118 120
119 __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu); 121 __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu));
120} 122}
121 123
122void touch_softlockup_watchdog(void) 124void touch_softlockup_watchdog(void)
123{ 125{
124 __raw_get_cpu_var(watchdog_touch_ts) = 0; 126 __this_cpu_write(watchdog_touch_ts, 0);
125} 127}
126EXPORT_SYMBOL(touch_softlockup_watchdog); 128EXPORT_SYMBOL(touch_softlockup_watchdog);
127 129
@@ -165,12 +167,12 @@ void touch_softlockup_watchdog_sync(void)
165/* watchdog detector functions */ 167/* watchdog detector functions */
166static int is_hardlockup(void) 168static int is_hardlockup(void)
167{ 169{
168 unsigned long hrint = __get_cpu_var(hrtimer_interrupts); 170 unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
169 171
170 if (__get_cpu_var(hrtimer_interrupts_saved) == hrint) 172 if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
171 return 1; 173 return 1;
172 174
173 __get_cpu_var(hrtimer_interrupts_saved) = hrint; 175 __this_cpu_write(hrtimer_interrupts_saved, hrint);
174 return 0; 176 return 0;
175} 177}
176#endif 178#endif
@@ -203,8 +205,8 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
203 /* Ensure the watchdog never gets throttled */ 205 /* Ensure the watchdog never gets throttled */
204 event->hw.interrupts = 0; 206 event->hw.interrupts = 0;
205 207
206 if (__get_cpu_var(watchdog_nmi_touch) == true) { 208 if (__this_cpu_read(watchdog_nmi_touch) == true) {
207 __get_cpu_var(watchdog_nmi_touch) = false; 209 __this_cpu_write(watchdog_nmi_touch, false);
208 return; 210 return;
209 } 211 }
210 212
@@ -218,7 +220,7 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
218 int this_cpu = smp_processor_id(); 220 int this_cpu = smp_processor_id();
219 221
220 /* only print hardlockups once */ 222 /* only print hardlockups once */
221 if (__get_cpu_var(hard_watchdog_warn) == true) 223 if (__this_cpu_read(hard_watchdog_warn) == true)
222 return; 224 return;
223 225
224 if (hardlockup_panic) 226 if (hardlockup_panic)
@@ -226,16 +228,16 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
226 else 228 else
227 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); 229 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
228 230
229 __get_cpu_var(hard_watchdog_warn) = true; 231 __this_cpu_write(hard_watchdog_warn, true);
230 return; 232 return;
231 } 233 }
232 234
233 __get_cpu_var(hard_watchdog_warn) = false; 235 __this_cpu_write(hard_watchdog_warn, false);
234 return; 236 return;
235} 237}
236static void watchdog_interrupt_count(void) 238static void watchdog_interrupt_count(void)
237{ 239{
238 __get_cpu_var(hrtimer_interrupts)++; 240 __this_cpu_inc(hrtimer_interrupts);
239} 241}
240#else 242#else
241static inline void watchdog_interrupt_count(void) { return; } 243static inline void watchdog_interrupt_count(void) { return; }
@@ -244,7 +246,7 @@ static inline void watchdog_interrupt_count(void) { return; }
244/* watchdog kicker functions */ 246/* watchdog kicker functions */
245static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) 247static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
246{ 248{
247 unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts); 249 unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
248 struct pt_regs *regs = get_irq_regs(); 250 struct pt_regs *regs = get_irq_regs();
249 int duration; 251 int duration;
250 252
@@ -252,18 +254,18 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
252 watchdog_interrupt_count(); 254 watchdog_interrupt_count();
253 255
254 /* kick the softlockup detector */ 256 /* kick the softlockup detector */
255 wake_up_process(__get_cpu_var(softlockup_watchdog)); 257 wake_up_process(__this_cpu_read(softlockup_watchdog));
256 258
257 /* .. and repeat */ 259 /* .. and repeat */
258 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); 260 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
259 261
260 if (touch_ts == 0) { 262 if (touch_ts == 0) {
261 if (unlikely(__get_cpu_var(softlockup_touch_sync))) { 263 if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
262 /* 264 /*
263 * If the time stamp was touched atomically 265 * If the time stamp was touched atomically
264 * make sure the scheduler tick is up to date. 266 * make sure the scheduler tick is up to date.
265 */ 267 */
266 __get_cpu_var(softlockup_touch_sync) = false; 268 __this_cpu_write(softlockup_touch_sync, false);
267 sched_clock_tick(); 269 sched_clock_tick();
268 } 270 }
269 __touch_watchdog(); 271 __touch_watchdog();
@@ -279,7 +281,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
279 duration = is_softlockup(touch_ts); 281 duration = is_softlockup(touch_ts);
280 if (unlikely(duration)) { 282 if (unlikely(duration)) {
281 /* only warn once */ 283 /* only warn once */
282 if (__get_cpu_var(soft_watchdog_warn) == true) 284 if (__this_cpu_read(soft_watchdog_warn) == true)
283 return HRTIMER_RESTART; 285 return HRTIMER_RESTART;
284 286
285 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", 287 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
@@ -294,9 +296,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
294 296
295 if (softlockup_panic) 297 if (softlockup_panic)
296 panic("softlockup: hung tasks"); 298 panic("softlockup: hung tasks");
297 __get_cpu_var(soft_watchdog_warn) = true; 299 __this_cpu_write(soft_watchdog_warn, true);
298 } else 300 } else
299 __get_cpu_var(soft_watchdog_warn) = false; 301 __this_cpu_write(soft_watchdog_warn, false);
300 302
301 return HRTIMER_RESTART; 303 return HRTIMER_RESTART;
302} 304}
@@ -307,7 +309,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
307 */ 309 */
308static int watchdog(void *unused) 310static int watchdog(void *unused)
309{ 311{
310 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 312 static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
311 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 313 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
312 314
313 sched_setscheduler(current, SCHED_FIFO, &param); 315 sched_setscheduler(current, SCHED_FIFO, &param);
@@ -364,7 +366,8 @@ static int watchdog_nmi_enable(int cpu)
364 goto out_save; 366 goto out_save;
365 } 367 }
366 368
367 printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); 369 printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n",
370 cpu, PTR_ERR(event));
368 return PTR_ERR(event); 371 return PTR_ERR(event);
369 372
370 /* success path */ 373 /* success path */
@@ -547,13 +550,13 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
547 .notifier_call = cpu_callback 550 .notifier_call = cpu_callback
548}; 551};
549 552
550static int __init spawn_watchdog_task(void) 553void __init lockup_detector_init(void)
551{ 554{
552 void *cpu = (void *)(long)smp_processor_id(); 555 void *cpu = (void *)(long)smp_processor_id();
553 int err; 556 int err;
554 557
555 if (no_watchdog) 558 if (no_watchdog)
556 return 0; 559 return;
557 560
558 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 561 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
559 WARN_ON(notifier_to_errno(err)); 562 WARN_ON(notifier_to_errno(err));
@@ -561,6 +564,5 @@ static int __init spawn_watchdog_task(void)
561 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 564 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
562 register_cpu_notifier(&cpu_nfb); 565 register_cpu_notifier(&cpu_nfb);
563 566
564 return 0; 567 return;
565} 568}
566early_initcall(spawn_watchdog_task);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 90db1bd1a978..8ee6ec82f88a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -661,7 +661,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
661{ 661{
662 struct worker *worker = kthread_data(task); 662 struct worker *worker = kthread_data(task);
663 663
664 if (likely(!(worker->flags & WORKER_NOT_RUNNING))) 664 if (!(worker->flags & WORKER_NOT_RUNNING))
665 atomic_inc(get_gcwq_nr_running(cpu)); 665 atomic_inc(get_gcwq_nr_running(cpu));
666} 666}
667 667
@@ -687,7 +687,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
687 struct global_cwq *gcwq = get_gcwq(cpu); 687 struct global_cwq *gcwq = get_gcwq(cpu);
688 atomic_t *nr_running = get_gcwq_nr_running(cpu); 688 atomic_t *nr_running = get_gcwq_nr_running(cpu);
689 689
690 if (unlikely(worker->flags & WORKER_NOT_RUNNING)) 690 if (worker->flags & WORKER_NOT_RUNNING)
691 return NULL; 691 return NULL;
692 692
693 /* this can only happen on the local cpu */ 693 /* this can only happen on the local cpu */
@@ -932,6 +932,38 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
932 wake_up_worker(gcwq); 932 wake_up_worker(gcwq);
933} 933}
934 934
935/*
936 * Test whether @work is being queued from another work executing on the
937 * same workqueue. This is rather expensive and should only be used from
938 * cold paths.
939 */
940static bool is_chained_work(struct workqueue_struct *wq)
941{
942 unsigned long flags;
943 unsigned int cpu;
944
945 for_each_gcwq_cpu(cpu) {
946 struct global_cwq *gcwq = get_gcwq(cpu);
947 struct worker *worker;
948 struct hlist_node *pos;
949 int i;
950
951 spin_lock_irqsave(&gcwq->lock, flags);
952 for_each_busy_worker(worker, i, pos, gcwq) {
953 if (worker->task != current)
954 continue;
955 spin_unlock_irqrestore(&gcwq->lock, flags);
956 /*
957 * I'm @worker, no locking necessary. See if @work
958 * is headed to the same workqueue.
959 */
960 return worker->current_cwq->wq == wq;
961 }
962 spin_unlock_irqrestore(&gcwq->lock, flags);
963 }
964 return false;
965}
966
935static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, 967static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
936 struct work_struct *work) 968 struct work_struct *work)
937{ 969{
@@ -943,7 +975,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
943 975
944 debug_work_activate(work); 976 debug_work_activate(work);
945 977
946 if (WARN_ON_ONCE(wq->flags & WQ_DYING)) 978 /* if dying, only works from the same workqueue are allowed */
979 if (unlikely(wq->flags & WQ_DYING) &&
980 WARN_ON_ONCE(!is_chained_work(wq)))
947 return; 981 return;
948 982
949 /* determine gcwq to use */ 983 /* determine gcwq to use */
@@ -2936,11 +2970,35 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
2936 */ 2970 */
2937void destroy_workqueue(struct workqueue_struct *wq) 2971void destroy_workqueue(struct workqueue_struct *wq)
2938{ 2972{
2973 unsigned int flush_cnt = 0;
2939 unsigned int cpu; 2974 unsigned int cpu;
2940 2975
2976 /*
2977 * Mark @wq dying and drain all pending works. Once WQ_DYING is
2978 * set, only chain queueing is allowed. IOW, only currently
2979 * pending or running work items on @wq can queue further work
2980 * items on it. @wq is flushed repeatedly until it becomes empty.
2981 * The number of flushing is detemined by the depth of chaining and
2982 * should be relatively short. Whine if it takes too long.
2983 */
2941 wq->flags |= WQ_DYING; 2984 wq->flags |= WQ_DYING;
2985reflush:
2942 flush_workqueue(wq); 2986 flush_workqueue(wq);
2943 2987
2988 for_each_cwq_cpu(cpu, wq) {
2989 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2990
2991 if (!cwq->nr_active && list_empty(&cwq->delayed_works))
2992 continue;
2993
2994 if (++flush_cnt == 10 ||
2995 (flush_cnt % 100 == 0 && flush_cnt <= 1000))
2996 printk(KERN_WARNING "workqueue %s: flush on "
2997 "destruction isn't complete after %u tries\n",
2998 wq->name, flush_cnt);
2999 goto reflush;
3000 }
3001
2944 /* 3002 /*
2945 * wq list is used to freeze wq, remove from list after 3003 * wq list is used to freeze wq, remove from list after
2946 * flushing is complete in case freeze races us. 3004 * flushing is complete in case freeze races us.
@@ -3692,7 +3750,8 @@ static int __init init_workqueues(void)
3692 system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); 3750 system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
3693 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, 3751 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
3694 WQ_UNBOUND_MAX_ACTIVE); 3752 WQ_UNBOUND_MAX_ACTIVE);
3695 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq); 3753 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
3754 !system_unbound_wq);
3696 return 0; 3755 return 0;
3697} 3756}
3698early_initcall(init_workqueues); 3757early_initcall(init_workqueues);