diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 5 | ||||
-rw-r--r-- | kernel/cpu.c | 24 | ||||
-rw-r--r-- | kernel/cpuset.c | 38 | ||||
-rw-r--r-- | kernel/kprobes.c | 2 | ||||
-rw-r--r-- | kernel/kthread.c | 1 | ||||
-rw-r--r-- | kernel/printk.c | 5 | ||||
-rw-r--r-- | kernel/rcupreempt.c | 20 | ||||
-rw-r--r-- | kernel/sched.c | 748 | ||||
-rw-r--r-- | kernel/sched_clock.c | 137 | ||||
-rw-r--r-- | kernel/sched_cpupri.c | 174 | ||||
-rw-r--r-- | kernel/sched_cpupri.h | 36 | ||||
-rw-r--r-- | kernel/sched_debug.c | 64 | ||||
-rw-r--r-- | kernel/sched_fair.c | 413 | ||||
-rw-r--r-- | kernel/sched_features.h | 7 | ||||
-rw-r--r-- | kernel/sched_rt.c | 405 | ||||
-rw-r--r-- | kernel/sched_stats.h | 42 | ||||
-rw-r--r-- | kernel/sysctl.c | 8 | ||||
-rw-r--r-- | kernel/time/tick-broadcast.c | 6 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 2 |
19 files changed, 1570 insertions, 567 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 480976275d98..f6328e16dfdd 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -3,7 +3,7 @@ | |||
3 | # | 3 | # |
4 | 4 | ||
5 | obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ | 5 | obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ |
6 | exit.o itimer.o time.o softirq.o resource.o \ | 6 | cpu.o exit.o itimer.o time.o softirq.o resource.o \ |
7 | sysctl.o capability.o ptrace.o timer.o user.o \ | 7 | sysctl.o capability.o ptrace.o timer.o user.o \ |
8 | signal.o sys.o kmod.o workqueue.o pid.o \ | 8 | signal.o sys.o kmod.o workqueue.o pid.o \ |
9 | rcupdate.o extable.o params.o posix-timers.o \ | 9 | rcupdate.o extable.o params.o posix-timers.o \ |
@@ -39,7 +39,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o | |||
39 | obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o | 39 | obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o |
40 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o | 40 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o |
41 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | 41 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o |
42 | obj-$(CONFIG_SMP) += cpu.o spinlock.o | 42 | obj-$(CONFIG_SMP) += spinlock.o |
43 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o | 43 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o |
44 | obj-$(CONFIG_PROVE_LOCKING) += spinlock.o | 44 | obj-$(CONFIG_PROVE_LOCKING) += spinlock.o |
45 | obj-$(CONFIG_UID16) += uid16.o | 45 | obj-$(CONFIG_UID16) += uid16.o |
@@ -83,6 +83,7 @@ obj-$(CONFIG_MARKERS) += marker.o | |||
83 | obj-$(CONFIG_LATENCYTOP) += latencytop.o | 83 | obj-$(CONFIG_LATENCYTOP) += latencytop.o |
84 | obj-$(CONFIG_FTRACE) += trace/ | 84 | obj-$(CONFIG_FTRACE) += trace/ |
85 | obj-$(CONFIG_TRACING) += trace/ | 85 | obj-$(CONFIG_TRACING) += trace/ |
86 | obj-$(CONFIG_SMP) += sched_cpupri.o | ||
86 | 87 | ||
87 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) | 88 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) |
88 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | 89 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is |
diff --git a/kernel/cpu.c b/kernel/cpu.c index c77bc3a1c722..b11f06dc149a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -15,6 +15,28 @@ | |||
15 | #include <linux/stop_machine.h> | 15 | #include <linux/stop_machine.h> |
16 | #include <linux/mutex.h> | 16 | #include <linux/mutex.h> |
17 | 17 | ||
18 | /* | ||
19 | * Represents all cpu's present in the system | ||
20 | * In systems capable of hotplug, this map could dynamically grow | ||
21 | * as new cpu's are detected in the system via any platform specific | ||
22 | * method, such as ACPI for e.g. | ||
23 | */ | ||
24 | cpumask_t cpu_present_map __read_mostly; | ||
25 | EXPORT_SYMBOL(cpu_present_map); | ||
26 | |||
27 | #ifndef CONFIG_SMP | ||
28 | |||
29 | /* | ||
30 | * Represents all cpu's that are currently online. | ||
31 | */ | ||
32 | cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; | ||
33 | EXPORT_SYMBOL(cpu_online_map); | ||
34 | |||
35 | cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; | ||
36 | EXPORT_SYMBOL(cpu_possible_map); | ||
37 | |||
38 | #else /* CONFIG_SMP */ | ||
39 | |||
18 | /* Serializes the updates to cpu_online_map, cpu_present_map */ | 40 | /* Serializes the updates to cpu_online_map, cpu_present_map */ |
19 | static DEFINE_MUTEX(cpu_add_remove_lock); | 41 | static DEFINE_MUTEX(cpu_add_remove_lock); |
20 | 42 | ||
@@ -403,3 +425,5 @@ out: | |||
403 | cpu_maps_update_done(); | 425 | cpu_maps_update_done(); |
404 | } | 426 | } |
405 | #endif /* CONFIG_PM_SLEEP_SMP */ | 427 | #endif /* CONFIG_PM_SLEEP_SMP */ |
428 | |||
429 | #endif /* CONFIG_SMP */ | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9fceb97e989c..459d601947a8 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -1194,6 +1194,15 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, | |||
1194 | 1194 | ||
1195 | if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | 1195 | if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) |
1196 | return -ENOSPC; | 1196 | return -ENOSPC; |
1197 | if (tsk->flags & PF_THREAD_BOUND) { | ||
1198 | cpumask_t mask; | ||
1199 | |||
1200 | mutex_lock(&callback_mutex); | ||
1201 | mask = cs->cpus_allowed; | ||
1202 | mutex_unlock(&callback_mutex); | ||
1203 | if (!cpus_equal(tsk->cpus_allowed, mask)) | ||
1204 | return -EINVAL; | ||
1205 | } | ||
1197 | 1206 | ||
1198 | return security_task_setscheduler(tsk, 0, NULL); | 1207 | return security_task_setscheduler(tsk, 0, NULL); |
1199 | } | 1208 | } |
@@ -1207,11 +1216,14 @@ static void cpuset_attach(struct cgroup_subsys *ss, | |||
1207 | struct mm_struct *mm; | 1216 | struct mm_struct *mm; |
1208 | struct cpuset *cs = cgroup_cs(cont); | 1217 | struct cpuset *cs = cgroup_cs(cont); |
1209 | struct cpuset *oldcs = cgroup_cs(oldcont); | 1218 | struct cpuset *oldcs = cgroup_cs(oldcont); |
1219 | int err; | ||
1210 | 1220 | ||
1211 | mutex_lock(&callback_mutex); | 1221 | mutex_lock(&callback_mutex); |
1212 | guarantee_online_cpus(cs, &cpus); | 1222 | guarantee_online_cpus(cs, &cpus); |
1213 | set_cpus_allowed_ptr(tsk, &cpus); | 1223 | err = set_cpus_allowed_ptr(tsk, &cpus); |
1214 | mutex_unlock(&callback_mutex); | 1224 | mutex_unlock(&callback_mutex); |
1225 | if (err) | ||
1226 | return; | ||
1215 | 1227 | ||
1216 | from = oldcs->mems_allowed; | 1228 | from = oldcs->mems_allowed; |
1217 | to = cs->mems_allowed; | 1229 | to = cs->mems_allowed; |
@@ -1882,7 +1894,7 @@ static void scan_for_empty_cpusets(const struct cpuset *root) | |||
1882 | * in order to minimize text size. | 1894 | * in order to minimize text size. |
1883 | */ | 1895 | */ |
1884 | 1896 | ||
1885 | static void common_cpu_mem_hotplug_unplug(void) | 1897 | static void common_cpu_mem_hotplug_unplug(int rebuild_sd) |
1886 | { | 1898 | { |
1887 | cgroup_lock(); | 1899 | cgroup_lock(); |
1888 | 1900 | ||
@@ -1894,7 +1906,8 @@ static void common_cpu_mem_hotplug_unplug(void) | |||
1894 | * Scheduler destroys domains on hotplug events. | 1906 | * Scheduler destroys domains on hotplug events. |
1895 | * Rebuild them based on the current settings. | 1907 | * Rebuild them based on the current settings. |
1896 | */ | 1908 | */ |
1897 | rebuild_sched_domains(); | 1909 | if (rebuild_sd) |
1910 | rebuild_sched_domains(); | ||
1898 | 1911 | ||
1899 | cgroup_unlock(); | 1912 | cgroup_unlock(); |
1900 | } | 1913 | } |
@@ -1912,11 +1925,22 @@ static void common_cpu_mem_hotplug_unplug(void) | |||
1912 | static int cpuset_handle_cpuhp(struct notifier_block *unused_nb, | 1925 | static int cpuset_handle_cpuhp(struct notifier_block *unused_nb, |
1913 | unsigned long phase, void *unused_cpu) | 1926 | unsigned long phase, void *unused_cpu) |
1914 | { | 1927 | { |
1915 | if (phase == CPU_DYING || phase == CPU_DYING_FROZEN) | 1928 | switch (phase) { |
1929 | case CPU_UP_CANCELED: | ||
1930 | case CPU_UP_CANCELED_FROZEN: | ||
1931 | case CPU_DOWN_FAILED: | ||
1932 | case CPU_DOWN_FAILED_FROZEN: | ||
1933 | case CPU_ONLINE: | ||
1934 | case CPU_ONLINE_FROZEN: | ||
1935 | case CPU_DEAD: | ||
1936 | case CPU_DEAD_FROZEN: | ||
1937 | common_cpu_mem_hotplug_unplug(1); | ||
1938 | break; | ||
1939 | default: | ||
1916 | return NOTIFY_DONE; | 1940 | return NOTIFY_DONE; |
1941 | } | ||
1917 | 1942 | ||
1918 | common_cpu_mem_hotplug_unplug(); | 1943 | return NOTIFY_OK; |
1919 | return 0; | ||
1920 | } | 1944 | } |
1921 | 1945 | ||
1922 | #ifdef CONFIG_MEMORY_HOTPLUG | 1946 | #ifdef CONFIG_MEMORY_HOTPLUG |
@@ -1929,7 +1953,7 @@ static int cpuset_handle_cpuhp(struct notifier_block *unused_nb, | |||
1929 | 1953 | ||
1930 | void cpuset_track_online_nodes(void) | 1954 | void cpuset_track_online_nodes(void) |
1931 | { | 1955 | { |
1932 | common_cpu_mem_hotplug_unplug(); | 1956 | common_cpu_mem_hotplug_unplug(0); |
1933 | } | 1957 | } |
1934 | #endif | 1958 | #endif |
1935 | 1959 | ||
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d4998f81e229..1485ca8d0e00 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -79,7 +79,7 @@ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; | |||
79 | * | 79 | * |
80 | * For such cases, we now have a blacklist | 80 | * For such cases, we now have a blacklist |
81 | */ | 81 | */ |
82 | struct kprobe_blackpoint kprobe_blacklist[] = { | 82 | static struct kprobe_blackpoint kprobe_blacklist[] = { |
83 | {"preempt_schedule",}, | 83 | {"preempt_schedule",}, |
84 | {NULL} /* Terminator */ | 84 | {NULL} /* Terminator */ |
85 | }; | 85 | }; |
diff --git a/kernel/kthread.c b/kernel/kthread.c index bd1b9ea024e1..97747cdd37c9 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -180,6 +180,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) | |||
180 | set_task_cpu(k, cpu); | 180 | set_task_cpu(k, cpu); |
181 | k->cpus_allowed = cpumask_of_cpu(cpu); | 181 | k->cpus_allowed = cpumask_of_cpu(cpu); |
182 | k->rt.nr_cpus_allowed = 1; | 182 | k->rt.nr_cpus_allowed = 1; |
183 | k->flags |= PF_THREAD_BOUND; | ||
183 | } | 184 | } |
184 | EXPORT_SYMBOL(kthread_bind); | 185 | EXPORT_SYMBOL(kthread_bind); |
185 | 186 | ||
diff --git a/kernel/printk.c b/kernel/printk.c index 75ef3af39132..5d81a11321fd 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -75,6 +75,8 @@ EXPORT_SYMBOL(oops_in_progress); | |||
75 | static DECLARE_MUTEX(console_sem); | 75 | static DECLARE_MUTEX(console_sem); |
76 | static DECLARE_MUTEX(secondary_console_sem); | 76 | static DECLARE_MUTEX(secondary_console_sem); |
77 | struct console *console_drivers; | 77 | struct console *console_drivers; |
78 | EXPORT_SYMBOL_GPL(console_drivers); | ||
79 | |||
78 | /* | 80 | /* |
79 | * This is used for debugging the mess that is the VT code by | 81 | * This is used for debugging the mess that is the VT code by |
80 | * keeping track if we have the console semaphore held. It's | 82 | * keeping track if we have the console semaphore held. It's |
@@ -121,6 +123,8 @@ struct console_cmdline | |||
121 | static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; | 123 | static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; |
122 | static int selected_console = -1; | 124 | static int selected_console = -1; |
123 | static int preferred_console = -1; | 125 | static int preferred_console = -1; |
126 | int console_set_on_cmdline; | ||
127 | EXPORT_SYMBOL(console_set_on_cmdline); | ||
124 | 128 | ||
125 | /* Flag: console code may call schedule() */ | 129 | /* Flag: console code may call schedule() */ |
126 | static int console_may_schedule; | 130 | static int console_may_schedule; |
@@ -890,6 +894,7 @@ static int __init console_setup(char *str) | |||
890 | *s = 0; | 894 | *s = 0; |
891 | 895 | ||
892 | __add_preferred_console(buf, idx, options, brl_options); | 896 | __add_preferred_console(buf, idx, options, brl_options); |
897 | console_set_on_cmdline = 1; | ||
893 | return 1; | 898 | return 1; |
894 | } | 899 | } |
895 | __setup("console=", console_setup); | 900 | __setup("console=", console_setup); |
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 5e02b7740702..41d275a81df5 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c | |||
@@ -925,26 +925,22 @@ void rcu_offline_cpu(int cpu) | |||
925 | spin_unlock_irqrestore(&rdp->lock, flags); | 925 | spin_unlock_irqrestore(&rdp->lock, flags); |
926 | } | 926 | } |
927 | 927 | ||
928 | void __devinit rcu_online_cpu(int cpu) | ||
929 | { | ||
930 | unsigned long flags; | ||
931 | |||
932 | spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); | ||
933 | cpu_set(cpu, rcu_cpu_online_map); | ||
934 | spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); | ||
935 | } | ||
936 | |||
937 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 928 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ |
938 | 929 | ||
939 | void rcu_offline_cpu(int cpu) | 930 | void rcu_offline_cpu(int cpu) |
940 | { | 931 | { |
941 | } | 932 | } |
942 | 933 | ||
943 | void __devinit rcu_online_cpu(int cpu) | 934 | #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ |
935 | |||
936 | void __cpuinit rcu_online_cpu(int cpu) | ||
944 | { | 937 | { |
945 | } | 938 | unsigned long flags; |
946 | 939 | ||
947 | #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ | 940 | spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); |
941 | cpu_set(cpu, rcu_cpu_online_map); | ||
942 | spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); | ||
943 | } | ||
948 | 944 | ||
949 | static void rcu_process_callbacks(struct softirq_action *unused) | 945 | static void rcu_process_callbacks(struct softirq_action *unused) |
950 | { | 946 | { |
diff --git a/kernel/sched.c b/kernel/sched.c index 42899dce837d..c74b0d23c752 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -75,6 +75,8 @@ | |||
75 | #include <asm/tlb.h> | 75 | #include <asm/tlb.h> |
76 | #include <asm/irq_regs.h> | 76 | #include <asm/irq_regs.h> |
77 | 77 | ||
78 | #include "sched_cpupri.h" | ||
79 | |||
78 | /* | 80 | /* |
79 | * Convert user-nice values [ -20 ... 0 ... 19 ] | 81 | * Convert user-nice values [ -20 ... 0 ... 19 ] |
80 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | 82 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], |
@@ -290,15 +292,15 @@ struct task_group root_task_group; | |||
290 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | 292 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); |
291 | /* Default task group's cfs_rq on each cpu */ | 293 | /* Default task group's cfs_rq on each cpu */ |
292 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 294 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; |
293 | #endif | 295 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
294 | 296 | ||
295 | #ifdef CONFIG_RT_GROUP_SCHED | 297 | #ifdef CONFIG_RT_GROUP_SCHED |
296 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | 298 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); |
297 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | 299 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; |
298 | #endif | 300 | #endif /* CONFIG_RT_GROUP_SCHED */ |
299 | #else | 301 | #else /* !CONFIG_FAIR_GROUP_SCHED */ |
300 | #define root_task_group init_task_group | 302 | #define root_task_group init_task_group |
301 | #endif | 303 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
302 | 304 | ||
303 | /* task_group_lock serializes add/remove of task groups and also changes to | 305 | /* task_group_lock serializes add/remove of task groups and also changes to |
304 | * a task group's cpu shares. | 306 | * a task group's cpu shares. |
@@ -308,9 +310,9 @@ static DEFINE_SPINLOCK(task_group_lock); | |||
308 | #ifdef CONFIG_FAIR_GROUP_SCHED | 310 | #ifdef CONFIG_FAIR_GROUP_SCHED |
309 | #ifdef CONFIG_USER_SCHED | 311 | #ifdef CONFIG_USER_SCHED |
310 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) | 312 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) |
311 | #else | 313 | #else /* !CONFIG_USER_SCHED */ |
312 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | 314 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
313 | #endif | 315 | #endif /* CONFIG_USER_SCHED */ |
314 | 316 | ||
315 | /* | 317 | /* |
316 | * A weight of 0 or 1 can cause arithmetics problems. | 318 | * A weight of 0 or 1 can cause arithmetics problems. |
@@ -364,6 +366,10 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) | |||
364 | #else | 366 | #else |
365 | 367 | ||
366 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } | 368 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } |
369 | static inline struct task_group *task_group(struct task_struct *p) | ||
370 | { | ||
371 | return NULL; | ||
372 | } | ||
367 | 373 | ||
368 | #endif /* CONFIG_GROUP_SCHED */ | 374 | #endif /* CONFIG_GROUP_SCHED */ |
369 | 375 | ||
@@ -374,6 +380,7 @@ struct cfs_rq { | |||
374 | 380 | ||
375 | u64 exec_clock; | 381 | u64 exec_clock; |
376 | u64 min_vruntime; | 382 | u64 min_vruntime; |
383 | u64 pair_start; | ||
377 | 384 | ||
378 | struct rb_root tasks_timeline; | 385 | struct rb_root tasks_timeline; |
379 | struct rb_node *rb_leftmost; | 386 | struct rb_node *rb_leftmost; |
@@ -402,6 +409,31 @@ struct cfs_rq { | |||
402 | */ | 409 | */ |
403 | struct list_head leaf_cfs_rq_list; | 410 | struct list_head leaf_cfs_rq_list; |
404 | struct task_group *tg; /* group that "owns" this runqueue */ | 411 | struct task_group *tg; /* group that "owns" this runqueue */ |
412 | |||
413 | #ifdef CONFIG_SMP | ||
414 | /* | ||
415 | * the part of load.weight contributed by tasks | ||
416 | */ | ||
417 | unsigned long task_weight; | ||
418 | |||
419 | /* | ||
420 | * h_load = weight * f(tg) | ||
421 | * | ||
422 | * Where f(tg) is the recursive weight fraction assigned to | ||
423 | * this group. | ||
424 | */ | ||
425 | unsigned long h_load; | ||
426 | |||
427 | /* | ||
428 | * this cpu's part of tg->shares | ||
429 | */ | ||
430 | unsigned long shares; | ||
431 | |||
432 | /* | ||
433 | * load.weight at the time we set shares | ||
434 | */ | ||
435 | unsigned long rq_weight; | ||
436 | #endif | ||
405 | #endif | 437 | #endif |
406 | }; | 438 | }; |
407 | 439 | ||
@@ -453,6 +485,9 @@ struct root_domain { | |||
453 | */ | 485 | */ |
454 | cpumask_t rto_mask; | 486 | cpumask_t rto_mask; |
455 | atomic_t rto_count; | 487 | atomic_t rto_count; |
488 | #ifdef CONFIG_SMP | ||
489 | struct cpupri cpupri; | ||
490 | #endif | ||
456 | }; | 491 | }; |
457 | 492 | ||
458 | /* | 493 | /* |
@@ -527,6 +562,9 @@ struct rq { | |||
527 | int push_cpu; | 562 | int push_cpu; |
528 | /* cpu of this runqueue: */ | 563 | /* cpu of this runqueue: */ |
529 | int cpu; | 564 | int cpu; |
565 | int online; | ||
566 | |||
567 | unsigned long avg_load_per_task; | ||
530 | 568 | ||
531 | struct task_struct *migration_thread; | 569 | struct task_struct *migration_thread; |
532 | struct list_head migration_queue; | 570 | struct list_head migration_queue; |
@@ -768,6 +806,12 @@ late_initcall(sched_init_debug); | |||
768 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | 806 | const_debug unsigned int sysctl_sched_nr_migrate = 32; |
769 | 807 | ||
770 | /* | 808 | /* |
809 | * ratelimit for updating the group shares. | ||
810 | * default: 0.5ms | ||
811 | */ | ||
812 | const_debug unsigned int sysctl_sched_shares_ratelimit = 500000; | ||
813 | |||
814 | /* | ||
771 | * period over which we measure -rt task cpu usage in us. | 815 | * period over which we measure -rt task cpu usage in us. |
772 | * default: 1s | 816 | * default: 1s |
773 | */ | 817 | */ |
@@ -794,82 +838,6 @@ static inline u64 global_rt_runtime(void) | |||
794 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | 838 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; |
795 | } | 839 | } |
796 | 840 | ||
797 | unsigned long long time_sync_thresh = 100000; | ||
798 | |||
799 | static DEFINE_PER_CPU(unsigned long long, time_offset); | ||
800 | static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); | ||
801 | |||
802 | /* | ||
803 | * Global lock which we take every now and then to synchronize | ||
804 | * the CPUs time. This method is not warp-safe, but it's good | ||
805 | * enough to synchronize slowly diverging time sources and thus | ||
806 | * it's good enough for tracing: | ||
807 | */ | ||
808 | static DEFINE_SPINLOCK(time_sync_lock); | ||
809 | static unsigned long long prev_global_time; | ||
810 | |||
811 | static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu) | ||
812 | { | ||
813 | /* | ||
814 | * We want this inlined, to not get tracer function calls | ||
815 | * in this critical section: | ||
816 | */ | ||
817 | spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_); | ||
818 | __raw_spin_lock(&time_sync_lock.raw_lock); | ||
819 | |||
820 | if (time < prev_global_time) { | ||
821 | per_cpu(time_offset, cpu) += prev_global_time - time; | ||
822 | time = prev_global_time; | ||
823 | } else { | ||
824 | prev_global_time = time; | ||
825 | } | ||
826 | |||
827 | __raw_spin_unlock(&time_sync_lock.raw_lock); | ||
828 | spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_); | ||
829 | |||
830 | return time; | ||
831 | } | ||
832 | |||
833 | static unsigned long long __cpu_clock(int cpu) | ||
834 | { | ||
835 | unsigned long long now; | ||
836 | |||
837 | /* | ||
838 | * Only call sched_clock() if the scheduler has already been | ||
839 | * initialized (some code might call cpu_clock() very early): | ||
840 | */ | ||
841 | if (unlikely(!scheduler_running)) | ||
842 | return 0; | ||
843 | |||
844 | now = sched_clock_cpu(cpu); | ||
845 | |||
846 | return now; | ||
847 | } | ||
848 | |||
849 | /* | ||
850 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | ||
851 | * clock constructed from sched_clock(): | ||
852 | */ | ||
853 | unsigned long long notrace cpu_clock(int cpu) | ||
854 | { | ||
855 | unsigned long long prev_cpu_time, time, delta_time; | ||
856 | unsigned long flags; | ||
857 | |||
858 | local_irq_save(flags); | ||
859 | prev_cpu_time = per_cpu(prev_cpu_time, cpu); | ||
860 | time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); | ||
861 | delta_time = time-prev_cpu_time; | ||
862 | |||
863 | if (unlikely(delta_time > time_sync_thresh)) { | ||
864 | time = __sync_cpu_clock(time, cpu); | ||
865 | per_cpu(prev_cpu_time, cpu) = time; | ||
866 | } | ||
867 | local_irq_restore(flags); | ||
868 | |||
869 | return time; | ||
870 | } | ||
871 | EXPORT_SYMBOL_GPL(cpu_clock); | ||
872 | |||
873 | #ifndef prepare_arch_switch | 841 | #ifndef prepare_arch_switch |
874 | # define prepare_arch_switch(next) do { } while (0) | 842 | # define prepare_arch_switch(next) do { } while (0) |
875 | #endif | 843 | #endif |
@@ -1332,15 +1300,15 @@ void wake_up_idle_cpu(int cpu) | |||
1332 | if (!tsk_is_polling(rq->idle)) | 1300 | if (!tsk_is_polling(rq->idle)) |
1333 | smp_send_reschedule(cpu); | 1301 | smp_send_reschedule(cpu); |
1334 | } | 1302 | } |
1335 | #endif | 1303 | #endif /* CONFIG_NO_HZ */ |
1336 | 1304 | ||
1337 | #else | 1305 | #else /* !CONFIG_SMP */ |
1338 | static void __resched_task(struct task_struct *p, int tif_bit) | 1306 | static void __resched_task(struct task_struct *p, int tif_bit) |
1339 | { | 1307 | { |
1340 | assert_spin_locked(&task_rq(p)->lock); | 1308 | assert_spin_locked(&task_rq(p)->lock); |
1341 | set_tsk_thread_flag(p, tif_bit); | 1309 | set_tsk_thread_flag(p, tif_bit); |
1342 | } | 1310 | } |
1343 | #endif | 1311 | #endif /* CONFIG_SMP */ |
1344 | 1312 | ||
1345 | #if BITS_PER_LONG == 32 | 1313 | #if BITS_PER_LONG == 32 |
1346 | # define WMULT_CONST (~0UL) | 1314 | # define WMULT_CONST (~0UL) |
@@ -1355,6 +1323,9 @@ static void __resched_task(struct task_struct *p, int tif_bit) | |||
1355 | */ | 1323 | */ |
1356 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) | 1324 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) |
1357 | 1325 | ||
1326 | /* | ||
1327 | * delta *= weight / lw | ||
1328 | */ | ||
1358 | static unsigned long | 1329 | static unsigned long |
1359 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | 1330 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, |
1360 | struct load_weight *lw) | 1331 | struct load_weight *lw) |
@@ -1382,12 +1353,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, | |||
1382 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); | 1353 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); |
1383 | } | 1354 | } |
1384 | 1355 | ||
1385 | static inline unsigned long | ||
1386 | calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) | ||
1387 | { | ||
1388 | return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); | ||
1389 | } | ||
1390 | |||
1391 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | 1356 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) |
1392 | { | 1357 | { |
1393 | lw->weight += inc; | 1358 | lw->weight += inc; |
@@ -1498,17 +1463,211 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) | |||
1498 | #ifdef CONFIG_SMP | 1463 | #ifdef CONFIG_SMP |
1499 | static unsigned long source_load(int cpu, int type); | 1464 | static unsigned long source_load(int cpu, int type); |
1500 | static unsigned long target_load(int cpu, int type); | 1465 | static unsigned long target_load(int cpu, int type); |
1501 | static unsigned long cpu_avg_load_per_task(int cpu); | ||
1502 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | 1466 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); |
1503 | #else /* CONFIG_SMP */ | 1467 | |
1468 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
1469 | { | ||
1470 | struct rq *rq = cpu_rq(cpu); | ||
1471 | |||
1472 | if (rq->nr_running) | ||
1473 | rq->avg_load_per_task = rq->load.weight / rq->nr_running; | ||
1474 | |||
1475 | return rq->avg_load_per_task; | ||
1476 | } | ||
1504 | 1477 | ||
1505 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1478 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1506 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | 1479 | |
1480 | typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); | ||
1481 | |||
1482 | /* | ||
1483 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
1484 | * leaving it for the final time. | ||
1485 | */ | ||
1486 | static void | ||
1487 | walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) | ||
1507 | { | 1488 | { |
1489 | struct task_group *parent, *child; | ||
1490 | |||
1491 | rcu_read_lock(); | ||
1492 | parent = &root_task_group; | ||
1493 | down: | ||
1494 | (*down)(parent, cpu, sd); | ||
1495 | list_for_each_entry_rcu(child, &parent->children, siblings) { | ||
1496 | parent = child; | ||
1497 | goto down; | ||
1498 | |||
1499 | up: | ||
1500 | continue; | ||
1501 | } | ||
1502 | (*up)(parent, cpu, sd); | ||
1503 | |||
1504 | child = parent; | ||
1505 | parent = parent->parent; | ||
1506 | if (parent) | ||
1507 | goto up; | ||
1508 | rcu_read_unlock(); | ||
1509 | } | ||
1510 | |||
1511 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | ||
1512 | |||
1513 | /* | ||
1514 | * Calculate and set the cpu's group shares. | ||
1515 | */ | ||
1516 | static void | ||
1517 | __update_group_shares_cpu(struct task_group *tg, int cpu, | ||
1518 | unsigned long sd_shares, unsigned long sd_rq_weight) | ||
1519 | { | ||
1520 | int boost = 0; | ||
1521 | unsigned long shares; | ||
1522 | unsigned long rq_weight; | ||
1523 | |||
1524 | if (!tg->se[cpu]) | ||
1525 | return; | ||
1526 | |||
1527 | rq_weight = tg->cfs_rq[cpu]->load.weight; | ||
1528 | |||
1529 | /* | ||
1530 | * If there are currently no tasks on the cpu pretend there is one of | ||
1531 | * average load so that when a new task gets to run here it will not | ||
1532 | * get delayed by group starvation. | ||
1533 | */ | ||
1534 | if (!rq_weight) { | ||
1535 | boost = 1; | ||
1536 | rq_weight = NICE_0_LOAD; | ||
1537 | } | ||
1538 | |||
1539 | if (unlikely(rq_weight > sd_rq_weight)) | ||
1540 | rq_weight = sd_rq_weight; | ||
1541 | |||
1542 | /* | ||
1543 | * \Sum shares * rq_weight | ||
1544 | * shares = ----------------------- | ||
1545 | * \Sum rq_weight | ||
1546 | * | ||
1547 | */ | ||
1548 | shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); | ||
1549 | |||
1550 | /* | ||
1551 | * record the actual number of shares, not the boosted amount. | ||
1552 | */ | ||
1553 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | ||
1554 | tg->cfs_rq[cpu]->rq_weight = rq_weight; | ||
1555 | |||
1556 | if (shares < MIN_SHARES) | ||
1557 | shares = MIN_SHARES; | ||
1558 | else if (shares > MAX_SHARES) | ||
1559 | shares = MAX_SHARES; | ||
1560 | |||
1561 | __set_se_shares(tg->se[cpu], shares); | ||
1562 | } | ||
1563 | |||
1564 | /* | ||
1565 | * Re-compute the task group their per cpu shares over the given domain. | ||
1566 | * This needs to be done in a bottom-up fashion because the rq weight of a | ||
1567 | * parent group depends on the shares of its child groups. | ||
1568 | */ | ||
1569 | static void | ||
1570 | tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
1571 | { | ||
1572 | unsigned long rq_weight = 0; | ||
1573 | unsigned long shares = 0; | ||
1574 | int i; | ||
1575 | |||
1576 | for_each_cpu_mask(i, sd->span) { | ||
1577 | rq_weight += tg->cfs_rq[i]->load.weight; | ||
1578 | shares += tg->cfs_rq[i]->shares; | ||
1579 | } | ||
1580 | |||
1581 | if ((!shares && rq_weight) || shares > tg->shares) | ||
1582 | shares = tg->shares; | ||
1583 | |||
1584 | if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) | ||
1585 | shares = tg->shares; | ||
1586 | |||
1587 | if (!rq_weight) | ||
1588 | rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; | ||
1589 | |||
1590 | for_each_cpu_mask(i, sd->span) { | ||
1591 | struct rq *rq = cpu_rq(i); | ||
1592 | unsigned long flags; | ||
1593 | |||
1594 | spin_lock_irqsave(&rq->lock, flags); | ||
1595 | __update_group_shares_cpu(tg, i, shares, rq_weight); | ||
1596 | spin_unlock_irqrestore(&rq->lock, flags); | ||
1597 | } | ||
1598 | } | ||
1599 | |||
1600 | /* | ||
1601 | * Compute the cpu's hierarchical load factor for each task group. | ||
1602 | * This needs to be done in a top-down fashion because the load of a child | ||
1603 | * group is a fraction of its parents load. | ||
1604 | */ | ||
1605 | static void | ||
1606 | tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
1607 | { | ||
1608 | unsigned long load; | ||
1609 | |||
1610 | if (!tg->parent) { | ||
1611 | load = cpu_rq(cpu)->load.weight; | ||
1612 | } else { | ||
1613 | load = tg->parent->cfs_rq[cpu]->h_load; | ||
1614 | load *= tg->cfs_rq[cpu]->shares; | ||
1615 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; | ||
1616 | } | ||
1617 | |||
1618 | tg->cfs_rq[cpu]->h_load = load; | ||
1619 | } | ||
1620 | |||
1621 | static void | ||
1622 | tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
1623 | { | ||
1624 | } | ||
1625 | |||
1626 | static void update_shares(struct sched_domain *sd) | ||
1627 | { | ||
1628 | u64 now = cpu_clock(raw_smp_processor_id()); | ||
1629 | s64 elapsed = now - sd->last_update; | ||
1630 | |||
1631 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | ||
1632 | sd->last_update = now; | ||
1633 | walk_tg_tree(tg_nop, tg_shares_up, 0, sd); | ||
1634 | } | ||
1508 | } | 1635 | } |
1636 | |||
1637 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | ||
1638 | { | ||
1639 | spin_unlock(&rq->lock); | ||
1640 | update_shares(sd); | ||
1641 | spin_lock(&rq->lock); | ||
1642 | } | ||
1643 | |||
1644 | static void update_h_load(int cpu) | ||
1645 | { | ||
1646 | walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); | ||
1647 | } | ||
1648 | |||
1649 | #else | ||
1650 | |||
1651 | static inline void update_shares(struct sched_domain *sd) | ||
1652 | { | ||
1653 | } | ||
1654 | |||
1655 | static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) | ||
1656 | { | ||
1657 | } | ||
1658 | |||
1509 | #endif | 1659 | #endif |
1510 | 1660 | ||
1511 | #endif /* CONFIG_SMP */ | 1661 | #endif |
1662 | |||
1663 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1664 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | ||
1665 | { | ||
1666 | #ifdef CONFIG_SMP | ||
1667 | cfs_rq->shares = shares; | ||
1668 | #endif | ||
1669 | } | ||
1670 | #endif | ||
1512 | 1671 | ||
1513 | #include "sched_stats.h" | 1672 | #include "sched_stats.h" |
1514 | #include "sched_idletask.c" | 1673 | #include "sched_idletask.c" |
@@ -1519,27 +1678,17 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | |||
1519 | #endif | 1678 | #endif |
1520 | 1679 | ||
1521 | #define sched_class_highest (&rt_sched_class) | 1680 | #define sched_class_highest (&rt_sched_class) |
1681 | #define for_each_class(class) \ | ||
1682 | for (class = sched_class_highest; class; class = class->next) | ||
1522 | 1683 | ||
1523 | static inline void inc_load(struct rq *rq, const struct task_struct *p) | 1684 | static void inc_nr_running(struct rq *rq) |
1524 | { | ||
1525 | update_load_add(&rq->load, p->se.load.weight); | ||
1526 | } | ||
1527 | |||
1528 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | ||
1529 | { | ||
1530 | update_load_sub(&rq->load, p->se.load.weight); | ||
1531 | } | ||
1532 | |||
1533 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | ||
1534 | { | 1685 | { |
1535 | rq->nr_running++; | 1686 | rq->nr_running++; |
1536 | inc_load(rq, p); | ||
1537 | } | 1687 | } |
1538 | 1688 | ||
1539 | static void dec_nr_running(struct task_struct *p, struct rq *rq) | 1689 | static void dec_nr_running(struct rq *rq) |
1540 | { | 1690 | { |
1541 | rq->nr_running--; | 1691 | rq->nr_running--; |
1542 | dec_load(rq, p); | ||
1543 | } | 1692 | } |
1544 | 1693 | ||
1545 | static void set_load_weight(struct task_struct *p) | 1694 | static void set_load_weight(struct task_struct *p) |
@@ -1563,6 +1712,12 @@ static void set_load_weight(struct task_struct *p) | |||
1563 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; | 1712 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; |
1564 | } | 1713 | } |
1565 | 1714 | ||
1715 | static void update_avg(u64 *avg, u64 sample) | ||
1716 | { | ||
1717 | s64 diff = sample - *avg; | ||
1718 | *avg += diff >> 3; | ||
1719 | } | ||
1720 | |||
1566 | static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) | 1721 | static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) |
1567 | { | 1722 | { |
1568 | sched_info_queued(p); | 1723 | sched_info_queued(p); |
@@ -1572,6 +1727,13 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
1572 | 1727 | ||
1573 | static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) | 1728 | static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) |
1574 | { | 1729 | { |
1730 | if (sleep && p->se.last_wakeup) { | ||
1731 | update_avg(&p->se.avg_overlap, | ||
1732 | p->se.sum_exec_runtime - p->se.last_wakeup); | ||
1733 | p->se.last_wakeup = 0; | ||
1734 | } | ||
1735 | |||
1736 | sched_info_dequeued(p); | ||
1575 | p->sched_class->dequeue_task(rq, p, sleep); | 1737 | p->sched_class->dequeue_task(rq, p, sleep); |
1576 | p->se.on_rq = 0; | 1738 | p->se.on_rq = 0; |
1577 | } | 1739 | } |
@@ -1631,7 +1793,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
1631 | rq->nr_uninterruptible--; | 1793 | rq->nr_uninterruptible--; |
1632 | 1794 | ||
1633 | enqueue_task(rq, p, wakeup); | 1795 | enqueue_task(rq, p, wakeup); |
1634 | inc_nr_running(p, rq); | 1796 | inc_nr_running(rq); |
1635 | } | 1797 | } |
1636 | 1798 | ||
1637 | /* | 1799 | /* |
@@ -1643,7 +1805,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | |||
1643 | rq->nr_uninterruptible++; | 1805 | rq->nr_uninterruptible++; |
1644 | 1806 | ||
1645 | dequeue_task(rq, p, sleep); | 1807 | dequeue_task(rq, p, sleep); |
1646 | dec_nr_running(p, rq); | 1808 | dec_nr_running(rq); |
1647 | } | 1809 | } |
1648 | 1810 | ||
1649 | /** | 1811 | /** |
@@ -1655,12 +1817,6 @@ inline int task_curr(const struct task_struct *p) | |||
1655 | return cpu_curr(task_cpu(p)) == p; | 1817 | return cpu_curr(task_cpu(p)) == p; |
1656 | } | 1818 | } |
1657 | 1819 | ||
1658 | /* Used instead of source_load when we know the type == 0 */ | ||
1659 | unsigned long weighted_cpuload(const int cpu) | ||
1660 | { | ||
1661 | return cpu_rq(cpu)->load.weight; | ||
1662 | } | ||
1663 | |||
1664 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | 1820 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) |
1665 | { | 1821 | { |
1666 | set_task_rq(p, cpu); | 1822 | set_task_rq(p, cpu); |
@@ -1689,6 +1845,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
1689 | 1845 | ||
1690 | #ifdef CONFIG_SMP | 1846 | #ifdef CONFIG_SMP |
1691 | 1847 | ||
1848 | /* Used instead of source_load when we know the type == 0 */ | ||
1849 | static unsigned long weighted_cpuload(const int cpu) | ||
1850 | { | ||
1851 | return cpu_rq(cpu)->load.weight; | ||
1852 | } | ||
1853 | |||
1692 | /* | 1854 | /* |
1693 | * Is this task likely cache-hot: | 1855 | * Is this task likely cache-hot: |
1694 | */ | 1856 | */ |
@@ -1899,7 +2061,7 @@ static unsigned long source_load(int cpu, int type) | |||
1899 | struct rq *rq = cpu_rq(cpu); | 2061 | struct rq *rq = cpu_rq(cpu); |
1900 | unsigned long total = weighted_cpuload(cpu); | 2062 | unsigned long total = weighted_cpuload(cpu); |
1901 | 2063 | ||
1902 | if (type == 0) | 2064 | if (type == 0 || !sched_feat(LB_BIAS)) |
1903 | return total; | 2065 | return total; |
1904 | 2066 | ||
1905 | return min(rq->cpu_load[type-1], total); | 2067 | return min(rq->cpu_load[type-1], total); |
@@ -1914,25 +2076,13 @@ static unsigned long target_load(int cpu, int type) | |||
1914 | struct rq *rq = cpu_rq(cpu); | 2076 | struct rq *rq = cpu_rq(cpu); |
1915 | unsigned long total = weighted_cpuload(cpu); | 2077 | unsigned long total = weighted_cpuload(cpu); |
1916 | 2078 | ||
1917 | if (type == 0) | 2079 | if (type == 0 || !sched_feat(LB_BIAS)) |
1918 | return total; | 2080 | return total; |
1919 | 2081 | ||
1920 | return max(rq->cpu_load[type-1], total); | 2082 | return max(rq->cpu_load[type-1], total); |
1921 | } | 2083 | } |
1922 | 2084 | ||
1923 | /* | 2085 | /* |
1924 | * Return the average load per task on the cpu's run queue | ||
1925 | */ | ||
1926 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
1927 | { | ||
1928 | struct rq *rq = cpu_rq(cpu); | ||
1929 | unsigned long total = weighted_cpuload(cpu); | ||
1930 | unsigned long n = rq->nr_running; | ||
1931 | |||
1932 | return n ? total / n : SCHED_LOAD_SCALE; | ||
1933 | } | ||
1934 | |||
1935 | /* | ||
1936 | * find_idlest_group finds and returns the least busy CPU group within the | 2086 | * find_idlest_group finds and returns the least busy CPU group within the |
1937 | * domain. | 2087 | * domain. |
1938 | */ | 2088 | */ |
@@ -2038,6 +2188,9 @@ static int sched_balance_self(int cpu, int flag) | |||
2038 | sd = tmp; | 2188 | sd = tmp; |
2039 | } | 2189 | } |
2040 | 2190 | ||
2191 | if (sd) | ||
2192 | update_shares(sd); | ||
2193 | |||
2041 | while (sd) { | 2194 | while (sd) { |
2042 | cpumask_t span, tmpmask; | 2195 | cpumask_t span, tmpmask; |
2043 | struct sched_group *group; | 2196 | struct sched_group *group; |
@@ -2104,6 +2257,22 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
2104 | if (!sched_feat(SYNC_WAKEUPS)) | 2257 | if (!sched_feat(SYNC_WAKEUPS)) |
2105 | sync = 0; | 2258 | sync = 0; |
2106 | 2259 | ||
2260 | #ifdef CONFIG_SMP | ||
2261 | if (sched_feat(LB_WAKEUP_UPDATE)) { | ||
2262 | struct sched_domain *sd; | ||
2263 | |||
2264 | this_cpu = raw_smp_processor_id(); | ||
2265 | cpu = task_cpu(p); | ||
2266 | |||
2267 | for_each_domain(this_cpu, sd) { | ||
2268 | if (cpu_isset(cpu, sd->span)) { | ||
2269 | update_shares(sd); | ||
2270 | break; | ||
2271 | } | ||
2272 | } | ||
2273 | } | ||
2274 | #endif | ||
2275 | |||
2107 | smp_wmb(); | 2276 | smp_wmb(); |
2108 | rq = task_rq_lock(p, &flags); | 2277 | rq = task_rq_lock(p, &flags); |
2109 | old_state = p->state; | 2278 | old_state = p->state; |
@@ -2150,7 +2319,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
2150 | } | 2319 | } |
2151 | } | 2320 | } |
2152 | } | 2321 | } |
2153 | #endif | 2322 | #endif /* CONFIG_SCHEDSTATS */ |
2154 | 2323 | ||
2155 | out_activate: | 2324 | out_activate: |
2156 | #endif /* CONFIG_SMP */ | 2325 | #endif /* CONFIG_SMP */ |
@@ -2179,6 +2348,8 @@ out_running: | |||
2179 | p->sched_class->task_wake_up(rq, p); | 2348 | p->sched_class->task_wake_up(rq, p); |
2180 | #endif | 2349 | #endif |
2181 | out: | 2350 | out: |
2351 | current->se.last_wakeup = current->se.sum_exec_runtime; | ||
2352 | |||
2182 | task_rq_unlock(rq, &flags); | 2353 | task_rq_unlock(rq, &flags); |
2183 | 2354 | ||
2184 | return success; | 2355 | return success; |
@@ -2299,7 +2470,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2299 | * management (if any): | 2470 | * management (if any): |
2300 | */ | 2471 | */ |
2301 | p->sched_class->task_new(rq, p); | 2472 | p->sched_class->task_new(rq, p); |
2302 | inc_nr_running(p, rq); | 2473 | inc_nr_running(rq); |
2303 | } | 2474 | } |
2304 | trace_mark(kernel_sched_wakeup_new, | 2475 | trace_mark(kernel_sched_wakeup_new, |
2305 | "pid %d state %ld ## rq %p task %p rq->curr %p", | 2476 | "pid %d state %ld ## rq %p task %p rq->curr %p", |
@@ -2356,7 +2527,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, | |||
2356 | notifier->ops->sched_out(notifier, next); | 2527 | notifier->ops->sched_out(notifier, next); |
2357 | } | 2528 | } |
2358 | 2529 | ||
2359 | #else | 2530 | #else /* !CONFIG_PREEMPT_NOTIFIERS */ |
2360 | 2531 | ||
2361 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) | 2532 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) |
2362 | { | 2533 | { |
@@ -2368,7 +2539,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, | |||
2368 | { | 2539 | { |
2369 | } | 2540 | } |
2370 | 2541 | ||
2371 | #endif | 2542 | #endif /* CONFIG_PREEMPT_NOTIFIERS */ |
2372 | 2543 | ||
2373 | /** | 2544 | /** |
2374 | * prepare_task_switch - prepare to switch tasks | 2545 | * prepare_task_switch - prepare to switch tasks |
@@ -2815,7 +2986,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2815 | enum cpu_idle_type idle, int *all_pinned, | 2986 | enum cpu_idle_type idle, int *all_pinned, |
2816 | int *this_best_prio, struct rq_iterator *iterator) | 2987 | int *this_best_prio, struct rq_iterator *iterator) |
2817 | { | 2988 | { |
2818 | int loops = 0, pulled = 0, pinned = 0, skip_for_load; | 2989 | int loops = 0, pulled = 0, pinned = 0; |
2819 | struct task_struct *p; | 2990 | struct task_struct *p; |
2820 | long rem_load_move = max_load_move; | 2991 | long rem_load_move = max_load_move; |
2821 | 2992 | ||
@@ -2831,14 +3002,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2831 | next: | 3002 | next: |
2832 | if (!p || loops++ > sysctl_sched_nr_migrate) | 3003 | if (!p || loops++ > sysctl_sched_nr_migrate) |
2833 | goto out; | 3004 | goto out; |
2834 | /* | 3005 | |
2835 | * To help distribute high priority tasks across CPUs we don't | 3006 | if ((p->se.load.weight >> 1) > rem_load_move || |
2836 | * skip a task if it will be the highest priority task (i.e. smallest | ||
2837 | * prio value) on its new queue regardless of its load weight | ||
2838 | */ | ||
2839 | skip_for_load = (p->se.load.weight >> 1) > rem_load_move + | ||
2840 | SCHED_LOAD_SCALE_FUZZ; | ||
2841 | if ((skip_for_load && p->prio >= *this_best_prio) || | ||
2842 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { | 3007 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { |
2843 | p = iterator->next(iterator->arg); | 3008 | p = iterator->next(iterator->arg); |
2844 | goto next; | 3009 | goto next; |
@@ -2893,6 +3058,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2893 | max_load_move - total_load_moved, | 3058 | max_load_move - total_load_moved, |
2894 | sd, idle, all_pinned, &this_best_prio); | 3059 | sd, idle, all_pinned, &this_best_prio); |
2895 | class = class->next; | 3060 | class = class->next; |
3061 | |||
3062 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) | ||
3063 | break; | ||
3064 | |||
2896 | } while (class && max_load_move > total_load_moved); | 3065 | } while (class && max_load_move > total_load_moved); |
2897 | 3066 | ||
2898 | return total_load_moved > 0; | 3067 | return total_load_moved > 0; |
@@ -2969,6 +3138,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2969 | max_load = this_load = total_load = total_pwr = 0; | 3138 | max_load = this_load = total_load = total_pwr = 0; |
2970 | busiest_load_per_task = busiest_nr_running = 0; | 3139 | busiest_load_per_task = busiest_nr_running = 0; |
2971 | this_load_per_task = this_nr_running = 0; | 3140 | this_load_per_task = this_nr_running = 0; |
3141 | |||
2972 | if (idle == CPU_NOT_IDLE) | 3142 | if (idle == CPU_NOT_IDLE) |
2973 | load_idx = sd->busy_idx; | 3143 | load_idx = sd->busy_idx; |
2974 | else if (idle == CPU_NEWLY_IDLE) | 3144 | else if (idle == CPU_NEWLY_IDLE) |
@@ -2983,6 +3153,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2983 | int __group_imb = 0; | 3153 | int __group_imb = 0; |
2984 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 3154 | unsigned int balance_cpu = -1, first_idle_cpu = 0; |
2985 | unsigned long sum_nr_running, sum_weighted_load; | 3155 | unsigned long sum_nr_running, sum_weighted_load; |
3156 | unsigned long sum_avg_load_per_task; | ||
3157 | unsigned long avg_load_per_task; | ||
2986 | 3158 | ||
2987 | local_group = cpu_isset(this_cpu, group->cpumask); | 3159 | local_group = cpu_isset(this_cpu, group->cpumask); |
2988 | 3160 | ||
@@ -2991,6 +3163,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2991 | 3163 | ||
2992 | /* Tally up the load of all CPUs in the group */ | 3164 | /* Tally up the load of all CPUs in the group */ |
2993 | sum_weighted_load = sum_nr_running = avg_load = 0; | 3165 | sum_weighted_load = sum_nr_running = avg_load = 0; |
3166 | sum_avg_load_per_task = avg_load_per_task = 0; | ||
3167 | |||
2994 | max_cpu_load = 0; | 3168 | max_cpu_load = 0; |
2995 | min_cpu_load = ~0UL; | 3169 | min_cpu_load = ~0UL; |
2996 | 3170 | ||
@@ -3024,6 +3198,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
3024 | avg_load += load; | 3198 | avg_load += load; |
3025 | sum_nr_running += rq->nr_running; | 3199 | sum_nr_running += rq->nr_running; |
3026 | sum_weighted_load += weighted_cpuload(i); | 3200 | sum_weighted_load += weighted_cpuload(i); |
3201 | |||
3202 | sum_avg_load_per_task += cpu_avg_load_per_task(i); | ||
3027 | } | 3203 | } |
3028 | 3204 | ||
3029 | /* | 3205 | /* |
@@ -3045,7 +3221,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
3045 | avg_load = sg_div_cpu_power(group, | 3221 | avg_load = sg_div_cpu_power(group, |
3046 | avg_load * SCHED_LOAD_SCALE); | 3222 | avg_load * SCHED_LOAD_SCALE); |
3047 | 3223 | ||
3048 | if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) | 3224 | |
3225 | /* | ||
3226 | * Consider the group unbalanced when the imbalance is larger | ||
3227 | * than the average weight of two tasks. | ||
3228 | * | ||
3229 | * APZ: with cgroup the avg task weight can vary wildly and | ||
3230 | * might not be a suitable number - should we keep a | ||
3231 | * normalized nr_running number somewhere that negates | ||
3232 | * the hierarchy? | ||
3233 | */ | ||
3234 | avg_load_per_task = sg_div_cpu_power(group, | ||
3235 | sum_avg_load_per_task * SCHED_LOAD_SCALE); | ||
3236 | |||
3237 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | ||
3049 | __group_imb = 1; | 3238 | __group_imb = 1; |
3050 | 3239 | ||
3051 | group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; | 3240 | group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; |
@@ -3186,9 +3375,9 @@ small_imbalance: | |||
3186 | if (busiest_load_per_task > this_load_per_task) | 3375 | if (busiest_load_per_task > this_load_per_task) |
3187 | imbn = 1; | 3376 | imbn = 1; |
3188 | } else | 3377 | } else |
3189 | this_load_per_task = SCHED_LOAD_SCALE; | 3378 | this_load_per_task = cpu_avg_load_per_task(this_cpu); |
3190 | 3379 | ||
3191 | if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= | 3380 | if (max_load - this_load + 2*busiest_load_per_task >= |
3192 | busiest_load_per_task * imbn) { | 3381 | busiest_load_per_task * imbn) { |
3193 | *imbalance = busiest_load_per_task; | 3382 | *imbalance = busiest_load_per_task; |
3194 | return busiest; | 3383 | return busiest; |
@@ -3314,6 +3503,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3314 | schedstat_inc(sd, lb_count[idle]); | 3503 | schedstat_inc(sd, lb_count[idle]); |
3315 | 3504 | ||
3316 | redo: | 3505 | redo: |
3506 | update_shares(sd); | ||
3317 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 3507 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
3318 | cpus, balance); | 3508 | cpus, balance); |
3319 | 3509 | ||
@@ -3416,8 +3606,9 @@ redo: | |||
3416 | 3606 | ||
3417 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3607 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
3418 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3608 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
3419 | return -1; | 3609 | ld_moved = -1; |
3420 | return ld_moved; | 3610 | |
3611 | goto out; | ||
3421 | 3612 | ||
3422 | out_balanced: | 3613 | out_balanced: |
3423 | schedstat_inc(sd, lb_balanced[idle]); | 3614 | schedstat_inc(sd, lb_balanced[idle]); |
@@ -3432,8 +3623,13 @@ out_one_pinned: | |||
3432 | 3623 | ||
3433 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3624 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
3434 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3625 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
3435 | return -1; | 3626 | ld_moved = -1; |
3436 | return 0; | 3627 | else |
3628 | ld_moved = 0; | ||
3629 | out: | ||
3630 | if (ld_moved) | ||
3631 | update_shares(sd); | ||
3632 | return ld_moved; | ||
3437 | } | 3633 | } |
3438 | 3634 | ||
3439 | /* | 3635 | /* |
@@ -3468,6 +3664,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, | |||
3468 | 3664 | ||
3469 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); | 3665 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); |
3470 | redo: | 3666 | redo: |
3667 | update_shares_locked(this_rq, sd); | ||
3471 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, | 3668 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, |
3472 | &sd_idle, cpus, NULL); | 3669 | &sd_idle, cpus, NULL); |
3473 | if (!group) { | 3670 | if (!group) { |
@@ -3511,6 +3708,7 @@ redo: | |||
3511 | } else | 3708 | } else |
3512 | sd->nr_balance_failed = 0; | 3709 | sd->nr_balance_failed = 0; |
3513 | 3710 | ||
3711 | update_shares_locked(this_rq, sd); | ||
3514 | return ld_moved; | 3712 | return ld_moved; |
3515 | 3713 | ||
3516 | out_balanced: | 3714 | out_balanced: |
@@ -3702,6 +3900,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3702 | /* Earliest time when we have to do rebalance again */ | 3900 | /* Earliest time when we have to do rebalance again */ |
3703 | unsigned long next_balance = jiffies + 60*HZ; | 3901 | unsigned long next_balance = jiffies + 60*HZ; |
3704 | int update_next_balance = 0; | 3902 | int update_next_balance = 0; |
3903 | int need_serialize; | ||
3705 | cpumask_t tmp; | 3904 | cpumask_t tmp; |
3706 | 3905 | ||
3707 | for_each_domain(cpu, sd) { | 3906 | for_each_domain(cpu, sd) { |
@@ -3719,8 +3918,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3719 | if (interval > HZ*NR_CPUS/10) | 3918 | if (interval > HZ*NR_CPUS/10) |
3720 | interval = HZ*NR_CPUS/10; | 3919 | interval = HZ*NR_CPUS/10; |
3721 | 3920 | ||
3921 | need_serialize = sd->flags & SD_SERIALIZE; | ||
3722 | 3922 | ||
3723 | if (sd->flags & SD_SERIALIZE) { | 3923 | if (need_serialize) { |
3724 | if (!spin_trylock(&balancing)) | 3924 | if (!spin_trylock(&balancing)) |
3725 | goto out; | 3925 | goto out; |
3726 | } | 3926 | } |
@@ -3736,7 +3936,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3736 | } | 3936 | } |
3737 | sd->last_balance = jiffies; | 3937 | sd->last_balance = jiffies; |
3738 | } | 3938 | } |
3739 | if (sd->flags & SD_SERIALIZE) | 3939 | if (need_serialize) |
3740 | spin_unlock(&balancing); | 3940 | spin_unlock(&balancing); |
3741 | out: | 3941 | out: |
3742 | if (time_after(next_balance, sd->last_balance + interval)) { | 3942 | if (time_after(next_balance, sd->last_balance + interval)) { |
@@ -4121,6 +4321,7 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
4121 | prev->comm, prev->pid, preempt_count()); | 4321 | prev->comm, prev->pid, preempt_count()); |
4122 | 4322 | ||
4123 | debug_show_held_locks(prev); | 4323 | debug_show_held_locks(prev); |
4324 | print_modules(); | ||
4124 | if (irqs_disabled()) | 4325 | if (irqs_disabled()) |
4125 | print_irqtrace_events(prev); | 4326 | print_irqtrace_events(prev); |
4126 | 4327 | ||
@@ -4194,7 +4395,7 @@ asmlinkage void __sched schedule(void) | |||
4194 | struct task_struct *prev, *next; | 4395 | struct task_struct *prev, *next; |
4195 | unsigned long *switch_count; | 4396 | unsigned long *switch_count; |
4196 | struct rq *rq; | 4397 | struct rq *rq; |
4197 | int cpu; | 4398 | int cpu, hrtick = sched_feat(HRTICK); |
4198 | 4399 | ||
4199 | need_resched: | 4400 | need_resched: |
4200 | preempt_disable(); | 4401 | preempt_disable(); |
@@ -4209,7 +4410,8 @@ need_resched_nonpreemptible: | |||
4209 | 4410 | ||
4210 | schedule_debug(prev); | 4411 | schedule_debug(prev); |
4211 | 4412 | ||
4212 | hrtick_clear(rq); | 4413 | if (hrtick) |
4414 | hrtick_clear(rq); | ||
4213 | 4415 | ||
4214 | /* | 4416 | /* |
4215 | * Do the rq-clock update outside the rq lock: | 4417 | * Do the rq-clock update outside the rq lock: |
@@ -4255,7 +4457,8 @@ need_resched_nonpreemptible: | |||
4255 | } else | 4457 | } else |
4256 | spin_unlock_irq(&rq->lock); | 4458 | spin_unlock_irq(&rq->lock); |
4257 | 4459 | ||
4258 | hrtick_set(rq); | 4460 | if (hrtick) |
4461 | hrtick_set(rq); | ||
4259 | 4462 | ||
4260 | if (unlikely(reacquire_kernel_lock(current) < 0)) | 4463 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
4261 | goto need_resched_nonpreemptible; | 4464 | goto need_resched_nonpreemptible; |
@@ -4637,10 +4840,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4637 | goto out_unlock; | 4840 | goto out_unlock; |
4638 | } | 4841 | } |
4639 | on_rq = p->se.on_rq; | 4842 | on_rq = p->se.on_rq; |
4640 | if (on_rq) { | 4843 | if (on_rq) |
4641 | dequeue_task(rq, p, 0); | 4844 | dequeue_task(rq, p, 0); |
4642 | dec_load(rq, p); | ||
4643 | } | ||
4644 | 4845 | ||
4645 | p->static_prio = NICE_TO_PRIO(nice); | 4846 | p->static_prio = NICE_TO_PRIO(nice); |
4646 | set_load_weight(p); | 4847 | set_load_weight(p); |
@@ -4650,7 +4851,6 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4650 | 4851 | ||
4651 | if (on_rq) { | 4852 | if (on_rq) { |
4652 | enqueue_task(rq, p, 0); | 4853 | enqueue_task(rq, p, 0); |
4653 | inc_load(rq, p); | ||
4654 | /* | 4854 | /* |
4655 | * If the task increased its priority or is running and | 4855 | * If the task increased its priority or is running and |
4656 | * lowered its priority, then reschedule its CPU: | 4856 | * lowered its priority, then reschedule its CPU: |
@@ -5121,24 +5321,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, | |||
5121 | return sched_setaffinity(pid, &new_mask); | 5321 | return sched_setaffinity(pid, &new_mask); |
5122 | } | 5322 | } |
5123 | 5323 | ||
5124 | /* | ||
5125 | * Represents all cpu's present in the system | ||
5126 | * In systems capable of hotplug, this map could dynamically grow | ||
5127 | * as new cpu's are detected in the system via any platform specific | ||
5128 | * method, such as ACPI for e.g. | ||
5129 | */ | ||
5130 | |||
5131 | cpumask_t cpu_present_map __read_mostly; | ||
5132 | EXPORT_SYMBOL(cpu_present_map); | ||
5133 | |||
5134 | #ifndef CONFIG_SMP | ||
5135 | cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; | ||
5136 | EXPORT_SYMBOL(cpu_online_map); | ||
5137 | |||
5138 | cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; | ||
5139 | EXPORT_SYMBOL(cpu_possible_map); | ||
5140 | #endif | ||
5141 | |||
5142 | long sched_getaffinity(pid_t pid, cpumask_t *mask) | 5324 | long sched_getaffinity(pid_t pid, cpumask_t *mask) |
5143 | { | 5325 | { |
5144 | struct task_struct *p; | 5326 | struct task_struct *p; |
@@ -5622,6 +5804,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) | |||
5622 | goto out; | 5804 | goto out; |
5623 | } | 5805 | } |
5624 | 5806 | ||
5807 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && | ||
5808 | !cpus_equal(p->cpus_allowed, *new_mask))) { | ||
5809 | ret = -EINVAL; | ||
5810 | goto out; | ||
5811 | } | ||
5812 | |||
5625 | if (p->sched_class->set_cpus_allowed) | 5813 | if (p->sched_class->set_cpus_allowed) |
5626 | p->sched_class->set_cpus_allowed(p, new_mask); | 5814 | p->sched_class->set_cpus_allowed(p, new_mask); |
5627 | else { | 5815 | else { |
@@ -5673,10 +5861,10 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5673 | double_rq_lock(rq_src, rq_dest); | 5861 | double_rq_lock(rq_src, rq_dest); |
5674 | /* Already moved. */ | 5862 | /* Already moved. */ |
5675 | if (task_cpu(p) != src_cpu) | 5863 | if (task_cpu(p) != src_cpu) |
5676 | goto out; | 5864 | goto done; |
5677 | /* Affinity changed (again). */ | 5865 | /* Affinity changed (again). */ |
5678 | if (!cpu_isset(dest_cpu, p->cpus_allowed)) | 5866 | if (!cpu_isset(dest_cpu, p->cpus_allowed)) |
5679 | goto out; | 5867 | goto fail; |
5680 | 5868 | ||
5681 | on_rq = p->se.on_rq; | 5869 | on_rq = p->se.on_rq; |
5682 | if (on_rq) | 5870 | if (on_rq) |
@@ -5687,8 +5875,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5687 | activate_task(rq_dest, p, 0); | 5875 | activate_task(rq_dest, p, 0); |
5688 | check_preempt_curr(rq_dest, p); | 5876 | check_preempt_curr(rq_dest, p); |
5689 | } | 5877 | } |
5878 | done: | ||
5690 | ret = 1; | 5879 | ret = 1; |
5691 | out: | 5880 | fail: |
5692 | double_rq_unlock(rq_src, rq_dest); | 5881 | double_rq_unlock(rq_src, rq_dest); |
5693 | return ret; | 5882 | return ret; |
5694 | } | 5883 | } |
@@ -6110,6 +6299,36 @@ static void unregister_sched_domain_sysctl(void) | |||
6110 | } | 6299 | } |
6111 | #endif | 6300 | #endif |
6112 | 6301 | ||
6302 | static void set_rq_online(struct rq *rq) | ||
6303 | { | ||
6304 | if (!rq->online) { | ||
6305 | const struct sched_class *class; | ||
6306 | |||
6307 | cpu_set(rq->cpu, rq->rd->online); | ||
6308 | rq->online = 1; | ||
6309 | |||
6310 | for_each_class(class) { | ||
6311 | if (class->rq_online) | ||
6312 | class->rq_online(rq); | ||
6313 | } | ||
6314 | } | ||
6315 | } | ||
6316 | |||
6317 | static void set_rq_offline(struct rq *rq) | ||
6318 | { | ||
6319 | if (rq->online) { | ||
6320 | const struct sched_class *class; | ||
6321 | |||
6322 | for_each_class(class) { | ||
6323 | if (class->rq_offline) | ||
6324 | class->rq_offline(rq); | ||
6325 | } | ||
6326 | |||
6327 | cpu_clear(rq->cpu, rq->rd->online); | ||
6328 | rq->online = 0; | ||
6329 | } | ||
6330 | } | ||
6331 | |||
6113 | /* | 6332 | /* |
6114 | * migration_call - callback that gets triggered when a CPU is added. | 6333 | * migration_call - callback that gets triggered when a CPU is added. |
6115 | * Here we can start up the necessary migration thread for the new CPU. | 6334 | * Here we can start up the necessary migration thread for the new CPU. |
@@ -6147,7 +6366,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6147 | spin_lock_irqsave(&rq->lock, flags); | 6366 | spin_lock_irqsave(&rq->lock, flags); |
6148 | if (rq->rd) { | 6367 | if (rq->rd) { |
6149 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); | 6368 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); |
6150 | cpu_set(cpu, rq->rd->online); | 6369 | |
6370 | set_rq_online(rq); | ||
6151 | } | 6371 | } |
6152 | spin_unlock_irqrestore(&rq->lock, flags); | 6372 | spin_unlock_irqrestore(&rq->lock, flags); |
6153 | break; | 6373 | break; |
@@ -6208,7 +6428,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6208 | spin_lock_irqsave(&rq->lock, flags); | 6428 | spin_lock_irqsave(&rq->lock, flags); |
6209 | if (rq->rd) { | 6429 | if (rq->rd) { |
6210 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); | 6430 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); |
6211 | cpu_clear(cpu, rq->rd->online); | 6431 | set_rq_offline(rq); |
6212 | } | 6432 | } |
6213 | spin_unlock_irqrestore(&rq->lock, flags); | 6433 | spin_unlock_irqrestore(&rq->lock, flags); |
6214 | break; | 6434 | break; |
@@ -6242,6 +6462,28 @@ void __init migration_init(void) | |||
6242 | 6462 | ||
6243 | #ifdef CONFIG_SCHED_DEBUG | 6463 | #ifdef CONFIG_SCHED_DEBUG |
6244 | 6464 | ||
6465 | static inline const char *sd_level_to_string(enum sched_domain_level lvl) | ||
6466 | { | ||
6467 | switch (lvl) { | ||
6468 | case SD_LV_NONE: | ||
6469 | return "NONE"; | ||
6470 | case SD_LV_SIBLING: | ||
6471 | return "SIBLING"; | ||
6472 | case SD_LV_MC: | ||
6473 | return "MC"; | ||
6474 | case SD_LV_CPU: | ||
6475 | return "CPU"; | ||
6476 | case SD_LV_NODE: | ||
6477 | return "NODE"; | ||
6478 | case SD_LV_ALLNODES: | ||
6479 | return "ALLNODES"; | ||
6480 | case SD_LV_MAX: | ||
6481 | return "MAX"; | ||
6482 | |||
6483 | } | ||
6484 | return "MAX"; | ||
6485 | } | ||
6486 | |||
6245 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | 6487 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, |
6246 | cpumask_t *groupmask) | 6488 | cpumask_t *groupmask) |
6247 | { | 6489 | { |
@@ -6261,7 +6503,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
6261 | return -1; | 6503 | return -1; |
6262 | } | 6504 | } |
6263 | 6505 | ||
6264 | printk(KERN_CONT "span %s\n", str); | 6506 | printk(KERN_CONT "span %s level %s\n", |
6507 | str, sd_level_to_string(sd->level)); | ||
6265 | 6508 | ||
6266 | if (!cpu_isset(cpu, sd->span)) { | 6509 | if (!cpu_isset(cpu, sd->span)) { |
6267 | printk(KERN_ERR "ERROR: domain->span does not contain " | 6510 | printk(KERN_ERR "ERROR: domain->span does not contain " |
@@ -6345,9 +6588,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
6345 | } | 6588 | } |
6346 | kfree(groupmask); | 6589 | kfree(groupmask); |
6347 | } | 6590 | } |
6348 | #else | 6591 | #else /* !CONFIG_SCHED_DEBUG */ |
6349 | # define sched_domain_debug(sd, cpu) do { } while (0) | 6592 | # define sched_domain_debug(sd, cpu) do { } while (0) |
6350 | #endif | 6593 | #endif /* CONFIG_SCHED_DEBUG */ |
6351 | 6594 | ||
6352 | static int sd_degenerate(struct sched_domain *sd) | 6595 | static int sd_degenerate(struct sched_domain *sd) |
6353 | { | 6596 | { |
@@ -6407,20 +6650,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
6407 | static void rq_attach_root(struct rq *rq, struct root_domain *rd) | 6650 | static void rq_attach_root(struct rq *rq, struct root_domain *rd) |
6408 | { | 6651 | { |
6409 | unsigned long flags; | 6652 | unsigned long flags; |
6410 | const struct sched_class *class; | ||
6411 | 6653 | ||
6412 | spin_lock_irqsave(&rq->lock, flags); | 6654 | spin_lock_irqsave(&rq->lock, flags); |
6413 | 6655 | ||
6414 | if (rq->rd) { | 6656 | if (rq->rd) { |
6415 | struct root_domain *old_rd = rq->rd; | 6657 | struct root_domain *old_rd = rq->rd; |
6416 | 6658 | ||
6417 | for (class = sched_class_highest; class; class = class->next) { | 6659 | if (cpu_isset(rq->cpu, old_rd->online)) |
6418 | if (class->leave_domain) | 6660 | set_rq_offline(rq); |
6419 | class->leave_domain(rq); | ||
6420 | } | ||
6421 | 6661 | ||
6422 | cpu_clear(rq->cpu, old_rd->span); | 6662 | cpu_clear(rq->cpu, old_rd->span); |
6423 | cpu_clear(rq->cpu, old_rd->online); | ||
6424 | 6663 | ||
6425 | if (atomic_dec_and_test(&old_rd->refcount)) | 6664 | if (atomic_dec_and_test(&old_rd->refcount)) |
6426 | kfree(old_rd); | 6665 | kfree(old_rd); |
@@ -6431,12 +6670,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
6431 | 6670 | ||
6432 | cpu_set(rq->cpu, rd->span); | 6671 | cpu_set(rq->cpu, rd->span); |
6433 | if (cpu_isset(rq->cpu, cpu_online_map)) | 6672 | if (cpu_isset(rq->cpu, cpu_online_map)) |
6434 | cpu_set(rq->cpu, rd->online); | 6673 | set_rq_online(rq); |
6435 | |||
6436 | for (class = sched_class_highest; class; class = class->next) { | ||
6437 | if (class->join_domain) | ||
6438 | class->join_domain(rq); | ||
6439 | } | ||
6440 | 6674 | ||
6441 | spin_unlock_irqrestore(&rq->lock, flags); | 6675 | spin_unlock_irqrestore(&rq->lock, flags); |
6442 | } | 6676 | } |
@@ -6447,6 +6681,8 @@ static void init_rootdomain(struct root_domain *rd) | |||
6447 | 6681 | ||
6448 | cpus_clear(rd->span); | 6682 | cpus_clear(rd->span); |
6449 | cpus_clear(rd->online); | 6683 | cpus_clear(rd->online); |
6684 | |||
6685 | cpupri_init(&rd->cpupri); | ||
6450 | } | 6686 | } |
6451 | 6687 | ||
6452 | static void init_defrootdomain(void) | 6688 | static void init_defrootdomain(void) |
@@ -6589,9 +6825,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) | |||
6589 | 6825 | ||
6590 | min_val = INT_MAX; | 6826 | min_val = INT_MAX; |
6591 | 6827 | ||
6592 | for (i = 0; i < MAX_NUMNODES; i++) { | 6828 | for (i = 0; i < nr_node_ids; i++) { |
6593 | /* Start at @node */ | 6829 | /* Start at @node */ |
6594 | n = (node + i) % MAX_NUMNODES; | 6830 | n = (node + i) % nr_node_ids; |
6595 | 6831 | ||
6596 | if (!nr_cpus_node(n)) | 6832 | if (!nr_cpus_node(n)) |
6597 | continue; | 6833 | continue; |
@@ -6641,7 +6877,7 @@ static void sched_domain_node_span(int node, cpumask_t *span) | |||
6641 | cpus_or(*span, *span, *nodemask); | 6877 | cpus_or(*span, *span, *nodemask); |
6642 | } | 6878 | } |
6643 | } | 6879 | } |
6644 | #endif | 6880 | #endif /* CONFIG_NUMA */ |
6645 | 6881 | ||
6646 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | 6882 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; |
6647 | 6883 | ||
@@ -6660,7 +6896,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, | |||
6660 | *sg = &per_cpu(sched_group_cpus, cpu); | 6896 | *sg = &per_cpu(sched_group_cpus, cpu); |
6661 | return cpu; | 6897 | return cpu; |
6662 | } | 6898 | } |
6663 | #endif | 6899 | #endif /* CONFIG_SCHED_SMT */ |
6664 | 6900 | ||
6665 | /* | 6901 | /* |
6666 | * multi-core sched-domains: | 6902 | * multi-core sched-domains: |
@@ -6668,7 +6904,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, | |||
6668 | #ifdef CONFIG_SCHED_MC | 6904 | #ifdef CONFIG_SCHED_MC |
6669 | static DEFINE_PER_CPU(struct sched_domain, core_domains); | 6905 | static DEFINE_PER_CPU(struct sched_domain, core_domains); |
6670 | static DEFINE_PER_CPU(struct sched_group, sched_group_core); | 6906 | static DEFINE_PER_CPU(struct sched_group, sched_group_core); |
6671 | #endif | 6907 | #endif /* CONFIG_SCHED_MC */ |
6672 | 6908 | ||
6673 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 6909 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) |
6674 | static int | 6910 | static int |
@@ -6770,7 +7006,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
6770 | sg = sg->next; | 7006 | sg = sg->next; |
6771 | } while (sg != group_head); | 7007 | } while (sg != group_head); |
6772 | } | 7008 | } |
6773 | #endif | 7009 | #endif /* CONFIG_NUMA */ |
6774 | 7010 | ||
6775 | #ifdef CONFIG_NUMA | 7011 | #ifdef CONFIG_NUMA |
6776 | /* Free memory allocated for various sched_group structures */ | 7012 | /* Free memory allocated for various sched_group structures */ |
@@ -6785,7 +7021,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) | |||
6785 | if (!sched_group_nodes) | 7021 | if (!sched_group_nodes) |
6786 | continue; | 7022 | continue; |
6787 | 7023 | ||
6788 | for (i = 0; i < MAX_NUMNODES; i++) { | 7024 | for (i = 0; i < nr_node_ids; i++) { |
6789 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | 7025 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; |
6790 | 7026 | ||
6791 | *nodemask = node_to_cpumask(i); | 7027 | *nodemask = node_to_cpumask(i); |
@@ -6807,11 +7043,11 @@ next_sg: | |||
6807 | sched_group_nodes_bycpu[cpu] = NULL; | 7043 | sched_group_nodes_bycpu[cpu] = NULL; |
6808 | } | 7044 | } |
6809 | } | 7045 | } |
6810 | #else | 7046 | #else /* !CONFIG_NUMA */ |
6811 | static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) | 7047 | static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) |
6812 | { | 7048 | { |
6813 | } | 7049 | } |
6814 | #endif | 7050 | #endif /* CONFIG_NUMA */ |
6815 | 7051 | ||
6816 | /* | 7052 | /* |
6817 | * Initialize sched groups cpu_power. | 7053 | * Initialize sched groups cpu_power. |
@@ -6978,7 +7214,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
6978 | /* | 7214 | /* |
6979 | * Allocate the per-node list of sched groups | 7215 | * Allocate the per-node list of sched groups |
6980 | */ | 7216 | */ |
6981 | sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), | 7217 | sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), |
6982 | GFP_KERNEL); | 7218 | GFP_KERNEL); |
6983 | if (!sched_group_nodes) { | 7219 | if (!sched_group_nodes) { |
6984 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 7220 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
@@ -7117,7 +7353,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7117 | #endif | 7353 | #endif |
7118 | 7354 | ||
7119 | /* Set up physical groups */ | 7355 | /* Set up physical groups */ |
7120 | for (i = 0; i < MAX_NUMNODES; i++) { | 7356 | for (i = 0; i < nr_node_ids; i++) { |
7121 | SCHED_CPUMASK_VAR(nodemask, allmasks); | 7357 | SCHED_CPUMASK_VAR(nodemask, allmasks); |
7122 | SCHED_CPUMASK_VAR(send_covered, allmasks); | 7358 | SCHED_CPUMASK_VAR(send_covered, allmasks); |
7123 | 7359 | ||
@@ -7141,7 +7377,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7141 | send_covered, tmpmask); | 7377 | send_covered, tmpmask); |
7142 | } | 7378 | } |
7143 | 7379 | ||
7144 | for (i = 0; i < MAX_NUMNODES; i++) { | 7380 | for (i = 0; i < nr_node_ids; i++) { |
7145 | /* Set up node groups */ | 7381 | /* Set up node groups */ |
7146 | struct sched_group *sg, *prev; | 7382 | struct sched_group *sg, *prev; |
7147 | SCHED_CPUMASK_VAR(nodemask, allmasks); | 7383 | SCHED_CPUMASK_VAR(nodemask, allmasks); |
@@ -7180,9 +7416,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7180 | cpus_or(*covered, *covered, *nodemask); | 7416 | cpus_or(*covered, *covered, *nodemask); |
7181 | prev = sg; | 7417 | prev = sg; |
7182 | 7418 | ||
7183 | for (j = 0; j < MAX_NUMNODES; j++) { | 7419 | for (j = 0; j < nr_node_ids; j++) { |
7184 | SCHED_CPUMASK_VAR(notcovered, allmasks); | 7420 | SCHED_CPUMASK_VAR(notcovered, allmasks); |
7185 | int n = (i + j) % MAX_NUMNODES; | 7421 | int n = (i + j) % nr_node_ids; |
7186 | node_to_cpumask_ptr(pnodemask, n); | 7422 | node_to_cpumask_ptr(pnodemask, n); |
7187 | 7423 | ||
7188 | cpus_complement(*notcovered, *covered); | 7424 | cpus_complement(*notcovered, *covered); |
@@ -7235,7 +7471,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7235 | } | 7471 | } |
7236 | 7472 | ||
7237 | #ifdef CONFIG_NUMA | 7473 | #ifdef CONFIG_NUMA |
7238 | for (i = 0; i < MAX_NUMNODES; i++) | 7474 | for (i = 0; i < nr_node_ids; i++) |
7239 | init_numa_sched_groups_power(sched_group_nodes[i]); | 7475 | init_numa_sched_groups_power(sched_group_nodes[i]); |
7240 | 7476 | ||
7241 | if (sd_allnodes) { | 7477 | if (sd_allnodes) { |
@@ -7520,7 +7756,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | |||
7520 | #endif | 7756 | #endif |
7521 | return err; | 7757 | return err; |
7522 | } | 7758 | } |
7523 | #endif | 7759 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ |
7524 | 7760 | ||
7525 | /* | 7761 | /* |
7526 | * Force a reinitialization of the sched domains hierarchy. The domains | 7762 | * Force a reinitialization of the sched domains hierarchy. The domains |
@@ -7531,21 +7767,28 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | |||
7531 | static int update_sched_domains(struct notifier_block *nfb, | 7767 | static int update_sched_domains(struct notifier_block *nfb, |
7532 | unsigned long action, void *hcpu) | 7768 | unsigned long action, void *hcpu) |
7533 | { | 7769 | { |
7770 | int cpu = (int)(long)hcpu; | ||
7771 | |||
7534 | switch (action) { | 7772 | switch (action) { |
7535 | case CPU_UP_PREPARE: | ||
7536 | case CPU_UP_PREPARE_FROZEN: | ||
7537 | case CPU_DOWN_PREPARE: | 7773 | case CPU_DOWN_PREPARE: |
7538 | case CPU_DOWN_PREPARE_FROZEN: | 7774 | case CPU_DOWN_PREPARE_FROZEN: |
7775 | disable_runtime(cpu_rq(cpu)); | ||
7776 | /* fall-through */ | ||
7777 | case CPU_UP_PREPARE: | ||
7778 | case CPU_UP_PREPARE_FROZEN: | ||
7539 | detach_destroy_domains(&cpu_online_map); | 7779 | detach_destroy_domains(&cpu_online_map); |
7540 | free_sched_domains(); | 7780 | free_sched_domains(); |
7541 | return NOTIFY_OK; | 7781 | return NOTIFY_OK; |
7542 | 7782 | ||
7543 | case CPU_UP_CANCELED: | 7783 | |
7544 | case CPU_UP_CANCELED_FROZEN: | ||
7545 | case CPU_DOWN_FAILED: | 7784 | case CPU_DOWN_FAILED: |
7546 | case CPU_DOWN_FAILED_FROZEN: | 7785 | case CPU_DOWN_FAILED_FROZEN: |
7547 | case CPU_ONLINE: | 7786 | case CPU_ONLINE: |
7548 | case CPU_ONLINE_FROZEN: | 7787 | case CPU_ONLINE_FROZEN: |
7788 | enable_runtime(cpu_rq(cpu)); | ||
7789 | /* fall-through */ | ||
7790 | case CPU_UP_CANCELED: | ||
7791 | case CPU_UP_CANCELED_FROZEN: | ||
7549 | case CPU_DEAD: | 7792 | case CPU_DEAD: |
7550 | case CPU_DEAD_FROZEN: | 7793 | case CPU_DEAD_FROZEN: |
7551 | /* | 7794 | /* |
@@ -7745,8 +7988,8 @@ void __init sched_init(void) | |||
7745 | 7988 | ||
7746 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; | 7989 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; |
7747 | ptr += nr_cpu_ids * sizeof(void **); | 7990 | ptr += nr_cpu_ids * sizeof(void **); |
7748 | #endif | 7991 | #endif /* CONFIG_USER_SCHED */ |
7749 | #endif | 7992 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7750 | #ifdef CONFIG_RT_GROUP_SCHED | 7993 | #ifdef CONFIG_RT_GROUP_SCHED |
7751 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; | 7994 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; |
7752 | ptr += nr_cpu_ids * sizeof(void **); | 7995 | ptr += nr_cpu_ids * sizeof(void **); |
@@ -7760,8 +8003,8 @@ void __init sched_init(void) | |||
7760 | 8003 | ||
7761 | root_task_group.rt_rq = (struct rt_rq **)ptr; | 8004 | root_task_group.rt_rq = (struct rt_rq **)ptr; |
7762 | ptr += nr_cpu_ids * sizeof(void **); | 8005 | ptr += nr_cpu_ids * sizeof(void **); |
7763 | #endif | 8006 | #endif /* CONFIG_USER_SCHED */ |
7764 | #endif | 8007 | #endif /* CONFIG_RT_GROUP_SCHED */ |
7765 | } | 8008 | } |
7766 | 8009 | ||
7767 | #ifdef CONFIG_SMP | 8010 | #ifdef CONFIG_SMP |
@@ -7777,8 +8020,8 @@ void __init sched_init(void) | |||
7777 | #ifdef CONFIG_USER_SCHED | 8020 | #ifdef CONFIG_USER_SCHED |
7778 | init_rt_bandwidth(&root_task_group.rt_bandwidth, | 8021 | init_rt_bandwidth(&root_task_group.rt_bandwidth, |
7779 | global_rt_period(), RUNTIME_INF); | 8022 | global_rt_period(), RUNTIME_INF); |
7780 | #endif | 8023 | #endif /* CONFIG_USER_SCHED */ |
7781 | #endif | 8024 | #endif /* CONFIG_RT_GROUP_SCHED */ |
7782 | 8025 | ||
7783 | #ifdef CONFIG_GROUP_SCHED | 8026 | #ifdef CONFIG_GROUP_SCHED |
7784 | list_add(&init_task_group.list, &task_groups); | 8027 | list_add(&init_task_group.list, &task_groups); |
@@ -7788,8 +8031,8 @@ void __init sched_init(void) | |||
7788 | INIT_LIST_HEAD(&root_task_group.children); | 8031 | INIT_LIST_HEAD(&root_task_group.children); |
7789 | init_task_group.parent = &root_task_group; | 8032 | init_task_group.parent = &root_task_group; |
7790 | list_add(&init_task_group.siblings, &root_task_group.children); | 8033 | list_add(&init_task_group.siblings, &root_task_group.children); |
7791 | #endif | 8034 | #endif /* CONFIG_USER_SCHED */ |
7792 | #endif | 8035 | #endif /* CONFIG_GROUP_SCHED */ |
7793 | 8036 | ||
7794 | for_each_possible_cpu(i) { | 8037 | for_each_possible_cpu(i) { |
7795 | struct rq *rq; | 8038 | struct rq *rq; |
@@ -7869,6 +8112,7 @@ void __init sched_init(void) | |||
7869 | rq->next_balance = jiffies; | 8112 | rq->next_balance = jiffies; |
7870 | rq->push_cpu = 0; | 8113 | rq->push_cpu = 0; |
7871 | rq->cpu = i; | 8114 | rq->cpu = i; |
8115 | rq->online = 0; | ||
7872 | rq->migration_thread = NULL; | 8116 | rq->migration_thread = NULL; |
7873 | INIT_LIST_HEAD(&rq->migration_queue); | 8117 | INIT_LIST_HEAD(&rq->migration_queue); |
7874 | rq_attach_root(rq, &def_root_domain); | 8118 | rq_attach_root(rq, &def_root_domain); |
@@ -8108,7 +8352,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | |||
8108 | { | 8352 | { |
8109 | list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); | 8353 | list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); |
8110 | } | 8354 | } |
8111 | #else | 8355 | #else /* !CONFG_FAIR_GROUP_SCHED */ |
8112 | static inline void free_fair_sched_group(struct task_group *tg) | 8356 | static inline void free_fair_sched_group(struct task_group *tg) |
8113 | { | 8357 | { |
8114 | } | 8358 | } |
@@ -8126,7 +8370,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu) | |||
8126 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8370 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
8127 | { | 8371 | { |
8128 | } | 8372 | } |
8129 | #endif | 8373 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8130 | 8374 | ||
8131 | #ifdef CONFIG_RT_GROUP_SCHED | 8375 | #ifdef CONFIG_RT_GROUP_SCHED |
8132 | static void free_rt_sched_group(struct task_group *tg) | 8376 | static void free_rt_sched_group(struct task_group *tg) |
@@ -8197,7 +8441,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | |||
8197 | { | 8441 | { |
8198 | list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); | 8442 | list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); |
8199 | } | 8443 | } |
8200 | #else | 8444 | #else /* !CONFIG_RT_GROUP_SCHED */ |
8201 | static inline void free_rt_sched_group(struct task_group *tg) | 8445 | static inline void free_rt_sched_group(struct task_group *tg) |
8202 | { | 8446 | { |
8203 | } | 8447 | } |
@@ -8215,7 +8459,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu) | |||
8215 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | 8459 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) |
8216 | { | 8460 | { |
8217 | } | 8461 | } |
8218 | #endif | 8462 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8219 | 8463 | ||
8220 | #ifdef CONFIG_GROUP_SCHED | 8464 | #ifdef CONFIG_GROUP_SCHED |
8221 | static void free_sched_group(struct task_group *tg) | 8465 | static void free_sched_group(struct task_group *tg) |
@@ -8326,17 +8570,14 @@ void sched_move_task(struct task_struct *tsk) | |||
8326 | 8570 | ||
8327 | task_rq_unlock(rq, &flags); | 8571 | task_rq_unlock(rq, &flags); |
8328 | } | 8572 | } |
8329 | #endif | 8573 | #endif /* CONFIG_GROUP_SCHED */ |
8330 | 8574 | ||
8331 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8575 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8332 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 8576 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) |
8333 | { | 8577 | { |
8334 | struct cfs_rq *cfs_rq = se->cfs_rq; | 8578 | struct cfs_rq *cfs_rq = se->cfs_rq; |
8335 | struct rq *rq = cfs_rq->rq; | ||
8336 | int on_rq; | 8579 | int on_rq; |
8337 | 8580 | ||
8338 | spin_lock_irq(&rq->lock); | ||
8339 | |||
8340 | on_rq = se->on_rq; | 8581 | on_rq = se->on_rq; |
8341 | if (on_rq) | 8582 | if (on_rq) |
8342 | dequeue_entity(cfs_rq, se, 0); | 8583 | dequeue_entity(cfs_rq, se, 0); |
@@ -8346,8 +8587,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) | |||
8346 | 8587 | ||
8347 | if (on_rq) | 8588 | if (on_rq) |
8348 | enqueue_entity(cfs_rq, se, 0); | 8589 | enqueue_entity(cfs_rq, se, 0); |
8590 | } | ||
8349 | 8591 | ||
8350 | spin_unlock_irq(&rq->lock); | 8592 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
8593 | { | ||
8594 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
8595 | struct rq *rq = cfs_rq->rq; | ||
8596 | unsigned long flags; | ||
8597 | |||
8598 | spin_lock_irqsave(&rq->lock, flags); | ||
8599 | __set_se_shares(se, shares); | ||
8600 | spin_unlock_irqrestore(&rq->lock, flags); | ||
8351 | } | 8601 | } |
8352 | 8602 | ||
8353 | static DEFINE_MUTEX(shares_mutex); | 8603 | static DEFINE_MUTEX(shares_mutex); |
@@ -8386,8 +8636,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
8386 | * w/o tripping rebalance_share or load_balance_fair. | 8636 | * w/o tripping rebalance_share or load_balance_fair. |
8387 | */ | 8637 | */ |
8388 | tg->shares = shares; | 8638 | tg->shares = shares; |
8389 | for_each_possible_cpu(i) | 8639 | for_each_possible_cpu(i) { |
8640 | /* | ||
8641 | * force a rebalance | ||
8642 | */ | ||
8643 | cfs_rq_set_shares(tg->cfs_rq[i], 0); | ||
8390 | set_se_shares(tg->se[i], shares); | 8644 | set_se_shares(tg->se[i], shares); |
8645 | } | ||
8391 | 8646 | ||
8392 | /* | 8647 | /* |
8393 | * Enable load balance activity on this group, by inserting it back on | 8648 | * Enable load balance activity on this group, by inserting it back on |
@@ -8426,7 +8681,7 @@ static unsigned long to_ratio(u64 period, u64 runtime) | |||
8426 | #ifdef CONFIG_CGROUP_SCHED | 8681 | #ifdef CONFIG_CGROUP_SCHED |
8427 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 8682 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
8428 | { | 8683 | { |
8429 | struct task_group *tgi, *parent = tg ? tg->parent : NULL; | 8684 | struct task_group *tgi, *parent = tg->parent; |
8430 | unsigned long total = 0; | 8685 | unsigned long total = 0; |
8431 | 8686 | ||
8432 | if (!parent) { | 8687 | if (!parent) { |
@@ -8450,7 +8705,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | |||
8450 | } | 8705 | } |
8451 | rcu_read_unlock(); | 8706 | rcu_read_unlock(); |
8452 | 8707 | ||
8453 | return total + to_ratio(period, runtime) < | 8708 | return total + to_ratio(period, runtime) <= |
8454 | to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), | 8709 | to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), |
8455 | parent->rt_bandwidth.rt_runtime); | 8710 | parent->rt_bandwidth.rt_runtime); |
8456 | } | 8711 | } |
@@ -8570,16 +8825,21 @@ long sched_group_rt_period(struct task_group *tg) | |||
8570 | 8825 | ||
8571 | static int sched_rt_global_constraints(void) | 8826 | static int sched_rt_global_constraints(void) |
8572 | { | 8827 | { |
8828 | struct task_group *tg = &root_task_group; | ||
8829 | u64 rt_runtime, rt_period; | ||
8573 | int ret = 0; | 8830 | int ret = 0; |
8574 | 8831 | ||
8832 | rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); | ||
8833 | rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
8834 | |||
8575 | mutex_lock(&rt_constraints_mutex); | 8835 | mutex_lock(&rt_constraints_mutex); |
8576 | if (!__rt_schedulable(NULL, 1, 0)) | 8836 | if (!__rt_schedulable(tg, rt_period, rt_runtime)) |
8577 | ret = -EINVAL; | 8837 | ret = -EINVAL; |
8578 | mutex_unlock(&rt_constraints_mutex); | 8838 | mutex_unlock(&rt_constraints_mutex); |
8579 | 8839 | ||
8580 | return ret; | 8840 | return ret; |
8581 | } | 8841 | } |
8582 | #else | 8842 | #else /* !CONFIG_RT_GROUP_SCHED */ |
8583 | static int sched_rt_global_constraints(void) | 8843 | static int sched_rt_global_constraints(void) |
8584 | { | 8844 | { |
8585 | unsigned long flags; | 8845 | unsigned long flags; |
@@ -8597,7 +8857,7 @@ static int sched_rt_global_constraints(void) | |||
8597 | 8857 | ||
8598 | return 0; | 8858 | return 0; |
8599 | } | 8859 | } |
8600 | #endif | 8860 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8601 | 8861 | ||
8602 | int sched_rt_handler(struct ctl_table *table, int write, | 8862 | int sched_rt_handler(struct ctl_table *table, int write, |
8603 | struct file *filp, void __user *buffer, size_t *lenp, | 8863 | struct file *filp, void __user *buffer, size_t *lenp, |
@@ -8705,7 +8965,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) | |||
8705 | 8965 | ||
8706 | return (u64) tg->shares; | 8966 | return (u64) tg->shares; |
8707 | } | 8967 | } |
8708 | #endif | 8968 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8709 | 8969 | ||
8710 | #ifdef CONFIG_RT_GROUP_SCHED | 8970 | #ifdef CONFIG_RT_GROUP_SCHED |
8711 | static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, | 8971 | static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, |
@@ -8729,7 +8989,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) | |||
8729 | { | 8989 | { |
8730 | return sched_group_rt_period(cgroup_tg(cgrp)); | 8990 | return sched_group_rt_period(cgroup_tg(cgrp)); |
8731 | } | 8991 | } |
8732 | #endif | 8992 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8733 | 8993 | ||
8734 | static struct cftype cpu_files[] = { | 8994 | static struct cftype cpu_files[] = { |
8735 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8995 | #ifdef CONFIG_FAIR_GROUP_SCHED |
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index ce05271219ab..22ed55d1167f 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c | |||
@@ -3,6 +3,9 @@ | |||
3 | * | 3 | * |
4 | * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | 4 | * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> |
5 | * | 5 | * |
6 | * Updates and enhancements: | ||
7 | * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com> | ||
8 | * | ||
6 | * Based on code by: | 9 | * Based on code by: |
7 | * Ingo Molnar <mingo@redhat.com> | 10 | * Ingo Molnar <mingo@redhat.com> |
8 | * Guillaume Chazarain <guichaz@gmail.com> | 11 | * Guillaume Chazarain <guichaz@gmail.com> |
@@ -32,6 +35,11 @@ | |||
32 | 35 | ||
33 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | 36 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
34 | 37 | ||
38 | #define MULTI_SHIFT 15 | ||
39 | /* Max is double, Min is 1/2 */ | ||
40 | #define MAX_MULTI (2LL << MULTI_SHIFT) | ||
41 | #define MIN_MULTI (1LL << (MULTI_SHIFT-1)) | ||
42 | |||
35 | struct sched_clock_data { | 43 | struct sched_clock_data { |
36 | /* | 44 | /* |
37 | * Raw spinlock - this is a special case: this might be called | 45 | * Raw spinlock - this is a special case: this might be called |
@@ -40,11 +48,15 @@ struct sched_clock_data { | |||
40 | */ | 48 | */ |
41 | raw_spinlock_t lock; | 49 | raw_spinlock_t lock; |
42 | 50 | ||
43 | unsigned long prev_jiffies; | 51 | unsigned long tick_jiffies; |
44 | u64 prev_raw; | 52 | u64 prev_raw; |
45 | u64 tick_raw; | 53 | u64 tick_raw; |
46 | u64 tick_gtod; | 54 | u64 tick_gtod; |
47 | u64 clock; | 55 | u64 clock; |
56 | s64 multi; | ||
57 | #ifdef CONFIG_NO_HZ | ||
58 | int check_max; | ||
59 | #endif | ||
48 | }; | 60 | }; |
49 | 61 | ||
50 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); | 62 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); |
@@ -71,41 +83,91 @@ void sched_clock_init(void) | |||
71 | struct sched_clock_data *scd = cpu_sdc(cpu); | 83 | struct sched_clock_data *scd = cpu_sdc(cpu); |
72 | 84 | ||
73 | scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; | 85 | scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; |
74 | scd->prev_jiffies = now_jiffies; | 86 | scd->tick_jiffies = now_jiffies; |
75 | scd->prev_raw = 0; | 87 | scd->prev_raw = 0; |
76 | scd->tick_raw = 0; | 88 | scd->tick_raw = 0; |
77 | scd->tick_gtod = ktime_now; | 89 | scd->tick_gtod = ktime_now; |
78 | scd->clock = ktime_now; | 90 | scd->clock = ktime_now; |
91 | scd->multi = 1 << MULTI_SHIFT; | ||
92 | #ifdef CONFIG_NO_HZ | ||
93 | scd->check_max = 1; | ||
94 | #endif | ||
79 | } | 95 | } |
80 | 96 | ||
81 | sched_clock_running = 1; | 97 | sched_clock_running = 1; |
82 | } | 98 | } |
83 | 99 | ||
100 | #ifdef CONFIG_NO_HZ | ||
101 | /* | ||
102 | * The dynamic ticks makes the delta jiffies inaccurate. This | ||
103 | * prevents us from checking the maximum time update. | ||
104 | * Disable the maximum check during stopped ticks. | ||
105 | */ | ||
106 | void sched_clock_tick_stop(int cpu) | ||
107 | { | ||
108 | struct sched_clock_data *scd = cpu_sdc(cpu); | ||
109 | |||
110 | scd->check_max = 0; | ||
111 | } | ||
112 | |||
113 | void sched_clock_tick_start(int cpu) | ||
114 | { | ||
115 | struct sched_clock_data *scd = cpu_sdc(cpu); | ||
116 | |||
117 | scd->check_max = 1; | ||
118 | } | ||
119 | |||
120 | static int check_max(struct sched_clock_data *scd) | ||
121 | { | ||
122 | return scd->check_max; | ||
123 | } | ||
124 | #else | ||
125 | static int check_max(struct sched_clock_data *scd) | ||
126 | { | ||
127 | return 1; | ||
128 | } | ||
129 | #endif /* CONFIG_NO_HZ */ | ||
130 | |||
84 | /* | 131 | /* |
85 | * update the percpu scd from the raw @now value | 132 | * update the percpu scd from the raw @now value |
86 | * | 133 | * |
87 | * - filter out backward motion | 134 | * - filter out backward motion |
88 | * - use jiffies to generate a min,max window to clip the raw values | 135 | * - use jiffies to generate a min,max window to clip the raw values |
89 | */ | 136 | */ |
90 | static void __update_sched_clock(struct sched_clock_data *scd, u64 now) | 137 | static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time) |
91 | { | 138 | { |
92 | unsigned long now_jiffies = jiffies; | 139 | unsigned long now_jiffies = jiffies; |
93 | long delta_jiffies = now_jiffies - scd->prev_jiffies; | 140 | long delta_jiffies = now_jiffies - scd->tick_jiffies; |
94 | u64 clock = scd->clock; | 141 | u64 clock = scd->clock; |
95 | u64 min_clock, max_clock; | 142 | u64 min_clock, max_clock; |
96 | s64 delta = now - scd->prev_raw; | 143 | s64 delta = now - scd->prev_raw; |
97 | 144 | ||
98 | WARN_ON_ONCE(!irqs_disabled()); | 145 | WARN_ON_ONCE(!irqs_disabled()); |
99 | min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC; | 146 | |
147 | /* | ||
148 | * At schedule tick the clock can be just under the gtod. We don't | ||
149 | * want to push it too prematurely. | ||
150 | */ | ||
151 | min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC); | ||
152 | if (min_clock > TICK_NSEC) | ||
153 | min_clock -= TICK_NSEC / 2; | ||
100 | 154 | ||
101 | if (unlikely(delta < 0)) { | 155 | if (unlikely(delta < 0)) { |
102 | clock++; | 156 | clock++; |
103 | goto out; | 157 | goto out; |
104 | } | 158 | } |
105 | 159 | ||
106 | max_clock = min_clock + TICK_NSEC; | 160 | /* |
161 | * The clock must stay within a jiffie of the gtod. | ||
162 | * But since we may be at the start of a jiffy or the end of one | ||
163 | * we add another jiffy buffer. | ||
164 | */ | ||
165 | max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC; | ||
166 | |||
167 | delta *= scd->multi; | ||
168 | delta >>= MULTI_SHIFT; | ||
107 | 169 | ||
108 | if (unlikely(clock + delta > max_clock)) { | 170 | if (unlikely(clock + delta > max_clock) && check_max(scd)) { |
109 | if (clock < max_clock) | 171 | if (clock < max_clock) |
110 | clock = max_clock; | 172 | clock = max_clock; |
111 | else | 173 | else |
@@ -118,9 +180,12 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) | |||
118 | if (unlikely(clock < min_clock)) | 180 | if (unlikely(clock < min_clock)) |
119 | clock = min_clock; | 181 | clock = min_clock; |
120 | 182 | ||
121 | scd->prev_raw = now; | 183 | if (time) |
122 | scd->prev_jiffies = now_jiffies; | 184 | *time = clock; |
123 | scd->clock = clock; | 185 | else { |
186 | scd->prev_raw = now; | ||
187 | scd->clock = clock; | ||
188 | } | ||
124 | } | 189 | } |
125 | 190 | ||
126 | static void lock_double_clock(struct sched_clock_data *data1, | 191 | static void lock_double_clock(struct sched_clock_data *data1, |
@@ -160,25 +225,30 @@ u64 sched_clock_cpu(int cpu) | |||
160 | now -= my_scd->tick_raw; | 225 | now -= my_scd->tick_raw; |
161 | now += scd->tick_raw; | 226 | now += scd->tick_raw; |
162 | 227 | ||
163 | now -= my_scd->tick_gtod; | 228 | now += my_scd->tick_gtod; |
164 | now += scd->tick_gtod; | 229 | now -= scd->tick_gtod; |
165 | 230 | ||
166 | __raw_spin_unlock(&my_scd->lock); | 231 | __raw_spin_unlock(&my_scd->lock); |
232 | |||
233 | __update_sched_clock(scd, now, &clock); | ||
234 | |||
235 | __raw_spin_unlock(&scd->lock); | ||
236 | |||
167 | } else { | 237 | } else { |
168 | __raw_spin_lock(&scd->lock); | 238 | __raw_spin_lock(&scd->lock); |
239 | __update_sched_clock(scd, now, NULL); | ||
240 | clock = scd->clock; | ||
241 | __raw_spin_unlock(&scd->lock); | ||
169 | } | 242 | } |
170 | 243 | ||
171 | __update_sched_clock(scd, now); | ||
172 | clock = scd->clock; | ||
173 | |||
174 | __raw_spin_unlock(&scd->lock); | ||
175 | |||
176 | return clock; | 244 | return clock; |
177 | } | 245 | } |
178 | 246 | ||
179 | void sched_clock_tick(void) | 247 | void sched_clock_tick(void) |
180 | { | 248 | { |
181 | struct sched_clock_data *scd = this_scd(); | 249 | struct sched_clock_data *scd = this_scd(); |
250 | unsigned long now_jiffies = jiffies; | ||
251 | s64 mult, delta_gtod, delta_raw; | ||
182 | u64 now, now_gtod; | 252 | u64 now, now_gtod; |
183 | 253 | ||
184 | if (unlikely(!sched_clock_running)) | 254 | if (unlikely(!sched_clock_running)) |
@@ -186,18 +256,33 @@ void sched_clock_tick(void) | |||
186 | 256 | ||
187 | WARN_ON_ONCE(!irqs_disabled()); | 257 | WARN_ON_ONCE(!irqs_disabled()); |
188 | 258 | ||
189 | now = sched_clock(); | ||
190 | now_gtod = ktime_to_ns(ktime_get()); | 259 | now_gtod = ktime_to_ns(ktime_get()); |
260 | now = sched_clock(); | ||
191 | 261 | ||
192 | __raw_spin_lock(&scd->lock); | 262 | __raw_spin_lock(&scd->lock); |
193 | __update_sched_clock(scd, now); | 263 | __update_sched_clock(scd, now, NULL); |
194 | /* | 264 | /* |
195 | * update tick_gtod after __update_sched_clock() because that will | 265 | * update tick_gtod after __update_sched_clock() because that will |
196 | * already observe 1 new jiffy; adding a new tick_gtod to that would | 266 | * already observe 1 new jiffy; adding a new tick_gtod to that would |
197 | * increase the clock 2 jiffies. | 267 | * increase the clock 2 jiffies. |
198 | */ | 268 | */ |
269 | delta_gtod = now_gtod - scd->tick_gtod; | ||
270 | delta_raw = now - scd->tick_raw; | ||
271 | |||
272 | if ((long)delta_raw > 0) { | ||
273 | mult = delta_gtod << MULTI_SHIFT; | ||
274 | do_div(mult, delta_raw); | ||
275 | scd->multi = mult; | ||
276 | if (scd->multi > MAX_MULTI) | ||
277 | scd->multi = MAX_MULTI; | ||
278 | else if (scd->multi < MIN_MULTI) | ||
279 | scd->multi = MIN_MULTI; | ||
280 | } else | ||
281 | scd->multi = 1 << MULTI_SHIFT; | ||
282 | |||
199 | scd->tick_raw = now; | 283 | scd->tick_raw = now; |
200 | scd->tick_gtod = now_gtod; | 284 | scd->tick_gtod = now_gtod; |
285 | scd->tick_jiffies = now_jiffies; | ||
201 | __raw_spin_unlock(&scd->lock); | 286 | __raw_spin_unlock(&scd->lock); |
202 | } | 287 | } |
203 | 288 | ||
@@ -227,6 +312,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) | |||
227 | __raw_spin_lock(&scd->lock); | 312 | __raw_spin_lock(&scd->lock); |
228 | scd->prev_raw = now; | 313 | scd->prev_raw = now; |
229 | scd->clock += delta_ns; | 314 | scd->clock += delta_ns; |
315 | scd->multi = 1 << MULTI_SHIFT; | ||
230 | __raw_spin_unlock(&scd->lock); | 316 | __raw_spin_unlock(&scd->lock); |
231 | 317 | ||
232 | touch_softlockup_watchdog(); | 318 | touch_softlockup_watchdog(); |
@@ -244,3 +330,16 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
244 | { | 330 | { |
245 | return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); | 331 | return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); |
246 | } | 332 | } |
333 | |||
334 | unsigned long long cpu_clock(int cpu) | ||
335 | { | ||
336 | unsigned long long clock; | ||
337 | unsigned long flags; | ||
338 | |||
339 | local_irq_save(flags); | ||
340 | clock = sched_clock_cpu(cpu); | ||
341 | local_irq_restore(flags); | ||
342 | |||
343 | return clock; | ||
344 | } | ||
345 | EXPORT_SYMBOL_GPL(cpu_clock); | ||
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c new file mode 100644 index 000000000000..52154fefab7e --- /dev/null +++ b/kernel/sched_cpupri.c | |||
@@ -0,0 +1,174 @@ | |||
1 | /* | ||
2 | * kernel/sched_cpupri.c | ||
3 | * | ||
4 | * CPU priority management | ||
5 | * | ||
6 | * Copyright (C) 2007-2008 Novell | ||
7 | * | ||
8 | * Author: Gregory Haskins <ghaskins@novell.com> | ||
9 | * | ||
10 | * This code tracks the priority of each CPU so that global migration | ||
11 | * decisions are easy to calculate. Each CPU can be in a state as follows: | ||
12 | * | ||
13 | * (INVALID), IDLE, NORMAL, RT1, ... RT99 | ||
14 | * | ||
15 | * going from the lowest priority to the highest. CPUs in the INVALID state | ||
16 | * are not eligible for routing. The system maintains this state with | ||
17 | * a 2 dimensional bitmap (the first for priority class, the second for cpus | ||
18 | * in that class). Therefore a typical application without affinity | ||
19 | * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit | ||
20 | * searches). For tasks with affinity restrictions, the algorithm has a | ||
21 | * worst case complexity of O(min(102, nr_domcpus)), though the scenario that | ||
22 | * yields the worst case search is fairly contrived. | ||
23 | * | ||
24 | * This program is free software; you can redistribute it and/or | ||
25 | * modify it under the terms of the GNU General Public License | ||
26 | * as published by the Free Software Foundation; version 2 | ||
27 | * of the License. | ||
28 | */ | ||
29 | |||
30 | #include "sched_cpupri.h" | ||
31 | |||
32 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ | ||
33 | static int convert_prio(int prio) | ||
34 | { | ||
35 | int cpupri; | ||
36 | |||
37 | if (prio == CPUPRI_INVALID) | ||
38 | cpupri = CPUPRI_INVALID; | ||
39 | else if (prio == MAX_PRIO) | ||
40 | cpupri = CPUPRI_IDLE; | ||
41 | else if (prio >= MAX_RT_PRIO) | ||
42 | cpupri = CPUPRI_NORMAL; | ||
43 | else | ||
44 | cpupri = MAX_RT_PRIO - prio + 1; | ||
45 | |||
46 | return cpupri; | ||
47 | } | ||
48 | |||
49 | #define for_each_cpupri_active(array, idx) \ | ||
50 | for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ | ||
51 | idx < CPUPRI_NR_PRIORITIES; \ | ||
52 | idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1)) | ||
53 | |||
54 | /** | ||
55 | * cpupri_find - find the best (lowest-pri) CPU in the system | ||
56 | * @cp: The cpupri context | ||
57 | * @p: The task | ||
58 | * @lowest_mask: A mask to fill in with selected CPUs | ||
59 | * | ||
60 | * Note: This function returns the recommended CPUs as calculated during the | ||
61 | * current invokation. By the time the call returns, the CPUs may have in | ||
62 | * fact changed priorities any number of times. While not ideal, it is not | ||
63 | * an issue of correctness since the normal rebalancer logic will correct | ||
64 | * any discrepancies created by racing against the uncertainty of the current | ||
65 | * priority configuration. | ||
66 | * | ||
67 | * Returns: (int)bool - CPUs were found | ||
68 | */ | ||
69 | int cpupri_find(struct cpupri *cp, struct task_struct *p, | ||
70 | cpumask_t *lowest_mask) | ||
71 | { | ||
72 | int idx = 0; | ||
73 | int task_pri = convert_prio(p->prio); | ||
74 | |||
75 | for_each_cpupri_active(cp->pri_active, idx) { | ||
76 | struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; | ||
77 | cpumask_t mask; | ||
78 | |||
79 | if (idx >= task_pri) | ||
80 | break; | ||
81 | |||
82 | cpus_and(mask, p->cpus_allowed, vec->mask); | ||
83 | |||
84 | if (cpus_empty(mask)) | ||
85 | continue; | ||
86 | |||
87 | *lowest_mask = mask; | ||
88 | return 1; | ||
89 | } | ||
90 | |||
91 | return 0; | ||
92 | } | ||
93 | |||
94 | /** | ||
95 | * cpupri_set - update the cpu priority setting | ||
96 | * @cp: The cpupri context | ||
97 | * @cpu: The target cpu | ||
98 | * @pri: The priority (INVALID-RT99) to assign to this CPU | ||
99 | * | ||
100 | * Note: Assumes cpu_rq(cpu)->lock is locked | ||
101 | * | ||
102 | * Returns: (void) | ||
103 | */ | ||
104 | void cpupri_set(struct cpupri *cp, int cpu, int newpri) | ||
105 | { | ||
106 | int *currpri = &cp->cpu_to_pri[cpu]; | ||
107 | int oldpri = *currpri; | ||
108 | unsigned long flags; | ||
109 | |||
110 | newpri = convert_prio(newpri); | ||
111 | |||
112 | BUG_ON(newpri >= CPUPRI_NR_PRIORITIES); | ||
113 | |||
114 | if (newpri == oldpri) | ||
115 | return; | ||
116 | |||
117 | /* | ||
118 | * If the cpu was currently mapped to a different value, we | ||
119 | * first need to unmap the old value | ||
120 | */ | ||
121 | if (likely(oldpri != CPUPRI_INVALID)) { | ||
122 | struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; | ||
123 | |||
124 | spin_lock_irqsave(&vec->lock, flags); | ||
125 | |||
126 | vec->count--; | ||
127 | if (!vec->count) | ||
128 | clear_bit(oldpri, cp->pri_active); | ||
129 | cpu_clear(cpu, vec->mask); | ||
130 | |||
131 | spin_unlock_irqrestore(&vec->lock, flags); | ||
132 | } | ||
133 | |||
134 | if (likely(newpri != CPUPRI_INVALID)) { | ||
135 | struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; | ||
136 | |||
137 | spin_lock_irqsave(&vec->lock, flags); | ||
138 | |||
139 | cpu_set(cpu, vec->mask); | ||
140 | vec->count++; | ||
141 | if (vec->count == 1) | ||
142 | set_bit(newpri, cp->pri_active); | ||
143 | |||
144 | spin_unlock_irqrestore(&vec->lock, flags); | ||
145 | } | ||
146 | |||
147 | *currpri = newpri; | ||
148 | } | ||
149 | |||
150 | /** | ||
151 | * cpupri_init - initialize the cpupri structure | ||
152 | * @cp: The cpupri context | ||
153 | * | ||
154 | * Returns: (void) | ||
155 | */ | ||
156 | void cpupri_init(struct cpupri *cp) | ||
157 | { | ||
158 | int i; | ||
159 | |||
160 | memset(cp, 0, sizeof(*cp)); | ||
161 | |||
162 | for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { | ||
163 | struct cpupri_vec *vec = &cp->pri_to_cpu[i]; | ||
164 | |||
165 | spin_lock_init(&vec->lock); | ||
166 | vec->count = 0; | ||
167 | cpus_clear(vec->mask); | ||
168 | } | ||
169 | |||
170 | for_each_possible_cpu(i) | ||
171 | cp->cpu_to_pri[i] = CPUPRI_INVALID; | ||
172 | } | ||
173 | |||
174 | |||
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h new file mode 100644 index 000000000000..f25811b0f931 --- /dev/null +++ b/kernel/sched_cpupri.h | |||
@@ -0,0 +1,36 @@ | |||
1 | #ifndef _LINUX_CPUPRI_H | ||
2 | #define _LINUX_CPUPRI_H | ||
3 | |||
4 | #include <linux/sched.h> | ||
5 | |||
6 | #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) | ||
7 | #define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES) | ||
8 | |||
9 | #define CPUPRI_INVALID -1 | ||
10 | #define CPUPRI_IDLE 0 | ||
11 | #define CPUPRI_NORMAL 1 | ||
12 | /* values 2-101 are RT priorities 0-99 */ | ||
13 | |||
14 | struct cpupri_vec { | ||
15 | spinlock_t lock; | ||
16 | int count; | ||
17 | cpumask_t mask; | ||
18 | }; | ||
19 | |||
20 | struct cpupri { | ||
21 | struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; | ||
22 | long pri_active[CPUPRI_NR_PRI_WORDS]; | ||
23 | int cpu_to_pri[NR_CPUS]; | ||
24 | }; | ||
25 | |||
26 | #ifdef CONFIG_SMP | ||
27 | int cpupri_find(struct cpupri *cp, | ||
28 | struct task_struct *p, cpumask_t *lowest_mask); | ||
29 | void cpupri_set(struct cpupri *cp, int cpu, int pri); | ||
30 | void cpupri_init(struct cpupri *cp); | ||
31 | #else | ||
32 | #define cpupri_set(cp, cpu, pri) do { } while (0) | ||
33 | #define cpupri_init() do { } while (0) | ||
34 | #endif | ||
35 | |||
36 | #endif /* _LINUX_CPUPRI_H */ | ||
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 8bb713040ac9..bbe6b31c3c56 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -119,9 +119,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
119 | struct sched_entity *last; | 119 | struct sched_entity *last; |
120 | unsigned long flags; | 120 | unsigned long flags; |
121 | 121 | ||
122 | #if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED) | 122 | #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) |
123 | SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); | ||
124 | #else | ||
125 | char path[128] = ""; | 123 | char path[128] = ""; |
126 | struct cgroup *cgroup = NULL; | 124 | struct cgroup *cgroup = NULL; |
127 | struct task_group *tg = cfs_rq->tg; | 125 | struct task_group *tg = cfs_rq->tg; |
@@ -133,6 +131,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
133 | cgroup_path(cgroup, path, sizeof(path)); | 131 | cgroup_path(cgroup, path, sizeof(path)); |
134 | 132 | ||
135 | SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); | 133 | SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); |
134 | #else | ||
135 | SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); | ||
136 | #endif | 136 | #endif |
137 | 137 | ||
138 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", | 138 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", |
@@ -162,11 +162,64 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
162 | SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); | 162 | SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); |
163 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | 163 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); |
164 | #ifdef CONFIG_SCHEDSTATS | 164 | #ifdef CONFIG_SCHEDSTATS |
165 | SEQ_printf(m, " .%-30s: %d\n", "bkl_count", | 165 | #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); |
166 | rq->bkl_count); | 166 | |
167 | P(yld_exp_empty); | ||
168 | P(yld_act_empty); | ||
169 | P(yld_both_empty); | ||
170 | P(yld_count); | ||
171 | |||
172 | P(sched_switch); | ||
173 | P(sched_count); | ||
174 | P(sched_goidle); | ||
175 | |||
176 | P(ttwu_count); | ||
177 | P(ttwu_local); | ||
178 | |||
179 | P(bkl_count); | ||
180 | |||
181 | #undef P | ||
167 | #endif | 182 | #endif |
168 | SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", | 183 | SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", |
169 | cfs_rq->nr_spread_over); | 184 | cfs_rq->nr_spread_over); |
185 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
186 | #ifdef CONFIG_SMP | ||
187 | SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); | ||
188 | #endif | ||
189 | #endif | ||
190 | } | ||
191 | |||
192 | void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) | ||
193 | { | ||
194 | #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) | ||
195 | char path[128] = ""; | ||
196 | struct cgroup *cgroup = NULL; | ||
197 | struct task_group *tg = rt_rq->tg; | ||
198 | |||
199 | if (tg) | ||
200 | cgroup = tg->css.cgroup; | ||
201 | |||
202 | if (cgroup) | ||
203 | cgroup_path(cgroup, path, sizeof(path)); | ||
204 | |||
205 | SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); | ||
206 | #else | ||
207 | SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); | ||
208 | #endif | ||
209 | |||
210 | |||
211 | #define P(x) \ | ||
212 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) | ||
213 | #define PN(x) \ | ||
214 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) | ||
215 | |||
216 | P(rt_nr_running); | ||
217 | P(rt_throttled); | ||
218 | PN(rt_time); | ||
219 | PN(rt_runtime); | ||
220 | |||
221 | #undef PN | ||
222 | #undef P | ||
170 | } | 223 | } |
171 | 224 | ||
172 | static void print_cpu(struct seq_file *m, int cpu) | 225 | static void print_cpu(struct seq_file *m, int cpu) |
@@ -208,6 +261,7 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
208 | #undef PN | 261 | #undef PN |
209 | 262 | ||
210 | print_cfs_stats(m, cpu); | 263 | print_cfs_stats(m, cpu); |
264 | print_rt_stats(m, cpu); | ||
211 | 265 | ||
212 | print_rq(m, rq, cpu); | 266 | print_rq(m, rq, cpu); |
213 | } | 267 | } |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 08ae848b71d4..f2aa987027d6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield; | |||
63 | 63 | ||
64 | /* | 64 | /* |
65 | * SCHED_OTHER wake-up granularity. | 65 | * SCHED_OTHER wake-up granularity. |
66 | * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) | 66 | * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) |
67 | * | 67 | * |
68 | * This option delays the preemption effects of decoupled workloads | 68 | * This option delays the preemption effects of decoupled workloads |
69 | * and reduces their over-scheduling. Synchronous workloads will still | 69 | * and reduces their over-scheduling. Synchronous workloads will still |
70 | * have immediate wakeup/sleep latencies. | 70 | * have immediate wakeup/sleep latencies. |
71 | */ | 71 | */ |
72 | unsigned int sysctl_sched_wakeup_granularity = 10000000UL; | 72 | unsigned int sysctl_sched_wakeup_granularity = 5000000UL; |
73 | 73 | ||
74 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | 74 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
75 | 75 | ||
@@ -334,6 +334,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, | |||
334 | #endif | 334 | #endif |
335 | 335 | ||
336 | /* | 336 | /* |
337 | * delta *= w / rw | ||
338 | */ | ||
339 | static inline unsigned long | ||
340 | calc_delta_weight(unsigned long delta, struct sched_entity *se) | ||
341 | { | ||
342 | for_each_sched_entity(se) { | ||
343 | delta = calc_delta_mine(delta, | ||
344 | se->load.weight, &cfs_rq_of(se)->load); | ||
345 | } | ||
346 | |||
347 | return delta; | ||
348 | } | ||
349 | |||
350 | /* | ||
351 | * delta *= rw / w | ||
352 | */ | ||
353 | static inline unsigned long | ||
354 | calc_delta_fair(unsigned long delta, struct sched_entity *se) | ||
355 | { | ||
356 | for_each_sched_entity(se) { | ||
357 | delta = calc_delta_mine(delta, | ||
358 | cfs_rq_of(se)->load.weight, &se->load); | ||
359 | } | ||
360 | |||
361 | return delta; | ||
362 | } | ||
363 | |||
364 | /* | ||
337 | * The idea is to set a period in which each task runs once. | 365 | * The idea is to set a period in which each task runs once. |
338 | * | 366 | * |
339 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch | 367 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch |
@@ -362,47 +390,80 @@ static u64 __sched_period(unsigned long nr_running) | |||
362 | */ | 390 | */ |
363 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) | 391 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) |
364 | { | 392 | { |
365 | u64 slice = __sched_period(cfs_rq->nr_running); | 393 | return calc_delta_weight(__sched_period(cfs_rq->nr_running), se); |
366 | |||
367 | for_each_sched_entity(se) { | ||
368 | cfs_rq = cfs_rq_of(se); | ||
369 | |||
370 | slice *= se->load.weight; | ||
371 | do_div(slice, cfs_rq->load.weight); | ||
372 | } | ||
373 | |||
374 | |||
375 | return slice; | ||
376 | } | 394 | } |
377 | 395 | ||
378 | /* | 396 | /* |
379 | * We calculate the vruntime slice of a to be inserted task | 397 | * We calculate the vruntime slice of a to be inserted task |
380 | * | 398 | * |
381 | * vs = s/w = p/rw | 399 | * vs = s*rw/w = p |
382 | */ | 400 | */ |
383 | static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) | 401 | static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) |
384 | { | 402 | { |
385 | unsigned long nr_running = cfs_rq->nr_running; | 403 | unsigned long nr_running = cfs_rq->nr_running; |
386 | unsigned long weight; | ||
387 | u64 vslice; | ||
388 | 404 | ||
389 | if (!se->on_rq) | 405 | if (!se->on_rq) |
390 | nr_running++; | 406 | nr_running++; |
391 | 407 | ||
392 | vslice = __sched_period(nr_running); | 408 | return __sched_period(nr_running); |
409 | } | ||
410 | |||
411 | /* | ||
412 | * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in | ||
413 | * that it favours >=0 over <0. | ||
414 | * | ||
415 | * -20 | | ||
416 | * | | ||
417 | * 0 --------+------- | ||
418 | * .' | ||
419 | * 19 .' | ||
420 | * | ||
421 | */ | ||
422 | static unsigned long | ||
423 | calc_delta_asym(unsigned long delta, struct sched_entity *se) | ||
424 | { | ||
425 | struct load_weight lw = { | ||
426 | .weight = NICE_0_LOAD, | ||
427 | .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) | ||
428 | }; | ||
393 | 429 | ||
394 | for_each_sched_entity(se) { | 430 | for_each_sched_entity(se) { |
395 | cfs_rq = cfs_rq_of(se); | 431 | struct load_weight *se_lw = &se->load; |
432 | unsigned long rw = cfs_rq_of(se)->load.weight; | ||
433 | |||
434 | #ifdef CONFIG_FAIR_SCHED_GROUP | ||
435 | struct cfs_rq *cfs_rq = se->my_q; | ||
436 | struct task_group *tg = NULL | ||
437 | |||
438 | if (cfs_rq) | ||
439 | tg = cfs_rq->tg; | ||
440 | |||
441 | if (tg && tg->shares < NICE_0_LOAD) { | ||
442 | /* | ||
443 | * scale shares to what it would have been had | ||
444 | * tg->weight been NICE_0_LOAD: | ||
445 | * | ||
446 | * weight = 1024 * shares / tg->weight | ||
447 | */ | ||
448 | lw.weight *= se->load.weight; | ||
449 | lw.weight /= tg->shares; | ||
450 | |||
451 | lw.inv_weight = 0; | ||
452 | |||
453 | se_lw = &lw; | ||
454 | rw += lw.weight - se->load.weight; | ||
455 | } else | ||
456 | #endif | ||
396 | 457 | ||
397 | weight = cfs_rq->load.weight; | 458 | if (se->load.weight < NICE_0_LOAD) { |
398 | if (!se->on_rq) | 459 | se_lw = &lw; |
399 | weight += se->load.weight; | 460 | rw += NICE_0_LOAD - se->load.weight; |
461 | } | ||
400 | 462 | ||
401 | vslice *= NICE_0_LOAD; | 463 | delta = calc_delta_mine(delta, rw, se_lw); |
402 | do_div(vslice, weight); | ||
403 | } | 464 | } |
404 | 465 | ||
405 | return vslice; | 466 | return delta; |
406 | } | 467 | } |
407 | 468 | ||
408 | /* | 469 | /* |
@@ -419,11 +480,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
419 | 480 | ||
420 | curr->sum_exec_runtime += delta_exec; | 481 | curr->sum_exec_runtime += delta_exec; |
421 | schedstat_add(cfs_rq, exec_clock, delta_exec); | 482 | schedstat_add(cfs_rq, exec_clock, delta_exec); |
422 | delta_exec_weighted = delta_exec; | 483 | delta_exec_weighted = calc_delta_fair(delta_exec, curr); |
423 | if (unlikely(curr->load.weight != NICE_0_LOAD)) { | ||
424 | delta_exec_weighted = calc_delta_fair(delta_exec_weighted, | ||
425 | &curr->load); | ||
426 | } | ||
427 | curr->vruntime += delta_exec_weighted; | 484 | curr->vruntime += delta_exec_weighted; |
428 | } | 485 | } |
429 | 486 | ||
@@ -510,10 +567,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
510 | * Scheduling class queueing methods: | 567 | * Scheduling class queueing methods: |
511 | */ | 568 | */ |
512 | 569 | ||
570 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
571 | static void | ||
572 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
573 | { | ||
574 | cfs_rq->task_weight += weight; | ||
575 | } | ||
576 | #else | ||
577 | static inline void | ||
578 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
579 | { | ||
580 | } | ||
581 | #endif | ||
582 | |||
513 | static void | 583 | static void |
514 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 584 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
515 | { | 585 | { |
516 | update_load_add(&cfs_rq->load, se->load.weight); | 586 | update_load_add(&cfs_rq->load, se->load.weight); |
587 | if (!parent_entity(se)) | ||
588 | inc_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
589 | if (entity_is_task(se)) | ||
590 | add_cfs_task_weight(cfs_rq, se->load.weight); | ||
517 | cfs_rq->nr_running++; | 591 | cfs_rq->nr_running++; |
518 | se->on_rq = 1; | 592 | se->on_rq = 1; |
519 | list_add(&se->group_node, &cfs_rq->tasks); | 593 | list_add(&se->group_node, &cfs_rq->tasks); |
@@ -523,6 +597,10 @@ static void | |||
523 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 597 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
524 | { | 598 | { |
525 | update_load_sub(&cfs_rq->load, se->load.weight); | 599 | update_load_sub(&cfs_rq->load, se->load.weight); |
600 | if (!parent_entity(se)) | ||
601 | dec_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
602 | if (entity_is_task(se)) | ||
603 | add_cfs_task_weight(cfs_rq, -se->load.weight); | ||
526 | cfs_rq->nr_running--; | 604 | cfs_rq->nr_running--; |
527 | se->on_rq = 0; | 605 | se->on_rq = 0; |
528 | list_del_init(&se->group_node); | 606 | list_del_init(&se->group_node); |
@@ -609,8 +687,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
609 | 687 | ||
610 | if (!initial) { | 688 | if (!initial) { |
611 | /* sleeps upto a single latency don't count. */ | 689 | /* sleeps upto a single latency don't count. */ |
612 | if (sched_feat(NEW_FAIR_SLEEPERS)) | 690 | if (sched_feat(NEW_FAIR_SLEEPERS)) { |
613 | vruntime -= sysctl_sched_latency; | 691 | unsigned long thresh = sysctl_sched_latency; |
692 | |||
693 | /* | ||
694 | * convert the sleeper threshold into virtual time | ||
695 | */ | ||
696 | if (sched_feat(NORMALIZED_SLEEPER)) | ||
697 | thresh = calc_delta_fair(thresh, se); | ||
698 | |||
699 | vruntime -= thresh; | ||
700 | } | ||
614 | 701 | ||
615 | /* ensure we never gain time by being placed backwards. */ | 702 | /* ensure we never gain time by being placed backwards. */ |
616 | vruntime = max_vruntime(se->vruntime, vruntime); | 703 | vruntime = max_vruntime(se->vruntime, vruntime); |
@@ -639,21 +726,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) | |||
639 | __enqueue_entity(cfs_rq, se); | 726 | __enqueue_entity(cfs_rq, se); |
640 | } | 727 | } |
641 | 728 | ||
642 | static void update_avg(u64 *avg, u64 sample) | ||
643 | { | ||
644 | s64 diff = sample - *avg; | ||
645 | *avg += diff >> 3; | ||
646 | } | ||
647 | |||
648 | static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
649 | { | ||
650 | if (!se->last_wakeup) | ||
651 | return; | ||
652 | |||
653 | update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup); | ||
654 | se->last_wakeup = 0; | ||
655 | } | ||
656 | |||
657 | static void | 729 | static void |
658 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | 730 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) |
659 | { | 731 | { |
@@ -664,7 +736,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | |||
664 | 736 | ||
665 | update_stats_dequeue(cfs_rq, se); | 737 | update_stats_dequeue(cfs_rq, se); |
666 | if (sleep) { | 738 | if (sleep) { |
667 | update_avg_stats(cfs_rq, se); | ||
668 | #ifdef CONFIG_SCHEDSTATS | 739 | #ifdef CONFIG_SCHEDSTATS |
669 | if (entity_is_task(se)) { | 740 | if (entity_is_task(se)) { |
670 | struct task_struct *tsk = task_of(se); | 741 | struct task_struct *tsk = task_of(se); |
@@ -726,17 +797,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
726 | se->prev_sum_exec_runtime = se->sum_exec_runtime; | 797 | se->prev_sum_exec_runtime = se->sum_exec_runtime; |
727 | } | 798 | } |
728 | 799 | ||
729 | static int | ||
730 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); | ||
731 | |||
732 | static struct sched_entity * | 800 | static struct sched_entity * |
733 | pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) | 801 | pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) |
734 | { | 802 | { |
735 | if (!cfs_rq->next) | 803 | struct rq *rq = rq_of(cfs_rq); |
736 | return se; | 804 | u64 pair_slice = rq->clock - cfs_rq->pair_start; |
737 | 805 | ||
738 | if (wakeup_preempt_entity(cfs_rq->next, se) != 0) | 806 | if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) { |
807 | cfs_rq->pair_start = rq->clock; | ||
739 | return se; | 808 | return se; |
809 | } | ||
740 | 810 | ||
741 | return cfs_rq->next; | 811 | return cfs_rq->next; |
742 | } | 812 | } |
@@ -835,7 +905,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | |||
835 | hrtick_start(rq, delta, requeue); | 905 | hrtick_start(rq, delta, requeue); |
836 | } | 906 | } |
837 | } | 907 | } |
838 | #else | 908 | #else /* !CONFIG_SCHED_HRTICK */ |
839 | static inline void | 909 | static inline void |
840 | hrtick_start_fair(struct rq *rq, struct task_struct *p) | 910 | hrtick_start_fair(struct rq *rq, struct task_struct *p) |
841 | { | 911 | { |
@@ -976,7 +1046,7 @@ static int wake_idle(int cpu, struct task_struct *p) | |||
976 | } | 1046 | } |
977 | return cpu; | 1047 | return cpu; |
978 | } | 1048 | } |
979 | #else | 1049 | #else /* !ARCH_HAS_SCHED_WAKE_IDLE*/ |
980 | static inline int wake_idle(int cpu, struct task_struct *p) | 1050 | static inline int wake_idle(int cpu, struct task_struct *p) |
981 | { | 1051 | { |
982 | return cpu; | 1052 | return cpu; |
@@ -987,6 +1057,89 @@ static inline int wake_idle(int cpu, struct task_struct *p) | |||
987 | 1057 | ||
988 | static const struct sched_class fair_sched_class; | 1058 | static const struct sched_class fair_sched_class; |
989 | 1059 | ||
1060 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1061 | /* | ||
1062 | * effective_load() calculates the load change as seen from the root_task_group | ||
1063 | * | ||
1064 | * Adding load to a group doesn't make a group heavier, but can cause movement | ||
1065 | * of group shares between cpus. Assuming the shares were perfectly aligned one | ||
1066 | * can calculate the shift in shares. | ||
1067 | * | ||
1068 | * The problem is that perfectly aligning the shares is rather expensive, hence | ||
1069 | * we try to avoid doing that too often - see update_shares(), which ratelimits | ||
1070 | * this change. | ||
1071 | * | ||
1072 | * We compensate this by not only taking the current delta into account, but | ||
1073 | * also considering the delta between when the shares were last adjusted and | ||
1074 | * now. | ||
1075 | * | ||
1076 | * We still saw a performance dip, some tracing learned us that between | ||
1077 | * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased | ||
1078 | * significantly. Therefore try to bias the error in direction of failing | ||
1079 | * the affine wakeup. | ||
1080 | * | ||
1081 | */ | ||
1082 | static long effective_load(struct task_group *tg, int cpu, | ||
1083 | long wl, long wg) | ||
1084 | { | ||
1085 | struct sched_entity *se = tg->se[cpu]; | ||
1086 | long more_w; | ||
1087 | |||
1088 | if (!tg->parent) | ||
1089 | return wl; | ||
1090 | |||
1091 | /* | ||
1092 | * By not taking the decrease of shares on the other cpu into | ||
1093 | * account our error leans towards reducing the affine wakeups. | ||
1094 | */ | ||
1095 | if (!wl && sched_feat(ASYM_EFF_LOAD)) | ||
1096 | return wl; | ||
1097 | |||
1098 | /* | ||
1099 | * Instead of using this increment, also add the difference | ||
1100 | * between when the shares were last updated and now. | ||
1101 | */ | ||
1102 | more_w = se->my_q->load.weight - se->my_q->rq_weight; | ||
1103 | wl += more_w; | ||
1104 | wg += more_w; | ||
1105 | |||
1106 | for_each_sched_entity(se) { | ||
1107 | #define D(n) (likely(n) ? (n) : 1) | ||
1108 | |||
1109 | long S, rw, s, a, b; | ||
1110 | |||
1111 | S = se->my_q->tg->shares; | ||
1112 | s = se->my_q->shares; | ||
1113 | rw = se->my_q->rq_weight; | ||
1114 | |||
1115 | a = S*(rw + wl); | ||
1116 | b = S*rw + s*wg; | ||
1117 | |||
1118 | wl = s*(a-b)/D(b); | ||
1119 | /* | ||
1120 | * Assume the group is already running and will | ||
1121 | * thus already be accounted for in the weight. | ||
1122 | * | ||
1123 | * That is, moving shares between CPUs, does not | ||
1124 | * alter the group weight. | ||
1125 | */ | ||
1126 | wg = 0; | ||
1127 | #undef D | ||
1128 | } | ||
1129 | |||
1130 | return wl; | ||
1131 | } | ||
1132 | |||
1133 | #else | ||
1134 | |||
1135 | static inline unsigned long effective_load(struct task_group *tg, int cpu, | ||
1136 | unsigned long wl, unsigned long wg) | ||
1137 | { | ||
1138 | return wl; | ||
1139 | } | ||
1140 | |||
1141 | #endif | ||
1142 | |||
990 | static int | 1143 | static int |
991 | wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, | 1144 | wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, |
992 | struct task_struct *p, int prev_cpu, int this_cpu, int sync, | 1145 | struct task_struct *p, int prev_cpu, int this_cpu, int sync, |
@@ -994,8 +1147,10 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, | |||
994 | unsigned int imbalance) | 1147 | unsigned int imbalance) |
995 | { | 1148 | { |
996 | struct task_struct *curr = this_rq->curr; | 1149 | struct task_struct *curr = this_rq->curr; |
1150 | struct task_group *tg; | ||
997 | unsigned long tl = this_load; | 1151 | unsigned long tl = this_load; |
998 | unsigned long tl_per_task; | 1152 | unsigned long tl_per_task; |
1153 | unsigned long weight; | ||
999 | int balanced; | 1154 | int balanced; |
1000 | 1155 | ||
1001 | if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) | 1156 | if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) |
@@ -1006,19 +1161,28 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, | |||
1006 | * effect of the currently running task from the load | 1161 | * effect of the currently running task from the load |
1007 | * of the current CPU: | 1162 | * of the current CPU: |
1008 | */ | 1163 | */ |
1009 | if (sync) | 1164 | if (sync) { |
1010 | tl -= current->se.load.weight; | 1165 | tg = task_group(current); |
1166 | weight = current->se.load.weight; | ||
1167 | |||
1168 | tl += effective_load(tg, this_cpu, -weight, -weight); | ||
1169 | load += effective_load(tg, prev_cpu, 0, -weight); | ||
1170 | } | ||
1011 | 1171 | ||
1012 | balanced = 100*(tl + p->se.load.weight) <= imbalance*load; | 1172 | tg = task_group(p); |
1173 | weight = p->se.load.weight; | ||
1174 | |||
1175 | balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= | ||
1176 | imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); | ||
1013 | 1177 | ||
1014 | /* | 1178 | /* |
1015 | * If the currently running task will sleep within | 1179 | * If the currently running task will sleep within |
1016 | * a reasonable amount of time then attract this newly | 1180 | * a reasonable amount of time then attract this newly |
1017 | * woken task: | 1181 | * woken task: |
1018 | */ | 1182 | */ |
1019 | if (sync && balanced && curr->sched_class == &fair_sched_class) { | 1183 | if (sync && balanced) { |
1020 | if (curr->se.avg_overlap < sysctl_sched_migration_cost && | 1184 | if (curr->se.avg_overlap < sysctl_sched_migration_cost && |
1021 | p->se.avg_overlap < sysctl_sched_migration_cost) | 1185 | p->se.avg_overlap < sysctl_sched_migration_cost) |
1022 | return 1; | 1186 | return 1; |
1023 | } | 1187 | } |
1024 | 1188 | ||
@@ -1111,11 +1275,13 @@ static unsigned long wakeup_gran(struct sched_entity *se) | |||
1111 | unsigned long gran = sysctl_sched_wakeup_granularity; | 1275 | unsigned long gran = sysctl_sched_wakeup_granularity; |
1112 | 1276 | ||
1113 | /* | 1277 | /* |
1114 | * More easily preempt - nice tasks, while not making | 1278 | * More easily preempt - nice tasks, while not making it harder for |
1115 | * it harder for + nice tasks. | 1279 | * + nice tasks. |
1116 | */ | 1280 | */ |
1117 | if (unlikely(se->load.weight > NICE_0_LOAD)) | 1281 | if (sched_feat(ASYM_GRAN)) |
1118 | gran = calc_delta_fair(gran, &se->load); | 1282 | gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); |
1283 | else | ||
1284 | gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); | ||
1119 | 1285 | ||
1120 | return gran; | 1286 | return gran; |
1121 | } | 1287 | } |
@@ -1177,7 +1343,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
1177 | return; | 1343 | return; |
1178 | } | 1344 | } |
1179 | 1345 | ||
1180 | se->last_wakeup = se->sum_exec_runtime; | ||
1181 | if (unlikely(se == pse)) | 1346 | if (unlikely(se == pse)) |
1182 | return; | 1347 | return; |
1183 | 1348 | ||
@@ -1275,23 +1440,18 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) | |||
1275 | struct task_struct *p = NULL; | 1440 | struct task_struct *p = NULL; |
1276 | struct sched_entity *se; | 1441 | struct sched_entity *se; |
1277 | 1442 | ||
1278 | if (next == &cfs_rq->tasks) | 1443 | while (next != &cfs_rq->tasks) { |
1279 | return NULL; | ||
1280 | |||
1281 | /* Skip over entities that are not tasks */ | ||
1282 | do { | ||
1283 | se = list_entry(next, struct sched_entity, group_node); | 1444 | se = list_entry(next, struct sched_entity, group_node); |
1284 | next = next->next; | 1445 | next = next->next; |
1285 | } while (next != &cfs_rq->tasks && !entity_is_task(se)); | ||
1286 | 1446 | ||
1287 | if (next == &cfs_rq->tasks) | 1447 | /* Skip over entities that are not tasks */ |
1288 | return NULL; | 1448 | if (entity_is_task(se)) { |
1449 | p = task_of(se); | ||
1450 | break; | ||
1451 | } | ||
1452 | } | ||
1289 | 1453 | ||
1290 | cfs_rq->balance_iterator = next; | 1454 | cfs_rq->balance_iterator = next; |
1291 | |||
1292 | if (entity_is_task(se)) | ||
1293 | p = task_of(se); | ||
1294 | |||
1295 | return p; | 1455 | return p; |
1296 | } | 1456 | } |
1297 | 1457 | ||
@@ -1309,75 +1469,82 @@ static struct task_struct *load_balance_next_fair(void *arg) | |||
1309 | return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); | 1469 | return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); |
1310 | } | 1470 | } |
1311 | 1471 | ||
1312 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1472 | static unsigned long |
1313 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | 1473 | __load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1474 | unsigned long max_load_move, struct sched_domain *sd, | ||
1475 | enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, | ||
1476 | struct cfs_rq *cfs_rq) | ||
1314 | { | 1477 | { |
1315 | struct sched_entity *curr; | 1478 | struct rq_iterator cfs_rq_iterator; |
1316 | struct task_struct *p; | ||
1317 | |||
1318 | if (!cfs_rq->nr_running || !first_fair(cfs_rq)) | ||
1319 | return MAX_PRIO; | ||
1320 | |||
1321 | curr = cfs_rq->curr; | ||
1322 | if (!curr) | ||
1323 | curr = __pick_next_entity(cfs_rq); | ||
1324 | 1479 | ||
1325 | p = task_of(curr); | 1480 | cfs_rq_iterator.start = load_balance_start_fair; |
1481 | cfs_rq_iterator.next = load_balance_next_fair; | ||
1482 | cfs_rq_iterator.arg = cfs_rq; | ||
1326 | 1483 | ||
1327 | return p->prio; | 1484 | return balance_tasks(this_rq, this_cpu, busiest, |
1485 | max_load_move, sd, idle, all_pinned, | ||
1486 | this_best_prio, &cfs_rq_iterator); | ||
1328 | } | 1487 | } |
1329 | #endif | ||
1330 | 1488 | ||
1489 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1331 | static unsigned long | 1490 | static unsigned long |
1332 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1491 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1333 | unsigned long max_load_move, | 1492 | unsigned long max_load_move, |
1334 | struct sched_domain *sd, enum cpu_idle_type idle, | 1493 | struct sched_domain *sd, enum cpu_idle_type idle, |
1335 | int *all_pinned, int *this_best_prio) | 1494 | int *all_pinned, int *this_best_prio) |
1336 | { | 1495 | { |
1337 | struct cfs_rq *busy_cfs_rq; | ||
1338 | long rem_load_move = max_load_move; | 1496 | long rem_load_move = max_load_move; |
1339 | struct rq_iterator cfs_rq_iterator; | 1497 | int busiest_cpu = cpu_of(busiest); |
1340 | 1498 | struct task_group *tg; | |
1341 | cfs_rq_iterator.start = load_balance_start_fair; | ||
1342 | cfs_rq_iterator.next = load_balance_next_fair; | ||
1343 | 1499 | ||
1344 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | 1500 | rcu_read_lock(); |
1345 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1501 | update_h_load(busiest_cpu); |
1346 | struct cfs_rq *this_cfs_rq; | ||
1347 | long imbalance; | ||
1348 | unsigned long maxload; | ||
1349 | 1502 | ||
1350 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); | 1503 | list_for_each_entry(tg, &task_groups, list) { |
1504 | struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; | ||
1505 | unsigned long busiest_h_load = busiest_cfs_rq->h_load; | ||
1506 | unsigned long busiest_weight = busiest_cfs_rq->load.weight; | ||
1507 | u64 rem_load, moved_load; | ||
1351 | 1508 | ||
1352 | imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; | 1509 | /* |
1353 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | 1510 | * empty group |
1354 | if (imbalance <= 0) | 1511 | */ |
1512 | if (!busiest_cfs_rq->task_weight) | ||
1355 | continue; | 1513 | continue; |
1356 | 1514 | ||
1357 | /* Don't pull more than imbalance/2 */ | 1515 | rem_load = (u64)rem_load_move * busiest_weight; |
1358 | imbalance /= 2; | 1516 | rem_load = div_u64(rem_load, busiest_h_load + 1); |
1359 | maxload = min(rem_load_move, imbalance); | ||
1360 | 1517 | ||
1361 | *this_best_prio = cfs_rq_best_prio(this_cfs_rq); | 1518 | moved_load = __load_balance_fair(this_rq, this_cpu, busiest, |
1362 | #else | 1519 | rem_load, sd, idle, all_pinned, this_best_prio, |
1363 | # define maxload rem_load_move | 1520 | tg->cfs_rq[busiest_cpu]); |
1364 | #endif | 1521 | |
1365 | /* | 1522 | if (!moved_load) |
1366 | * pass busy_cfs_rq argument into | 1523 | continue; |
1367 | * load_balance_[start|next]_fair iterators | 1524 | |
1368 | */ | 1525 | moved_load *= busiest_h_load; |
1369 | cfs_rq_iterator.arg = busy_cfs_rq; | 1526 | moved_load = div_u64(moved_load, busiest_weight + 1); |
1370 | rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, | ||
1371 | maxload, sd, idle, all_pinned, | ||
1372 | this_best_prio, | ||
1373 | &cfs_rq_iterator); | ||
1374 | 1527 | ||
1375 | if (rem_load_move <= 0) | 1528 | rem_load_move -= moved_load; |
1529 | if (rem_load_move < 0) | ||
1376 | break; | 1530 | break; |
1377 | } | 1531 | } |
1532 | rcu_read_unlock(); | ||
1378 | 1533 | ||
1379 | return max_load_move - rem_load_move; | 1534 | return max_load_move - rem_load_move; |
1380 | } | 1535 | } |
1536 | #else | ||
1537 | static unsigned long | ||
1538 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
1539 | unsigned long max_load_move, | ||
1540 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
1541 | int *all_pinned, int *this_best_prio) | ||
1542 | { | ||
1543 | return __load_balance_fair(this_rq, this_cpu, busiest, | ||
1544 | max_load_move, sd, idle, all_pinned, | ||
1545 | this_best_prio, &busiest->cfs); | ||
1546 | } | ||
1547 | #endif | ||
1381 | 1548 | ||
1382 | static int | 1549 | static int |
1383 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1550 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
@@ -1402,7 +1569,7 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1402 | 1569 | ||
1403 | return 0; | 1570 | return 0; |
1404 | } | 1571 | } |
1405 | #endif | 1572 | #endif /* CONFIG_SMP */ |
1406 | 1573 | ||
1407 | /* | 1574 | /* |
1408 | * scheduler tick hitting a task of our scheduling class: | 1575 | * scheduler tick hitting a task of our scheduling class: |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 1c7283cb9581..862b06bd560a 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
@@ -1,4 +1,5 @@ | |||
1 | SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) | 1 | SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) |
2 | SCHED_FEAT(NORMALIZED_SLEEPER, 1) | ||
2 | SCHED_FEAT(WAKEUP_PREEMPT, 1) | 3 | SCHED_FEAT(WAKEUP_PREEMPT, 1) |
3 | SCHED_FEAT(START_DEBIT, 1) | 4 | SCHED_FEAT(START_DEBIT, 1) |
4 | SCHED_FEAT(AFFINE_WAKEUPS, 1) | 5 | SCHED_FEAT(AFFINE_WAKEUPS, 1) |
@@ -6,5 +7,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1) | |||
6 | SCHED_FEAT(SYNC_WAKEUPS, 1) | 7 | SCHED_FEAT(SYNC_WAKEUPS, 1) |
7 | SCHED_FEAT(HRTICK, 1) | 8 | SCHED_FEAT(HRTICK, 1) |
8 | SCHED_FEAT(DOUBLE_TICK, 0) | 9 | SCHED_FEAT(DOUBLE_TICK, 0) |
9 | SCHED_FEAT(NORMALIZED_SLEEPER, 1) | 10 | SCHED_FEAT(ASYM_GRAN, 1) |
10 | SCHED_FEAT(DEADLINE, 1) | 11 | SCHED_FEAT(LB_BIAS, 0) |
12 | SCHED_FEAT(LB_WAKEUP_UPDATE, 1) | ||
13 | SCHED_FEAT(ASYM_EFF_LOAD, 1) | ||
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 0f3c19197fa4..47ceac9e8552 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -12,6 +12,9 @@ static inline int rt_overloaded(struct rq *rq) | |||
12 | 12 | ||
13 | static inline void rt_set_overload(struct rq *rq) | 13 | static inline void rt_set_overload(struct rq *rq) |
14 | { | 14 | { |
15 | if (!rq->online) | ||
16 | return; | ||
17 | |||
15 | cpu_set(rq->cpu, rq->rd->rto_mask); | 18 | cpu_set(rq->cpu, rq->rd->rto_mask); |
16 | /* | 19 | /* |
17 | * Make sure the mask is visible before we set | 20 | * Make sure the mask is visible before we set |
@@ -26,6 +29,9 @@ static inline void rt_set_overload(struct rq *rq) | |||
26 | 29 | ||
27 | static inline void rt_clear_overload(struct rq *rq) | 30 | static inline void rt_clear_overload(struct rq *rq) |
28 | { | 31 | { |
32 | if (!rq->online) | ||
33 | return; | ||
34 | |||
29 | /* the order here really doesn't matter */ | 35 | /* the order here really doesn't matter */ |
30 | atomic_dec(&rq->rd->rto_count); | 36 | atomic_dec(&rq->rd->rto_count); |
31 | cpu_clear(rq->cpu, rq->rd->rto_mask); | 37 | cpu_clear(rq->cpu, rq->rd->rto_mask); |
@@ -155,7 +161,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) | |||
155 | return &rt_rq->tg->rt_bandwidth; | 161 | return &rt_rq->tg->rt_bandwidth; |
156 | } | 162 | } |
157 | 163 | ||
158 | #else | 164 | #else /* !CONFIG_RT_GROUP_SCHED */ |
159 | 165 | ||
160 | static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) | 166 | static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) |
161 | { | 167 | { |
@@ -220,49 +226,10 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) | |||
220 | return &def_rt_bandwidth; | 226 | return &def_rt_bandwidth; |
221 | } | 227 | } |
222 | 228 | ||
223 | #endif | 229 | #endif /* CONFIG_RT_GROUP_SCHED */ |
224 | |||
225 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | ||
226 | { | ||
227 | int i, idle = 1; | ||
228 | cpumask_t span; | ||
229 | |||
230 | if (rt_b->rt_runtime == RUNTIME_INF) | ||
231 | return 1; | ||
232 | |||
233 | span = sched_rt_period_mask(); | ||
234 | for_each_cpu_mask(i, span) { | ||
235 | int enqueue = 0; | ||
236 | struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); | ||
237 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
238 | |||
239 | spin_lock(&rq->lock); | ||
240 | if (rt_rq->rt_time) { | ||
241 | u64 runtime; | ||
242 | |||
243 | spin_lock(&rt_rq->rt_runtime_lock); | ||
244 | runtime = rt_rq->rt_runtime; | ||
245 | rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); | ||
246 | if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { | ||
247 | rt_rq->rt_throttled = 0; | ||
248 | enqueue = 1; | ||
249 | } | ||
250 | if (rt_rq->rt_time || rt_rq->rt_nr_running) | ||
251 | idle = 0; | ||
252 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
253 | } else if (rt_rq->rt_nr_running) | ||
254 | idle = 0; | ||
255 | |||
256 | if (enqueue) | ||
257 | sched_rt_rq_enqueue(rt_rq); | ||
258 | spin_unlock(&rq->lock); | ||
259 | } | ||
260 | |||
261 | return idle; | ||
262 | } | ||
263 | 230 | ||
264 | #ifdef CONFIG_SMP | 231 | #ifdef CONFIG_SMP |
265 | static int balance_runtime(struct rt_rq *rt_rq) | 232 | static int do_balance_runtime(struct rt_rq *rt_rq) |
266 | { | 233 | { |
267 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | 234 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); |
268 | struct root_domain *rd = cpu_rq(smp_processor_id())->rd; | 235 | struct root_domain *rd = cpu_rq(smp_processor_id())->rd; |
@@ -281,6 +248,9 @@ static int balance_runtime(struct rt_rq *rt_rq) | |||
281 | continue; | 248 | continue; |
282 | 249 | ||
283 | spin_lock(&iter->rt_runtime_lock); | 250 | spin_lock(&iter->rt_runtime_lock); |
251 | if (iter->rt_runtime == RUNTIME_INF) | ||
252 | goto next; | ||
253 | |||
284 | diff = iter->rt_runtime - iter->rt_time; | 254 | diff = iter->rt_runtime - iter->rt_time; |
285 | if (diff > 0) { | 255 | if (diff > 0) { |
286 | do_div(diff, weight); | 256 | do_div(diff, weight); |
@@ -294,13 +264,163 @@ static int balance_runtime(struct rt_rq *rt_rq) | |||
294 | break; | 264 | break; |
295 | } | 265 | } |
296 | } | 266 | } |
267 | next: | ||
297 | spin_unlock(&iter->rt_runtime_lock); | 268 | spin_unlock(&iter->rt_runtime_lock); |
298 | } | 269 | } |
299 | spin_unlock(&rt_b->rt_runtime_lock); | 270 | spin_unlock(&rt_b->rt_runtime_lock); |
300 | 271 | ||
301 | return more; | 272 | return more; |
302 | } | 273 | } |
303 | #endif | 274 | |
275 | static void __disable_runtime(struct rq *rq) | ||
276 | { | ||
277 | struct root_domain *rd = rq->rd; | ||
278 | struct rt_rq *rt_rq; | ||
279 | |||
280 | if (unlikely(!scheduler_running)) | ||
281 | return; | ||
282 | |||
283 | for_each_leaf_rt_rq(rt_rq, rq) { | ||
284 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | ||
285 | s64 want; | ||
286 | int i; | ||
287 | |||
288 | spin_lock(&rt_b->rt_runtime_lock); | ||
289 | spin_lock(&rt_rq->rt_runtime_lock); | ||
290 | if (rt_rq->rt_runtime == RUNTIME_INF || | ||
291 | rt_rq->rt_runtime == rt_b->rt_runtime) | ||
292 | goto balanced; | ||
293 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
294 | |||
295 | want = rt_b->rt_runtime - rt_rq->rt_runtime; | ||
296 | |||
297 | for_each_cpu_mask(i, rd->span) { | ||
298 | struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); | ||
299 | s64 diff; | ||
300 | |||
301 | if (iter == rt_rq) | ||
302 | continue; | ||
303 | |||
304 | spin_lock(&iter->rt_runtime_lock); | ||
305 | if (want > 0) { | ||
306 | diff = min_t(s64, iter->rt_runtime, want); | ||
307 | iter->rt_runtime -= diff; | ||
308 | want -= diff; | ||
309 | } else { | ||
310 | iter->rt_runtime -= want; | ||
311 | want -= want; | ||
312 | } | ||
313 | spin_unlock(&iter->rt_runtime_lock); | ||
314 | |||
315 | if (!want) | ||
316 | break; | ||
317 | } | ||
318 | |||
319 | spin_lock(&rt_rq->rt_runtime_lock); | ||
320 | BUG_ON(want); | ||
321 | balanced: | ||
322 | rt_rq->rt_runtime = RUNTIME_INF; | ||
323 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
324 | spin_unlock(&rt_b->rt_runtime_lock); | ||
325 | } | ||
326 | } | ||
327 | |||
328 | static void disable_runtime(struct rq *rq) | ||
329 | { | ||
330 | unsigned long flags; | ||
331 | |||
332 | spin_lock_irqsave(&rq->lock, flags); | ||
333 | __disable_runtime(rq); | ||
334 | spin_unlock_irqrestore(&rq->lock, flags); | ||
335 | } | ||
336 | |||
337 | static void __enable_runtime(struct rq *rq) | ||
338 | { | ||
339 | struct rt_rq *rt_rq; | ||
340 | |||
341 | if (unlikely(!scheduler_running)) | ||
342 | return; | ||
343 | |||
344 | for_each_leaf_rt_rq(rt_rq, rq) { | ||
345 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | ||
346 | |||
347 | spin_lock(&rt_b->rt_runtime_lock); | ||
348 | spin_lock(&rt_rq->rt_runtime_lock); | ||
349 | rt_rq->rt_runtime = rt_b->rt_runtime; | ||
350 | rt_rq->rt_time = 0; | ||
351 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
352 | spin_unlock(&rt_b->rt_runtime_lock); | ||
353 | } | ||
354 | } | ||
355 | |||
356 | static void enable_runtime(struct rq *rq) | ||
357 | { | ||
358 | unsigned long flags; | ||
359 | |||
360 | spin_lock_irqsave(&rq->lock, flags); | ||
361 | __enable_runtime(rq); | ||
362 | spin_unlock_irqrestore(&rq->lock, flags); | ||
363 | } | ||
364 | |||
365 | static int balance_runtime(struct rt_rq *rt_rq) | ||
366 | { | ||
367 | int more = 0; | ||
368 | |||
369 | if (rt_rq->rt_time > rt_rq->rt_runtime) { | ||
370 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
371 | more = do_balance_runtime(rt_rq); | ||
372 | spin_lock(&rt_rq->rt_runtime_lock); | ||
373 | } | ||
374 | |||
375 | return more; | ||
376 | } | ||
377 | #else /* !CONFIG_SMP */ | ||
378 | static inline int balance_runtime(struct rt_rq *rt_rq) | ||
379 | { | ||
380 | return 0; | ||
381 | } | ||
382 | #endif /* CONFIG_SMP */ | ||
383 | |||
384 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | ||
385 | { | ||
386 | int i, idle = 1; | ||
387 | cpumask_t span; | ||
388 | |||
389 | if (rt_b->rt_runtime == RUNTIME_INF) | ||
390 | return 1; | ||
391 | |||
392 | span = sched_rt_period_mask(); | ||
393 | for_each_cpu_mask(i, span) { | ||
394 | int enqueue = 0; | ||
395 | struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); | ||
396 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
397 | |||
398 | spin_lock(&rq->lock); | ||
399 | if (rt_rq->rt_time) { | ||
400 | u64 runtime; | ||
401 | |||
402 | spin_lock(&rt_rq->rt_runtime_lock); | ||
403 | if (rt_rq->rt_throttled) | ||
404 | balance_runtime(rt_rq); | ||
405 | runtime = rt_rq->rt_runtime; | ||
406 | rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); | ||
407 | if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { | ||
408 | rt_rq->rt_throttled = 0; | ||
409 | enqueue = 1; | ||
410 | } | ||
411 | if (rt_rq->rt_time || rt_rq->rt_nr_running) | ||
412 | idle = 0; | ||
413 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
414 | } else if (rt_rq->rt_nr_running) | ||
415 | idle = 0; | ||
416 | |||
417 | if (enqueue) | ||
418 | sched_rt_rq_enqueue(rt_rq); | ||
419 | spin_unlock(&rq->lock); | ||
420 | } | ||
421 | |||
422 | return idle; | ||
423 | } | ||
304 | 424 | ||
305 | static inline int rt_se_prio(struct sched_rt_entity *rt_se) | 425 | static inline int rt_se_prio(struct sched_rt_entity *rt_se) |
306 | { | 426 | { |
@@ -327,18 +447,10 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) | |||
327 | if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) | 447 | if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) |
328 | return 0; | 448 | return 0; |
329 | 449 | ||
330 | #ifdef CONFIG_SMP | 450 | balance_runtime(rt_rq); |
331 | if (rt_rq->rt_time > runtime) { | 451 | runtime = sched_rt_runtime(rt_rq); |
332 | int more; | 452 | if (runtime == RUNTIME_INF) |
333 | 453 | return 0; | |
334 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
335 | more = balance_runtime(rt_rq); | ||
336 | spin_lock(&rt_rq->rt_runtime_lock); | ||
337 | |||
338 | if (more) | ||
339 | runtime = sched_rt_runtime(rt_rq); | ||
340 | } | ||
341 | #endif | ||
342 | 454 | ||
343 | if (rt_rq->rt_time > runtime) { | 455 | if (rt_rq->rt_time > runtime) { |
344 | rt_rq->rt_throttled = 1; | 456 | rt_rq->rt_throttled = 1; |
@@ -392,12 +504,21 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
392 | WARN_ON(!rt_prio(rt_se_prio(rt_se))); | 504 | WARN_ON(!rt_prio(rt_se_prio(rt_se))); |
393 | rt_rq->rt_nr_running++; | 505 | rt_rq->rt_nr_running++; |
394 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | 506 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED |
395 | if (rt_se_prio(rt_se) < rt_rq->highest_prio) | 507 | if (rt_se_prio(rt_se) < rt_rq->highest_prio) { |
508 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
509 | |||
396 | rt_rq->highest_prio = rt_se_prio(rt_se); | 510 | rt_rq->highest_prio = rt_se_prio(rt_se); |
511 | #ifdef CONFIG_SMP | ||
512 | if (rq->online) | ||
513 | cpupri_set(&rq->rd->cpupri, rq->cpu, | ||
514 | rt_se_prio(rt_se)); | ||
515 | #endif | ||
516 | } | ||
397 | #endif | 517 | #endif |
398 | #ifdef CONFIG_SMP | 518 | #ifdef CONFIG_SMP |
399 | if (rt_se->nr_cpus_allowed > 1) { | 519 | if (rt_se->nr_cpus_allowed > 1) { |
400 | struct rq *rq = rq_of_rt_rq(rt_rq); | 520 | struct rq *rq = rq_of_rt_rq(rt_rq); |
521 | |||
401 | rq->rt.rt_nr_migratory++; | 522 | rq->rt.rt_nr_migratory++; |
402 | } | 523 | } |
403 | 524 | ||
@@ -417,6 +538,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
417 | static inline | 538 | static inline |
418 | void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | 539 | void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) |
419 | { | 540 | { |
541 | #ifdef CONFIG_SMP | ||
542 | int highest_prio = rt_rq->highest_prio; | ||
543 | #endif | ||
544 | |||
420 | WARN_ON(!rt_prio(rt_se_prio(rt_se))); | 545 | WARN_ON(!rt_prio(rt_se_prio(rt_se))); |
421 | WARN_ON(!rt_rq->rt_nr_running); | 546 | WARN_ON(!rt_rq->rt_nr_running); |
422 | rt_rq->rt_nr_running--; | 547 | rt_rq->rt_nr_running--; |
@@ -440,6 +565,14 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
440 | rq->rt.rt_nr_migratory--; | 565 | rq->rt.rt_nr_migratory--; |
441 | } | 566 | } |
442 | 567 | ||
568 | if (rt_rq->highest_prio != highest_prio) { | ||
569 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
570 | |||
571 | if (rq->online) | ||
572 | cpupri_set(&rq->rd->cpupri, rq->cpu, | ||
573 | rt_rq->highest_prio); | ||
574 | } | ||
575 | |||
443 | update_rt_migration(rq_of_rt_rq(rt_rq)); | 576 | update_rt_migration(rq_of_rt_rq(rt_rq)); |
444 | #endif /* CONFIG_SMP */ | 577 | #endif /* CONFIG_SMP */ |
445 | #ifdef CONFIG_RT_GROUP_SCHED | 578 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -455,6 +588,7 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) | |||
455 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | 588 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); |
456 | struct rt_prio_array *array = &rt_rq->active; | 589 | struct rt_prio_array *array = &rt_rq->active; |
457 | struct rt_rq *group_rq = group_rt_rq(rt_se); | 590 | struct rt_rq *group_rq = group_rt_rq(rt_se); |
591 | struct list_head *queue = array->queue + rt_se_prio(rt_se); | ||
458 | 592 | ||
459 | /* | 593 | /* |
460 | * Don't enqueue the group if its throttled, or when empty. | 594 | * Don't enqueue the group if its throttled, or when empty. |
@@ -465,7 +599,11 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) | |||
465 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) | 599 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) |
466 | return; | 600 | return; |
467 | 601 | ||
468 | list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); | 602 | if (rt_se->nr_cpus_allowed == 1) |
603 | list_add(&rt_se->run_list, queue); | ||
604 | else | ||
605 | list_add_tail(&rt_se->run_list, queue); | ||
606 | |||
469 | __set_bit(rt_se_prio(rt_se), array->bitmap); | 607 | __set_bit(rt_se_prio(rt_se), array->bitmap); |
470 | 608 | ||
471 | inc_rt_tasks(rt_se, rt_rq); | 609 | inc_rt_tasks(rt_se, rt_rq); |
@@ -532,6 +670,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | |||
532 | rt_se->timeout = 0; | 670 | rt_se->timeout = 0; |
533 | 671 | ||
534 | enqueue_rt_entity(rt_se); | 672 | enqueue_rt_entity(rt_se); |
673 | |||
674 | inc_cpu_load(rq, p->se.load.weight); | ||
535 | } | 675 | } |
536 | 676 | ||
537 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | 677 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) |
@@ -540,6 +680,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | |||
540 | 680 | ||
541 | update_curr_rt(rq); | 681 | update_curr_rt(rq); |
542 | dequeue_rt_entity(rt_se); | 682 | dequeue_rt_entity(rt_se); |
683 | |||
684 | dec_cpu_load(rq, p->se.load.weight); | ||
543 | } | 685 | } |
544 | 686 | ||
545 | /* | 687 | /* |
@@ -550,10 +692,12 @@ static | |||
550 | void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) | 692 | void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) |
551 | { | 693 | { |
552 | struct rt_prio_array *array = &rt_rq->active; | 694 | struct rt_prio_array *array = &rt_rq->active; |
553 | struct list_head *queue = array->queue + rt_se_prio(rt_se); | ||
554 | 695 | ||
555 | if (on_rt_rq(rt_se)) | 696 | if (on_rt_rq(rt_se)) { |
556 | list_move_tail(&rt_se->run_list, queue); | 697 | list_del_init(&rt_se->run_list); |
698 | list_add_tail(&rt_se->run_list, | ||
699 | array->queue + rt_se_prio(rt_se)); | ||
700 | } | ||
557 | } | 701 | } |
558 | 702 | ||
559 | static void requeue_task_rt(struct rq *rq, struct task_struct *p) | 703 | static void requeue_task_rt(struct rq *rq, struct task_struct *p) |
@@ -616,8 +760,37 @@ static int select_task_rq_rt(struct task_struct *p, int sync) | |||
616 | */ | 760 | */ |
617 | static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) | 761 | static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) |
618 | { | 762 | { |
619 | if (p->prio < rq->curr->prio) | 763 | if (p->prio < rq->curr->prio) { |
620 | resched_task(rq->curr); | 764 | resched_task(rq->curr); |
765 | return; | ||
766 | } | ||
767 | |||
768 | #ifdef CONFIG_SMP | ||
769 | /* | ||
770 | * If: | ||
771 | * | ||
772 | * - the newly woken task is of equal priority to the current task | ||
773 | * - the newly woken task is non-migratable while current is migratable | ||
774 | * - current will be preempted on the next reschedule | ||
775 | * | ||
776 | * we should check to see if current can readily move to a different | ||
777 | * cpu. If so, we will reschedule to allow the push logic to try | ||
778 | * to move current somewhere else, making room for our non-migratable | ||
779 | * task. | ||
780 | */ | ||
781 | if((p->prio == rq->curr->prio) | ||
782 | && p->rt.nr_cpus_allowed == 1 | ||
783 | && rq->curr->rt.nr_cpus_allowed != 1) { | ||
784 | cpumask_t mask; | ||
785 | |||
786 | if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) | ||
787 | /* | ||
788 | * There appears to be other cpus that can accept | ||
789 | * current, so lets reschedule to try and push it away | ||
790 | */ | ||
791 | resched_task(rq->curr); | ||
792 | } | ||
793 | #endif | ||
621 | } | 794 | } |
622 | 795 | ||
623 | static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, | 796 | static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, |
@@ -720,73 +893,6 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) | |||
720 | 893 | ||
721 | static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); | 894 | static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); |
722 | 895 | ||
723 | static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask) | ||
724 | { | ||
725 | int lowest_prio = -1; | ||
726 | int lowest_cpu = -1; | ||
727 | int count = 0; | ||
728 | int cpu; | ||
729 | |||
730 | cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed); | ||
731 | |||
732 | /* | ||
733 | * Scan each rq for the lowest prio. | ||
734 | */ | ||
735 | for_each_cpu_mask(cpu, *lowest_mask) { | ||
736 | struct rq *rq = cpu_rq(cpu); | ||
737 | |||
738 | /* We look for lowest RT prio or non-rt CPU */ | ||
739 | if (rq->rt.highest_prio >= MAX_RT_PRIO) { | ||
740 | /* | ||
741 | * if we already found a low RT queue | ||
742 | * and now we found this non-rt queue | ||
743 | * clear the mask and set our bit. | ||
744 | * Otherwise just return the queue as is | ||
745 | * and the count==1 will cause the algorithm | ||
746 | * to use the first bit found. | ||
747 | */ | ||
748 | if (lowest_cpu != -1) { | ||
749 | cpus_clear(*lowest_mask); | ||
750 | cpu_set(rq->cpu, *lowest_mask); | ||
751 | } | ||
752 | return 1; | ||
753 | } | ||
754 | |||
755 | /* no locking for now */ | ||
756 | if ((rq->rt.highest_prio > task->prio) | ||
757 | && (rq->rt.highest_prio >= lowest_prio)) { | ||
758 | if (rq->rt.highest_prio > lowest_prio) { | ||
759 | /* new low - clear old data */ | ||
760 | lowest_prio = rq->rt.highest_prio; | ||
761 | lowest_cpu = cpu; | ||
762 | count = 0; | ||
763 | } | ||
764 | count++; | ||
765 | } else | ||
766 | cpu_clear(cpu, *lowest_mask); | ||
767 | } | ||
768 | |||
769 | /* | ||
770 | * Clear out all the set bits that represent | ||
771 | * runqueues that were of higher prio than | ||
772 | * the lowest_prio. | ||
773 | */ | ||
774 | if (lowest_cpu > 0) { | ||
775 | /* | ||
776 | * Perhaps we could add another cpumask op to | ||
777 | * zero out bits. Like cpu_zero_bits(cpumask, nrbits); | ||
778 | * Then that could be optimized to use memset and such. | ||
779 | */ | ||
780 | for_each_cpu_mask(cpu, *lowest_mask) { | ||
781 | if (cpu >= lowest_cpu) | ||
782 | break; | ||
783 | cpu_clear(cpu, *lowest_mask); | ||
784 | } | ||
785 | } | ||
786 | |||
787 | return count; | ||
788 | } | ||
789 | |||
790 | static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) | 896 | static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) |
791 | { | 897 | { |
792 | int first; | 898 | int first; |
@@ -808,17 +914,12 @@ static int find_lowest_rq(struct task_struct *task) | |||
808 | cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); | 914 | cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); |
809 | int this_cpu = smp_processor_id(); | 915 | int this_cpu = smp_processor_id(); |
810 | int cpu = task_cpu(task); | 916 | int cpu = task_cpu(task); |
811 | int count = find_lowest_cpus(task, lowest_mask); | ||
812 | 917 | ||
813 | if (!count) | 918 | if (task->rt.nr_cpus_allowed == 1) |
814 | return -1; /* No targets found */ | 919 | return -1; /* No other targets possible */ |
815 | 920 | ||
816 | /* | 921 | if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) |
817 | * There is no sense in performing an optimal search if only one | 922 | return -1; /* No targets found */ |
818 | * target is found. | ||
819 | */ | ||
820 | if (count == 1) | ||
821 | return first_cpu(*lowest_mask); | ||
822 | 923 | ||
823 | /* | 924 | /* |
824 | * At this point we have built a mask of cpus representing the | 925 | * At this point we have built a mask of cpus representing the |
@@ -1163,17 +1264,25 @@ static void set_cpus_allowed_rt(struct task_struct *p, | |||
1163 | } | 1264 | } |
1164 | 1265 | ||
1165 | /* Assumes rq->lock is held */ | 1266 | /* Assumes rq->lock is held */ |
1166 | static void join_domain_rt(struct rq *rq) | 1267 | static void rq_online_rt(struct rq *rq) |
1167 | { | 1268 | { |
1168 | if (rq->rt.overloaded) | 1269 | if (rq->rt.overloaded) |
1169 | rt_set_overload(rq); | 1270 | rt_set_overload(rq); |
1271 | |||
1272 | __enable_runtime(rq); | ||
1273 | |||
1274 | cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio); | ||
1170 | } | 1275 | } |
1171 | 1276 | ||
1172 | /* Assumes rq->lock is held */ | 1277 | /* Assumes rq->lock is held */ |
1173 | static void leave_domain_rt(struct rq *rq) | 1278 | static void rq_offline_rt(struct rq *rq) |
1174 | { | 1279 | { |
1175 | if (rq->rt.overloaded) | 1280 | if (rq->rt.overloaded) |
1176 | rt_clear_overload(rq); | 1281 | rt_clear_overload(rq); |
1282 | |||
1283 | __disable_runtime(rq); | ||
1284 | |||
1285 | cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); | ||
1177 | } | 1286 | } |
1178 | 1287 | ||
1179 | /* | 1288 | /* |
@@ -1336,8 +1445,8 @@ static const struct sched_class rt_sched_class = { | |||
1336 | .load_balance = load_balance_rt, | 1445 | .load_balance = load_balance_rt, |
1337 | .move_one_task = move_one_task_rt, | 1446 | .move_one_task = move_one_task_rt, |
1338 | .set_cpus_allowed = set_cpus_allowed_rt, | 1447 | .set_cpus_allowed = set_cpus_allowed_rt, |
1339 | .join_domain = join_domain_rt, | 1448 | .rq_online = rq_online_rt, |
1340 | .leave_domain = leave_domain_rt, | 1449 | .rq_offline = rq_offline_rt, |
1341 | .pre_schedule = pre_schedule_rt, | 1450 | .pre_schedule = pre_schedule_rt, |
1342 | .post_schedule = post_schedule_rt, | 1451 | .post_schedule = post_schedule_rt, |
1343 | .task_wake_up = task_wake_up_rt, | 1452 | .task_wake_up = task_wake_up_rt, |
@@ -1350,3 +1459,17 @@ static const struct sched_class rt_sched_class = { | |||
1350 | .prio_changed = prio_changed_rt, | 1459 | .prio_changed = prio_changed_rt, |
1351 | .switched_to = switched_to_rt, | 1460 | .switched_to = switched_to_rt, |
1352 | }; | 1461 | }; |
1462 | |||
1463 | #ifdef CONFIG_SCHED_DEBUG | ||
1464 | extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); | ||
1465 | |||
1466 | static void print_rt_stats(struct seq_file *m, int cpu) | ||
1467 | { | ||
1468 | struct rt_rq *rt_rq; | ||
1469 | |||
1470 | rcu_read_lock(); | ||
1471 | for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu)) | ||
1472 | print_rt_rq(m, cpu, rt_rq); | ||
1473 | rcu_read_unlock(); | ||
1474 | } | ||
1475 | #endif /* CONFIG_SCHED_DEBUG */ | ||
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 80179ef7450e..8385d43987e2 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h | |||
@@ -118,6 +118,13 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) | |||
118 | if (rq) | 118 | if (rq) |
119 | rq->rq_sched_info.cpu_time += delta; | 119 | rq->rq_sched_info.cpu_time += delta; |
120 | } | 120 | } |
121 | |||
122 | static inline void | ||
123 | rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) | ||
124 | { | ||
125 | if (rq) | ||
126 | rq->rq_sched_info.run_delay += delta; | ||
127 | } | ||
121 | # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) | 128 | # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) |
122 | # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) | 129 | # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) |
123 | # define schedstat_set(var, val) do { var = (val); } while (0) | 130 | # define schedstat_set(var, val) do { var = (val); } while (0) |
@@ -126,6 +133,9 @@ static inline void | |||
126 | rq_sched_info_arrive(struct rq *rq, unsigned long long delta) | 133 | rq_sched_info_arrive(struct rq *rq, unsigned long long delta) |
127 | {} | 134 | {} |
128 | static inline void | 135 | static inline void |
136 | rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) | ||
137 | {} | ||
138 | static inline void | ||
129 | rq_sched_info_depart(struct rq *rq, unsigned long long delta) | 139 | rq_sched_info_depart(struct rq *rq, unsigned long long delta) |
130 | {} | 140 | {} |
131 | # define schedstat_inc(rq, field) do { } while (0) | 141 | # define schedstat_inc(rq, field) do { } while (0) |
@@ -134,6 +144,11 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) | |||
134 | #endif | 144 | #endif |
135 | 145 | ||
136 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 146 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
147 | static inline void sched_info_reset_dequeued(struct task_struct *t) | ||
148 | { | ||
149 | t->sched_info.last_queued = 0; | ||
150 | } | ||
151 | |||
137 | /* | 152 | /* |
138 | * Called when a process is dequeued from the active array and given | 153 | * Called when a process is dequeued from the active array and given |
139 | * the cpu. We should note that with the exception of interactive | 154 | * the cpu. We should note that with the exception of interactive |
@@ -143,15 +158,22 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) | |||
143 | * active queue, thus delaying tasks in the expired queue from running; | 158 | * active queue, thus delaying tasks in the expired queue from running; |
144 | * see scheduler_tick()). | 159 | * see scheduler_tick()). |
145 | * | 160 | * |
146 | * This function is only called from sched_info_arrive(), rather than | 161 | * Though we are interested in knowing how long it was from the *first* time a |
147 | * dequeue_task(). Even though a task may be queued and dequeued multiple | 162 | * task was queued to the time that it finally hit a cpu, we call this routine |
148 | * times as it is shuffled about, we're really interested in knowing how | 163 | * from dequeue_task() to account for possible rq->clock skew across cpus. The |
149 | * long it was from the *first* time it was queued to the time that it | 164 | * delta taken on each cpu would annul the skew. |
150 | * finally hit a cpu. | ||
151 | */ | 165 | */ |
152 | static inline void sched_info_dequeued(struct task_struct *t) | 166 | static inline void sched_info_dequeued(struct task_struct *t) |
153 | { | 167 | { |
154 | t->sched_info.last_queued = 0; | 168 | unsigned long long now = task_rq(t)->clock, delta = 0; |
169 | |||
170 | if (unlikely(sched_info_on())) | ||
171 | if (t->sched_info.last_queued) | ||
172 | delta = now - t->sched_info.last_queued; | ||
173 | sched_info_reset_dequeued(t); | ||
174 | t->sched_info.run_delay += delta; | ||
175 | |||
176 | rq_sched_info_dequeued(task_rq(t), delta); | ||
155 | } | 177 | } |
156 | 178 | ||
157 | /* | 179 | /* |
@@ -165,7 +187,7 @@ static void sched_info_arrive(struct task_struct *t) | |||
165 | 187 | ||
166 | if (t->sched_info.last_queued) | 188 | if (t->sched_info.last_queued) |
167 | delta = now - t->sched_info.last_queued; | 189 | delta = now - t->sched_info.last_queued; |
168 | sched_info_dequeued(t); | 190 | sched_info_reset_dequeued(t); |
169 | t->sched_info.run_delay += delta; | 191 | t->sched_info.run_delay += delta; |
170 | t->sched_info.last_arrival = now; | 192 | t->sched_info.last_arrival = now; |
171 | t->sched_info.pcount++; | 193 | t->sched_info.pcount++; |
@@ -242,7 +264,9 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) | |||
242 | __sched_info_switch(prev, next); | 264 | __sched_info_switch(prev, next); |
243 | } | 265 | } |
244 | #else | 266 | #else |
245 | #define sched_info_queued(t) do { } while (0) | 267 | #define sched_info_queued(t) do { } while (0) |
246 | #define sched_info_switch(t, next) do { } while (0) | 268 | #define sched_info_reset_dequeued(t) do { } while (0) |
269 | #define sched_info_dequeued(t) do { } while (0) | ||
270 | #define sched_info_switch(t, next) do { } while (0) | ||
247 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ | 271 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ |
248 | 272 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index efaf7c5500e9..18943985ddee 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -267,6 +267,14 @@ static struct ctl_table kern_table[] = { | |||
267 | }, | 267 | }, |
268 | { | 268 | { |
269 | .ctl_name = CTL_UNNUMBERED, | 269 | .ctl_name = CTL_UNNUMBERED, |
270 | .procname = "sched_shares_ratelimit", | ||
271 | .data = &sysctl_sched_shares_ratelimit, | ||
272 | .maxlen = sizeof(unsigned int), | ||
273 | .mode = 0644, | ||
274 | .proc_handler = &proc_dointvec, | ||
275 | }, | ||
276 | { | ||
277 | .ctl_name = CTL_UNNUMBERED, | ||
270 | .procname = "sched_child_runs_first", | 278 | .procname = "sched_child_runs_first", |
271 | .data = &sysctl_sched_child_runs_first, | 279 | .data = &sysctl_sched_child_runs_first, |
272 | .maxlen = sizeof(unsigned int), | 280 | .maxlen = sizeof(unsigned int), |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 57a1f02e5ec0..67f80c261709 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -30,6 +30,7 @@ | |||
30 | struct tick_device tick_broadcast_device; | 30 | struct tick_device tick_broadcast_device; |
31 | static cpumask_t tick_broadcast_mask; | 31 | static cpumask_t tick_broadcast_mask; |
32 | static DEFINE_SPINLOCK(tick_broadcast_lock); | 32 | static DEFINE_SPINLOCK(tick_broadcast_lock); |
33 | static int tick_broadcast_force; | ||
33 | 34 | ||
34 | #ifdef CONFIG_TICK_ONESHOT | 35 | #ifdef CONFIG_TICK_ONESHOT |
35 | static void tick_broadcast_clear_oneshot(int cpu); | 36 | static void tick_broadcast_clear_oneshot(int cpu); |
@@ -232,10 +233,11 @@ static void tick_do_broadcast_on_off(void *why) | |||
232 | CLOCK_EVT_MODE_SHUTDOWN); | 233 | CLOCK_EVT_MODE_SHUTDOWN); |
233 | } | 234 | } |
234 | if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) | 235 | if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) |
235 | dev->features |= CLOCK_EVT_FEAT_DUMMY; | 236 | tick_broadcast_force = 1; |
236 | break; | 237 | break; |
237 | case CLOCK_EVT_NOTIFY_BROADCAST_OFF: | 238 | case CLOCK_EVT_NOTIFY_BROADCAST_OFF: |
238 | if (cpu_isset(cpu, tick_broadcast_mask)) { | 239 | if (!tick_broadcast_force && |
240 | cpu_isset(cpu, tick_broadcast_mask)) { | ||
239 | cpu_clear(cpu, tick_broadcast_mask); | 241 | cpu_clear(cpu, tick_broadcast_mask); |
240 | if (td->mode == TICKDEV_MODE_PERIODIC) | 242 | if (td->mode == TICKDEV_MODE_PERIODIC) |
241 | tick_setup_periodic(dev, 0); | 243 | tick_setup_periodic(dev, 0); |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b854a895591e..d63008b09a4c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -276,6 +276,7 @@ void tick_nohz_stop_sched_tick(void) | |||
276 | ts->tick_stopped = 1; | 276 | ts->tick_stopped = 1; |
277 | ts->idle_jiffies = last_jiffies; | 277 | ts->idle_jiffies = last_jiffies; |
278 | rcu_enter_nohz(); | 278 | rcu_enter_nohz(); |
279 | sched_clock_tick_stop(cpu); | ||
279 | } | 280 | } |
280 | 281 | ||
281 | /* | 282 | /* |
@@ -375,6 +376,7 @@ void tick_nohz_restart_sched_tick(void) | |||
375 | select_nohz_load_balancer(0); | 376 | select_nohz_load_balancer(0); |
376 | now = ktime_get(); | 377 | now = ktime_get(); |
377 | tick_do_update_jiffies64(now); | 378 | tick_do_update_jiffies64(now); |
379 | sched_clock_tick_start(cpu); | ||
378 | cpu_clear(cpu, nohz_cpu_mask); | 380 | cpu_clear(cpu, nohz_cpu_mask); |
379 | 381 | ||
380 | /* | 382 | /* |