diff options
-rw-r--r-- | Documentation/sched-design-CFS.txt | 67 | ||||
-rw-r--r-- | arch/i386/Kconfig | 11 | ||||
-rw-r--r-- | drivers/kvm/kvm.h | 10 | ||||
-rw-r--r-- | drivers/kvm/kvm_main.c | 2 | ||||
-rw-r--r-- | fs/pipe.c | 9 | ||||
-rw-r--r-- | fs/proc/array.c | 17 | ||||
-rw-r--r-- | fs/proc/base.c | 2 | ||||
-rw-r--r-- | fs/proc/proc_misc.c | 15 | ||||
-rw-r--r-- | include/linux/kernel_stat.h | 1 | ||||
-rw-r--r-- | include/linux/sched.h | 99 | ||||
-rw-r--r-- | include/linux/topology.h | 5 | ||||
-rw-r--r-- | init/Kconfig | 21 | ||||
-rw-r--r-- | kernel/delayacct.c | 2 | ||||
-rw-r--r-- | kernel/exit.c | 6 | ||||
-rw-r--r-- | kernel/fork.c | 3 | ||||
-rw-r--r-- | kernel/ksysfs.c | 8 | ||||
-rw-r--r-- | kernel/sched.c | 1444 | ||||
-rw-r--r-- | kernel/sched_debug.c | 282 | ||||
-rw-r--r-- | kernel/sched_fair.c | 811 | ||||
-rw-r--r-- | kernel/sched_idletask.c | 8 | ||||
-rw-r--r-- | kernel/sched_rt.c | 19 | ||||
-rw-r--r-- | kernel/sched_stats.h | 28 | ||||
-rw-r--r-- | kernel/sysctl.c | 37 | ||||
-rw-r--r-- | kernel/user.c | 249 | ||||
-rw-r--r-- | net/unix/af_unix.c | 4 |
25 files changed, 1872 insertions, 1288 deletions
diff --git a/Documentation/sched-design-CFS.txt b/Documentation/sched-design-CFS.txt index 84901e7c0508..88bcb8767335 100644 --- a/Documentation/sched-design-CFS.txt +++ b/Documentation/sched-design-CFS.txt | |||
@@ -117,3 +117,70 @@ Some implementation details: | |||
117 | iterators of the scheduling modules are used. The balancing code got | 117 | iterators of the scheduling modules are used. The balancing code got |
118 | quite a bit simpler as a result. | 118 | quite a bit simpler as a result. |
119 | 119 | ||
120 | |||
121 | Group scheduler extension to CFS | ||
122 | ================================ | ||
123 | |||
124 | Normally the scheduler operates on individual tasks and strives to provide | ||
125 | fair CPU time to each task. Sometimes, it may be desirable to group tasks | ||
126 | and provide fair CPU time to each such task group. For example, it may | ||
127 | be desirable to first provide fair CPU time to each user on the system | ||
128 | and then to each task belonging to a user. | ||
129 | |||
130 | CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets | ||
131 | SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such | ||
132 | groups. At present, there are two (mutually exclusive) mechanisms to group | ||
133 | tasks for CPU bandwidth control purpose: | ||
134 | |||
135 | - Based on user id (CONFIG_FAIR_USER_SCHED) | ||
136 | In this option, tasks are grouped according to their user id. | ||
137 | - Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED) | ||
138 | This options lets the administrator create arbitrary groups | ||
139 | of tasks, using the "cgroup" pseudo filesystem. See | ||
140 | Documentation/cgroups.txt for more information about this | ||
141 | filesystem. | ||
142 | |||
143 | Only one of these options to group tasks can be chosen and not both. | ||
144 | |||
145 | Group scheduler tunables: | ||
146 | |||
147 | When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for | ||
148 | each new user and a "cpu_share" file is added in that directory. | ||
149 | |||
150 | # cd /sys/kernel/uids | ||
151 | # cat 512/cpu_share # Display user 512's CPU share | ||
152 | 1024 | ||
153 | # echo 2048 > 512/cpu_share # Modify user 512's CPU share | ||
154 | # cat 512/cpu_share # Display user 512's CPU share | ||
155 | 2048 | ||
156 | # | ||
157 | |||
158 | CPU bandwidth between two users are divided in the ratio of their CPU shares. | ||
159 | For ex: if you would like user "root" to get twice the bandwidth of user | ||
160 | "guest", then set the cpu_share for both the users such that "root"'s | ||
161 | cpu_share is twice "guest"'s cpu_share | ||
162 | |||
163 | |||
164 | When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created | ||
165 | for each group created using the pseudo filesystem. See example steps | ||
166 | below to create task groups and modify their CPU share using the "cgroups" | ||
167 | pseudo filesystem | ||
168 | |||
169 | # mkdir /dev/cpuctl | ||
170 | # mount -t cgroup -ocpu none /dev/cpuctl | ||
171 | # cd /dev/cpuctl | ||
172 | |||
173 | # mkdir multimedia # create "multimedia" group of tasks | ||
174 | # mkdir browser # create "browser" group of tasks | ||
175 | |||
176 | # #Configure the multimedia group to receive twice the CPU bandwidth | ||
177 | # #that of browser group | ||
178 | |||
179 | # echo 2048 > multimedia/cpu.shares | ||
180 | # echo 1024 > browser/cpu.shares | ||
181 | |||
182 | # firefox & # Launch firefox and move it to "browser" group | ||
183 | # echo <firefox_pid> > browser/tasks | ||
184 | |||
185 | # #Launch gmplayer (or your favourite movie player) | ||
186 | # echo <movie_player_pid> > multimedia/tasks | ||
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index f1486f8a3e6d..bf9aafad4978 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig | |||
@@ -214,6 +214,17 @@ config X86_ES7000 | |||
214 | 214 | ||
215 | endchoice | 215 | endchoice |
216 | 216 | ||
217 | config SCHED_NO_NO_OMIT_FRAME_POINTER | ||
218 | bool "Single-depth WCHAN output" | ||
219 | default y | ||
220 | help | ||
221 | Calculate simpler /proc/<PID>/wchan values. If this option | ||
222 | is disabled then wchan values will recurse back to the | ||
223 | caller function. This provides more accurate wchan values, | ||
224 | at the expense of slightly more scheduling overhead. | ||
225 | |||
226 | If in doubt, say "Y". | ||
227 | |||
217 | config PARAVIRT | 228 | config PARAVIRT |
218 | bool "Paravirtualization support (EXPERIMENTAL)" | 229 | bool "Paravirtualization support (EXPERIMENTAL)" |
219 | depends on EXPERIMENTAL | 230 | depends on EXPERIMENTAL |
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index ad0813843adc..3b0bc4bda5f2 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h | |||
@@ -624,6 +624,16 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu); | |||
624 | 624 | ||
625 | int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); | 625 | int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); |
626 | 626 | ||
627 | static inline void kvm_guest_enter(void) | ||
628 | { | ||
629 | current->flags |= PF_VCPU; | ||
630 | } | ||
631 | |||
632 | static inline void kvm_guest_exit(void) | ||
633 | { | ||
634 | current->flags &= ~PF_VCPU; | ||
635 | } | ||
636 | |||
627 | static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | 637 | static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, |
628 | u32 error_code) | 638 | u32 error_code) |
629 | { | 639 | { |
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 353e58527d15..af2d288c881d 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c | |||
@@ -2046,6 +2046,7 @@ again: | |||
2046 | kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); | 2046 | kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); |
2047 | 2047 | ||
2048 | vcpu->guest_mode = 1; | 2048 | vcpu->guest_mode = 1; |
2049 | kvm_guest_enter(); | ||
2049 | 2050 | ||
2050 | if (vcpu->requests) | 2051 | if (vcpu->requests) |
2051 | if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) | 2052 | if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) |
@@ -2053,6 +2054,7 @@ again: | |||
2053 | 2054 | ||
2054 | kvm_x86_ops->run(vcpu, kvm_run); | 2055 | kvm_x86_ops->run(vcpu, kvm_run); |
2055 | 2056 | ||
2057 | kvm_guest_exit(); | ||
2056 | vcpu->guest_mode = 0; | 2058 | vcpu->guest_mode = 0; |
2057 | local_irq_enable(); | 2059 | local_irq_enable(); |
2058 | 2060 | ||
@@ -45,8 +45,7 @@ void pipe_wait(struct pipe_inode_info *pipe) | |||
45 | * Pipes are system-local resources, so sleeping on them | 45 | * Pipes are system-local resources, so sleeping on them |
46 | * is considered a noninteractive wait: | 46 | * is considered a noninteractive wait: |
47 | */ | 47 | */ |
48 | prepare_to_wait(&pipe->wait, &wait, | 48 | prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); |
49 | TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE); | ||
50 | if (pipe->inode) | 49 | if (pipe->inode) |
51 | mutex_unlock(&pipe->inode->i_mutex); | 50 | mutex_unlock(&pipe->inode->i_mutex); |
52 | schedule(); | 51 | schedule(); |
@@ -383,7 +382,7 @@ redo: | |||
383 | 382 | ||
384 | /* Signal writers asynchronously that there is more room. */ | 383 | /* Signal writers asynchronously that there is more room. */ |
385 | if (do_wakeup) { | 384 | if (do_wakeup) { |
386 | wake_up_interruptible(&pipe->wait); | 385 | wake_up_interruptible_sync(&pipe->wait); |
387 | kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); | 386 | kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); |
388 | } | 387 | } |
389 | if (ret > 0) | 388 | if (ret > 0) |
@@ -556,7 +555,7 @@ redo2: | |||
556 | out: | 555 | out: |
557 | mutex_unlock(&inode->i_mutex); | 556 | mutex_unlock(&inode->i_mutex); |
558 | if (do_wakeup) { | 557 | if (do_wakeup) { |
559 | wake_up_interruptible(&pipe->wait); | 558 | wake_up_interruptible_sync(&pipe->wait); |
560 | kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); | 559 | kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); |
561 | } | 560 | } |
562 | if (ret > 0) | 561 | if (ret > 0) |
@@ -650,7 +649,7 @@ pipe_release(struct inode *inode, int decr, int decw) | |||
650 | if (!pipe->readers && !pipe->writers) { | 649 | if (!pipe->readers && !pipe->writers) { |
651 | free_pipe_info(inode); | 650 | free_pipe_info(inode); |
652 | } else { | 651 | } else { |
653 | wake_up_interruptible(&pipe->wait); | 652 | wake_up_interruptible_sync(&pipe->wait); |
654 | kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); | 653 | kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); |
655 | kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); | 654 | kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); |
656 | } | 655 | } |
diff --git a/fs/proc/array.c b/fs/proc/array.c index ee4814dd98f9..27b59f5f3bd1 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c | |||
@@ -370,6 +370,11 @@ static cputime_t task_stime(struct task_struct *p) | |||
370 | } | 370 | } |
371 | #endif | 371 | #endif |
372 | 372 | ||
373 | static cputime_t task_gtime(struct task_struct *p) | ||
374 | { | ||
375 | return p->gtime; | ||
376 | } | ||
377 | |||
373 | static int do_task_stat(struct task_struct *task, char *buffer, int whole) | 378 | static int do_task_stat(struct task_struct *task, char *buffer, int whole) |
374 | { | 379 | { |
375 | unsigned long vsize, eip, esp, wchan = ~0UL; | 380 | unsigned long vsize, eip, esp, wchan = ~0UL; |
@@ -385,6 +390,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) | |||
385 | unsigned long cmin_flt = 0, cmaj_flt = 0; | 390 | unsigned long cmin_flt = 0, cmaj_flt = 0; |
386 | unsigned long min_flt = 0, maj_flt = 0; | 391 | unsigned long min_flt = 0, maj_flt = 0; |
387 | cputime_t cutime, cstime, utime, stime; | 392 | cputime_t cutime, cstime, utime, stime; |
393 | cputime_t cgtime, gtime; | ||
388 | unsigned long rsslim = 0; | 394 | unsigned long rsslim = 0; |
389 | char tcomm[sizeof(task->comm)]; | 395 | char tcomm[sizeof(task->comm)]; |
390 | unsigned long flags; | 396 | unsigned long flags; |
@@ -403,6 +409,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) | |||
403 | sigemptyset(&sigign); | 409 | sigemptyset(&sigign); |
404 | sigemptyset(&sigcatch); | 410 | sigemptyset(&sigcatch); |
405 | cutime = cstime = utime = stime = cputime_zero; | 411 | cutime = cstime = utime = stime = cputime_zero; |
412 | cgtime = gtime = cputime_zero; | ||
406 | 413 | ||
407 | rcu_read_lock(); | 414 | rcu_read_lock(); |
408 | if (lock_task_sighand(task, &flags)) { | 415 | if (lock_task_sighand(task, &flags)) { |
@@ -420,6 +427,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) | |||
420 | cmaj_flt = sig->cmaj_flt; | 427 | cmaj_flt = sig->cmaj_flt; |
421 | cutime = sig->cutime; | 428 | cutime = sig->cutime; |
422 | cstime = sig->cstime; | 429 | cstime = sig->cstime; |
430 | cgtime = sig->cgtime; | ||
423 | rsslim = sig->rlim[RLIMIT_RSS].rlim_cur; | 431 | rsslim = sig->rlim[RLIMIT_RSS].rlim_cur; |
424 | 432 | ||
425 | /* add up live thread stats at the group level */ | 433 | /* add up live thread stats at the group level */ |
@@ -430,6 +438,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) | |||
430 | maj_flt += t->maj_flt; | 438 | maj_flt += t->maj_flt; |
431 | utime = cputime_add(utime, task_utime(t)); | 439 | utime = cputime_add(utime, task_utime(t)); |
432 | stime = cputime_add(stime, task_stime(t)); | 440 | stime = cputime_add(stime, task_stime(t)); |
441 | gtime = cputime_add(gtime, task_gtime(t)); | ||
433 | t = next_thread(t); | 442 | t = next_thread(t); |
434 | } while (t != task); | 443 | } while (t != task); |
435 | 444 | ||
@@ -437,6 +446,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) | |||
437 | maj_flt += sig->maj_flt; | 446 | maj_flt += sig->maj_flt; |
438 | utime = cputime_add(utime, sig->utime); | 447 | utime = cputime_add(utime, sig->utime); |
439 | stime = cputime_add(stime, sig->stime); | 448 | stime = cputime_add(stime, sig->stime); |
449 | gtime += cputime_add(gtime, sig->gtime); | ||
440 | } | 450 | } |
441 | 451 | ||
442 | sid = signal_session(sig); | 452 | sid = signal_session(sig); |
@@ -454,6 +464,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) | |||
454 | maj_flt = task->maj_flt; | 464 | maj_flt = task->maj_flt; |
455 | utime = task_utime(task); | 465 | utime = task_utime(task); |
456 | stime = task_stime(task); | 466 | stime = task_stime(task); |
467 | gtime = task_gtime(task); | ||
457 | } | 468 | } |
458 | 469 | ||
459 | /* scale priority and nice values from timeslices to -20..20 */ | 470 | /* scale priority and nice values from timeslices to -20..20 */ |
@@ -471,7 +482,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) | |||
471 | 482 | ||
472 | res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \ | 483 | res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \ |
473 | %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ | 484 | %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ |
474 | %lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu\n", | 485 | %lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", |
475 | task->pid, | 486 | task->pid, |
476 | tcomm, | 487 | tcomm, |
477 | state, | 488 | state, |
@@ -516,7 +527,9 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) | |||
516 | task_cpu(task), | 527 | task_cpu(task), |
517 | task->rt_priority, | 528 | task->rt_priority, |
518 | task->policy, | 529 | task->policy, |
519 | (unsigned long long)delayacct_blkio_ticks(task)); | 530 | (unsigned long long)delayacct_blkio_ticks(task), |
531 | cputime_to_clock_t(gtime), | ||
532 | cputime_to_clock_t(cgtime)); | ||
520 | if (mm) | 533 | if (mm) |
521 | mmput(mm); | 534 | mmput(mm); |
522 | return res; | 535 | return res; |
diff --git a/fs/proc/base.c b/fs/proc/base.c index 19489b0d5554..e5d0953d4db1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c | |||
@@ -304,7 +304,7 @@ static int proc_pid_schedstat(struct task_struct *task, char *buffer) | |||
304 | return sprintf(buffer, "%llu %llu %lu\n", | 304 | return sprintf(buffer, "%llu %llu %lu\n", |
305 | task->sched_info.cpu_time, | 305 | task->sched_info.cpu_time, |
306 | task->sched_info.run_delay, | 306 | task->sched_info.run_delay, |
307 | task->sched_info.pcnt); | 307 | task->sched_info.pcount); |
308 | } | 308 | } |
309 | #endif | 309 | #endif |
310 | 310 | ||
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index bee251cb87c8..b872a01ad3af 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c | |||
@@ -443,6 +443,7 @@ static int show_stat(struct seq_file *p, void *v) | |||
443 | int i; | 443 | int i; |
444 | unsigned long jif; | 444 | unsigned long jif; |
445 | cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; | 445 | cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; |
446 | cputime64_t guest; | ||
446 | u64 sum = 0; | 447 | u64 sum = 0; |
447 | struct timespec boottime; | 448 | struct timespec boottime; |
448 | unsigned int *per_irq_sum; | 449 | unsigned int *per_irq_sum; |
@@ -453,6 +454,7 @@ static int show_stat(struct seq_file *p, void *v) | |||
453 | 454 | ||
454 | user = nice = system = idle = iowait = | 455 | user = nice = system = idle = iowait = |
455 | irq = softirq = steal = cputime64_zero; | 456 | irq = softirq = steal = cputime64_zero; |
457 | guest = cputime64_zero; | ||
456 | getboottime(&boottime); | 458 | getboottime(&boottime); |
457 | jif = boottime.tv_sec; | 459 | jif = boottime.tv_sec; |
458 | 460 | ||
@@ -467,6 +469,7 @@ static int show_stat(struct seq_file *p, void *v) | |||
467 | irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); | 469 | irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); |
468 | softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); | 470 | softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); |
469 | steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); | 471 | steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); |
472 | guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); | ||
470 | for (j = 0; j < NR_IRQS; j++) { | 473 | for (j = 0; j < NR_IRQS; j++) { |
471 | unsigned int temp = kstat_cpu(i).irqs[j]; | 474 | unsigned int temp = kstat_cpu(i).irqs[j]; |
472 | sum += temp; | 475 | sum += temp; |
@@ -474,7 +477,7 @@ static int show_stat(struct seq_file *p, void *v) | |||
474 | } | 477 | } |
475 | } | 478 | } |
476 | 479 | ||
477 | seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu\n", | 480 | seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", |
478 | (unsigned long long)cputime64_to_clock_t(user), | 481 | (unsigned long long)cputime64_to_clock_t(user), |
479 | (unsigned long long)cputime64_to_clock_t(nice), | 482 | (unsigned long long)cputime64_to_clock_t(nice), |
480 | (unsigned long long)cputime64_to_clock_t(system), | 483 | (unsigned long long)cputime64_to_clock_t(system), |
@@ -482,7 +485,8 @@ static int show_stat(struct seq_file *p, void *v) | |||
482 | (unsigned long long)cputime64_to_clock_t(iowait), | 485 | (unsigned long long)cputime64_to_clock_t(iowait), |
483 | (unsigned long long)cputime64_to_clock_t(irq), | 486 | (unsigned long long)cputime64_to_clock_t(irq), |
484 | (unsigned long long)cputime64_to_clock_t(softirq), | 487 | (unsigned long long)cputime64_to_clock_t(softirq), |
485 | (unsigned long long)cputime64_to_clock_t(steal)); | 488 | (unsigned long long)cputime64_to_clock_t(steal), |
489 | (unsigned long long)cputime64_to_clock_t(guest)); | ||
486 | for_each_online_cpu(i) { | 490 | for_each_online_cpu(i) { |
487 | 491 | ||
488 | /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ | 492 | /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ |
@@ -494,7 +498,9 @@ static int show_stat(struct seq_file *p, void *v) | |||
494 | irq = kstat_cpu(i).cpustat.irq; | 498 | irq = kstat_cpu(i).cpustat.irq; |
495 | softirq = kstat_cpu(i).cpustat.softirq; | 499 | softirq = kstat_cpu(i).cpustat.softirq; |
496 | steal = kstat_cpu(i).cpustat.steal; | 500 | steal = kstat_cpu(i).cpustat.steal; |
497 | seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu\n", | 501 | guest = kstat_cpu(i).cpustat.guest; |
502 | seq_printf(p, | ||
503 | "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", | ||
498 | i, | 504 | i, |
499 | (unsigned long long)cputime64_to_clock_t(user), | 505 | (unsigned long long)cputime64_to_clock_t(user), |
500 | (unsigned long long)cputime64_to_clock_t(nice), | 506 | (unsigned long long)cputime64_to_clock_t(nice), |
@@ -503,7 +509,8 @@ static int show_stat(struct seq_file *p, void *v) | |||
503 | (unsigned long long)cputime64_to_clock_t(iowait), | 509 | (unsigned long long)cputime64_to_clock_t(iowait), |
504 | (unsigned long long)cputime64_to_clock_t(irq), | 510 | (unsigned long long)cputime64_to_clock_t(irq), |
505 | (unsigned long long)cputime64_to_clock_t(softirq), | 511 | (unsigned long long)cputime64_to_clock_t(softirq), |
506 | (unsigned long long)cputime64_to_clock_t(steal)); | 512 | (unsigned long long)cputime64_to_clock_t(steal), |
513 | (unsigned long long)cputime64_to_clock_t(guest)); | ||
507 | } | 514 | } |
508 | seq_printf(p, "intr %llu", (unsigned long long)sum); | 515 | seq_printf(p, "intr %llu", (unsigned long long)sum); |
509 | 516 | ||
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 43e895f1cabe..12bf44f083f5 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h | |||
@@ -23,6 +23,7 @@ struct cpu_usage_stat { | |||
23 | cputime64_t idle; | 23 | cputime64_t idle; |
24 | cputime64_t iowait; | 24 | cputime64_t iowait; |
25 | cputime64_t steal; | 25 | cputime64_t steal; |
26 | cputime64_t guest; | ||
26 | }; | 27 | }; |
27 | 28 | ||
28 | struct kernel_stat { | 29 | struct kernel_stat { |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 833f7dc2b8de..228e0a8ce248 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -87,6 +87,7 @@ struct sched_param { | |||
87 | #include <linux/timer.h> | 87 | #include <linux/timer.h> |
88 | #include <linux/hrtimer.h> | 88 | #include <linux/hrtimer.h> |
89 | #include <linux/task_io_accounting.h> | 89 | #include <linux/task_io_accounting.h> |
90 | #include <linux/kobject.h> | ||
90 | 91 | ||
91 | #include <asm/processor.h> | 92 | #include <asm/processor.h> |
92 | 93 | ||
@@ -136,6 +137,7 @@ extern unsigned long weighted_cpuload(const int cpu); | |||
136 | 137 | ||
137 | struct seq_file; | 138 | struct seq_file; |
138 | struct cfs_rq; | 139 | struct cfs_rq; |
140 | struct task_group; | ||
139 | #ifdef CONFIG_SCHED_DEBUG | 141 | #ifdef CONFIG_SCHED_DEBUG |
140 | extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); | 142 | extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); |
141 | extern void proc_sched_set_task(struct task_struct *p); | 143 | extern void proc_sched_set_task(struct task_struct *p); |
@@ -174,8 +176,7 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
174 | #define EXIT_ZOMBIE 16 | 176 | #define EXIT_ZOMBIE 16 |
175 | #define EXIT_DEAD 32 | 177 | #define EXIT_DEAD 32 |
176 | /* in tsk->state again */ | 178 | /* in tsk->state again */ |
177 | #define TASK_NONINTERACTIVE 64 | 179 | #define TASK_DEAD 64 |
178 | #define TASK_DEAD 128 | ||
179 | 180 | ||
180 | #define __set_task_state(tsk, state_value) \ | 181 | #define __set_task_state(tsk, state_value) \ |
181 | do { (tsk)->state = (state_value); } while (0) | 182 | do { (tsk)->state = (state_value); } while (0) |
@@ -516,6 +517,8 @@ struct signal_struct { | |||
516 | * in __exit_signal, except for the group leader. | 517 | * in __exit_signal, except for the group leader. |
517 | */ | 518 | */ |
518 | cputime_t utime, stime, cutime, cstime; | 519 | cputime_t utime, stime, cutime, cstime; |
520 | cputime_t gtime; | ||
521 | cputime_t cgtime; | ||
519 | unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; | 522 | unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; |
520 | unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; | 523 | unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; |
521 | unsigned long inblock, oublock, cinblock, coublock; | 524 | unsigned long inblock, oublock, cinblock, coublock; |
@@ -596,8 +599,21 @@ struct user_struct { | |||
596 | /* Hash table maintenance information */ | 599 | /* Hash table maintenance information */ |
597 | struct hlist_node uidhash_node; | 600 | struct hlist_node uidhash_node; |
598 | uid_t uid; | 601 | uid_t uid; |
602 | |||
603 | #ifdef CONFIG_FAIR_USER_SCHED | ||
604 | struct task_group *tg; | ||
605 | struct kset kset; | ||
606 | struct subsys_attribute user_attr; | ||
607 | struct work_struct work; | ||
608 | #endif | ||
599 | }; | 609 | }; |
600 | 610 | ||
611 | #ifdef CONFIG_FAIR_USER_SCHED | ||
612 | extern int uids_kobject_init(void); | ||
613 | #else | ||
614 | static inline int uids_kobject_init(void) { return 0; } | ||
615 | #endif | ||
616 | |||
601 | extern struct user_struct *find_user(uid_t); | 617 | extern struct user_struct *find_user(uid_t); |
602 | 618 | ||
603 | extern struct user_struct root_user; | 619 | extern struct user_struct root_user; |
@@ -609,13 +625,17 @@ struct reclaim_state; | |||
609 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 625 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
610 | struct sched_info { | 626 | struct sched_info { |
611 | /* cumulative counters */ | 627 | /* cumulative counters */ |
612 | unsigned long pcnt; /* # of times run on this cpu */ | 628 | unsigned long pcount; /* # of times run on this cpu */ |
613 | unsigned long long cpu_time, /* time spent on the cpu */ | 629 | unsigned long long cpu_time, /* time spent on the cpu */ |
614 | run_delay; /* time spent waiting on a runqueue */ | 630 | run_delay; /* time spent waiting on a runqueue */ |
615 | 631 | ||
616 | /* timestamps */ | 632 | /* timestamps */ |
617 | unsigned long long last_arrival,/* when we last ran on a cpu */ | 633 | unsigned long long last_arrival,/* when we last ran on a cpu */ |
618 | last_queued; /* when we were last queued to run */ | 634 | last_queued; /* when we were last queued to run */ |
635 | #ifdef CONFIG_SCHEDSTATS | ||
636 | /* BKL stats */ | ||
637 | unsigned long bkl_count; | ||
638 | #endif | ||
619 | }; | 639 | }; |
620 | #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ | 640 | #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ |
621 | 641 | ||
@@ -750,7 +770,7 @@ struct sched_domain { | |||
750 | 770 | ||
751 | #ifdef CONFIG_SCHEDSTATS | 771 | #ifdef CONFIG_SCHEDSTATS |
752 | /* load_balance() stats */ | 772 | /* load_balance() stats */ |
753 | unsigned long lb_cnt[CPU_MAX_IDLE_TYPES]; | 773 | unsigned long lb_count[CPU_MAX_IDLE_TYPES]; |
754 | unsigned long lb_failed[CPU_MAX_IDLE_TYPES]; | 774 | unsigned long lb_failed[CPU_MAX_IDLE_TYPES]; |
755 | unsigned long lb_balanced[CPU_MAX_IDLE_TYPES]; | 775 | unsigned long lb_balanced[CPU_MAX_IDLE_TYPES]; |
756 | unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES]; | 776 | unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES]; |
@@ -760,17 +780,17 @@ struct sched_domain { | |||
760 | unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES]; | 780 | unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES]; |
761 | 781 | ||
762 | /* Active load balancing */ | 782 | /* Active load balancing */ |
763 | unsigned long alb_cnt; | 783 | unsigned long alb_count; |
764 | unsigned long alb_failed; | 784 | unsigned long alb_failed; |
765 | unsigned long alb_pushed; | 785 | unsigned long alb_pushed; |
766 | 786 | ||
767 | /* SD_BALANCE_EXEC stats */ | 787 | /* SD_BALANCE_EXEC stats */ |
768 | unsigned long sbe_cnt; | 788 | unsigned long sbe_count; |
769 | unsigned long sbe_balanced; | 789 | unsigned long sbe_balanced; |
770 | unsigned long sbe_pushed; | 790 | unsigned long sbe_pushed; |
771 | 791 | ||
772 | /* SD_BALANCE_FORK stats */ | 792 | /* SD_BALANCE_FORK stats */ |
773 | unsigned long sbf_cnt; | 793 | unsigned long sbf_count; |
774 | unsigned long sbf_balanced; | 794 | unsigned long sbf_balanced; |
775 | unsigned long sbf_pushed; | 795 | unsigned long sbf_pushed; |
776 | 796 | ||
@@ -854,11 +874,11 @@ struct rq; | |||
854 | struct sched_domain; | 874 | struct sched_domain; |
855 | 875 | ||
856 | struct sched_class { | 876 | struct sched_class { |
857 | struct sched_class *next; | 877 | const struct sched_class *next; |
858 | 878 | ||
859 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); | 879 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); |
860 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); | 880 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); |
861 | void (*yield_task) (struct rq *rq, struct task_struct *p); | 881 | void (*yield_task) (struct rq *rq); |
862 | 882 | ||
863 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); | 883 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); |
864 | 884 | ||
@@ -888,31 +908,22 @@ struct load_weight { | |||
888 | * 4 se->block_start | 908 | * 4 se->block_start |
889 | * 4 se->run_node | 909 | * 4 se->run_node |
890 | * 4 se->sleep_start | 910 | * 4 se->sleep_start |
891 | * 4 se->sleep_start_fair | ||
892 | * 6 se->load.weight | 911 | * 6 se->load.weight |
893 | * 7 se->delta_fair | ||
894 | * 15 se->wait_runtime | ||
895 | */ | 912 | */ |
896 | struct sched_entity { | 913 | struct sched_entity { |
897 | long wait_runtime; | ||
898 | unsigned long delta_fair_run; | ||
899 | unsigned long delta_fair_sleep; | ||
900 | unsigned long delta_exec; | ||
901 | s64 fair_key; | ||
902 | struct load_weight load; /* for load-balancing */ | 914 | struct load_weight load; /* for load-balancing */ |
903 | struct rb_node run_node; | 915 | struct rb_node run_node; |
904 | unsigned int on_rq; | 916 | unsigned int on_rq; |
917 | int peer_preempt; | ||
905 | 918 | ||
906 | u64 exec_start; | 919 | u64 exec_start; |
907 | u64 sum_exec_runtime; | 920 | u64 sum_exec_runtime; |
921 | u64 vruntime; | ||
908 | u64 prev_sum_exec_runtime; | 922 | u64 prev_sum_exec_runtime; |
909 | u64 wait_start_fair; | ||
910 | u64 sleep_start_fair; | ||
911 | 923 | ||
912 | #ifdef CONFIG_SCHEDSTATS | 924 | #ifdef CONFIG_SCHEDSTATS |
913 | u64 wait_start; | 925 | u64 wait_start; |
914 | u64 wait_max; | 926 | u64 wait_max; |
915 | s64 sum_wait_runtime; | ||
916 | 927 | ||
917 | u64 sleep_start; | 928 | u64 sleep_start; |
918 | u64 sleep_max; | 929 | u64 sleep_max; |
@@ -921,9 +932,25 @@ struct sched_entity { | |||
921 | u64 block_start; | 932 | u64 block_start; |
922 | u64 block_max; | 933 | u64 block_max; |
923 | u64 exec_max; | 934 | u64 exec_max; |
924 | 935 | u64 slice_max; | |
925 | unsigned long wait_runtime_overruns; | 936 | |
926 | unsigned long wait_runtime_underruns; | 937 | u64 nr_migrations; |
938 | u64 nr_migrations_cold; | ||
939 | u64 nr_failed_migrations_affine; | ||
940 | u64 nr_failed_migrations_running; | ||
941 | u64 nr_failed_migrations_hot; | ||
942 | u64 nr_forced_migrations; | ||
943 | u64 nr_forced2_migrations; | ||
944 | |||
945 | u64 nr_wakeups; | ||
946 | u64 nr_wakeups_sync; | ||
947 | u64 nr_wakeups_migrate; | ||
948 | u64 nr_wakeups_local; | ||
949 | u64 nr_wakeups_remote; | ||
950 | u64 nr_wakeups_affine; | ||
951 | u64 nr_wakeups_affine_attempts; | ||
952 | u64 nr_wakeups_passive; | ||
953 | u64 nr_wakeups_idle; | ||
927 | #endif | 954 | #endif |
928 | 955 | ||
929 | #ifdef CONFIG_FAIR_GROUP_SCHED | 956 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -952,7 +979,7 @@ struct task_struct { | |||
952 | 979 | ||
953 | int prio, static_prio, normal_prio; | 980 | int prio, static_prio, normal_prio; |
954 | struct list_head run_list; | 981 | struct list_head run_list; |
955 | struct sched_class *sched_class; | 982 | const struct sched_class *sched_class; |
956 | struct sched_entity se; | 983 | struct sched_entity se; |
957 | 984 | ||
958 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 985 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
@@ -1023,6 +1050,7 @@ struct task_struct { | |||
1023 | 1050 | ||
1024 | unsigned int rt_priority; | 1051 | unsigned int rt_priority; |
1025 | cputime_t utime, stime; | 1052 | cputime_t utime, stime; |
1053 | cputime_t gtime; | ||
1026 | unsigned long nvcsw, nivcsw; /* context switch counts */ | 1054 | unsigned long nvcsw, nivcsw; /* context switch counts */ |
1027 | struct timespec start_time; /* monotonic time */ | 1055 | struct timespec start_time; /* monotonic time */ |
1028 | struct timespec real_start_time; /* boot based time */ | 1056 | struct timespec real_start_time; /* boot based time */ |
@@ -1314,6 +1342,7 @@ static inline void put_task_struct(struct task_struct *t) | |||
1314 | #define PF_STARTING 0x00000002 /* being created */ | 1342 | #define PF_STARTING 0x00000002 /* being created */ |
1315 | #define PF_EXITING 0x00000004 /* getting shut down */ | 1343 | #define PF_EXITING 0x00000004 /* getting shut down */ |
1316 | #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ | 1344 | #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ |
1345 | #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ | ||
1317 | #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ | 1346 | #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ |
1318 | #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ | 1347 | #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ |
1319 | #define PF_DUMPCORE 0x00000200 /* dumped core */ | 1348 | #define PF_DUMPCORE 0x00000200 /* dumped core */ |
@@ -1401,15 +1430,17 @@ static inline void idle_task_exit(void) {} | |||
1401 | 1430 | ||
1402 | extern void sched_idle_next(void); | 1431 | extern void sched_idle_next(void); |
1403 | 1432 | ||
1433 | #ifdef CONFIG_SCHED_DEBUG | ||
1404 | extern unsigned int sysctl_sched_latency; | 1434 | extern unsigned int sysctl_sched_latency; |
1405 | extern unsigned int sysctl_sched_min_granularity; | 1435 | extern unsigned int sysctl_sched_nr_latency; |
1406 | extern unsigned int sysctl_sched_wakeup_granularity; | 1436 | extern unsigned int sysctl_sched_wakeup_granularity; |
1407 | extern unsigned int sysctl_sched_batch_wakeup_granularity; | 1437 | extern unsigned int sysctl_sched_batch_wakeup_granularity; |
1408 | extern unsigned int sysctl_sched_stat_granularity; | ||
1409 | extern unsigned int sysctl_sched_runtime_limit; | ||
1410 | extern unsigned int sysctl_sched_compat_yield; | ||
1411 | extern unsigned int sysctl_sched_child_runs_first; | 1438 | extern unsigned int sysctl_sched_child_runs_first; |
1412 | extern unsigned int sysctl_sched_features; | 1439 | extern unsigned int sysctl_sched_features; |
1440 | extern unsigned int sysctl_sched_migration_cost; | ||
1441 | #endif | ||
1442 | |||
1443 | extern unsigned int sysctl_sched_compat_yield; | ||
1413 | 1444 | ||
1414 | #ifdef CONFIG_RT_MUTEXES | 1445 | #ifdef CONFIG_RT_MUTEXES |
1415 | extern int rt_mutex_getprio(struct task_struct *p); | 1446 | extern int rt_mutex_getprio(struct task_struct *p); |
@@ -1843,6 +1874,18 @@ extern int sched_mc_power_savings, sched_smt_power_savings; | |||
1843 | 1874 | ||
1844 | extern void normalize_rt_tasks(void); | 1875 | extern void normalize_rt_tasks(void); |
1845 | 1876 | ||
1877 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1878 | |||
1879 | extern struct task_group init_task_group; | ||
1880 | |||
1881 | extern struct task_group *sched_create_group(void); | ||
1882 | extern void sched_destroy_group(struct task_group *tg); | ||
1883 | extern void sched_move_task(struct task_struct *tsk); | ||
1884 | extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); | ||
1885 | extern unsigned long sched_group_shares(struct task_group *tg); | ||
1886 | |||
1887 | #endif | ||
1888 | |||
1846 | #ifdef CONFIG_TASK_XACCT | 1889 | #ifdef CONFIG_TASK_XACCT |
1847 | static inline void add_rchar(struct task_struct *tsk, ssize_t amt) | 1890 | static inline void add_rchar(struct task_struct *tsk, ssize_t amt) |
1848 | { | 1891 | { |
diff --git a/include/linux/topology.h b/include/linux/topology.h index 525d437b1253..47729f18bfdf 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h | |||
@@ -159,15 +159,14 @@ | |||
159 | .imbalance_pct = 125, \ | 159 | .imbalance_pct = 125, \ |
160 | .cache_nice_tries = 1, \ | 160 | .cache_nice_tries = 1, \ |
161 | .busy_idx = 2, \ | 161 | .busy_idx = 2, \ |
162 | .idle_idx = 0, \ | 162 | .idle_idx = 1, \ |
163 | .newidle_idx = 0, \ | 163 | .newidle_idx = 2, \ |
164 | .wake_idx = 1, \ | 164 | .wake_idx = 1, \ |
165 | .forkexec_idx = 1, \ | 165 | .forkexec_idx = 1, \ |
166 | .flags = SD_LOAD_BALANCE \ | 166 | .flags = SD_LOAD_BALANCE \ |
167 | | SD_BALANCE_NEWIDLE \ | 167 | | SD_BALANCE_NEWIDLE \ |
168 | | SD_BALANCE_EXEC \ | 168 | | SD_BALANCE_EXEC \ |
169 | | SD_WAKE_AFFINE \ | 169 | | SD_WAKE_AFFINE \ |
170 | | SD_WAKE_IDLE \ | ||
171 | | BALANCE_FOR_PKG_POWER,\ | 170 | | BALANCE_FOR_PKG_POWER,\ |
172 | .last_balance = jiffies, \ | 171 | .last_balance = jiffies, \ |
173 | .balance_interval = 1, \ | 172 | .balance_interval = 1, \ |
diff --git a/init/Kconfig b/init/Kconfig index d54d0cadcc06..54f31a191b88 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -281,6 +281,27 @@ config CPUSETS | |||
281 | 281 | ||
282 | Say N if unsure. | 282 | Say N if unsure. |
283 | 283 | ||
284 | config FAIR_GROUP_SCHED | ||
285 | bool "Fair group CPU scheduler" | ||
286 | default y | ||
287 | depends on EXPERIMENTAL | ||
288 | help | ||
289 | This feature lets CPU scheduler recognize task groups and control CPU | ||
290 | bandwidth allocation to such task groups. | ||
291 | |||
292 | choice | ||
293 | depends on FAIR_GROUP_SCHED | ||
294 | prompt "Basis for grouping tasks" | ||
295 | default FAIR_USER_SCHED | ||
296 | |||
297 | config FAIR_USER_SCHED | ||
298 | bool "user id" | ||
299 | help | ||
300 | This option will choose userid as the basis for grouping | ||
301 | tasks, thus providing equal CPU bandwidth to each user. | ||
302 | |||
303 | endchoice | ||
304 | |||
284 | config SYSFS_DEPRECATED | 305 | config SYSFS_DEPRECATED |
285 | bool "Create deprecated sysfs files" | 306 | bool "Create deprecated sysfs files" |
286 | default y | 307 | default y |
diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 81e697829633..09e9574eeb26 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c | |||
@@ -119,7 +119,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) | |||
119 | * No locking available for sched_info (and too expensive to add one) | 119 | * No locking available for sched_info (and too expensive to add one) |
120 | * Mitigate by taking snapshot of values | 120 | * Mitigate by taking snapshot of values |
121 | */ | 121 | */ |
122 | t1 = tsk->sched_info.pcnt; | 122 | t1 = tsk->sched_info.pcount; |
123 | t2 = tsk->sched_info.run_delay; | 123 | t2 = tsk->sched_info.run_delay; |
124 | t3 = tsk->sched_info.cpu_time; | 124 | t3 = tsk->sched_info.cpu_time; |
125 | 125 | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 993369ee94d1..7f7959de4a87 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -111,6 +111,7 @@ static void __exit_signal(struct task_struct *tsk) | |||
111 | */ | 111 | */ |
112 | sig->utime = cputime_add(sig->utime, tsk->utime); | 112 | sig->utime = cputime_add(sig->utime, tsk->utime); |
113 | sig->stime = cputime_add(sig->stime, tsk->stime); | 113 | sig->stime = cputime_add(sig->stime, tsk->stime); |
114 | sig->gtime = cputime_add(sig->gtime, tsk->gtime); | ||
114 | sig->min_flt += tsk->min_flt; | 115 | sig->min_flt += tsk->min_flt; |
115 | sig->maj_flt += tsk->maj_flt; | 116 | sig->maj_flt += tsk->maj_flt; |
116 | sig->nvcsw += tsk->nvcsw; | 117 | sig->nvcsw += tsk->nvcsw; |
@@ -1242,6 +1243,11 @@ static int wait_task_zombie(struct task_struct *p, int noreap, | |||
1242 | cputime_add(p->stime, | 1243 | cputime_add(p->stime, |
1243 | cputime_add(sig->stime, | 1244 | cputime_add(sig->stime, |
1244 | sig->cstime))); | 1245 | sig->cstime))); |
1246 | psig->cgtime = | ||
1247 | cputime_add(psig->cgtime, | ||
1248 | cputime_add(p->gtime, | ||
1249 | cputime_add(sig->gtime, | ||
1250 | sig->cgtime))); | ||
1245 | psig->cmin_flt += | 1251 | psig->cmin_flt += |
1246 | p->min_flt + sig->min_flt + sig->cmin_flt; | 1252 | p->min_flt + sig->min_flt + sig->cmin_flt; |
1247 | psig->cmaj_flt += | 1253 | psig->cmaj_flt += |
diff --git a/kernel/fork.c b/kernel/fork.c index 5e67f90a1694..3fc3c1383912 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -877,6 +877,8 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
877 | sig->tty_old_pgrp = NULL; | 877 | sig->tty_old_pgrp = NULL; |
878 | 878 | ||
879 | sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; | 879 | sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; |
880 | sig->gtime = cputime_zero; | ||
881 | sig->cgtime = cputime_zero; | ||
880 | sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; | 882 | sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; |
881 | sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; | 883 | sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; |
882 | sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; | 884 | sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; |
@@ -1045,6 +1047,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1045 | 1047 | ||
1046 | p->utime = cputime_zero; | 1048 | p->utime = cputime_zero; |
1047 | p->stime = cputime_zero; | 1049 | p->stime = cputime_zero; |
1050 | p->gtime = cputime_zero; | ||
1048 | 1051 | ||
1049 | #ifdef CONFIG_TASK_XACCT | 1052 | #ifdef CONFIG_TASK_XACCT |
1050 | p->rchar = 0; /* I/O counter: bytes read */ | 1053 | p->rchar = 0; /* I/O counter: bytes read */ |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index d0e5c48e18c7..6046939d0804 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
15 | #include <linux/init.h> | 15 | #include <linux/init.h> |
16 | #include <linux/kexec.h> | 16 | #include <linux/kexec.h> |
17 | #include <linux/sched.h> | ||
17 | 18 | ||
18 | #define KERNEL_ATTR_RO(_name) \ | 19 | #define KERNEL_ATTR_RO(_name) \ |
19 | static struct subsys_attribute _name##_attr = __ATTR_RO(_name) | 20 | static struct subsys_attribute _name##_attr = __ATTR_RO(_name) |
@@ -116,6 +117,13 @@ static int __init ksysfs_init(void) | |||
116 | ¬es_attr); | 117 | ¬es_attr); |
117 | } | 118 | } |
118 | 119 | ||
120 | /* | ||
121 | * Create "/sys/kernel/uids" directory and corresponding root user's | ||
122 | * directory under it. | ||
123 | */ | ||
124 | if (!error) | ||
125 | error = uids_kobject_init(); | ||
126 | |||
119 | return error; | 127 | return error; |
120 | } | 128 | } |
121 | 129 | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 6c10fa796ca0..bba57adb9504 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -96,7 +96,7 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
96 | /* | 96 | /* |
97 | * Some helpers for converting nanosecond timing to jiffy resolution | 97 | * Some helpers for converting nanosecond timing to jiffy resolution |
98 | */ | 98 | */ |
99 | #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) | 99 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (1000000000 / HZ)) |
100 | #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) | 100 | #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) |
101 | 101 | ||
102 | #define NICE_0_LOAD SCHED_LOAD_SCALE | 102 | #define NICE_0_LOAD SCHED_LOAD_SCALE |
@@ -105,11 +105,9 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
105 | /* | 105 | /* |
106 | * These are the 'tuning knobs' of the scheduler: | 106 | * These are the 'tuning knobs' of the scheduler: |
107 | * | 107 | * |
108 | * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), | 108 | * default timeslice is 100 msecs (used only for SCHED_RR tasks). |
109 | * default timeslice is 100 msecs, maximum timeslice is 800 msecs. | ||
110 | * Timeslices get refilled after they expire. | 109 | * Timeslices get refilled after they expire. |
111 | */ | 110 | */ |
112 | #define MIN_TIMESLICE max(5 * HZ / 1000, 1) | ||
113 | #define DEF_TIMESLICE (100 * HZ / 1000) | 111 | #define DEF_TIMESLICE (100 * HZ / 1000) |
114 | 112 | ||
115 | #ifdef CONFIG_SMP | 113 | #ifdef CONFIG_SMP |
@@ -133,24 +131,6 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) | |||
133 | } | 131 | } |
134 | #endif | 132 | #endif |
135 | 133 | ||
136 | #define SCALE_PRIO(x, prio) \ | ||
137 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) | ||
138 | |||
139 | /* | ||
140 | * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] | ||
141 | * to time slice values: [800ms ... 100ms ... 5ms] | ||
142 | */ | ||
143 | static unsigned int static_prio_timeslice(int static_prio) | ||
144 | { | ||
145 | if (static_prio == NICE_TO_PRIO(19)) | ||
146 | return 1; | ||
147 | |||
148 | if (static_prio < NICE_TO_PRIO(0)) | ||
149 | return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); | ||
150 | else | ||
151 | return SCALE_PRIO(DEF_TIMESLICE, static_prio); | ||
152 | } | ||
153 | |||
154 | static inline int rt_policy(int policy) | 134 | static inline int rt_policy(int policy) |
155 | { | 135 | { |
156 | if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) | 136 | if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) |
@@ -171,31 +151,91 @@ struct rt_prio_array { | |||
171 | struct list_head queue[MAX_RT_PRIO]; | 151 | struct list_head queue[MAX_RT_PRIO]; |
172 | }; | 152 | }; |
173 | 153 | ||
174 | struct load_stat { | 154 | #ifdef CONFIG_FAIR_GROUP_SCHED |
175 | struct load_weight load; | 155 | |
176 | u64 load_update_start, load_update_last; | 156 | struct cfs_rq; |
177 | unsigned long delta_fair, delta_exec, delta_stat; | 157 | |
158 | /* task group related information */ | ||
159 | struct task_group { | ||
160 | /* schedulable entities of this group on each cpu */ | ||
161 | struct sched_entity **se; | ||
162 | /* runqueue "owned" by this group on each cpu */ | ||
163 | struct cfs_rq **cfs_rq; | ||
164 | unsigned long shares; | ||
165 | /* spinlock to serialize modification to shares */ | ||
166 | spinlock_t lock; | ||
167 | }; | ||
168 | |||
169 | /* Default task group's sched entity on each cpu */ | ||
170 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | ||
171 | /* Default task group's cfs_rq on each cpu */ | ||
172 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | ||
173 | |||
174 | static struct sched_entity *init_sched_entity_p[NR_CPUS]; | ||
175 | static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; | ||
176 | |||
177 | /* Default task group. | ||
178 | * Every task in system belong to this group at bootup. | ||
179 | */ | ||
180 | struct task_group init_task_group = { | ||
181 | .se = init_sched_entity_p, | ||
182 | .cfs_rq = init_cfs_rq_p, | ||
178 | }; | 183 | }; |
179 | 184 | ||
185 | #ifdef CONFIG_FAIR_USER_SCHED | ||
186 | # define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD | ||
187 | #else | ||
188 | # define INIT_TASK_GRP_LOAD NICE_0_LOAD | ||
189 | #endif | ||
190 | |||
191 | static int init_task_group_load = INIT_TASK_GRP_LOAD; | ||
192 | |||
193 | /* return group to which a task belongs */ | ||
194 | static inline struct task_group *task_group(struct task_struct *p) | ||
195 | { | ||
196 | struct task_group *tg; | ||
197 | |||
198 | #ifdef CONFIG_FAIR_USER_SCHED | ||
199 | tg = p->user->tg; | ||
200 | #else | ||
201 | tg = &init_task_group; | ||
202 | #endif | ||
203 | |||
204 | return tg; | ||
205 | } | ||
206 | |||
207 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | ||
208 | static inline void set_task_cfs_rq(struct task_struct *p) | ||
209 | { | ||
210 | p->se.cfs_rq = task_group(p)->cfs_rq[task_cpu(p)]; | ||
211 | p->se.parent = task_group(p)->se[task_cpu(p)]; | ||
212 | } | ||
213 | |||
214 | #else | ||
215 | |||
216 | static inline void set_task_cfs_rq(struct task_struct *p) { } | ||
217 | |||
218 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
219 | |||
180 | /* CFS-related fields in a runqueue */ | 220 | /* CFS-related fields in a runqueue */ |
181 | struct cfs_rq { | 221 | struct cfs_rq { |
182 | struct load_weight load; | 222 | struct load_weight load; |
183 | unsigned long nr_running; | 223 | unsigned long nr_running; |
184 | 224 | ||
185 | s64 fair_clock; | ||
186 | u64 exec_clock; | 225 | u64 exec_clock; |
187 | s64 wait_runtime; | 226 | u64 min_vruntime; |
188 | u64 sleeper_bonus; | ||
189 | unsigned long wait_runtime_overruns, wait_runtime_underruns; | ||
190 | 227 | ||
191 | struct rb_root tasks_timeline; | 228 | struct rb_root tasks_timeline; |
192 | struct rb_node *rb_leftmost; | 229 | struct rb_node *rb_leftmost; |
193 | struct rb_node *rb_load_balance_curr; | 230 | struct rb_node *rb_load_balance_curr; |
194 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
195 | /* 'curr' points to currently running entity on this cfs_rq. | 231 | /* 'curr' points to currently running entity on this cfs_rq. |
196 | * It is set to NULL otherwise (i.e when none are currently running). | 232 | * It is set to NULL otherwise (i.e when none are currently running). |
197 | */ | 233 | */ |
198 | struct sched_entity *curr; | 234 | struct sched_entity *curr; |
235 | |||
236 | unsigned long nr_spread_over; | ||
237 | |||
238 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
199 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | 239 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ |
200 | 240 | ||
201 | /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | 241 | /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in |
@@ -206,6 +246,8 @@ struct cfs_rq { | |||
206 | * list is used during load balance. | 246 | * list is used during load balance. |
207 | */ | 247 | */ |
208 | struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ | 248 | struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ |
249 | struct task_group *tg; /* group that "owns" this runqueue */ | ||
250 | struct rcu_head rcu; | ||
209 | #endif | 251 | #endif |
210 | }; | 252 | }; |
211 | 253 | ||
@@ -237,7 +279,7 @@ struct rq { | |||
237 | #ifdef CONFIG_NO_HZ | 279 | #ifdef CONFIG_NO_HZ |
238 | unsigned char in_nohz_recently; | 280 | unsigned char in_nohz_recently; |
239 | #endif | 281 | #endif |
240 | struct load_stat ls; /* capture load from *all* tasks on this cpu */ | 282 | struct load_weight load; /* capture load from *all* tasks on this cpu */ |
241 | unsigned long nr_load_updates; | 283 | unsigned long nr_load_updates; |
242 | u64 nr_switches; | 284 | u64 nr_switches; |
243 | 285 | ||
@@ -289,16 +331,19 @@ struct rq { | |||
289 | unsigned long yld_exp_empty; | 331 | unsigned long yld_exp_empty; |
290 | unsigned long yld_act_empty; | 332 | unsigned long yld_act_empty; |
291 | unsigned long yld_both_empty; | 333 | unsigned long yld_both_empty; |
292 | unsigned long yld_cnt; | 334 | unsigned long yld_count; |
293 | 335 | ||
294 | /* schedule() stats */ | 336 | /* schedule() stats */ |
295 | unsigned long sched_switch; | 337 | unsigned long sched_switch; |
296 | unsigned long sched_cnt; | 338 | unsigned long sched_count; |
297 | unsigned long sched_goidle; | 339 | unsigned long sched_goidle; |
298 | 340 | ||
299 | /* try_to_wake_up() stats */ | 341 | /* try_to_wake_up() stats */ |
300 | unsigned long ttwu_cnt; | 342 | unsigned long ttwu_count; |
301 | unsigned long ttwu_local; | 343 | unsigned long ttwu_local; |
344 | |||
345 | /* BKL stats */ | ||
346 | unsigned long bkl_count; | ||
302 | #endif | 347 | #endif |
303 | struct lock_class_key rq_lock_key; | 348 | struct lock_class_key rq_lock_key; |
304 | }; | 349 | }; |
@@ -383,6 +428,37 @@ static void update_rq_clock(struct rq *rq) | |||
383 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 428 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
384 | 429 | ||
385 | /* | 430 | /* |
431 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | ||
432 | */ | ||
433 | #ifdef CONFIG_SCHED_DEBUG | ||
434 | # define const_debug __read_mostly | ||
435 | #else | ||
436 | # define const_debug static const | ||
437 | #endif | ||
438 | |||
439 | /* | ||
440 | * Debugging: various feature bits | ||
441 | */ | ||
442 | enum { | ||
443 | SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, | ||
444 | SCHED_FEAT_START_DEBIT = 2, | ||
445 | SCHED_FEAT_TREE_AVG = 4, | ||
446 | SCHED_FEAT_APPROX_AVG = 8, | ||
447 | SCHED_FEAT_WAKEUP_PREEMPT = 16, | ||
448 | SCHED_FEAT_PREEMPT_RESTRICT = 32, | ||
449 | }; | ||
450 | |||
451 | const_debug unsigned int sysctl_sched_features = | ||
452 | SCHED_FEAT_NEW_FAIR_SLEEPERS *1 | | ||
453 | SCHED_FEAT_START_DEBIT *1 | | ||
454 | SCHED_FEAT_TREE_AVG *0 | | ||
455 | SCHED_FEAT_APPROX_AVG *0 | | ||
456 | SCHED_FEAT_WAKEUP_PREEMPT *1 | | ||
457 | SCHED_FEAT_PREEMPT_RESTRICT *1; | ||
458 | |||
459 | #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) | ||
460 | |||
461 | /* | ||
386 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | 462 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu |
387 | * clock constructed from sched_clock(): | 463 | * clock constructed from sched_clock(): |
388 | */ | 464 | */ |
@@ -400,18 +476,7 @@ unsigned long long cpu_clock(int cpu) | |||
400 | 476 | ||
401 | return now; | 477 | return now; |
402 | } | 478 | } |
403 | 479 | EXPORT_SYMBOL_GPL(cpu_clock); | |
404 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
405 | /* Change a task's ->cfs_rq if it moves across CPUs */ | ||
406 | static inline void set_task_cfs_rq(struct task_struct *p) | ||
407 | { | ||
408 | p->se.cfs_rq = &task_rq(p)->cfs; | ||
409 | } | ||
410 | #else | ||
411 | static inline void set_task_cfs_rq(struct task_struct *p) | ||
412 | { | ||
413 | } | ||
414 | #endif | ||
415 | 480 | ||
416 | #ifndef prepare_arch_switch | 481 | #ifndef prepare_arch_switch |
417 | # define prepare_arch_switch(next) do { } while (0) | 482 | # define prepare_arch_switch(next) do { } while (0) |
@@ -497,16 +562,13 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
497 | static inline struct rq *__task_rq_lock(struct task_struct *p) | 562 | static inline struct rq *__task_rq_lock(struct task_struct *p) |
498 | __acquires(rq->lock) | 563 | __acquires(rq->lock) |
499 | { | 564 | { |
500 | struct rq *rq; | 565 | for (;;) { |
501 | 566 | struct rq *rq = task_rq(p); | |
502 | repeat_lock_task: | 567 | spin_lock(&rq->lock); |
503 | rq = task_rq(p); | 568 | if (likely(rq == task_rq(p))) |
504 | spin_lock(&rq->lock); | 569 | return rq; |
505 | if (unlikely(rq != task_rq(p))) { | ||
506 | spin_unlock(&rq->lock); | 570 | spin_unlock(&rq->lock); |
507 | goto repeat_lock_task; | ||
508 | } | 571 | } |
509 | return rq; | ||
510 | } | 572 | } |
511 | 573 | ||
512 | /* | 574 | /* |
@@ -519,18 +581,17 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | |||
519 | { | 581 | { |
520 | struct rq *rq; | 582 | struct rq *rq; |
521 | 583 | ||
522 | repeat_lock_task: | 584 | for (;;) { |
523 | local_irq_save(*flags); | 585 | local_irq_save(*flags); |
524 | rq = task_rq(p); | 586 | rq = task_rq(p); |
525 | spin_lock(&rq->lock); | 587 | spin_lock(&rq->lock); |
526 | if (unlikely(rq != task_rq(p))) { | 588 | if (likely(rq == task_rq(p))) |
589 | return rq; | ||
527 | spin_unlock_irqrestore(&rq->lock, *flags); | 590 | spin_unlock_irqrestore(&rq->lock, *flags); |
528 | goto repeat_lock_task; | ||
529 | } | 591 | } |
530 | return rq; | ||
531 | } | 592 | } |
532 | 593 | ||
533 | static inline void __task_rq_unlock(struct rq *rq) | 594 | static void __task_rq_unlock(struct rq *rq) |
534 | __releases(rq->lock) | 595 | __releases(rq->lock) |
535 | { | 596 | { |
536 | spin_unlock(&rq->lock); | 597 | spin_unlock(&rq->lock); |
@@ -545,7 +606,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) | |||
545 | /* | 606 | /* |
546 | * this_rq_lock - lock this runqueue and disable interrupts. | 607 | * this_rq_lock - lock this runqueue and disable interrupts. |
547 | */ | 608 | */ |
548 | static inline struct rq *this_rq_lock(void) | 609 | static struct rq *this_rq_lock(void) |
549 | __acquires(rq->lock) | 610 | __acquires(rq->lock) |
550 | { | 611 | { |
551 | struct rq *rq; | 612 | struct rq *rq; |
@@ -645,19 +706,6 @@ static inline void resched_task(struct task_struct *p) | |||
645 | } | 706 | } |
646 | #endif | 707 | #endif |
647 | 708 | ||
648 | static u64 div64_likely32(u64 divident, unsigned long divisor) | ||
649 | { | ||
650 | #if BITS_PER_LONG == 32 | ||
651 | if (likely(divident <= 0xffffffffULL)) | ||
652 | return (u32)divident / divisor; | ||
653 | do_div(divident, divisor); | ||
654 | |||
655 | return divident; | ||
656 | #else | ||
657 | return divident / divisor; | ||
658 | #endif | ||
659 | } | ||
660 | |||
661 | #if BITS_PER_LONG == 32 | 709 | #if BITS_PER_LONG == 32 |
662 | # define WMULT_CONST (~0UL) | 710 | # define WMULT_CONST (~0UL) |
663 | #else | 711 | #else |
@@ -699,16 +747,14 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) | |||
699 | return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); | 747 | return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); |
700 | } | 748 | } |
701 | 749 | ||
702 | static void update_load_add(struct load_weight *lw, unsigned long inc) | 750 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) |
703 | { | 751 | { |
704 | lw->weight += inc; | 752 | lw->weight += inc; |
705 | lw->inv_weight = 0; | ||
706 | } | 753 | } |
707 | 754 | ||
708 | static void update_load_sub(struct load_weight *lw, unsigned long dec) | 755 | static inline void update_load_sub(struct load_weight *lw, unsigned long dec) |
709 | { | 756 | { |
710 | lw->weight -= dec; | 757 | lw->weight -= dec; |
711 | lw->inv_weight = 0; | ||
712 | } | 758 | } |
713 | 759 | ||
714 | /* | 760 | /* |
@@ -784,29 +830,20 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
784 | int *this_best_prio, struct rq_iterator *iterator); | 830 | int *this_best_prio, struct rq_iterator *iterator); |
785 | 831 | ||
786 | #include "sched_stats.h" | 832 | #include "sched_stats.h" |
787 | #include "sched_rt.c" | ||
788 | #include "sched_fair.c" | ||
789 | #include "sched_idletask.c" | 833 | #include "sched_idletask.c" |
834 | #include "sched_fair.c" | ||
835 | #include "sched_rt.c" | ||
790 | #ifdef CONFIG_SCHED_DEBUG | 836 | #ifdef CONFIG_SCHED_DEBUG |
791 | # include "sched_debug.c" | 837 | # include "sched_debug.c" |
792 | #endif | 838 | #endif |
793 | 839 | ||
794 | #define sched_class_highest (&rt_sched_class) | 840 | #define sched_class_highest (&rt_sched_class) |
795 | 841 | ||
796 | static void __update_curr_load(struct rq *rq, struct load_stat *ls) | ||
797 | { | ||
798 | if (rq->curr != rq->idle && ls->load.weight) { | ||
799 | ls->delta_exec += ls->delta_stat; | ||
800 | ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load); | ||
801 | ls->delta_stat = 0; | ||
802 | } | ||
803 | } | ||
804 | |||
805 | /* | 842 | /* |
806 | * Update delta_exec, delta_fair fields for rq. | 843 | * Update delta_exec, delta_fair fields for rq. |
807 | * | 844 | * |
808 | * delta_fair clock advances at a rate inversely proportional to | 845 | * delta_fair clock advances at a rate inversely proportional to |
809 | * total load (rq->ls.load.weight) on the runqueue, while | 846 | * total load (rq->load.weight) on the runqueue, while |
810 | * delta_exec advances at the same rate as wall-clock (provided | 847 | * delta_exec advances at the same rate as wall-clock (provided |
811 | * cpu is not idle). | 848 | * cpu is not idle). |
812 | * | 849 | * |
@@ -814,35 +851,17 @@ static void __update_curr_load(struct rq *rq, struct load_stat *ls) | |||
814 | * runqueue over any given interval. This (smoothened) load is used | 851 | * runqueue over any given interval. This (smoothened) load is used |
815 | * during load balance. | 852 | * during load balance. |
816 | * | 853 | * |
817 | * This function is called /before/ updating rq->ls.load | 854 | * This function is called /before/ updating rq->load |
818 | * and when switching tasks. | 855 | * and when switching tasks. |
819 | */ | 856 | */ |
820 | static void update_curr_load(struct rq *rq) | ||
821 | { | ||
822 | struct load_stat *ls = &rq->ls; | ||
823 | u64 start; | ||
824 | |||
825 | start = ls->load_update_start; | ||
826 | ls->load_update_start = rq->clock; | ||
827 | ls->delta_stat += rq->clock - start; | ||
828 | /* | ||
829 | * Stagger updates to ls->delta_fair. Very frequent updates | ||
830 | * can be expensive. | ||
831 | */ | ||
832 | if (ls->delta_stat >= sysctl_sched_stat_granularity) | ||
833 | __update_curr_load(rq, ls); | ||
834 | } | ||
835 | |||
836 | static inline void inc_load(struct rq *rq, const struct task_struct *p) | 857 | static inline void inc_load(struct rq *rq, const struct task_struct *p) |
837 | { | 858 | { |
838 | update_curr_load(rq); | 859 | update_load_add(&rq->load, p->se.load.weight); |
839 | update_load_add(&rq->ls.load, p->se.load.weight); | ||
840 | } | 860 | } |
841 | 861 | ||
842 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | 862 | static inline void dec_load(struct rq *rq, const struct task_struct *p) |
843 | { | 863 | { |
844 | update_curr_load(rq); | 864 | update_load_sub(&rq->load, p->se.load.weight); |
845 | update_load_sub(&rq->ls.load, p->se.load.weight); | ||
846 | } | 865 | } |
847 | 866 | ||
848 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | 867 | static void inc_nr_running(struct task_struct *p, struct rq *rq) |
@@ -859,8 +878,6 @@ static void dec_nr_running(struct task_struct *p, struct rq *rq) | |||
859 | 878 | ||
860 | static void set_load_weight(struct task_struct *p) | 879 | static void set_load_weight(struct task_struct *p) |
861 | { | 880 | { |
862 | p->se.wait_runtime = 0; | ||
863 | |||
864 | if (task_has_rt_policy(p)) { | 881 | if (task_has_rt_policy(p)) { |
865 | p->se.load.weight = prio_to_weight[0] * 2; | 882 | p->se.load.weight = prio_to_weight[0] * 2; |
866 | p->se.load.inv_weight = prio_to_wmult[0] >> 1; | 883 | p->se.load.inv_weight = prio_to_wmult[0] >> 1; |
@@ -952,20 +969,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
952 | } | 969 | } |
953 | 970 | ||
954 | /* | 971 | /* |
955 | * activate_idle_task - move idle task to the _front_ of runqueue. | ||
956 | */ | ||
957 | static inline void activate_idle_task(struct task_struct *p, struct rq *rq) | ||
958 | { | ||
959 | update_rq_clock(rq); | ||
960 | |||
961 | if (p->state == TASK_UNINTERRUPTIBLE) | ||
962 | rq->nr_uninterruptible--; | ||
963 | |||
964 | enqueue_task(rq, p, 0); | ||
965 | inc_nr_running(p, rq); | ||
966 | } | ||
967 | |||
968 | /* | ||
969 | * deactivate_task - remove a task from the runqueue. | 972 | * deactivate_task - remove a task from the runqueue. |
970 | */ | 973 | */ |
971 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | 974 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) |
@@ -989,32 +992,50 @@ inline int task_curr(const struct task_struct *p) | |||
989 | /* Used instead of source_load when we know the type == 0 */ | 992 | /* Used instead of source_load when we know the type == 0 */ |
990 | unsigned long weighted_cpuload(const int cpu) | 993 | unsigned long weighted_cpuload(const int cpu) |
991 | { | 994 | { |
992 | return cpu_rq(cpu)->ls.load.weight; | 995 | return cpu_rq(cpu)->load.weight; |
993 | } | 996 | } |
994 | 997 | ||
995 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | 998 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) |
996 | { | 999 | { |
997 | #ifdef CONFIG_SMP | 1000 | #ifdef CONFIG_SMP |
998 | task_thread_info(p)->cpu = cpu; | 1001 | task_thread_info(p)->cpu = cpu; |
999 | set_task_cfs_rq(p); | ||
1000 | #endif | 1002 | #endif |
1003 | set_task_cfs_rq(p); | ||
1001 | } | 1004 | } |
1002 | 1005 | ||
1003 | #ifdef CONFIG_SMP | 1006 | #ifdef CONFIG_SMP |
1004 | 1007 | ||
1008 | /* | ||
1009 | * Is this task likely cache-hot: | ||
1010 | */ | ||
1011 | static inline int | ||
1012 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | ||
1013 | { | ||
1014 | s64 delta; | ||
1015 | |||
1016 | if (p->sched_class != &fair_sched_class) | ||
1017 | return 0; | ||
1018 | |||
1019 | if (sysctl_sched_migration_cost == -1) | ||
1020 | return 1; | ||
1021 | if (sysctl_sched_migration_cost == 0) | ||
1022 | return 0; | ||
1023 | |||
1024 | delta = now - p->se.exec_start; | ||
1025 | |||
1026 | return delta < (s64)sysctl_sched_migration_cost; | ||
1027 | } | ||
1028 | |||
1029 | |||
1005 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 1030 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
1006 | { | 1031 | { |
1007 | int old_cpu = task_cpu(p); | 1032 | int old_cpu = task_cpu(p); |
1008 | struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); | 1033 | struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); |
1009 | u64 clock_offset, fair_clock_offset; | 1034 | struct cfs_rq *old_cfsrq = task_cfs_rq(p), |
1035 | *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); | ||
1036 | u64 clock_offset; | ||
1010 | 1037 | ||
1011 | clock_offset = old_rq->clock - new_rq->clock; | 1038 | clock_offset = old_rq->clock - new_rq->clock; |
1012 | fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock; | ||
1013 | |||
1014 | if (p->se.wait_start_fair) | ||
1015 | p->se.wait_start_fair -= fair_clock_offset; | ||
1016 | if (p->se.sleep_start_fair) | ||
1017 | p->se.sleep_start_fair -= fair_clock_offset; | ||
1018 | 1039 | ||
1019 | #ifdef CONFIG_SCHEDSTATS | 1040 | #ifdef CONFIG_SCHEDSTATS |
1020 | if (p->se.wait_start) | 1041 | if (p->se.wait_start) |
@@ -1023,7 +1044,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1023 | p->se.sleep_start -= clock_offset; | 1044 | p->se.sleep_start -= clock_offset; |
1024 | if (p->se.block_start) | 1045 | if (p->se.block_start) |
1025 | p->se.block_start -= clock_offset; | 1046 | p->se.block_start -= clock_offset; |
1047 | if (old_cpu != new_cpu) { | ||
1048 | schedstat_inc(p, se.nr_migrations); | ||
1049 | if (task_hot(p, old_rq->clock, NULL)) | ||
1050 | schedstat_inc(p, se.nr_forced2_migrations); | ||
1051 | } | ||
1026 | #endif | 1052 | #endif |
1053 | p->se.vruntime -= old_cfsrq->min_vruntime - | ||
1054 | new_cfsrq->min_vruntime; | ||
1027 | 1055 | ||
1028 | __set_task_cpu(p, new_cpu); | 1056 | __set_task_cpu(p, new_cpu); |
1029 | } | 1057 | } |
@@ -1078,69 +1106,71 @@ void wait_task_inactive(struct task_struct *p) | |||
1078 | int running, on_rq; | 1106 | int running, on_rq; |
1079 | struct rq *rq; | 1107 | struct rq *rq; |
1080 | 1108 | ||
1081 | repeat: | 1109 | for (;;) { |
1082 | /* | 1110 | /* |
1083 | * We do the initial early heuristics without holding | 1111 | * We do the initial early heuristics without holding |
1084 | * any task-queue locks at all. We'll only try to get | 1112 | * any task-queue locks at all. We'll only try to get |
1085 | * the runqueue lock when things look like they will | 1113 | * the runqueue lock when things look like they will |
1086 | * work out! | 1114 | * work out! |
1087 | */ | 1115 | */ |
1088 | rq = task_rq(p); | 1116 | rq = task_rq(p); |
1089 | 1117 | ||
1090 | /* | 1118 | /* |
1091 | * If the task is actively running on another CPU | 1119 | * If the task is actively running on another CPU |
1092 | * still, just relax and busy-wait without holding | 1120 | * still, just relax and busy-wait without holding |
1093 | * any locks. | 1121 | * any locks. |
1094 | * | 1122 | * |
1095 | * NOTE! Since we don't hold any locks, it's not | 1123 | * NOTE! Since we don't hold any locks, it's not |
1096 | * even sure that "rq" stays as the right runqueue! | 1124 | * even sure that "rq" stays as the right runqueue! |
1097 | * But we don't care, since "task_running()" will | 1125 | * But we don't care, since "task_running()" will |
1098 | * return false if the runqueue has changed and p | 1126 | * return false if the runqueue has changed and p |
1099 | * is actually now running somewhere else! | 1127 | * is actually now running somewhere else! |
1100 | */ | 1128 | */ |
1101 | while (task_running(rq, p)) | 1129 | while (task_running(rq, p)) |
1102 | cpu_relax(); | 1130 | cpu_relax(); |
1103 | 1131 | ||
1104 | /* | 1132 | /* |
1105 | * Ok, time to look more closely! We need the rq | 1133 | * Ok, time to look more closely! We need the rq |
1106 | * lock now, to be *sure*. If we're wrong, we'll | 1134 | * lock now, to be *sure*. If we're wrong, we'll |
1107 | * just go back and repeat. | 1135 | * just go back and repeat. |
1108 | */ | 1136 | */ |
1109 | rq = task_rq_lock(p, &flags); | 1137 | rq = task_rq_lock(p, &flags); |
1110 | running = task_running(rq, p); | 1138 | running = task_running(rq, p); |
1111 | on_rq = p->se.on_rq; | 1139 | on_rq = p->se.on_rq; |
1112 | task_rq_unlock(rq, &flags); | 1140 | task_rq_unlock(rq, &flags); |
1113 | 1141 | ||
1114 | /* | 1142 | /* |
1115 | * Was it really running after all now that we | 1143 | * Was it really running after all now that we |
1116 | * checked with the proper locks actually held? | 1144 | * checked with the proper locks actually held? |
1117 | * | 1145 | * |
1118 | * Oops. Go back and try again.. | 1146 | * Oops. Go back and try again.. |
1119 | */ | 1147 | */ |
1120 | if (unlikely(running)) { | 1148 | if (unlikely(running)) { |
1121 | cpu_relax(); | 1149 | cpu_relax(); |
1122 | goto repeat; | 1150 | continue; |
1123 | } | 1151 | } |
1124 | 1152 | ||
1125 | /* | 1153 | /* |
1126 | * It's not enough that it's not actively running, | 1154 | * It's not enough that it's not actively running, |
1127 | * it must be off the runqueue _entirely_, and not | 1155 | * it must be off the runqueue _entirely_, and not |
1128 | * preempted! | 1156 | * preempted! |
1129 | * | 1157 | * |
1130 | * So if it wa still runnable (but just not actively | 1158 | * So if it wa still runnable (but just not actively |
1131 | * running right now), it's preempted, and we should | 1159 | * running right now), it's preempted, and we should |
1132 | * yield - it could be a while. | 1160 | * yield - it could be a while. |
1133 | */ | 1161 | */ |
1134 | if (unlikely(on_rq)) { | 1162 | if (unlikely(on_rq)) { |
1135 | yield(); | 1163 | schedule_timeout_uninterruptible(1); |
1136 | goto repeat; | 1164 | continue; |
1137 | } | 1165 | } |
1138 | 1166 | ||
1139 | /* | 1167 | /* |
1140 | * Ahh, all good. It wasn't running, and it wasn't | 1168 | * Ahh, all good. It wasn't running, and it wasn't |
1141 | * runnable, which means that it will never become | 1169 | * runnable, which means that it will never become |
1142 | * running in the future either. We're all done! | 1170 | * running in the future either. We're all done! |
1143 | */ | 1171 | */ |
1172 | break; | ||
1173 | } | ||
1144 | } | 1174 | } |
1145 | 1175 | ||
1146 | /*** | 1176 | /*** |
@@ -1174,7 +1204,7 @@ void kick_process(struct task_struct *p) | |||
1174 | * We want to under-estimate the load of migration sources, to | 1204 | * We want to under-estimate the load of migration sources, to |
1175 | * balance conservatively. | 1205 | * balance conservatively. |
1176 | */ | 1206 | */ |
1177 | static inline unsigned long source_load(int cpu, int type) | 1207 | static unsigned long source_load(int cpu, int type) |
1178 | { | 1208 | { |
1179 | struct rq *rq = cpu_rq(cpu); | 1209 | struct rq *rq = cpu_rq(cpu); |
1180 | unsigned long total = weighted_cpuload(cpu); | 1210 | unsigned long total = weighted_cpuload(cpu); |
@@ -1189,7 +1219,7 @@ static inline unsigned long source_load(int cpu, int type) | |||
1189 | * Return a high guess at the load of a migration-target cpu weighted | 1219 | * Return a high guess at the load of a migration-target cpu weighted |
1190 | * according to the scheduling class and "nice" value. | 1220 | * according to the scheduling class and "nice" value. |
1191 | */ | 1221 | */ |
1192 | static inline unsigned long target_load(int cpu, int type) | 1222 | static unsigned long target_load(int cpu, int type) |
1193 | { | 1223 | { |
1194 | struct rq *rq = cpu_rq(cpu); | 1224 | struct rq *rq = cpu_rq(cpu); |
1195 | unsigned long total = weighted_cpuload(cpu); | 1225 | unsigned long total = weighted_cpuload(cpu); |
@@ -1231,7 +1261,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
1231 | 1261 | ||
1232 | /* Skip over this group if it has no CPUs allowed */ | 1262 | /* Skip over this group if it has no CPUs allowed */ |
1233 | if (!cpus_intersects(group->cpumask, p->cpus_allowed)) | 1263 | if (!cpus_intersects(group->cpumask, p->cpus_allowed)) |
1234 | goto nextgroup; | 1264 | continue; |
1235 | 1265 | ||
1236 | local_group = cpu_isset(this_cpu, group->cpumask); | 1266 | local_group = cpu_isset(this_cpu, group->cpumask); |
1237 | 1267 | ||
@@ -1259,9 +1289,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
1259 | min_load = avg_load; | 1289 | min_load = avg_load; |
1260 | idlest = group; | 1290 | idlest = group; |
1261 | } | 1291 | } |
1262 | nextgroup: | 1292 | } while (group = group->next, group != sd->groups); |
1263 | group = group->next; | ||
1264 | } while (group != sd->groups); | ||
1265 | 1293 | ||
1266 | if (!idlest || 100*this_load < imbalance*min_load) | 1294 | if (!idlest || 100*this_load < imbalance*min_load) |
1267 | return NULL; | 1295 | return NULL; |
@@ -1393,8 +1421,13 @@ static int wake_idle(int cpu, struct task_struct *p) | |||
1393 | if (sd->flags & SD_WAKE_IDLE) { | 1421 | if (sd->flags & SD_WAKE_IDLE) { |
1394 | cpus_and(tmp, sd->span, p->cpus_allowed); | 1422 | cpus_and(tmp, sd->span, p->cpus_allowed); |
1395 | for_each_cpu_mask(i, tmp) { | 1423 | for_each_cpu_mask(i, tmp) { |
1396 | if (idle_cpu(i)) | 1424 | if (idle_cpu(i)) { |
1425 | if (i != task_cpu(p)) { | ||
1426 | schedstat_inc(p, | ||
1427 | se.nr_wakeups_idle); | ||
1428 | } | ||
1397 | return i; | 1429 | return i; |
1430 | } | ||
1398 | } | 1431 | } |
1399 | } else { | 1432 | } else { |
1400 | break; | 1433 | break; |
@@ -1425,7 +1458,7 @@ static inline int wake_idle(int cpu, struct task_struct *p) | |||
1425 | */ | 1458 | */ |
1426 | static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | 1459 | static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) |
1427 | { | 1460 | { |
1428 | int cpu, this_cpu, success = 0; | 1461 | int cpu, orig_cpu, this_cpu, success = 0; |
1429 | unsigned long flags; | 1462 | unsigned long flags; |
1430 | long old_state; | 1463 | long old_state; |
1431 | struct rq *rq; | 1464 | struct rq *rq; |
@@ -1444,6 +1477,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1444 | goto out_running; | 1477 | goto out_running; |
1445 | 1478 | ||
1446 | cpu = task_cpu(p); | 1479 | cpu = task_cpu(p); |
1480 | orig_cpu = cpu; | ||
1447 | this_cpu = smp_processor_id(); | 1481 | this_cpu = smp_processor_id(); |
1448 | 1482 | ||
1449 | #ifdef CONFIG_SMP | 1483 | #ifdef CONFIG_SMP |
@@ -1452,7 +1486,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1452 | 1486 | ||
1453 | new_cpu = cpu; | 1487 | new_cpu = cpu; |
1454 | 1488 | ||
1455 | schedstat_inc(rq, ttwu_cnt); | 1489 | schedstat_inc(rq, ttwu_count); |
1456 | if (cpu == this_cpu) { | 1490 | if (cpu == this_cpu) { |
1457 | schedstat_inc(rq, ttwu_local); | 1491 | schedstat_inc(rq, ttwu_local); |
1458 | goto out_set_cpu; | 1492 | goto out_set_cpu; |
@@ -1487,6 +1521,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1487 | unsigned long tl = this_load; | 1521 | unsigned long tl = this_load; |
1488 | unsigned long tl_per_task; | 1522 | unsigned long tl_per_task; |
1489 | 1523 | ||
1524 | /* | ||
1525 | * Attract cache-cold tasks on sync wakeups: | ||
1526 | */ | ||
1527 | if (sync && !task_hot(p, rq->clock, this_sd)) | ||
1528 | goto out_set_cpu; | ||
1529 | |||
1530 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | ||
1490 | tl_per_task = cpu_avg_load_per_task(this_cpu); | 1531 | tl_per_task = cpu_avg_load_per_task(this_cpu); |
1491 | 1532 | ||
1492 | /* | 1533 | /* |
@@ -1506,6 +1547,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1506 | * there is no bad imbalance. | 1547 | * there is no bad imbalance. |
1507 | */ | 1548 | */ |
1508 | schedstat_inc(this_sd, ttwu_move_affine); | 1549 | schedstat_inc(this_sd, ttwu_move_affine); |
1550 | schedstat_inc(p, se.nr_wakeups_affine); | ||
1509 | goto out_set_cpu; | 1551 | goto out_set_cpu; |
1510 | } | 1552 | } |
1511 | } | 1553 | } |
@@ -1517,6 +1559,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1517 | if (this_sd->flags & SD_WAKE_BALANCE) { | 1559 | if (this_sd->flags & SD_WAKE_BALANCE) { |
1518 | if (imbalance*this_load <= 100*load) { | 1560 | if (imbalance*this_load <= 100*load) { |
1519 | schedstat_inc(this_sd, ttwu_move_balance); | 1561 | schedstat_inc(this_sd, ttwu_move_balance); |
1562 | schedstat_inc(p, se.nr_wakeups_passive); | ||
1520 | goto out_set_cpu; | 1563 | goto out_set_cpu; |
1521 | } | 1564 | } |
1522 | } | 1565 | } |
@@ -1542,18 +1585,18 @@ out_set_cpu: | |||
1542 | 1585 | ||
1543 | out_activate: | 1586 | out_activate: |
1544 | #endif /* CONFIG_SMP */ | 1587 | #endif /* CONFIG_SMP */ |
1588 | schedstat_inc(p, se.nr_wakeups); | ||
1589 | if (sync) | ||
1590 | schedstat_inc(p, se.nr_wakeups_sync); | ||
1591 | if (orig_cpu != cpu) | ||
1592 | schedstat_inc(p, se.nr_wakeups_migrate); | ||
1593 | if (cpu == this_cpu) | ||
1594 | schedstat_inc(p, se.nr_wakeups_local); | ||
1595 | else | ||
1596 | schedstat_inc(p, se.nr_wakeups_remote); | ||
1545 | update_rq_clock(rq); | 1597 | update_rq_clock(rq); |
1546 | activate_task(rq, p, 1); | 1598 | activate_task(rq, p, 1); |
1547 | /* | 1599 | check_preempt_curr(rq, p); |
1548 | * Sync wakeups (i.e. those types of wakeups where the waker | ||
1549 | * has indicated that it will leave the CPU in short order) | ||
1550 | * don't trigger a preemption, if the woken up task will run on | ||
1551 | * this cpu. (in this case the 'I will reschedule' promise of | ||
1552 | * the waker guarantees that the freshly woken up task is going | ||
1553 | * to be considered on this CPU.) | ||
1554 | */ | ||
1555 | if (!sync || cpu != this_cpu) | ||
1556 | check_preempt_curr(rq, p); | ||
1557 | success = 1; | 1600 | success = 1; |
1558 | 1601 | ||
1559 | out_running: | 1602 | out_running: |
@@ -1584,28 +1627,20 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state) | |||
1584 | */ | 1627 | */ |
1585 | static void __sched_fork(struct task_struct *p) | 1628 | static void __sched_fork(struct task_struct *p) |
1586 | { | 1629 | { |
1587 | p->se.wait_start_fair = 0; | ||
1588 | p->se.exec_start = 0; | 1630 | p->se.exec_start = 0; |
1589 | p->se.sum_exec_runtime = 0; | 1631 | p->se.sum_exec_runtime = 0; |
1590 | p->se.prev_sum_exec_runtime = 0; | 1632 | p->se.prev_sum_exec_runtime = 0; |
1591 | p->se.delta_exec = 0; | ||
1592 | p->se.delta_fair_run = 0; | ||
1593 | p->se.delta_fair_sleep = 0; | ||
1594 | p->se.wait_runtime = 0; | ||
1595 | p->se.sleep_start_fair = 0; | ||
1596 | 1633 | ||
1597 | #ifdef CONFIG_SCHEDSTATS | 1634 | #ifdef CONFIG_SCHEDSTATS |
1598 | p->se.wait_start = 0; | 1635 | p->se.wait_start = 0; |
1599 | p->se.sum_wait_runtime = 0; | ||
1600 | p->se.sum_sleep_runtime = 0; | 1636 | p->se.sum_sleep_runtime = 0; |
1601 | p->se.sleep_start = 0; | 1637 | p->se.sleep_start = 0; |
1602 | p->se.block_start = 0; | 1638 | p->se.block_start = 0; |
1603 | p->se.sleep_max = 0; | 1639 | p->se.sleep_max = 0; |
1604 | p->se.block_max = 0; | 1640 | p->se.block_max = 0; |
1605 | p->se.exec_max = 0; | 1641 | p->se.exec_max = 0; |
1642 | p->se.slice_max = 0; | ||
1606 | p->se.wait_max = 0; | 1643 | p->se.wait_max = 0; |
1607 | p->se.wait_runtime_overruns = 0; | ||
1608 | p->se.wait_runtime_underruns = 0; | ||
1609 | #endif | 1644 | #endif |
1610 | 1645 | ||
1611 | INIT_LIST_HEAD(&p->run_list); | 1646 | INIT_LIST_HEAD(&p->run_list); |
@@ -1636,12 +1671,14 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
1636 | #ifdef CONFIG_SMP | 1671 | #ifdef CONFIG_SMP |
1637 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); | 1672 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); |
1638 | #endif | 1673 | #endif |
1639 | __set_task_cpu(p, cpu); | 1674 | set_task_cpu(p, cpu); |
1640 | 1675 | ||
1641 | /* | 1676 | /* |
1642 | * Make sure we do not leak PI boosting priority to the child: | 1677 | * Make sure we do not leak PI boosting priority to the child: |
1643 | */ | 1678 | */ |
1644 | p->prio = current->normal_prio; | 1679 | p->prio = current->normal_prio; |
1680 | if (!rt_prio(p->prio)) | ||
1681 | p->sched_class = &fair_sched_class; | ||
1645 | 1682 | ||
1646 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 1683 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
1647 | if (likely(sched_info_on())) | 1684 | if (likely(sched_info_on())) |
@@ -1658,12 +1695,6 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
1658 | } | 1695 | } |
1659 | 1696 | ||
1660 | /* | 1697 | /* |
1661 | * After fork, child runs first. (default) If set to 0 then | ||
1662 | * parent will (try to) run first. | ||
1663 | */ | ||
1664 | unsigned int __read_mostly sysctl_sched_child_runs_first = 1; | ||
1665 | |||
1666 | /* | ||
1667 | * wake_up_new_task - wake up a newly created task for the first time. | 1698 | * wake_up_new_task - wake up a newly created task for the first time. |
1668 | * | 1699 | * |
1669 | * This function will do some initial scheduler statistics housekeeping | 1700 | * This function will do some initial scheduler statistics housekeeping |
@@ -1674,24 +1705,14 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
1674 | { | 1705 | { |
1675 | unsigned long flags; | 1706 | unsigned long flags; |
1676 | struct rq *rq; | 1707 | struct rq *rq; |
1677 | int this_cpu; | ||
1678 | 1708 | ||
1679 | rq = task_rq_lock(p, &flags); | 1709 | rq = task_rq_lock(p, &flags); |
1680 | BUG_ON(p->state != TASK_RUNNING); | 1710 | BUG_ON(p->state != TASK_RUNNING); |
1681 | this_cpu = smp_processor_id(); /* parent's CPU */ | ||
1682 | update_rq_clock(rq); | 1711 | update_rq_clock(rq); |
1683 | 1712 | ||
1684 | p->prio = effective_prio(p); | 1713 | p->prio = effective_prio(p); |
1685 | 1714 | ||
1686 | if (rt_prio(p->prio)) | 1715 | if (!p->sched_class->task_new || !current->se.on_rq || !rq->cfs.curr) { |
1687 | p->sched_class = &rt_sched_class; | ||
1688 | else | ||
1689 | p->sched_class = &fair_sched_class; | ||
1690 | |||
1691 | if (!p->sched_class->task_new || !sysctl_sched_child_runs_first || | ||
1692 | (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu || | ||
1693 | !current->se.on_rq) { | ||
1694 | |||
1695 | activate_task(rq, p, 0); | 1716 | activate_task(rq, p, 0); |
1696 | } else { | 1717 | } else { |
1697 | /* | 1718 | /* |
@@ -1800,7 +1821,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, | |||
1800 | * with the lock held can cause deadlocks; see schedule() for | 1821 | * with the lock held can cause deadlocks; see schedule() for |
1801 | * details.) | 1822 | * details.) |
1802 | */ | 1823 | */ |
1803 | static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) | 1824 | static void finish_task_switch(struct rq *rq, struct task_struct *prev) |
1804 | __releases(rq->lock) | 1825 | __releases(rq->lock) |
1805 | { | 1826 | { |
1806 | struct mm_struct *mm = rq->prev_mm; | 1827 | struct mm_struct *mm = rq->prev_mm; |
@@ -1982,42 +2003,10 @@ unsigned long nr_active(void) | |||
1982 | */ | 2003 | */ |
1983 | static void update_cpu_load(struct rq *this_rq) | 2004 | static void update_cpu_load(struct rq *this_rq) |
1984 | { | 2005 | { |
1985 | u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64; | 2006 | unsigned long this_load = this_rq->load.weight; |
1986 | unsigned long total_load = this_rq->ls.load.weight; | ||
1987 | unsigned long this_load = total_load; | ||
1988 | struct load_stat *ls = &this_rq->ls; | ||
1989 | int i, scale; | 2007 | int i, scale; |
1990 | 2008 | ||
1991 | this_rq->nr_load_updates++; | 2009 | this_rq->nr_load_updates++; |
1992 | if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD))) | ||
1993 | goto do_avg; | ||
1994 | |||
1995 | /* Update delta_fair/delta_exec fields first */ | ||
1996 | update_curr_load(this_rq); | ||
1997 | |||
1998 | fair_delta64 = ls->delta_fair + 1; | ||
1999 | ls->delta_fair = 0; | ||
2000 | |||
2001 | exec_delta64 = ls->delta_exec + 1; | ||
2002 | ls->delta_exec = 0; | ||
2003 | |||
2004 | sample_interval64 = this_rq->clock - ls->load_update_last; | ||
2005 | ls->load_update_last = this_rq->clock; | ||
2006 | |||
2007 | if ((s64)sample_interval64 < (s64)TICK_NSEC) | ||
2008 | sample_interval64 = TICK_NSEC; | ||
2009 | |||
2010 | if (exec_delta64 > sample_interval64) | ||
2011 | exec_delta64 = sample_interval64; | ||
2012 | |||
2013 | idle_delta64 = sample_interval64 - exec_delta64; | ||
2014 | |||
2015 | tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64); | ||
2016 | tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64); | ||
2017 | |||
2018 | this_load = (unsigned long)tmp64; | ||
2019 | |||
2020 | do_avg: | ||
2021 | 2010 | ||
2022 | /* Update our load: */ | 2011 | /* Update our load: */ |
2023 | for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | 2012 | for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { |
@@ -2027,7 +2016,13 @@ do_avg: | |||
2027 | 2016 | ||
2028 | old_load = this_rq->cpu_load[i]; | 2017 | old_load = this_rq->cpu_load[i]; |
2029 | new_load = this_load; | 2018 | new_load = this_load; |
2030 | 2019 | /* | |
2020 | * Round up the averaging division if load is increasing. This | ||
2021 | * prevents us from getting stuck on 9 if the load is 10, for | ||
2022 | * example. | ||
2023 | */ | ||
2024 | if (new_load > old_load) | ||
2025 | new_load += scale-1; | ||
2031 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; | 2026 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; |
2032 | } | 2027 | } |
2033 | } | 2028 | } |
@@ -2179,13 +2174,38 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
2179 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | 2174 | * 2) cannot be migrated to this CPU due to cpus_allowed, or |
2180 | * 3) are cache-hot on their current CPU. | 2175 | * 3) are cache-hot on their current CPU. |
2181 | */ | 2176 | */ |
2182 | if (!cpu_isset(this_cpu, p->cpus_allowed)) | 2177 | if (!cpu_isset(this_cpu, p->cpus_allowed)) { |
2178 | schedstat_inc(p, se.nr_failed_migrations_affine); | ||
2183 | return 0; | 2179 | return 0; |
2180 | } | ||
2184 | *all_pinned = 0; | 2181 | *all_pinned = 0; |
2185 | 2182 | ||
2186 | if (task_running(rq, p)) | 2183 | if (task_running(rq, p)) { |
2184 | schedstat_inc(p, se.nr_failed_migrations_running); | ||
2187 | return 0; | 2185 | return 0; |
2186 | } | ||
2187 | |||
2188 | /* | ||
2189 | * Aggressive migration if: | ||
2190 | * 1) task is cache cold, or | ||
2191 | * 2) too many balance attempts have failed. | ||
2192 | */ | ||
2193 | |||
2194 | if (!task_hot(p, rq->clock, sd) || | ||
2195 | sd->nr_balance_failed > sd->cache_nice_tries) { | ||
2196 | #ifdef CONFIG_SCHEDSTATS | ||
2197 | if (task_hot(p, rq->clock, sd)) { | ||
2198 | schedstat_inc(sd, lb_hot_gained[idle]); | ||
2199 | schedstat_inc(p, se.nr_forced_migrations); | ||
2200 | } | ||
2201 | #endif | ||
2202 | return 1; | ||
2203 | } | ||
2188 | 2204 | ||
2205 | if (task_hot(p, rq->clock, sd)) { | ||
2206 | schedstat_inc(p, se.nr_failed_migrations_hot); | ||
2207 | return 0; | ||
2208 | } | ||
2189 | return 1; | 2209 | return 1; |
2190 | } | 2210 | } |
2191 | 2211 | ||
@@ -2264,7 +2284,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2264 | struct sched_domain *sd, enum cpu_idle_type idle, | 2284 | struct sched_domain *sd, enum cpu_idle_type idle, |
2265 | int *all_pinned) | 2285 | int *all_pinned) |
2266 | { | 2286 | { |
2267 | struct sched_class *class = sched_class_highest; | 2287 | const struct sched_class *class = sched_class_highest; |
2268 | unsigned long total_load_moved = 0; | 2288 | unsigned long total_load_moved = 0; |
2269 | int this_best_prio = this_rq->curr->prio; | 2289 | int this_best_prio = this_rq->curr->prio; |
2270 | 2290 | ||
@@ -2289,7 +2309,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2289 | static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2309 | static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2290 | struct sched_domain *sd, enum cpu_idle_type idle) | 2310 | struct sched_domain *sd, enum cpu_idle_type idle) |
2291 | { | 2311 | { |
2292 | struct sched_class *class; | 2312 | const struct sched_class *class; |
2293 | int this_best_prio = MAX_PRIO; | 2313 | int this_best_prio = MAX_PRIO; |
2294 | 2314 | ||
2295 | for (class = sched_class_highest; class; class = class->next) | 2315 | for (class = sched_class_highest; class; class = class->next) |
@@ -2653,7 +2673,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
2653 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2673 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2654 | sd_idle = 1; | 2674 | sd_idle = 1; |
2655 | 2675 | ||
2656 | schedstat_inc(sd, lb_cnt[idle]); | 2676 | schedstat_inc(sd, lb_count[idle]); |
2657 | 2677 | ||
2658 | redo: | 2678 | redo: |
2659 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 2679 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
@@ -2806,7 +2826,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | |||
2806 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2826 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2807 | sd_idle = 1; | 2827 | sd_idle = 1; |
2808 | 2828 | ||
2809 | schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]); | 2829 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); |
2810 | redo: | 2830 | redo: |
2811 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, | 2831 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, |
2812 | &sd_idle, &cpus, NULL); | 2832 | &sd_idle, &cpus, NULL); |
@@ -2940,7 +2960,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
2940 | } | 2960 | } |
2941 | 2961 | ||
2942 | if (likely(sd)) { | 2962 | if (likely(sd)) { |
2943 | schedstat_inc(sd, alb_cnt); | 2963 | schedstat_inc(sd, alb_count); |
2944 | 2964 | ||
2945 | if (move_one_task(target_rq, target_cpu, busiest_rq, | 2965 | if (move_one_task(target_rq, target_cpu, busiest_rq, |
2946 | sd, CPU_IDLE)) | 2966 | sd, CPU_IDLE)) |
@@ -3033,7 +3053,7 @@ static DEFINE_SPINLOCK(balancing); | |||
3033 | * | 3053 | * |
3034 | * Balancing parameters are set up in arch_init_sched_domains. | 3054 | * Balancing parameters are set up in arch_init_sched_domains. |
3035 | */ | 3055 | */ |
3036 | static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) | 3056 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) |
3037 | { | 3057 | { |
3038 | int balance = 1; | 3058 | int balance = 1; |
3039 | struct rq *rq = cpu_rq(cpu); | 3059 | struct rq *rq = cpu_rq(cpu); |
@@ -3280,6 +3300,25 @@ void account_user_time(struct task_struct *p, cputime_t cputime) | |||
3280 | } | 3300 | } |
3281 | 3301 | ||
3282 | /* | 3302 | /* |
3303 | * Account guest cpu time to a process. | ||
3304 | * @p: the process that the cpu time gets accounted to | ||
3305 | * @cputime: the cpu time spent in virtual machine since the last update | ||
3306 | */ | ||
3307 | void account_guest_time(struct task_struct *p, cputime_t cputime) | ||
3308 | { | ||
3309 | cputime64_t tmp; | ||
3310 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
3311 | |||
3312 | tmp = cputime_to_cputime64(cputime); | ||
3313 | |||
3314 | p->utime = cputime_add(p->utime, cputime); | ||
3315 | p->gtime = cputime_add(p->gtime, cputime); | ||
3316 | |||
3317 | cpustat->user = cputime64_add(cpustat->user, tmp); | ||
3318 | cpustat->guest = cputime64_add(cpustat->guest, tmp); | ||
3319 | } | ||
3320 | |||
3321 | /* | ||
3283 | * Account system cpu time to a process. | 3322 | * Account system cpu time to a process. |
3284 | * @p: the process that the cpu time gets accounted to | 3323 | * @p: the process that the cpu time gets accounted to |
3285 | * @hardirq_offset: the offset to subtract from hardirq_count() | 3324 | * @hardirq_offset: the offset to subtract from hardirq_count() |
@@ -3292,6 +3331,12 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
3292 | struct rq *rq = this_rq(); | 3331 | struct rq *rq = this_rq(); |
3293 | cputime64_t tmp; | 3332 | cputime64_t tmp; |
3294 | 3333 | ||
3334 | if (p->flags & PF_VCPU) { | ||
3335 | account_guest_time(p, cputime); | ||
3336 | p->flags &= ~PF_VCPU; | ||
3337 | return; | ||
3338 | } | ||
3339 | |||
3295 | p->stime = cputime_add(p->stime, cputime); | 3340 | p->stime = cputime_add(p->stime, cputime); |
3296 | 3341 | ||
3297 | /* Add system time to cpustat. */ | 3342 | /* Add system time to cpustat. */ |
@@ -3430,7 +3475,13 @@ static inline void schedule_debug(struct task_struct *prev) | |||
3430 | 3475 | ||
3431 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 3476 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
3432 | 3477 | ||
3433 | schedstat_inc(this_rq(), sched_cnt); | 3478 | schedstat_inc(this_rq(), sched_count); |
3479 | #ifdef CONFIG_SCHEDSTATS | ||
3480 | if (unlikely(prev->lock_depth >= 0)) { | ||
3481 | schedstat_inc(this_rq(), bkl_count); | ||
3482 | schedstat_inc(prev, sched_info.bkl_count); | ||
3483 | } | ||
3484 | #endif | ||
3434 | } | 3485 | } |
3435 | 3486 | ||
3436 | /* | 3487 | /* |
@@ -3439,7 +3490,7 @@ static inline void schedule_debug(struct task_struct *prev) | |||
3439 | static inline struct task_struct * | 3490 | static inline struct task_struct * |
3440 | pick_next_task(struct rq *rq, struct task_struct *prev) | 3491 | pick_next_task(struct rq *rq, struct task_struct *prev) |
3441 | { | 3492 | { |
3442 | struct sched_class *class; | 3493 | const struct sched_class *class; |
3443 | struct task_struct *p; | 3494 | struct task_struct *p; |
3444 | 3495 | ||
3445 | /* | 3496 | /* |
@@ -3488,9 +3539,13 @@ need_resched_nonpreemptible: | |||
3488 | 3539 | ||
3489 | schedule_debug(prev); | 3540 | schedule_debug(prev); |
3490 | 3541 | ||
3491 | spin_lock_irq(&rq->lock); | 3542 | /* |
3492 | clear_tsk_need_resched(prev); | 3543 | * Do the rq-clock update outside the rq lock: |
3544 | */ | ||
3545 | local_irq_disable(); | ||
3493 | __update_rq_clock(rq); | 3546 | __update_rq_clock(rq); |
3547 | spin_lock(&rq->lock); | ||
3548 | clear_tsk_need_resched(prev); | ||
3494 | 3549 | ||
3495 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3550 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
3496 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && | 3551 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && |
@@ -3550,27 +3605,30 @@ asmlinkage void __sched preempt_schedule(void) | |||
3550 | if (likely(ti->preempt_count || irqs_disabled())) | 3605 | if (likely(ti->preempt_count || irqs_disabled())) |
3551 | return; | 3606 | return; |
3552 | 3607 | ||
3553 | need_resched: | 3608 | do { |
3554 | add_preempt_count(PREEMPT_ACTIVE); | 3609 | add_preempt_count(PREEMPT_ACTIVE); |
3555 | /* | 3610 | |
3556 | * We keep the big kernel semaphore locked, but we | 3611 | /* |
3557 | * clear ->lock_depth so that schedule() doesnt | 3612 | * We keep the big kernel semaphore locked, but we |
3558 | * auto-release the semaphore: | 3613 | * clear ->lock_depth so that schedule() doesnt |
3559 | */ | 3614 | * auto-release the semaphore: |
3615 | */ | ||
3560 | #ifdef CONFIG_PREEMPT_BKL | 3616 | #ifdef CONFIG_PREEMPT_BKL |
3561 | saved_lock_depth = task->lock_depth; | 3617 | saved_lock_depth = task->lock_depth; |
3562 | task->lock_depth = -1; | 3618 | task->lock_depth = -1; |
3563 | #endif | 3619 | #endif |
3564 | schedule(); | 3620 | schedule(); |
3565 | #ifdef CONFIG_PREEMPT_BKL | 3621 | #ifdef CONFIG_PREEMPT_BKL |
3566 | task->lock_depth = saved_lock_depth; | 3622 | task->lock_depth = saved_lock_depth; |
3567 | #endif | 3623 | #endif |
3568 | sub_preempt_count(PREEMPT_ACTIVE); | 3624 | sub_preempt_count(PREEMPT_ACTIVE); |
3569 | 3625 | ||
3570 | /* we could miss a preemption opportunity between schedule and now */ | 3626 | /* |
3571 | barrier(); | 3627 | * Check again in case we missed a preemption opportunity |
3572 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3628 | * between schedule and now. |
3573 | goto need_resched; | 3629 | */ |
3630 | barrier(); | ||
3631 | } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); | ||
3574 | } | 3632 | } |
3575 | EXPORT_SYMBOL(preempt_schedule); | 3633 | EXPORT_SYMBOL(preempt_schedule); |
3576 | 3634 | ||
@@ -3590,29 +3648,32 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
3590 | /* Catch callers which need to be fixed */ | 3648 | /* Catch callers which need to be fixed */ |
3591 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 3649 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
3592 | 3650 | ||
3593 | need_resched: | 3651 | do { |
3594 | add_preempt_count(PREEMPT_ACTIVE); | 3652 | add_preempt_count(PREEMPT_ACTIVE); |
3595 | /* | 3653 | |
3596 | * We keep the big kernel semaphore locked, but we | 3654 | /* |
3597 | * clear ->lock_depth so that schedule() doesnt | 3655 | * We keep the big kernel semaphore locked, but we |
3598 | * auto-release the semaphore: | 3656 | * clear ->lock_depth so that schedule() doesnt |
3599 | */ | 3657 | * auto-release the semaphore: |
3658 | */ | ||
3600 | #ifdef CONFIG_PREEMPT_BKL | 3659 | #ifdef CONFIG_PREEMPT_BKL |
3601 | saved_lock_depth = task->lock_depth; | 3660 | saved_lock_depth = task->lock_depth; |
3602 | task->lock_depth = -1; | 3661 | task->lock_depth = -1; |
3603 | #endif | 3662 | #endif |
3604 | local_irq_enable(); | 3663 | local_irq_enable(); |
3605 | schedule(); | 3664 | schedule(); |
3606 | local_irq_disable(); | 3665 | local_irq_disable(); |
3607 | #ifdef CONFIG_PREEMPT_BKL | 3666 | #ifdef CONFIG_PREEMPT_BKL |
3608 | task->lock_depth = saved_lock_depth; | 3667 | task->lock_depth = saved_lock_depth; |
3609 | #endif | 3668 | #endif |
3610 | sub_preempt_count(PREEMPT_ACTIVE); | 3669 | sub_preempt_count(PREEMPT_ACTIVE); |
3611 | 3670 | ||
3612 | /* we could miss a preemption opportunity between schedule and now */ | 3671 | /* |
3613 | barrier(); | 3672 | * Check again in case we missed a preemption opportunity |
3614 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3673 | * between schedule and now. |
3615 | goto need_resched; | 3674 | */ |
3675 | barrier(); | ||
3676 | } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); | ||
3616 | } | 3677 | } |
3617 | 3678 | ||
3618 | #endif /* CONFIG_PREEMPT */ | 3679 | #endif /* CONFIG_PREEMPT */ |
@@ -3636,10 +3697,9 @@ EXPORT_SYMBOL(default_wake_function); | |||
3636 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | 3697 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, |
3637 | int nr_exclusive, int sync, void *key) | 3698 | int nr_exclusive, int sync, void *key) |
3638 | { | 3699 | { |
3639 | struct list_head *tmp, *next; | 3700 | wait_queue_t *curr, *next; |
3640 | 3701 | ||
3641 | list_for_each_safe(tmp, next, &q->task_list) { | 3702 | list_for_each_entry_safe(curr, next, &q->task_list, task_list) { |
3642 | wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); | ||
3643 | unsigned flags = curr->flags; | 3703 | unsigned flags = curr->flags; |
3644 | 3704 | ||
3645 | if (curr->func(curr, mode, sync, key) && | 3705 | if (curr->func(curr, mode, sync, key) && |
@@ -3729,206 +3789,116 @@ void fastcall complete_all(struct completion *x) | |||
3729 | } | 3789 | } |
3730 | EXPORT_SYMBOL(complete_all); | 3790 | EXPORT_SYMBOL(complete_all); |
3731 | 3791 | ||
3732 | void fastcall __sched wait_for_completion(struct completion *x) | 3792 | static inline long __sched |
3733 | { | 3793 | do_wait_for_common(struct completion *x, long timeout, int state) |
3734 | might_sleep(); | ||
3735 | |||
3736 | spin_lock_irq(&x->wait.lock); | ||
3737 | if (!x->done) { | ||
3738 | DECLARE_WAITQUEUE(wait, current); | ||
3739 | |||
3740 | wait.flags |= WQ_FLAG_EXCLUSIVE; | ||
3741 | __add_wait_queue_tail(&x->wait, &wait); | ||
3742 | do { | ||
3743 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
3744 | spin_unlock_irq(&x->wait.lock); | ||
3745 | schedule(); | ||
3746 | spin_lock_irq(&x->wait.lock); | ||
3747 | } while (!x->done); | ||
3748 | __remove_wait_queue(&x->wait, &wait); | ||
3749 | } | ||
3750 | x->done--; | ||
3751 | spin_unlock_irq(&x->wait.lock); | ||
3752 | } | ||
3753 | EXPORT_SYMBOL(wait_for_completion); | ||
3754 | |||
3755 | unsigned long fastcall __sched | ||
3756 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | ||
3757 | { | 3794 | { |
3758 | might_sleep(); | ||
3759 | |||
3760 | spin_lock_irq(&x->wait.lock); | ||
3761 | if (!x->done) { | 3795 | if (!x->done) { |
3762 | DECLARE_WAITQUEUE(wait, current); | 3796 | DECLARE_WAITQUEUE(wait, current); |
3763 | 3797 | ||
3764 | wait.flags |= WQ_FLAG_EXCLUSIVE; | 3798 | wait.flags |= WQ_FLAG_EXCLUSIVE; |
3765 | __add_wait_queue_tail(&x->wait, &wait); | 3799 | __add_wait_queue_tail(&x->wait, &wait); |
3766 | do { | 3800 | do { |
3767 | __set_current_state(TASK_UNINTERRUPTIBLE); | 3801 | if (state == TASK_INTERRUPTIBLE && |
3802 | signal_pending(current)) { | ||
3803 | __remove_wait_queue(&x->wait, &wait); | ||
3804 | return -ERESTARTSYS; | ||
3805 | } | ||
3806 | __set_current_state(state); | ||
3768 | spin_unlock_irq(&x->wait.lock); | 3807 | spin_unlock_irq(&x->wait.lock); |
3769 | timeout = schedule_timeout(timeout); | 3808 | timeout = schedule_timeout(timeout); |
3770 | spin_lock_irq(&x->wait.lock); | 3809 | spin_lock_irq(&x->wait.lock); |
3771 | if (!timeout) { | 3810 | if (!timeout) { |
3772 | __remove_wait_queue(&x->wait, &wait); | 3811 | __remove_wait_queue(&x->wait, &wait); |
3773 | goto out; | 3812 | return timeout; |
3774 | } | 3813 | } |
3775 | } while (!x->done); | 3814 | } while (!x->done); |
3776 | __remove_wait_queue(&x->wait, &wait); | 3815 | __remove_wait_queue(&x->wait, &wait); |
3777 | } | 3816 | } |
3778 | x->done--; | 3817 | x->done--; |
3779 | out: | ||
3780 | spin_unlock_irq(&x->wait.lock); | ||
3781 | return timeout; | 3818 | return timeout; |
3782 | } | 3819 | } |
3783 | EXPORT_SYMBOL(wait_for_completion_timeout); | ||
3784 | 3820 | ||
3785 | int fastcall __sched wait_for_completion_interruptible(struct completion *x) | 3821 | static long __sched |
3822 | wait_for_common(struct completion *x, long timeout, int state) | ||
3786 | { | 3823 | { |
3787 | int ret = 0; | ||
3788 | |||
3789 | might_sleep(); | 3824 | might_sleep(); |
3790 | 3825 | ||
3791 | spin_lock_irq(&x->wait.lock); | 3826 | spin_lock_irq(&x->wait.lock); |
3792 | if (!x->done) { | 3827 | timeout = do_wait_for_common(x, timeout, state); |
3793 | DECLARE_WAITQUEUE(wait, current); | ||
3794 | |||
3795 | wait.flags |= WQ_FLAG_EXCLUSIVE; | ||
3796 | __add_wait_queue_tail(&x->wait, &wait); | ||
3797 | do { | ||
3798 | if (signal_pending(current)) { | ||
3799 | ret = -ERESTARTSYS; | ||
3800 | __remove_wait_queue(&x->wait, &wait); | ||
3801 | goto out; | ||
3802 | } | ||
3803 | __set_current_state(TASK_INTERRUPTIBLE); | ||
3804 | spin_unlock_irq(&x->wait.lock); | ||
3805 | schedule(); | ||
3806 | spin_lock_irq(&x->wait.lock); | ||
3807 | } while (!x->done); | ||
3808 | __remove_wait_queue(&x->wait, &wait); | ||
3809 | } | ||
3810 | x->done--; | ||
3811 | out: | ||
3812 | spin_unlock_irq(&x->wait.lock); | 3828 | spin_unlock_irq(&x->wait.lock); |
3829 | return timeout; | ||
3830 | } | ||
3813 | 3831 | ||
3814 | return ret; | 3832 | void fastcall __sched wait_for_completion(struct completion *x) |
3833 | { | ||
3834 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | ||
3815 | } | 3835 | } |
3816 | EXPORT_SYMBOL(wait_for_completion_interruptible); | 3836 | EXPORT_SYMBOL(wait_for_completion); |
3817 | 3837 | ||
3818 | unsigned long fastcall __sched | 3838 | unsigned long fastcall __sched |
3819 | wait_for_completion_interruptible_timeout(struct completion *x, | 3839 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) |
3820 | unsigned long timeout) | ||
3821 | { | 3840 | { |
3822 | might_sleep(); | 3841 | return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); |
3823 | |||
3824 | spin_lock_irq(&x->wait.lock); | ||
3825 | if (!x->done) { | ||
3826 | DECLARE_WAITQUEUE(wait, current); | ||
3827 | |||
3828 | wait.flags |= WQ_FLAG_EXCLUSIVE; | ||
3829 | __add_wait_queue_tail(&x->wait, &wait); | ||
3830 | do { | ||
3831 | if (signal_pending(current)) { | ||
3832 | timeout = -ERESTARTSYS; | ||
3833 | __remove_wait_queue(&x->wait, &wait); | ||
3834 | goto out; | ||
3835 | } | ||
3836 | __set_current_state(TASK_INTERRUPTIBLE); | ||
3837 | spin_unlock_irq(&x->wait.lock); | ||
3838 | timeout = schedule_timeout(timeout); | ||
3839 | spin_lock_irq(&x->wait.lock); | ||
3840 | if (!timeout) { | ||
3841 | __remove_wait_queue(&x->wait, &wait); | ||
3842 | goto out; | ||
3843 | } | ||
3844 | } while (!x->done); | ||
3845 | __remove_wait_queue(&x->wait, &wait); | ||
3846 | } | ||
3847 | x->done--; | ||
3848 | out: | ||
3849 | spin_unlock_irq(&x->wait.lock); | ||
3850 | return timeout; | ||
3851 | } | 3842 | } |
3852 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | 3843 | EXPORT_SYMBOL(wait_for_completion_timeout); |
3853 | 3844 | ||
3854 | static inline void | 3845 | int __sched wait_for_completion_interruptible(struct completion *x) |
3855 | sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) | ||
3856 | { | 3846 | { |
3857 | spin_lock_irqsave(&q->lock, *flags); | 3847 | return wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); |
3858 | __add_wait_queue(q, wait); | ||
3859 | spin_unlock(&q->lock); | ||
3860 | } | 3848 | } |
3849 | EXPORT_SYMBOL(wait_for_completion_interruptible); | ||
3861 | 3850 | ||
3862 | static inline void | 3851 | unsigned long fastcall __sched |
3863 | sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) | 3852 | wait_for_completion_interruptible_timeout(struct completion *x, |
3853 | unsigned long timeout) | ||
3864 | { | 3854 | { |
3865 | spin_lock_irq(&q->lock); | 3855 | return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); |
3866 | __remove_wait_queue(q, wait); | ||
3867 | spin_unlock_irqrestore(&q->lock, *flags); | ||
3868 | } | 3856 | } |
3857 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | ||
3869 | 3858 | ||
3870 | void __sched interruptible_sleep_on(wait_queue_head_t *q) | 3859 | static long __sched |
3860 | sleep_on_common(wait_queue_head_t *q, int state, long timeout) | ||
3871 | { | 3861 | { |
3872 | unsigned long flags; | 3862 | unsigned long flags; |
3873 | wait_queue_t wait; | 3863 | wait_queue_t wait; |
3874 | 3864 | ||
3875 | init_waitqueue_entry(&wait, current); | 3865 | init_waitqueue_entry(&wait, current); |
3876 | 3866 | ||
3877 | current->state = TASK_INTERRUPTIBLE; | 3867 | __set_current_state(state); |
3878 | 3868 | ||
3879 | sleep_on_head(q, &wait, &flags); | 3869 | spin_lock_irqsave(&q->lock, flags); |
3880 | schedule(); | 3870 | __add_wait_queue(q, &wait); |
3881 | sleep_on_tail(q, &wait, &flags); | 3871 | spin_unlock(&q->lock); |
3872 | timeout = schedule_timeout(timeout); | ||
3873 | spin_lock_irq(&q->lock); | ||
3874 | __remove_wait_queue(q, &wait); | ||
3875 | spin_unlock_irqrestore(&q->lock, flags); | ||
3876 | |||
3877 | return timeout; | ||
3878 | } | ||
3879 | |||
3880 | void __sched interruptible_sleep_on(wait_queue_head_t *q) | ||
3881 | { | ||
3882 | sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); | ||
3882 | } | 3883 | } |
3883 | EXPORT_SYMBOL(interruptible_sleep_on); | 3884 | EXPORT_SYMBOL(interruptible_sleep_on); |
3884 | 3885 | ||
3885 | long __sched | 3886 | long __sched |
3886 | interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) | 3887 | interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) |
3887 | { | 3888 | { |
3888 | unsigned long flags; | 3889 | return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); |
3889 | wait_queue_t wait; | ||
3890 | |||
3891 | init_waitqueue_entry(&wait, current); | ||
3892 | |||
3893 | current->state = TASK_INTERRUPTIBLE; | ||
3894 | |||
3895 | sleep_on_head(q, &wait, &flags); | ||
3896 | timeout = schedule_timeout(timeout); | ||
3897 | sleep_on_tail(q, &wait, &flags); | ||
3898 | |||
3899 | return timeout; | ||
3900 | } | 3890 | } |
3901 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); | 3891 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); |
3902 | 3892 | ||
3903 | void __sched sleep_on(wait_queue_head_t *q) | 3893 | void __sched sleep_on(wait_queue_head_t *q) |
3904 | { | 3894 | { |
3905 | unsigned long flags; | 3895 | sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); |
3906 | wait_queue_t wait; | ||
3907 | |||
3908 | init_waitqueue_entry(&wait, current); | ||
3909 | |||
3910 | current->state = TASK_UNINTERRUPTIBLE; | ||
3911 | |||
3912 | sleep_on_head(q, &wait, &flags); | ||
3913 | schedule(); | ||
3914 | sleep_on_tail(q, &wait, &flags); | ||
3915 | } | 3896 | } |
3916 | EXPORT_SYMBOL(sleep_on); | 3897 | EXPORT_SYMBOL(sleep_on); |
3917 | 3898 | ||
3918 | long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | 3899 | long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) |
3919 | { | 3900 | { |
3920 | unsigned long flags; | 3901 | return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); |
3921 | wait_queue_t wait; | ||
3922 | |||
3923 | init_waitqueue_entry(&wait, current); | ||
3924 | |||
3925 | current->state = TASK_UNINTERRUPTIBLE; | ||
3926 | |||
3927 | sleep_on_head(q, &wait, &flags); | ||
3928 | timeout = schedule_timeout(timeout); | ||
3929 | sleep_on_tail(q, &wait, &flags); | ||
3930 | |||
3931 | return timeout; | ||
3932 | } | 3902 | } |
3933 | EXPORT_SYMBOL(sleep_on_timeout); | 3903 | EXPORT_SYMBOL(sleep_on_timeout); |
3934 | 3904 | ||
@@ -3947,7 +3917,7 @@ EXPORT_SYMBOL(sleep_on_timeout); | |||
3947 | void rt_mutex_setprio(struct task_struct *p, int prio) | 3917 | void rt_mutex_setprio(struct task_struct *p, int prio) |
3948 | { | 3918 | { |
3949 | unsigned long flags; | 3919 | unsigned long flags; |
3950 | int oldprio, on_rq; | 3920 | int oldprio, on_rq, running; |
3951 | struct rq *rq; | 3921 | struct rq *rq; |
3952 | 3922 | ||
3953 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 3923 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
@@ -3957,8 +3927,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3957 | 3927 | ||
3958 | oldprio = p->prio; | 3928 | oldprio = p->prio; |
3959 | on_rq = p->se.on_rq; | 3929 | on_rq = p->se.on_rq; |
3960 | if (on_rq) | 3930 | running = task_running(rq, p); |
3931 | if (on_rq) { | ||
3961 | dequeue_task(rq, p, 0); | 3932 | dequeue_task(rq, p, 0); |
3933 | if (running) | ||
3934 | p->sched_class->put_prev_task(rq, p); | ||
3935 | } | ||
3962 | 3936 | ||
3963 | if (rt_prio(prio)) | 3937 | if (rt_prio(prio)) |
3964 | p->sched_class = &rt_sched_class; | 3938 | p->sched_class = &rt_sched_class; |
@@ -3968,13 +3942,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3968 | p->prio = prio; | 3942 | p->prio = prio; |
3969 | 3943 | ||
3970 | if (on_rq) { | 3944 | if (on_rq) { |
3945 | if (running) | ||
3946 | p->sched_class->set_curr_task(rq); | ||
3971 | enqueue_task(rq, p, 0); | 3947 | enqueue_task(rq, p, 0); |
3972 | /* | 3948 | /* |
3973 | * Reschedule if we are currently running on this runqueue and | 3949 | * Reschedule if we are currently running on this runqueue and |
3974 | * our priority decreased, or if we are not currently running on | 3950 | * our priority decreased, or if we are not currently running on |
3975 | * this runqueue and our priority is higher than the current's | 3951 | * this runqueue and our priority is higher than the current's |
3976 | */ | 3952 | */ |
3977 | if (task_running(rq, p)) { | 3953 | if (running) { |
3978 | if (p->prio > oldprio) | 3954 | if (p->prio > oldprio) |
3979 | resched_task(rq->curr); | 3955 | resched_task(rq->curr); |
3980 | } else { | 3956 | } else { |
@@ -4138,7 +4114,7 @@ struct task_struct *idle_task(int cpu) | |||
4138 | * find_process_by_pid - find a process with a matching PID value. | 4114 | * find_process_by_pid - find a process with a matching PID value. |
4139 | * @pid: the pid in question. | 4115 | * @pid: the pid in question. |
4140 | */ | 4116 | */ |
4141 | static inline struct task_struct *find_process_by_pid(pid_t pid) | 4117 | static struct task_struct *find_process_by_pid(pid_t pid) |
4142 | { | 4118 | { |
4143 | return pid ? find_task_by_pid(pid) : current; | 4119 | return pid ? find_task_by_pid(pid) : current; |
4144 | } | 4120 | } |
@@ -4180,7 +4156,7 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | |||
4180 | int sched_setscheduler(struct task_struct *p, int policy, | 4156 | int sched_setscheduler(struct task_struct *p, int policy, |
4181 | struct sched_param *param) | 4157 | struct sched_param *param) |
4182 | { | 4158 | { |
4183 | int retval, oldprio, oldpolicy = -1, on_rq; | 4159 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
4184 | unsigned long flags; | 4160 | unsigned long flags; |
4185 | struct rq *rq; | 4161 | struct rq *rq; |
4186 | 4162 | ||
@@ -4262,18 +4238,26 @@ recheck: | |||
4262 | } | 4238 | } |
4263 | update_rq_clock(rq); | 4239 | update_rq_clock(rq); |
4264 | on_rq = p->se.on_rq; | 4240 | on_rq = p->se.on_rq; |
4265 | if (on_rq) | 4241 | running = task_running(rq, p); |
4242 | if (on_rq) { | ||
4266 | deactivate_task(rq, p, 0); | 4243 | deactivate_task(rq, p, 0); |
4244 | if (running) | ||
4245 | p->sched_class->put_prev_task(rq, p); | ||
4246 | } | ||
4247 | |||
4267 | oldprio = p->prio; | 4248 | oldprio = p->prio; |
4268 | __setscheduler(rq, p, policy, param->sched_priority); | 4249 | __setscheduler(rq, p, policy, param->sched_priority); |
4250 | |||
4269 | if (on_rq) { | 4251 | if (on_rq) { |
4252 | if (running) | ||
4253 | p->sched_class->set_curr_task(rq); | ||
4270 | activate_task(rq, p, 0); | 4254 | activate_task(rq, p, 0); |
4271 | /* | 4255 | /* |
4272 | * Reschedule if we are currently running on this runqueue and | 4256 | * Reschedule if we are currently running on this runqueue and |
4273 | * our priority decreased, or if we are not currently running on | 4257 | * our priority decreased, or if we are not currently running on |
4274 | * this runqueue and our priority is higher than the current's | 4258 | * this runqueue and our priority is higher than the current's |
4275 | */ | 4259 | */ |
4276 | if (task_running(rq, p)) { | 4260 | if (running) { |
4277 | if (p->prio > oldprio) | 4261 | if (p->prio > oldprio) |
4278 | resched_task(rq->curr); | 4262 | resched_task(rq->curr); |
4279 | } else { | 4263 | } else { |
@@ -4344,10 +4328,10 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) | |||
4344 | asmlinkage long sys_sched_getscheduler(pid_t pid) | 4328 | asmlinkage long sys_sched_getscheduler(pid_t pid) |
4345 | { | 4329 | { |
4346 | struct task_struct *p; | 4330 | struct task_struct *p; |
4347 | int retval = -EINVAL; | 4331 | int retval; |
4348 | 4332 | ||
4349 | if (pid < 0) | 4333 | if (pid < 0) |
4350 | goto out_nounlock; | 4334 | return -EINVAL; |
4351 | 4335 | ||
4352 | retval = -ESRCH; | 4336 | retval = -ESRCH; |
4353 | read_lock(&tasklist_lock); | 4337 | read_lock(&tasklist_lock); |
@@ -4358,8 +4342,6 @@ asmlinkage long sys_sched_getscheduler(pid_t pid) | |||
4358 | retval = p->policy; | 4342 | retval = p->policy; |
4359 | } | 4343 | } |
4360 | read_unlock(&tasklist_lock); | 4344 | read_unlock(&tasklist_lock); |
4361 | |||
4362 | out_nounlock: | ||
4363 | return retval; | 4345 | return retval; |
4364 | } | 4346 | } |
4365 | 4347 | ||
@@ -4372,10 +4354,10 @@ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) | |||
4372 | { | 4354 | { |
4373 | struct sched_param lp; | 4355 | struct sched_param lp; |
4374 | struct task_struct *p; | 4356 | struct task_struct *p; |
4375 | int retval = -EINVAL; | 4357 | int retval; |
4376 | 4358 | ||
4377 | if (!param || pid < 0) | 4359 | if (!param || pid < 0) |
4378 | goto out_nounlock; | 4360 | return -EINVAL; |
4379 | 4361 | ||
4380 | read_lock(&tasklist_lock); | 4362 | read_lock(&tasklist_lock); |
4381 | p = find_process_by_pid(pid); | 4363 | p = find_process_by_pid(pid); |
@@ -4395,7 +4377,6 @@ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) | |||
4395 | */ | 4377 | */ |
4396 | retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; | 4378 | retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; |
4397 | 4379 | ||
4398 | out_nounlock: | ||
4399 | return retval; | 4380 | return retval; |
4400 | 4381 | ||
4401 | out_unlock: | 4382 | out_unlock: |
@@ -4555,8 +4536,8 @@ asmlinkage long sys_sched_yield(void) | |||
4555 | { | 4536 | { |
4556 | struct rq *rq = this_rq_lock(); | 4537 | struct rq *rq = this_rq_lock(); |
4557 | 4538 | ||
4558 | schedstat_inc(rq, yld_cnt); | 4539 | schedstat_inc(rq, yld_count); |
4559 | current->sched_class->yield_task(rq, current); | 4540 | current->sched_class->yield_task(rq); |
4560 | 4541 | ||
4561 | /* | 4542 | /* |
4562 | * Since we are going to call schedule() anyway, there's | 4543 | * Since we are going to call schedule() anyway, there's |
@@ -4750,11 +4731,12 @@ asmlinkage | |||
4750 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | 4731 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) |
4751 | { | 4732 | { |
4752 | struct task_struct *p; | 4733 | struct task_struct *p; |
4753 | int retval = -EINVAL; | 4734 | unsigned int time_slice; |
4735 | int retval; | ||
4754 | struct timespec t; | 4736 | struct timespec t; |
4755 | 4737 | ||
4756 | if (pid < 0) | 4738 | if (pid < 0) |
4757 | goto out_nounlock; | 4739 | return -EINVAL; |
4758 | 4740 | ||
4759 | retval = -ESRCH; | 4741 | retval = -ESRCH; |
4760 | read_lock(&tasklist_lock); | 4742 | read_lock(&tasklist_lock); |
@@ -4766,12 +4748,24 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | |||
4766 | if (retval) | 4748 | if (retval) |
4767 | goto out_unlock; | 4749 | goto out_unlock; |
4768 | 4750 | ||
4769 | jiffies_to_timespec(p->policy == SCHED_FIFO ? | 4751 | if (p->policy == SCHED_FIFO) |
4770 | 0 : static_prio_timeslice(p->static_prio), &t); | 4752 | time_slice = 0; |
4753 | else if (p->policy == SCHED_RR) | ||
4754 | time_slice = DEF_TIMESLICE; | ||
4755 | else { | ||
4756 | struct sched_entity *se = &p->se; | ||
4757 | unsigned long flags; | ||
4758 | struct rq *rq; | ||
4759 | |||
4760 | rq = task_rq_lock(p, &flags); | ||
4761 | time_slice = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); | ||
4762 | task_rq_unlock(rq, &flags); | ||
4763 | } | ||
4771 | read_unlock(&tasklist_lock); | 4764 | read_unlock(&tasklist_lock); |
4765 | jiffies_to_timespec(time_slice, &t); | ||
4772 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | 4766 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
4773 | out_nounlock: | ||
4774 | return retval; | 4767 | return retval; |
4768 | |||
4775 | out_unlock: | 4769 | out_unlock: |
4776 | read_unlock(&tasklist_lock); | 4770 | read_unlock(&tasklist_lock); |
4777 | return retval; | 4771 | return retval; |
@@ -4900,32 +4894,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
4900 | */ | 4894 | */ |
4901 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | 4895 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; |
4902 | 4896 | ||
4903 | /* | ||
4904 | * Increase the granularity value when there are more CPUs, | ||
4905 | * because with more CPUs the 'effective latency' as visible | ||
4906 | * to users decreases. But the relationship is not linear, | ||
4907 | * so pick a second-best guess by going with the log2 of the | ||
4908 | * number of CPUs. | ||
4909 | * | ||
4910 | * This idea comes from the SD scheduler of Con Kolivas: | ||
4911 | */ | ||
4912 | static inline void sched_init_granularity(void) | ||
4913 | { | ||
4914 | unsigned int factor = 1 + ilog2(num_online_cpus()); | ||
4915 | const unsigned long limit = 100000000; | ||
4916 | |||
4917 | sysctl_sched_min_granularity *= factor; | ||
4918 | if (sysctl_sched_min_granularity > limit) | ||
4919 | sysctl_sched_min_granularity = limit; | ||
4920 | |||
4921 | sysctl_sched_latency *= factor; | ||
4922 | if (sysctl_sched_latency > limit) | ||
4923 | sysctl_sched_latency = limit; | ||
4924 | |||
4925 | sysctl_sched_runtime_limit = sysctl_sched_latency; | ||
4926 | sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2; | ||
4927 | } | ||
4928 | |||
4929 | #ifdef CONFIG_SMP | 4897 | #ifdef CONFIG_SMP |
4930 | /* | 4898 | /* |
4931 | * This is how migration works: | 4899 | * This is how migration works: |
@@ -5103,35 +5071,34 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
5103 | struct rq *rq; | 5071 | struct rq *rq; |
5104 | int dest_cpu; | 5072 | int dest_cpu; |
5105 | 5073 | ||
5106 | restart: | 5074 | do { |
5107 | /* On same node? */ | 5075 | /* On same node? */ |
5108 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); | 5076 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); |
5109 | cpus_and(mask, mask, p->cpus_allowed); | 5077 | cpus_and(mask, mask, p->cpus_allowed); |
5110 | dest_cpu = any_online_cpu(mask); | 5078 | dest_cpu = any_online_cpu(mask); |
5111 | 5079 | ||
5112 | /* On any allowed CPU? */ | 5080 | /* On any allowed CPU? */ |
5113 | if (dest_cpu == NR_CPUS) | 5081 | if (dest_cpu == NR_CPUS) |
5114 | dest_cpu = any_online_cpu(p->cpus_allowed); | 5082 | dest_cpu = any_online_cpu(p->cpus_allowed); |
5115 | 5083 | ||
5116 | /* No more Mr. Nice Guy. */ | 5084 | /* No more Mr. Nice Guy. */ |
5117 | if (dest_cpu == NR_CPUS) { | 5085 | if (dest_cpu == NR_CPUS) { |
5118 | rq = task_rq_lock(p, &flags); | 5086 | rq = task_rq_lock(p, &flags); |
5119 | cpus_setall(p->cpus_allowed); | 5087 | cpus_setall(p->cpus_allowed); |
5120 | dest_cpu = any_online_cpu(p->cpus_allowed); | 5088 | dest_cpu = any_online_cpu(p->cpus_allowed); |
5121 | task_rq_unlock(rq, &flags); | 5089 | task_rq_unlock(rq, &flags); |
5122 | 5090 | ||
5123 | /* | 5091 | /* |
5124 | * Don't tell them about moving exiting tasks or | 5092 | * Don't tell them about moving exiting tasks or |
5125 | * kernel threads (both mm NULL), since they never | 5093 | * kernel threads (both mm NULL), since they never |
5126 | * leave kernel. | 5094 | * leave kernel. |
5127 | */ | 5095 | */ |
5128 | if (p->mm && printk_ratelimit()) | 5096 | if (p->mm && printk_ratelimit()) |
5129 | printk(KERN_INFO "process %d (%s) no " | 5097 | printk(KERN_INFO "process %d (%s) no " |
5130 | "longer affine to cpu%d\n", | 5098 | "longer affine to cpu%d\n", |
5131 | p->pid, p->comm, dead_cpu); | 5099 | p->pid, p->comm, dead_cpu); |
5132 | } | 5100 | } |
5133 | if (!__migrate_task(p, dead_cpu, dest_cpu)) | 5101 | } while (!__migrate_task(p, dead_cpu, dest_cpu)); |
5134 | goto restart; | ||
5135 | } | 5102 | } |
5136 | 5103 | ||
5137 | /* | 5104 | /* |
@@ -5173,6 +5140,20 @@ static void migrate_live_tasks(int src_cpu) | |||
5173 | } | 5140 | } |
5174 | 5141 | ||
5175 | /* | 5142 | /* |
5143 | * activate_idle_task - move idle task to the _front_ of runqueue. | ||
5144 | */ | ||
5145 | static void activate_idle_task(struct task_struct *p, struct rq *rq) | ||
5146 | { | ||
5147 | update_rq_clock(rq); | ||
5148 | |||
5149 | if (p->state == TASK_UNINTERRUPTIBLE) | ||
5150 | rq->nr_uninterruptible--; | ||
5151 | |||
5152 | enqueue_task(rq, p, 0); | ||
5153 | inc_nr_running(p, rq); | ||
5154 | } | ||
5155 | |||
5156 | /* | ||
5176 | * Schedules idle task to be the next runnable task on current CPU. | 5157 | * Schedules idle task to be the next runnable task on current CPU. |
5177 | * It does so by boosting its priority to highest possible and adding it to | 5158 | * It does so by boosting its priority to highest possible and adding it to |
5178 | * the _front_ of the runqueue. Used by CPU offline code. | 5159 | * the _front_ of the runqueue. Used by CPU offline code. |
@@ -5284,14 +5265,23 @@ static struct ctl_table sd_ctl_root[] = { | |||
5284 | static struct ctl_table *sd_alloc_ctl_entry(int n) | 5265 | static struct ctl_table *sd_alloc_ctl_entry(int n) |
5285 | { | 5266 | { |
5286 | struct ctl_table *entry = | 5267 | struct ctl_table *entry = |
5287 | kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL); | 5268 | kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); |
5288 | |||
5289 | BUG_ON(!entry); | ||
5290 | memset(entry, 0, n * sizeof(struct ctl_table)); | ||
5291 | 5269 | ||
5292 | return entry; | 5270 | return entry; |
5293 | } | 5271 | } |
5294 | 5272 | ||
5273 | static void sd_free_ctl_entry(struct ctl_table **tablep) | ||
5274 | { | ||
5275 | struct ctl_table *entry = *tablep; | ||
5276 | |||
5277 | for (entry = *tablep; entry->procname; entry++) | ||
5278 | if (entry->child) | ||
5279 | sd_free_ctl_entry(&entry->child); | ||
5280 | |||
5281 | kfree(*tablep); | ||
5282 | *tablep = NULL; | ||
5283 | } | ||
5284 | |||
5295 | static void | 5285 | static void |
5296 | set_table_entry(struct ctl_table *entry, | 5286 | set_table_entry(struct ctl_table *entry, |
5297 | const char *procname, void *data, int maxlen, | 5287 | const char *procname, void *data, int maxlen, |
@@ -5307,7 +5297,10 @@ set_table_entry(struct ctl_table *entry, | |||
5307 | static struct ctl_table * | 5297 | static struct ctl_table * |
5308 | sd_alloc_ctl_domain_table(struct sched_domain *sd) | 5298 | sd_alloc_ctl_domain_table(struct sched_domain *sd) |
5309 | { | 5299 | { |
5310 | struct ctl_table *table = sd_alloc_ctl_entry(14); | 5300 | struct ctl_table *table = sd_alloc_ctl_entry(12); |
5301 | |||
5302 | if (table == NULL) | ||
5303 | return NULL; | ||
5311 | 5304 | ||
5312 | set_table_entry(&table[0], "min_interval", &sd->min_interval, | 5305 | set_table_entry(&table[0], "min_interval", &sd->min_interval, |
5313 | sizeof(long), 0644, proc_doulongvec_minmax); | 5306 | sizeof(long), 0644, proc_doulongvec_minmax); |
@@ -5327,11 +5320,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) | |||
5327 | sizeof(int), 0644, proc_dointvec_minmax); | 5320 | sizeof(int), 0644, proc_dointvec_minmax); |
5328 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, | 5321 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, |
5329 | sizeof(int), 0644, proc_dointvec_minmax); | 5322 | sizeof(int), 0644, proc_dointvec_minmax); |
5330 | set_table_entry(&table[10], "cache_nice_tries", | 5323 | set_table_entry(&table[9], "cache_nice_tries", |
5331 | &sd->cache_nice_tries, | 5324 | &sd->cache_nice_tries, |
5332 | sizeof(int), 0644, proc_dointvec_minmax); | 5325 | sizeof(int), 0644, proc_dointvec_minmax); |
5333 | set_table_entry(&table[12], "flags", &sd->flags, | 5326 | set_table_entry(&table[10], "flags", &sd->flags, |
5334 | sizeof(int), 0644, proc_dointvec_minmax); | 5327 | sizeof(int), 0644, proc_dointvec_minmax); |
5328 | /* &table[11] is terminator */ | ||
5335 | 5329 | ||
5336 | return table; | 5330 | return table; |
5337 | } | 5331 | } |
@@ -5346,6 +5340,8 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) | |||
5346 | for_each_domain(cpu, sd) | 5340 | for_each_domain(cpu, sd) |
5347 | domain_num++; | 5341 | domain_num++; |
5348 | entry = table = sd_alloc_ctl_entry(domain_num + 1); | 5342 | entry = table = sd_alloc_ctl_entry(domain_num + 1); |
5343 | if (table == NULL) | ||
5344 | return NULL; | ||
5349 | 5345 | ||
5350 | i = 0; | 5346 | i = 0; |
5351 | for_each_domain(cpu, sd) { | 5347 | for_each_domain(cpu, sd) { |
@@ -5360,24 +5356,38 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) | |||
5360 | } | 5356 | } |
5361 | 5357 | ||
5362 | static struct ctl_table_header *sd_sysctl_header; | 5358 | static struct ctl_table_header *sd_sysctl_header; |
5363 | static void init_sched_domain_sysctl(void) | 5359 | static void register_sched_domain_sysctl(void) |
5364 | { | 5360 | { |
5365 | int i, cpu_num = num_online_cpus(); | 5361 | int i, cpu_num = num_online_cpus(); |
5366 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); | 5362 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); |
5367 | char buf[32]; | 5363 | char buf[32]; |
5368 | 5364 | ||
5365 | if (entry == NULL) | ||
5366 | return; | ||
5367 | |||
5369 | sd_ctl_dir[0].child = entry; | 5368 | sd_ctl_dir[0].child = entry; |
5370 | 5369 | ||
5371 | for (i = 0; i < cpu_num; i++, entry++) { | 5370 | for_each_online_cpu(i) { |
5372 | snprintf(buf, 32, "cpu%d", i); | 5371 | snprintf(buf, 32, "cpu%d", i); |
5373 | entry->procname = kstrdup(buf, GFP_KERNEL); | 5372 | entry->procname = kstrdup(buf, GFP_KERNEL); |
5374 | entry->mode = 0555; | 5373 | entry->mode = 0555; |
5375 | entry->child = sd_alloc_ctl_cpu_table(i); | 5374 | entry->child = sd_alloc_ctl_cpu_table(i); |
5375 | entry++; | ||
5376 | } | 5376 | } |
5377 | sd_sysctl_header = register_sysctl_table(sd_ctl_root); | 5377 | sd_sysctl_header = register_sysctl_table(sd_ctl_root); |
5378 | } | 5378 | } |
5379 | |||
5380 | static void unregister_sched_domain_sysctl(void) | ||
5381 | { | ||
5382 | unregister_sysctl_table(sd_sysctl_header); | ||
5383 | sd_sysctl_header = NULL; | ||
5384 | sd_free_ctl_entry(&sd_ctl_dir[0].child); | ||
5385 | } | ||
5379 | #else | 5386 | #else |
5380 | static void init_sched_domain_sysctl(void) | 5387 | static void register_sched_domain_sysctl(void) |
5388 | { | ||
5389 | } | ||
5390 | static void unregister_sched_domain_sysctl(void) | ||
5381 | { | 5391 | { |
5382 | } | 5392 | } |
5383 | #endif | 5393 | #endif |
@@ -5499,8 +5509,7 @@ int __init migration_init(void) | |||
5499 | int nr_cpu_ids __read_mostly = NR_CPUS; | 5509 | int nr_cpu_ids __read_mostly = NR_CPUS; |
5500 | EXPORT_SYMBOL(nr_cpu_ids); | 5510 | EXPORT_SYMBOL(nr_cpu_ids); |
5501 | 5511 | ||
5502 | #undef SCHED_DOMAIN_DEBUG | 5512 | #ifdef CONFIG_SCHED_DEBUG |
5503 | #ifdef SCHED_DOMAIN_DEBUG | ||
5504 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 5513 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
5505 | { | 5514 | { |
5506 | int level = 0; | 5515 | int level = 0; |
@@ -5558,16 +5567,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
5558 | printk("\n"); | 5567 | printk("\n"); |
5559 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 5568 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
5560 | "set\n"); | 5569 | "set\n"); |
5570 | break; | ||
5561 | } | 5571 | } |
5562 | 5572 | ||
5563 | if (!cpus_weight(group->cpumask)) { | 5573 | if (!cpus_weight(group->cpumask)) { |
5564 | printk("\n"); | 5574 | printk("\n"); |
5565 | printk(KERN_ERR "ERROR: empty group\n"); | 5575 | printk(KERN_ERR "ERROR: empty group\n"); |
5576 | break; | ||
5566 | } | 5577 | } |
5567 | 5578 | ||
5568 | if (cpus_intersects(groupmask, group->cpumask)) { | 5579 | if (cpus_intersects(groupmask, group->cpumask)) { |
5569 | printk("\n"); | 5580 | printk("\n"); |
5570 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | 5581 | printk(KERN_ERR "ERROR: repeated CPUs\n"); |
5582 | break; | ||
5571 | } | 5583 | } |
5572 | 5584 | ||
5573 | cpus_or(groupmask, groupmask, group->cpumask); | 5585 | cpus_or(groupmask, groupmask, group->cpumask); |
@@ -5701,7 +5713,7 @@ static int __init isolated_cpu_setup(char *str) | |||
5701 | return 1; | 5713 | return 1; |
5702 | } | 5714 | } |
5703 | 5715 | ||
5704 | __setup ("isolcpus=", isolated_cpu_setup); | 5716 | __setup("isolcpus=", isolated_cpu_setup); |
5705 | 5717 | ||
5706 | /* | 5718 | /* |
5707 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer | 5719 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer |
@@ -5930,24 +5942,23 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
5930 | 5942 | ||
5931 | if (!sg) | 5943 | if (!sg) |
5932 | return; | 5944 | return; |
5933 | next_sg: | 5945 | do { |
5934 | for_each_cpu_mask(j, sg->cpumask) { | 5946 | for_each_cpu_mask(j, sg->cpumask) { |
5935 | struct sched_domain *sd; | 5947 | struct sched_domain *sd; |
5936 | 5948 | ||
5937 | sd = &per_cpu(phys_domains, j); | 5949 | sd = &per_cpu(phys_domains, j); |
5938 | if (j != first_cpu(sd->groups->cpumask)) { | 5950 | if (j != first_cpu(sd->groups->cpumask)) { |
5939 | /* | 5951 | /* |
5940 | * Only add "power" once for each | 5952 | * Only add "power" once for each |
5941 | * physical package. | 5953 | * physical package. |
5942 | */ | 5954 | */ |
5943 | continue; | 5955 | continue; |
5944 | } | 5956 | } |
5945 | 5957 | ||
5946 | sg_inc_cpu_power(sg, sd->groups->__cpu_power); | 5958 | sg_inc_cpu_power(sg, sd->groups->__cpu_power); |
5947 | } | 5959 | } |
5948 | sg = sg->next; | 5960 | sg = sg->next; |
5949 | if (sg != group_head) | 5961 | } while (sg != group_head); |
5950 | goto next_sg; | ||
5951 | } | 5962 | } |
5952 | #endif | 5963 | #endif |
5953 | 5964 | ||
@@ -6058,7 +6069,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6058 | /* | 6069 | /* |
6059 | * Allocate the per-node list of sched groups | 6070 | * Allocate the per-node list of sched groups |
6060 | */ | 6071 | */ |
6061 | sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES, | 6072 | sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), |
6062 | GFP_KERNEL); | 6073 | GFP_KERNEL); |
6063 | if (!sched_group_nodes) { | 6074 | if (!sched_group_nodes) { |
6064 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 6075 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
@@ -6311,6 +6322,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) | |||
6311 | 6322 | ||
6312 | err = build_sched_domains(&cpu_default_map); | 6323 | err = build_sched_domains(&cpu_default_map); |
6313 | 6324 | ||
6325 | register_sched_domain_sysctl(); | ||
6326 | |||
6314 | return err; | 6327 | return err; |
6315 | } | 6328 | } |
6316 | 6329 | ||
@@ -6327,6 +6340,8 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
6327 | { | 6340 | { |
6328 | int i; | 6341 | int i; |
6329 | 6342 | ||
6343 | unregister_sched_domain_sysctl(); | ||
6344 | |||
6330 | for_each_cpu_mask(i, *cpu_map) | 6345 | for_each_cpu_mask(i, *cpu_map) |
6331 | cpu_attach_domain(NULL, i); | 6346 | cpu_attach_domain(NULL, i); |
6332 | synchronize_sched(); | 6347 | synchronize_sched(); |
@@ -6357,6 +6372,8 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | |||
6357 | if (!err && !cpus_empty(*partition2)) | 6372 | if (!err && !cpus_empty(*partition2)) |
6358 | err = build_sched_domains(partition2); | 6373 | err = build_sched_domains(partition2); |
6359 | 6374 | ||
6375 | register_sched_domain_sysctl(); | ||
6376 | |||
6360 | return err; | 6377 | return err; |
6361 | } | 6378 | } |
6362 | 6379 | ||
@@ -6488,17 +6505,13 @@ void __init sched_init_smp(void) | |||
6488 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 6505 | /* XXX: Theoretical race here - CPU may be hotplugged now */ |
6489 | hotcpu_notifier(update_sched_domains, 0); | 6506 | hotcpu_notifier(update_sched_domains, 0); |
6490 | 6507 | ||
6491 | init_sched_domain_sysctl(); | ||
6492 | |||
6493 | /* Move init over to a non-isolated CPU */ | 6508 | /* Move init over to a non-isolated CPU */ |
6494 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 6509 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) |
6495 | BUG(); | 6510 | BUG(); |
6496 | sched_init_granularity(); | ||
6497 | } | 6511 | } |
6498 | #else | 6512 | #else |
6499 | void __init sched_init_smp(void) | 6513 | void __init sched_init_smp(void) |
6500 | { | 6514 | { |
6501 | sched_init_granularity(); | ||
6502 | } | 6515 | } |
6503 | #endif /* CONFIG_SMP */ | 6516 | #endif /* CONFIG_SMP */ |
6504 | 6517 | ||
@@ -6512,28 +6525,20 @@ int in_sched_functions(unsigned long addr) | |||
6512 | && addr < (unsigned long)__sched_text_end); | 6525 | && addr < (unsigned long)__sched_text_end); |
6513 | } | 6526 | } |
6514 | 6527 | ||
6515 | static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | 6528 | static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) |
6516 | { | 6529 | { |
6517 | cfs_rq->tasks_timeline = RB_ROOT; | 6530 | cfs_rq->tasks_timeline = RB_ROOT; |
6518 | cfs_rq->fair_clock = 1; | ||
6519 | #ifdef CONFIG_FAIR_GROUP_SCHED | 6531 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6520 | cfs_rq->rq = rq; | 6532 | cfs_rq->rq = rq; |
6521 | #endif | 6533 | #endif |
6534 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | ||
6522 | } | 6535 | } |
6523 | 6536 | ||
6524 | void __init sched_init(void) | 6537 | void __init sched_init(void) |
6525 | { | 6538 | { |
6526 | u64 now = sched_clock(); | ||
6527 | int highest_cpu = 0; | 6539 | int highest_cpu = 0; |
6528 | int i, j; | 6540 | int i, j; |
6529 | 6541 | ||
6530 | /* | ||
6531 | * Link up the scheduling class hierarchy: | ||
6532 | */ | ||
6533 | rt_sched_class.next = &fair_sched_class; | ||
6534 | fair_sched_class.next = &idle_sched_class; | ||
6535 | idle_sched_class.next = NULL; | ||
6536 | |||
6537 | for_each_possible_cpu(i) { | 6542 | for_each_possible_cpu(i) { |
6538 | struct rt_prio_array *array; | 6543 | struct rt_prio_array *array; |
6539 | struct rq *rq; | 6544 | struct rq *rq; |
@@ -6546,10 +6551,28 @@ void __init sched_init(void) | |||
6546 | init_cfs_rq(&rq->cfs, rq); | 6551 | init_cfs_rq(&rq->cfs, rq); |
6547 | #ifdef CONFIG_FAIR_GROUP_SCHED | 6552 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6548 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 6553 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
6549 | list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | 6554 | { |
6555 | struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i); | ||
6556 | struct sched_entity *se = | ||
6557 | &per_cpu(init_sched_entity, i); | ||
6558 | |||
6559 | init_cfs_rq_p[i] = cfs_rq; | ||
6560 | init_cfs_rq(cfs_rq, rq); | ||
6561 | cfs_rq->tg = &init_task_group; | ||
6562 | list_add(&cfs_rq->leaf_cfs_rq_list, | ||
6563 | &rq->leaf_cfs_rq_list); | ||
6564 | |||
6565 | init_sched_entity_p[i] = se; | ||
6566 | se->cfs_rq = &rq->cfs; | ||
6567 | se->my_q = cfs_rq; | ||
6568 | se->load.weight = init_task_group_load; | ||
6569 | se->load.inv_weight = | ||
6570 | div64_64(1ULL<<32, init_task_group_load); | ||
6571 | se->parent = NULL; | ||
6572 | } | ||
6573 | init_task_group.shares = init_task_group_load; | ||
6574 | spin_lock_init(&init_task_group.lock); | ||
6550 | #endif | 6575 | #endif |
6551 | rq->ls.load_update_last = now; | ||
6552 | rq->ls.load_update_start = now; | ||
6553 | 6576 | ||
6554 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 6577 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
6555 | rq->cpu_load[j] = 0; | 6578 | rq->cpu_load[j] = 0; |
@@ -6634,26 +6657,40 @@ EXPORT_SYMBOL(__might_sleep); | |||
6634 | #endif | 6657 | #endif |
6635 | 6658 | ||
6636 | #ifdef CONFIG_MAGIC_SYSRQ | 6659 | #ifdef CONFIG_MAGIC_SYSRQ |
6660 | static void normalize_task(struct rq *rq, struct task_struct *p) | ||
6661 | { | ||
6662 | int on_rq; | ||
6663 | update_rq_clock(rq); | ||
6664 | on_rq = p->se.on_rq; | ||
6665 | if (on_rq) | ||
6666 | deactivate_task(rq, p, 0); | ||
6667 | __setscheduler(rq, p, SCHED_NORMAL, 0); | ||
6668 | if (on_rq) { | ||
6669 | activate_task(rq, p, 0); | ||
6670 | resched_task(rq->curr); | ||
6671 | } | ||
6672 | } | ||
6673 | |||
6637 | void normalize_rt_tasks(void) | 6674 | void normalize_rt_tasks(void) |
6638 | { | 6675 | { |
6639 | struct task_struct *g, *p; | 6676 | struct task_struct *g, *p; |
6640 | unsigned long flags; | 6677 | unsigned long flags; |
6641 | struct rq *rq; | 6678 | struct rq *rq; |
6642 | int on_rq; | ||
6643 | 6679 | ||
6644 | read_lock_irq(&tasklist_lock); | 6680 | read_lock_irq(&tasklist_lock); |
6645 | do_each_thread(g, p) { | 6681 | do_each_thread(g, p) { |
6646 | p->se.fair_key = 0; | 6682 | /* |
6647 | p->se.wait_runtime = 0; | 6683 | * Only normalize user tasks: |
6684 | */ | ||
6685 | if (!p->mm) | ||
6686 | continue; | ||
6687 | |||
6648 | p->se.exec_start = 0; | 6688 | p->se.exec_start = 0; |
6649 | p->se.wait_start_fair = 0; | ||
6650 | p->se.sleep_start_fair = 0; | ||
6651 | #ifdef CONFIG_SCHEDSTATS | 6689 | #ifdef CONFIG_SCHEDSTATS |
6652 | p->se.wait_start = 0; | 6690 | p->se.wait_start = 0; |
6653 | p->se.sleep_start = 0; | 6691 | p->se.sleep_start = 0; |
6654 | p->se.block_start = 0; | 6692 | p->se.block_start = 0; |
6655 | #endif | 6693 | #endif |
6656 | task_rq(p)->cfs.fair_clock = 0; | ||
6657 | task_rq(p)->clock = 0; | 6694 | task_rq(p)->clock = 0; |
6658 | 6695 | ||
6659 | if (!rt_task(p)) { | 6696 | if (!rt_task(p)) { |
@@ -6668,26 +6705,9 @@ void normalize_rt_tasks(void) | |||
6668 | 6705 | ||
6669 | spin_lock_irqsave(&p->pi_lock, flags); | 6706 | spin_lock_irqsave(&p->pi_lock, flags); |
6670 | rq = __task_rq_lock(p); | 6707 | rq = __task_rq_lock(p); |
6671 | #ifdef CONFIG_SMP | ||
6672 | /* | ||
6673 | * Do not touch the migration thread: | ||
6674 | */ | ||
6675 | if (p == rq->migration_thread) | ||
6676 | goto out_unlock; | ||
6677 | #endif | ||
6678 | 6708 | ||
6679 | update_rq_clock(rq); | 6709 | normalize_task(rq, p); |
6680 | on_rq = p->se.on_rq; | 6710 | |
6681 | if (on_rq) | ||
6682 | deactivate_task(rq, p, 0); | ||
6683 | __setscheduler(rq, p, SCHED_NORMAL, 0); | ||
6684 | if (on_rq) { | ||
6685 | activate_task(rq, p, 0); | ||
6686 | resched_task(rq->curr); | ||
6687 | } | ||
6688 | #ifdef CONFIG_SMP | ||
6689 | out_unlock: | ||
6690 | #endif | ||
6691 | __task_rq_unlock(rq); | 6711 | __task_rq_unlock(rq); |
6692 | spin_unlock_irqrestore(&p->pi_lock, flags); | 6712 | spin_unlock_irqrestore(&p->pi_lock, flags); |
6693 | } while_each_thread(g, p); | 6713 | } while_each_thread(g, p); |
@@ -6740,3 +6760,201 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
6740 | } | 6760 | } |
6741 | 6761 | ||
6742 | #endif | 6762 | #endif |
6763 | |||
6764 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
6765 | |||
6766 | /* allocate runqueue etc for a new task group */ | ||
6767 | struct task_group *sched_create_group(void) | ||
6768 | { | ||
6769 | struct task_group *tg; | ||
6770 | struct cfs_rq *cfs_rq; | ||
6771 | struct sched_entity *se; | ||
6772 | struct rq *rq; | ||
6773 | int i; | ||
6774 | |||
6775 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); | ||
6776 | if (!tg) | ||
6777 | return ERR_PTR(-ENOMEM); | ||
6778 | |||
6779 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); | ||
6780 | if (!tg->cfs_rq) | ||
6781 | goto err; | ||
6782 | tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); | ||
6783 | if (!tg->se) | ||
6784 | goto err; | ||
6785 | |||
6786 | for_each_possible_cpu(i) { | ||
6787 | rq = cpu_rq(i); | ||
6788 | |||
6789 | cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, | ||
6790 | cpu_to_node(i)); | ||
6791 | if (!cfs_rq) | ||
6792 | goto err; | ||
6793 | |||
6794 | se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, | ||
6795 | cpu_to_node(i)); | ||
6796 | if (!se) | ||
6797 | goto err; | ||
6798 | |||
6799 | memset(cfs_rq, 0, sizeof(struct cfs_rq)); | ||
6800 | memset(se, 0, sizeof(struct sched_entity)); | ||
6801 | |||
6802 | tg->cfs_rq[i] = cfs_rq; | ||
6803 | init_cfs_rq(cfs_rq, rq); | ||
6804 | cfs_rq->tg = tg; | ||
6805 | |||
6806 | tg->se[i] = se; | ||
6807 | se->cfs_rq = &rq->cfs; | ||
6808 | se->my_q = cfs_rq; | ||
6809 | se->load.weight = NICE_0_LOAD; | ||
6810 | se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD); | ||
6811 | se->parent = NULL; | ||
6812 | } | ||
6813 | |||
6814 | for_each_possible_cpu(i) { | ||
6815 | rq = cpu_rq(i); | ||
6816 | cfs_rq = tg->cfs_rq[i]; | ||
6817 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
6818 | } | ||
6819 | |||
6820 | tg->shares = NICE_0_LOAD; | ||
6821 | spin_lock_init(&tg->lock); | ||
6822 | |||
6823 | return tg; | ||
6824 | |||
6825 | err: | ||
6826 | for_each_possible_cpu(i) { | ||
6827 | if (tg->cfs_rq) | ||
6828 | kfree(tg->cfs_rq[i]); | ||
6829 | if (tg->se) | ||
6830 | kfree(tg->se[i]); | ||
6831 | } | ||
6832 | kfree(tg->cfs_rq); | ||
6833 | kfree(tg->se); | ||
6834 | kfree(tg); | ||
6835 | |||
6836 | return ERR_PTR(-ENOMEM); | ||
6837 | } | ||
6838 | |||
6839 | /* rcu callback to free various structures associated with a task group */ | ||
6840 | static void free_sched_group(struct rcu_head *rhp) | ||
6841 | { | ||
6842 | struct cfs_rq *cfs_rq = container_of(rhp, struct cfs_rq, rcu); | ||
6843 | struct task_group *tg = cfs_rq->tg; | ||
6844 | struct sched_entity *se; | ||
6845 | int i; | ||
6846 | |||
6847 | /* now it should be safe to free those cfs_rqs */ | ||
6848 | for_each_possible_cpu(i) { | ||
6849 | cfs_rq = tg->cfs_rq[i]; | ||
6850 | kfree(cfs_rq); | ||
6851 | |||
6852 | se = tg->se[i]; | ||
6853 | kfree(se); | ||
6854 | } | ||
6855 | |||
6856 | kfree(tg->cfs_rq); | ||
6857 | kfree(tg->se); | ||
6858 | kfree(tg); | ||
6859 | } | ||
6860 | |||
6861 | /* Destroy runqueue etc associated with a task group */ | ||
6862 | void sched_destroy_group(struct task_group *tg) | ||
6863 | { | ||
6864 | struct cfs_rq *cfs_rq; | ||
6865 | int i; | ||
6866 | |||
6867 | for_each_possible_cpu(i) { | ||
6868 | cfs_rq = tg->cfs_rq[i]; | ||
6869 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | ||
6870 | } | ||
6871 | |||
6872 | cfs_rq = tg->cfs_rq[0]; | ||
6873 | |||
6874 | /* wait for possible concurrent references to cfs_rqs complete */ | ||
6875 | call_rcu(&cfs_rq->rcu, free_sched_group); | ||
6876 | } | ||
6877 | |||
6878 | /* change task's runqueue when it moves between groups. | ||
6879 | * The caller of this function should have put the task in its new group | ||
6880 | * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to | ||
6881 | * reflect its new group. | ||
6882 | */ | ||
6883 | void sched_move_task(struct task_struct *tsk) | ||
6884 | { | ||
6885 | int on_rq, running; | ||
6886 | unsigned long flags; | ||
6887 | struct rq *rq; | ||
6888 | |||
6889 | rq = task_rq_lock(tsk, &flags); | ||
6890 | |||
6891 | if (tsk->sched_class != &fair_sched_class) | ||
6892 | goto done; | ||
6893 | |||
6894 | update_rq_clock(rq); | ||
6895 | |||
6896 | running = task_running(rq, tsk); | ||
6897 | on_rq = tsk->se.on_rq; | ||
6898 | |||
6899 | if (on_rq) { | ||
6900 | dequeue_task(rq, tsk, 0); | ||
6901 | if (unlikely(running)) | ||
6902 | tsk->sched_class->put_prev_task(rq, tsk); | ||
6903 | } | ||
6904 | |||
6905 | set_task_cfs_rq(tsk); | ||
6906 | |||
6907 | if (on_rq) { | ||
6908 | if (unlikely(running)) | ||
6909 | tsk->sched_class->set_curr_task(rq); | ||
6910 | enqueue_task(rq, tsk, 0); | ||
6911 | } | ||
6912 | |||
6913 | done: | ||
6914 | task_rq_unlock(rq, &flags); | ||
6915 | } | ||
6916 | |||
6917 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | ||
6918 | { | ||
6919 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
6920 | struct rq *rq = cfs_rq->rq; | ||
6921 | int on_rq; | ||
6922 | |||
6923 | spin_lock_irq(&rq->lock); | ||
6924 | |||
6925 | on_rq = se->on_rq; | ||
6926 | if (on_rq) | ||
6927 | dequeue_entity(cfs_rq, se, 0); | ||
6928 | |||
6929 | se->load.weight = shares; | ||
6930 | se->load.inv_weight = div64_64((1ULL<<32), shares); | ||
6931 | |||
6932 | if (on_rq) | ||
6933 | enqueue_entity(cfs_rq, se, 0); | ||
6934 | |||
6935 | spin_unlock_irq(&rq->lock); | ||
6936 | } | ||
6937 | |||
6938 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | ||
6939 | { | ||
6940 | int i; | ||
6941 | |||
6942 | spin_lock(&tg->lock); | ||
6943 | if (tg->shares == shares) | ||
6944 | goto done; | ||
6945 | |||
6946 | tg->shares = shares; | ||
6947 | for_each_possible_cpu(i) | ||
6948 | set_se_shares(tg->se[i], shares); | ||
6949 | |||
6950 | done: | ||
6951 | spin_unlock(&tg->lock); | ||
6952 | return 0; | ||
6953 | } | ||
6954 | |||
6955 | unsigned long sched_group_shares(struct task_group *tg) | ||
6956 | { | ||
6957 | return tg->shares; | ||
6958 | } | ||
6959 | |||
6960 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index c3ee38bd3426..a5e517ec07c3 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -28,6 +28,31 @@ | |||
28 | printk(x); \ | 28 | printk(x); \ |
29 | } while (0) | 29 | } while (0) |
30 | 30 | ||
31 | /* | ||
32 | * Ease the printing of nsec fields: | ||
33 | */ | ||
34 | static long long nsec_high(long long nsec) | ||
35 | { | ||
36 | if (nsec < 0) { | ||
37 | nsec = -nsec; | ||
38 | do_div(nsec, 1000000); | ||
39 | return -nsec; | ||
40 | } | ||
41 | do_div(nsec, 1000000); | ||
42 | |||
43 | return nsec; | ||
44 | } | ||
45 | |||
46 | static unsigned long nsec_low(long long nsec) | ||
47 | { | ||
48 | if (nsec < 0) | ||
49 | nsec = -nsec; | ||
50 | |||
51 | return do_div(nsec, 1000000); | ||
52 | } | ||
53 | |||
54 | #define SPLIT_NS(x) nsec_high(x), nsec_low(x) | ||
55 | |||
31 | static void | 56 | static void |
32 | print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | 57 | print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) |
33 | { | 58 | { |
@@ -36,23 +61,19 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
36 | else | 61 | else |
37 | SEQ_printf(m, " "); | 62 | SEQ_printf(m, " "); |
38 | 63 | ||
39 | SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d ", | 64 | SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", |
40 | p->comm, p->pid, | 65 | p->comm, p->pid, |
41 | (long long)p->se.fair_key, | 66 | SPLIT_NS(p->se.vruntime), |
42 | (long long)(p->se.fair_key - rq->cfs.fair_clock), | ||
43 | (long long)p->se.wait_runtime, | ||
44 | (long long)(p->nvcsw + p->nivcsw), | 67 | (long long)(p->nvcsw + p->nivcsw), |
45 | p->prio); | 68 | p->prio); |
46 | #ifdef CONFIG_SCHEDSTATS | 69 | #ifdef CONFIG_SCHEDSTATS |
47 | SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n", | 70 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n", |
48 | (long long)p->se.sum_exec_runtime, | 71 | SPLIT_NS(p->se.vruntime), |
49 | (long long)p->se.sum_wait_runtime, | 72 | SPLIT_NS(p->se.sum_exec_runtime), |
50 | (long long)p->se.sum_sleep_runtime, | 73 | SPLIT_NS(p->se.sum_sleep_runtime)); |
51 | (long long)p->se.wait_runtime_overruns, | ||
52 | (long long)p->se.wait_runtime_underruns); | ||
53 | #else | 74 | #else |
54 | SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n", | 75 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n", |
55 | 0LL, 0LL, 0LL, 0LL, 0LL); | 76 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); |
56 | #endif | 77 | #endif |
57 | } | 78 | } |
58 | 79 | ||
@@ -62,14 +83,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
62 | 83 | ||
63 | SEQ_printf(m, | 84 | SEQ_printf(m, |
64 | "\nrunnable tasks:\n" | 85 | "\nrunnable tasks:\n" |
65 | " task PID tree-key delta waiting" | 86 | " task PID tree-key switches prio" |
66 | " switches prio" | 87 | " exec-runtime sum-exec sum-sleep\n" |
67 | " sum-exec sum-wait sum-sleep" | 88 | "------------------------------------------------------" |
68 | " wait-overrun wait-underrun\n" | 89 | "----------------------------------------------------\n"); |
69 | "------------------------------------------------------------------" | ||
70 | "----------------" | ||
71 | "------------------------------------------------" | ||
72 | "--------------------------------\n"); | ||
73 | 90 | ||
74 | read_lock_irq(&tasklist_lock); | 91 | read_lock_irq(&tasklist_lock); |
75 | 92 | ||
@@ -83,45 +100,48 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
83 | read_unlock_irq(&tasklist_lock); | 100 | read_unlock_irq(&tasklist_lock); |
84 | } | 101 | } |
85 | 102 | ||
86 | static void | 103 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) |
87 | print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | ||
88 | { | 104 | { |
89 | s64 wait_runtime_rq_sum = 0; | 105 | s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, |
90 | struct task_struct *p; | 106 | spread, rq0_min_vruntime, spread0; |
91 | struct rb_node *curr; | ||
92 | unsigned long flags; | ||
93 | struct rq *rq = &per_cpu(runqueues, cpu); | 107 | struct rq *rq = &per_cpu(runqueues, cpu); |
108 | struct sched_entity *last; | ||
109 | unsigned long flags; | ||
94 | 110 | ||
95 | spin_lock_irqsave(&rq->lock, flags); | ||
96 | curr = first_fair(cfs_rq); | ||
97 | while (curr) { | ||
98 | p = rb_entry(curr, struct task_struct, se.run_node); | ||
99 | wait_runtime_rq_sum += p->se.wait_runtime; | ||
100 | |||
101 | curr = rb_next(curr); | ||
102 | } | ||
103 | spin_unlock_irqrestore(&rq->lock, flags); | ||
104 | |||
105 | SEQ_printf(m, " .%-30s: %Ld\n", "wait_runtime_rq_sum", | ||
106 | (long long)wait_runtime_rq_sum); | ||
107 | } | ||
108 | |||
109 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | ||
110 | { | ||
111 | SEQ_printf(m, "\ncfs_rq\n"); | 111 | SEQ_printf(m, "\ncfs_rq\n"); |
112 | 112 | ||
113 | #define P(x) \ | 113 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", |
114 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(cfs_rq->x)) | 114 | SPLIT_NS(cfs_rq->exec_clock)); |
115 | |||
116 | P(fair_clock); | ||
117 | P(exec_clock); | ||
118 | P(wait_runtime); | ||
119 | P(wait_runtime_overruns); | ||
120 | P(wait_runtime_underruns); | ||
121 | P(sleeper_bonus); | ||
122 | #undef P | ||
123 | 115 | ||
124 | print_cfs_rq_runtime_sum(m, cpu, cfs_rq); | 116 | spin_lock_irqsave(&rq->lock, flags); |
117 | if (cfs_rq->rb_leftmost) | ||
118 | MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; | ||
119 | last = __pick_last_entity(cfs_rq); | ||
120 | if (last) | ||
121 | max_vruntime = last->vruntime; | ||
122 | min_vruntime = rq->cfs.min_vruntime; | ||
123 | rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; | ||
124 | spin_unlock_irqrestore(&rq->lock, flags); | ||
125 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", | ||
126 | SPLIT_NS(MIN_vruntime)); | ||
127 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", | ||
128 | SPLIT_NS(min_vruntime)); | ||
129 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", | ||
130 | SPLIT_NS(max_vruntime)); | ||
131 | spread = max_vruntime - MIN_vruntime; | ||
132 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", | ||
133 | SPLIT_NS(spread)); | ||
134 | spread0 = min_vruntime - rq0_min_vruntime; | ||
135 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", | ||
136 | SPLIT_NS(spread0)); | ||
137 | SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); | ||
138 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | ||
139 | #ifdef CONFIG_SCHEDSTATS | ||
140 | SEQ_printf(m, " .%-30s: %ld\n", "bkl_count", | ||
141 | rq->bkl_count); | ||
142 | #endif | ||
143 | SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", | ||
144 | cfs_rq->nr_spread_over); | ||
125 | } | 145 | } |
126 | 146 | ||
127 | static void print_cpu(struct seq_file *m, int cpu) | 147 | static void print_cpu(struct seq_file *m, int cpu) |
@@ -141,31 +161,32 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
141 | 161 | ||
142 | #define P(x) \ | 162 | #define P(x) \ |
143 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) | 163 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) |
164 | #define PN(x) \ | ||
165 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) | ||
144 | 166 | ||
145 | P(nr_running); | 167 | P(nr_running); |
146 | SEQ_printf(m, " .%-30s: %lu\n", "load", | 168 | SEQ_printf(m, " .%-30s: %lu\n", "load", |
147 | rq->ls.load.weight); | 169 | rq->load.weight); |
148 | P(ls.delta_fair); | ||
149 | P(ls.delta_exec); | ||
150 | P(nr_switches); | 170 | P(nr_switches); |
151 | P(nr_load_updates); | 171 | P(nr_load_updates); |
152 | P(nr_uninterruptible); | 172 | P(nr_uninterruptible); |
153 | SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies); | 173 | SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies); |
154 | P(next_balance); | 174 | PN(next_balance); |
155 | P(curr->pid); | 175 | P(curr->pid); |
156 | P(clock); | 176 | PN(clock); |
157 | P(idle_clock); | 177 | PN(idle_clock); |
158 | P(prev_clock_raw); | 178 | PN(prev_clock_raw); |
159 | P(clock_warps); | 179 | P(clock_warps); |
160 | P(clock_overflows); | 180 | P(clock_overflows); |
161 | P(clock_deep_idle_events); | 181 | P(clock_deep_idle_events); |
162 | P(clock_max_delta); | 182 | PN(clock_max_delta); |
163 | P(cpu_load[0]); | 183 | P(cpu_load[0]); |
164 | P(cpu_load[1]); | 184 | P(cpu_load[1]); |
165 | P(cpu_load[2]); | 185 | P(cpu_load[2]); |
166 | P(cpu_load[3]); | 186 | P(cpu_load[3]); |
167 | P(cpu_load[4]); | 187 | P(cpu_load[4]); |
168 | #undef P | 188 | #undef P |
189 | #undef PN | ||
169 | 190 | ||
170 | print_cfs_stats(m, cpu); | 191 | print_cfs_stats(m, cpu); |
171 | 192 | ||
@@ -177,12 +198,25 @@ static int sched_debug_show(struct seq_file *m, void *v) | |||
177 | u64 now = ktime_to_ns(ktime_get()); | 198 | u64 now = ktime_to_ns(ktime_get()); |
178 | int cpu; | 199 | int cpu; |
179 | 200 | ||
180 | SEQ_printf(m, "Sched Debug Version: v0.05-v20, %s %.*s\n", | 201 | SEQ_printf(m, "Sched Debug Version: v0.06-v22, %s %.*s\n", |
181 | init_utsname()->release, | 202 | init_utsname()->release, |
182 | (int)strcspn(init_utsname()->version, " "), | 203 | (int)strcspn(init_utsname()->version, " "), |
183 | init_utsname()->version); | 204 | init_utsname()->version); |
184 | 205 | ||
185 | SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now); | 206 | SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); |
207 | |||
208 | #define P(x) \ | ||
209 | SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) | ||
210 | #define PN(x) \ | ||
211 | SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) | ||
212 | PN(sysctl_sched_latency); | ||
213 | PN(sysctl_sched_nr_latency); | ||
214 | PN(sysctl_sched_wakeup_granularity); | ||
215 | PN(sysctl_sched_batch_wakeup_granularity); | ||
216 | PN(sysctl_sched_child_runs_first); | ||
217 | P(sysctl_sched_features); | ||
218 | #undef PN | ||
219 | #undef P | ||
186 | 220 | ||
187 | for_each_online_cpu(cpu) | 221 | for_each_online_cpu(cpu) |
188 | print_cpu(m, cpu); | 222 | print_cpu(m, cpu); |
@@ -202,7 +236,7 @@ static int sched_debug_open(struct inode *inode, struct file *filp) | |||
202 | return single_open(filp, sched_debug_show, NULL); | 236 | return single_open(filp, sched_debug_show, NULL); |
203 | } | 237 | } |
204 | 238 | ||
205 | static struct file_operations sched_debug_fops = { | 239 | static const struct file_operations sched_debug_fops = { |
206 | .open = sched_debug_open, | 240 | .open = sched_debug_open, |
207 | .read = seq_read, | 241 | .read = seq_read, |
208 | .llseek = seq_lseek, | 242 | .llseek = seq_lseek, |
@@ -226,6 +260,7 @@ __initcall(init_sched_debug_procfs); | |||
226 | 260 | ||
227 | void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | 261 | void proc_sched_show_task(struct task_struct *p, struct seq_file *m) |
228 | { | 262 | { |
263 | unsigned long nr_switches; | ||
229 | unsigned long flags; | 264 | unsigned long flags; |
230 | int num_threads = 1; | 265 | int num_threads = 1; |
231 | 266 | ||
@@ -237,41 +272,89 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
237 | rcu_read_unlock(); | 272 | rcu_read_unlock(); |
238 | 273 | ||
239 | SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); | 274 | SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); |
240 | SEQ_printf(m, "----------------------------------------------\n"); | 275 | SEQ_printf(m, |
276 | "---------------------------------------------------------\n"); | ||
277 | #define __P(F) \ | ||
278 | SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F) | ||
241 | #define P(F) \ | 279 | #define P(F) \ |
242 | SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F) | 280 | SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F) |
281 | #define __PN(F) \ | ||
282 | SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) | ||
283 | #define PN(F) \ | ||
284 | SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) | ||
243 | 285 | ||
244 | P(se.wait_runtime); | 286 | PN(se.exec_start); |
245 | P(se.wait_start_fair); | 287 | PN(se.vruntime); |
246 | P(se.exec_start); | 288 | PN(se.sum_exec_runtime); |
247 | P(se.sleep_start_fair); | 289 | |
248 | P(se.sum_exec_runtime); | 290 | nr_switches = p->nvcsw + p->nivcsw; |
249 | 291 | ||
250 | #ifdef CONFIG_SCHEDSTATS | 292 | #ifdef CONFIG_SCHEDSTATS |
251 | P(se.wait_start); | 293 | PN(se.wait_start); |
252 | P(se.sleep_start); | 294 | PN(se.sleep_start); |
253 | P(se.block_start); | 295 | PN(se.block_start); |
254 | P(se.sleep_max); | 296 | PN(se.sleep_max); |
255 | P(se.block_max); | 297 | PN(se.block_max); |
256 | P(se.exec_max); | 298 | PN(se.exec_max); |
257 | P(se.wait_max); | 299 | PN(se.slice_max); |
258 | P(se.wait_runtime_overruns); | 300 | PN(se.wait_max); |
259 | P(se.wait_runtime_underruns); | 301 | P(sched_info.bkl_count); |
260 | P(se.sum_wait_runtime); | 302 | P(se.nr_migrations); |
303 | P(se.nr_migrations_cold); | ||
304 | P(se.nr_failed_migrations_affine); | ||
305 | P(se.nr_failed_migrations_running); | ||
306 | P(se.nr_failed_migrations_hot); | ||
307 | P(se.nr_forced_migrations); | ||
308 | P(se.nr_forced2_migrations); | ||
309 | P(se.nr_wakeups); | ||
310 | P(se.nr_wakeups_sync); | ||
311 | P(se.nr_wakeups_migrate); | ||
312 | P(se.nr_wakeups_local); | ||
313 | P(se.nr_wakeups_remote); | ||
314 | P(se.nr_wakeups_affine); | ||
315 | P(se.nr_wakeups_affine_attempts); | ||
316 | P(se.nr_wakeups_passive); | ||
317 | P(se.nr_wakeups_idle); | ||
318 | |||
319 | { | ||
320 | u64 avg_atom, avg_per_cpu; | ||
321 | |||
322 | avg_atom = p->se.sum_exec_runtime; | ||
323 | if (nr_switches) | ||
324 | do_div(avg_atom, nr_switches); | ||
325 | else | ||
326 | avg_atom = -1LL; | ||
327 | |||
328 | avg_per_cpu = p->se.sum_exec_runtime; | ||
329 | if (p->se.nr_migrations) | ||
330 | avg_per_cpu = div64_64(avg_per_cpu, p->se.nr_migrations); | ||
331 | else | ||
332 | avg_per_cpu = -1LL; | ||
333 | |||
334 | __PN(avg_atom); | ||
335 | __PN(avg_per_cpu); | ||
336 | } | ||
261 | #endif | 337 | #endif |
262 | SEQ_printf(m, "%-25s:%20Ld\n", | 338 | __P(nr_switches); |
263 | "nr_switches", (long long)(p->nvcsw + p->nivcsw)); | 339 | SEQ_printf(m, "%-35s:%21Ld\n", |
340 | "nr_voluntary_switches", (long long)p->nvcsw); | ||
341 | SEQ_printf(m, "%-35s:%21Ld\n", | ||
342 | "nr_involuntary_switches", (long long)p->nivcsw); | ||
343 | |||
264 | P(se.load.weight); | 344 | P(se.load.weight); |
265 | P(policy); | 345 | P(policy); |
266 | P(prio); | 346 | P(prio); |
347 | #undef PN | ||
348 | #undef __PN | ||
267 | #undef P | 349 | #undef P |
350 | #undef __P | ||
268 | 351 | ||
269 | { | 352 | { |
270 | u64 t0, t1; | 353 | u64 t0, t1; |
271 | 354 | ||
272 | t0 = sched_clock(); | 355 | t0 = sched_clock(); |
273 | t1 = sched_clock(); | 356 | t1 = sched_clock(); |
274 | SEQ_printf(m, "%-25s:%20Ld\n", | 357 | SEQ_printf(m, "%-35s:%21Ld\n", |
275 | "clock-delta", (long long)(t1-t0)); | 358 | "clock-delta", (long long)(t1-t0)); |
276 | } | 359 | } |
277 | } | 360 | } |
@@ -279,9 +362,32 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
279 | void proc_sched_set_task(struct task_struct *p) | 362 | void proc_sched_set_task(struct task_struct *p) |
280 | { | 363 | { |
281 | #ifdef CONFIG_SCHEDSTATS | 364 | #ifdef CONFIG_SCHEDSTATS |
282 | p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0; | 365 | p->se.wait_max = 0; |
283 | p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0; | 366 | p->se.sleep_max = 0; |
367 | p->se.sum_sleep_runtime = 0; | ||
368 | p->se.block_max = 0; | ||
369 | p->se.exec_max = 0; | ||
370 | p->se.slice_max = 0; | ||
371 | p->se.nr_migrations = 0; | ||
372 | p->se.nr_migrations_cold = 0; | ||
373 | p->se.nr_failed_migrations_affine = 0; | ||
374 | p->se.nr_failed_migrations_running = 0; | ||
375 | p->se.nr_failed_migrations_hot = 0; | ||
376 | p->se.nr_forced_migrations = 0; | ||
377 | p->se.nr_forced2_migrations = 0; | ||
378 | p->se.nr_wakeups = 0; | ||
379 | p->se.nr_wakeups_sync = 0; | ||
380 | p->se.nr_wakeups_migrate = 0; | ||
381 | p->se.nr_wakeups_local = 0; | ||
382 | p->se.nr_wakeups_remote = 0; | ||
383 | p->se.nr_wakeups_affine = 0; | ||
384 | p->se.nr_wakeups_affine_attempts = 0; | ||
385 | p->se.nr_wakeups_passive = 0; | ||
386 | p->se.nr_wakeups_idle = 0; | ||
387 | p->sched_info.bkl_count = 0; | ||
284 | #endif | 388 | #endif |
285 | p->se.sum_exec_runtime = 0; | 389 | p->se.sum_exec_runtime = 0; |
286 | p->se.prev_sum_exec_runtime = 0; | 390 | p->se.prev_sum_exec_runtime = 0; |
391 | p->nvcsw = 0; | ||
392 | p->nivcsw = 0; | ||
287 | } | 393 | } |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 67c67a87146e..a17b785d7000 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -25,22 +25,26 @@ | |||
25 | * (default: 20ms, units: nanoseconds) | 25 | * (default: 20ms, units: nanoseconds) |
26 | * | 26 | * |
27 | * NOTE: this latency value is not the same as the concept of | 27 | * NOTE: this latency value is not the same as the concept of |
28 | * 'timeslice length' - timeslices in CFS are of variable length. | 28 | * 'timeslice length' - timeslices in CFS are of variable length |
29 | * (to see the precise effective timeslice length of your workload, | 29 | * and have no persistent notion like in traditional, time-slice |
30 | * run vmstat and monitor the context-switches field) | 30 | * based scheduling concepts. |
31 | * | 31 | * |
32 | * On SMP systems the value of this is multiplied by the log2 of the | 32 | * (to see the precise effective timeslice length of your workload, |
33 | * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way | 33 | * run vmstat and monitor the context-switches (cs) field) |
34 | * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) | ||
35 | * Targeted preemption latency for CPU-bound tasks: | ||
36 | */ | 34 | */ |
37 | unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; | 35 | const_debug unsigned int sysctl_sched_latency = 20000000ULL; |
36 | |||
37 | /* | ||
38 | * After fork, child runs first. (default) If set to 0 then | ||
39 | * parent will (try to) run first. | ||
40 | */ | ||
41 | const_debug unsigned int sysctl_sched_child_runs_first = 1; | ||
38 | 42 | ||
39 | /* | 43 | /* |
40 | * Minimal preemption granularity for CPU-bound tasks: | 44 | * Minimal preemption granularity for CPU-bound tasks: |
41 | * (default: 2 msec, units: nanoseconds) | 45 | * (default: 2 msec, units: nanoseconds) |
42 | */ | 46 | */ |
43 | unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL; | 47 | const_debug unsigned int sysctl_sched_nr_latency = 20; |
44 | 48 | ||
45 | /* | 49 | /* |
46 | * sys_sched_yield() compat mode | 50 | * sys_sched_yield() compat mode |
@@ -52,52 +56,25 @@ unsigned int __read_mostly sysctl_sched_compat_yield; | |||
52 | 56 | ||
53 | /* | 57 | /* |
54 | * SCHED_BATCH wake-up granularity. | 58 | * SCHED_BATCH wake-up granularity. |
55 | * (default: 25 msec, units: nanoseconds) | 59 | * (default: 10 msec, units: nanoseconds) |
56 | * | 60 | * |
57 | * This option delays the preemption effects of decoupled workloads | 61 | * This option delays the preemption effects of decoupled workloads |
58 | * and reduces their over-scheduling. Synchronous workloads will still | 62 | * and reduces their over-scheduling. Synchronous workloads will still |
59 | * have immediate wakeup/sleep latencies. | 63 | * have immediate wakeup/sleep latencies. |
60 | */ | 64 | */ |
61 | unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL; | 65 | const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; |
62 | 66 | ||
63 | /* | 67 | /* |
64 | * SCHED_OTHER wake-up granularity. | 68 | * SCHED_OTHER wake-up granularity. |
65 | * (default: 1 msec, units: nanoseconds) | 69 | * (default: 10 msec, units: nanoseconds) |
66 | * | 70 | * |
67 | * This option delays the preemption effects of decoupled workloads | 71 | * This option delays the preemption effects of decoupled workloads |
68 | * and reduces their over-scheduling. Synchronous workloads will still | 72 | * and reduces their over-scheduling. Synchronous workloads will still |
69 | * have immediate wakeup/sleep latencies. | 73 | * have immediate wakeup/sleep latencies. |
70 | */ | 74 | */ |
71 | unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000UL; | 75 | const_debug unsigned int sysctl_sched_wakeup_granularity = 10000000UL; |
72 | |||
73 | unsigned int sysctl_sched_stat_granularity __read_mostly; | ||
74 | |||
75 | /* | ||
76 | * Initialized in sched_init_granularity() [to 5 times the base granularity]: | ||
77 | */ | ||
78 | unsigned int sysctl_sched_runtime_limit __read_mostly; | ||
79 | |||
80 | /* | ||
81 | * Debugging: various feature bits | ||
82 | */ | ||
83 | enum { | ||
84 | SCHED_FEAT_FAIR_SLEEPERS = 1, | ||
85 | SCHED_FEAT_SLEEPER_AVG = 2, | ||
86 | SCHED_FEAT_SLEEPER_LOAD_AVG = 4, | ||
87 | SCHED_FEAT_PRECISE_CPU_LOAD = 8, | ||
88 | SCHED_FEAT_START_DEBIT = 16, | ||
89 | SCHED_FEAT_SKIP_INITIAL = 32, | ||
90 | }; | ||
91 | 76 | ||
92 | unsigned int sysctl_sched_features __read_mostly = | 77 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
93 | SCHED_FEAT_FAIR_SLEEPERS *1 | | ||
94 | SCHED_FEAT_SLEEPER_AVG *0 | | ||
95 | SCHED_FEAT_SLEEPER_LOAD_AVG *1 | | ||
96 | SCHED_FEAT_PRECISE_CPU_LOAD *1 | | ||
97 | SCHED_FEAT_START_DEBIT *1 | | ||
98 | SCHED_FEAT_SKIP_INITIAL *0; | ||
99 | |||
100 | extern struct sched_class fair_sched_class; | ||
101 | 78 | ||
102 | /************************************************************** | 79 | /************************************************************** |
103 | * CFS operations on generic schedulable entities: | 80 | * CFS operations on generic schedulable entities: |
@@ -111,21 +88,9 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
111 | return cfs_rq->rq; | 88 | return cfs_rq->rq; |
112 | } | 89 | } |
113 | 90 | ||
114 | /* currently running entity (if any) on this cfs_rq */ | ||
115 | static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) | ||
116 | { | ||
117 | return cfs_rq->curr; | ||
118 | } | ||
119 | |||
120 | /* An entity is a task if it doesn't "own" a runqueue */ | 91 | /* An entity is a task if it doesn't "own" a runqueue */ |
121 | #define entity_is_task(se) (!se->my_q) | 92 | #define entity_is_task(se) (!se->my_q) |
122 | 93 | ||
123 | static inline void | ||
124 | set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
125 | { | ||
126 | cfs_rq->curr = se; | ||
127 | } | ||
128 | |||
129 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 94 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
130 | 95 | ||
131 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | 96 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) |
@@ -133,21 +98,8 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
133 | return container_of(cfs_rq, struct rq, cfs); | 98 | return container_of(cfs_rq, struct rq, cfs); |
134 | } | 99 | } |
135 | 100 | ||
136 | static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) | ||
137 | { | ||
138 | struct rq *rq = rq_of(cfs_rq); | ||
139 | |||
140 | if (unlikely(rq->curr->sched_class != &fair_sched_class)) | ||
141 | return NULL; | ||
142 | |||
143 | return &rq->curr->se; | ||
144 | } | ||
145 | |||
146 | #define entity_is_task(se) 1 | 101 | #define entity_is_task(se) 1 |
147 | 102 | ||
148 | static inline void | ||
149 | set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { } | ||
150 | |||
151 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 103 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
152 | 104 | ||
153 | static inline struct task_struct *task_of(struct sched_entity *se) | 105 | static inline struct task_struct *task_of(struct sched_entity *se) |
@@ -160,16 +112,38 @@ static inline struct task_struct *task_of(struct sched_entity *se) | |||
160 | * Scheduling class tree data structure manipulation methods: | 112 | * Scheduling class tree data structure manipulation methods: |
161 | */ | 113 | */ |
162 | 114 | ||
115 | static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) | ||
116 | { | ||
117 | s64 delta = (s64)(vruntime - min_vruntime); | ||
118 | if (delta > 0) | ||
119 | min_vruntime = vruntime; | ||
120 | |||
121 | return min_vruntime; | ||
122 | } | ||
123 | |||
124 | static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) | ||
125 | { | ||
126 | s64 delta = (s64)(vruntime - min_vruntime); | ||
127 | if (delta < 0) | ||
128 | min_vruntime = vruntime; | ||
129 | |||
130 | return min_vruntime; | ||
131 | } | ||
132 | |||
133 | static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
134 | { | ||
135 | return se->vruntime - cfs_rq->min_vruntime; | ||
136 | } | ||
137 | |||
163 | /* | 138 | /* |
164 | * Enqueue an entity into the rb-tree: | 139 | * Enqueue an entity into the rb-tree: |
165 | */ | 140 | */ |
166 | static inline void | 141 | static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) |
167 | __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
168 | { | 142 | { |
169 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; | 143 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; |
170 | struct rb_node *parent = NULL; | 144 | struct rb_node *parent = NULL; |
171 | struct sched_entity *entry; | 145 | struct sched_entity *entry; |
172 | s64 key = se->fair_key; | 146 | s64 key = entity_key(cfs_rq, se); |
173 | int leftmost = 1; | 147 | int leftmost = 1; |
174 | 148 | ||
175 | /* | 149 | /* |
@@ -182,7 +156,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
182 | * We dont care about collisions. Nodes with | 156 | * We dont care about collisions. Nodes with |
183 | * the same key stay together. | 157 | * the same key stay together. |
184 | */ | 158 | */ |
185 | if (key - entry->fair_key < 0) { | 159 | if (key < entity_key(cfs_rq, entry)) { |
186 | link = &parent->rb_left; | 160 | link = &parent->rb_left; |
187 | } else { | 161 | } else { |
188 | link = &parent->rb_right; | 162 | link = &parent->rb_right; |
@@ -199,24 +173,14 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
199 | 173 | ||
200 | rb_link_node(&se->run_node, parent, link); | 174 | rb_link_node(&se->run_node, parent, link); |
201 | rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); | 175 | rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); |
202 | update_load_add(&cfs_rq->load, se->load.weight); | ||
203 | cfs_rq->nr_running++; | ||
204 | se->on_rq = 1; | ||
205 | |||
206 | schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); | ||
207 | } | 176 | } |
208 | 177 | ||
209 | static inline void | 178 | static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) |
210 | __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
211 | { | 179 | { |
212 | if (cfs_rq->rb_leftmost == &se->run_node) | 180 | if (cfs_rq->rb_leftmost == &se->run_node) |
213 | cfs_rq->rb_leftmost = rb_next(&se->run_node); | 181 | cfs_rq->rb_leftmost = rb_next(&se->run_node); |
214 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); | ||
215 | update_load_sub(&cfs_rq->load, se->load.weight); | ||
216 | cfs_rq->nr_running--; | ||
217 | se->on_rq = 0; | ||
218 | 182 | ||
219 | schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); | 183 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); |
220 | } | 184 | } |
221 | 185 | ||
222 | static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) | 186 | static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) |
@@ -229,118 +193,86 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) | |||
229 | return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); | 193 | return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); |
230 | } | 194 | } |
231 | 195 | ||
196 | static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | ||
197 | { | ||
198 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; | ||
199 | struct sched_entity *se = NULL; | ||
200 | struct rb_node *parent; | ||
201 | |||
202 | while (*link) { | ||
203 | parent = *link; | ||
204 | se = rb_entry(parent, struct sched_entity, run_node); | ||
205 | link = &parent->rb_right; | ||
206 | } | ||
207 | |||
208 | return se; | ||
209 | } | ||
210 | |||
232 | /************************************************************** | 211 | /************************************************************** |
233 | * Scheduling class statistics methods: | 212 | * Scheduling class statistics methods: |
234 | */ | 213 | */ |
235 | 214 | ||
215 | |||
236 | /* | 216 | /* |
237 | * Calculate the preemption granularity needed to schedule every | 217 | * The idea is to set a period in which each task runs once. |
238 | * runnable task once per sysctl_sched_latency amount of time. | ||
239 | * (down to a sensible low limit on granularity) | ||
240 | * | ||
241 | * For example, if there are 2 tasks running and latency is 10 msecs, | ||
242 | * we switch tasks every 5 msecs. If we have 3 tasks running, we have | ||
243 | * to switch tasks every 3.33 msecs to get a 10 msecs observed latency | ||
244 | * for each task. We do finer and finer scheduling up to until we | ||
245 | * reach the minimum granularity value. | ||
246 | * | ||
247 | * To achieve this we use the following dynamic-granularity rule: | ||
248 | * | 218 | * |
249 | * gran = lat/nr - lat/nr/nr | 219 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch |
220 | * this period because otherwise the slices get too small. | ||
250 | * | 221 | * |
251 | * This comes out of the following equations: | 222 | * p = (nr <= nl) ? l : l*nr/nl |
252 | * | ||
253 | * kA1 + gran = kB1 | ||
254 | * kB2 + gran = kA2 | ||
255 | * kA2 = kA1 | ||
256 | * kB2 = kB1 - d + d/nr | ||
257 | * lat = d * nr | ||
258 | * | ||
259 | * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running), | ||
260 | * '1' is start of time, '2' is end of time, 'd' is delay between | ||
261 | * 1 and 2 (during which task B was running), 'nr' is number of tasks | ||
262 | * running, 'lat' is the the period of each task. ('lat' is the | ||
263 | * sched_latency that we aim for.) | ||
264 | */ | 223 | */ |
265 | static long | 224 | static u64 __sched_period(unsigned long nr_running) |
266 | sched_granularity(struct cfs_rq *cfs_rq) | ||
267 | { | 225 | { |
268 | unsigned int gran = sysctl_sched_latency; | 226 | u64 period = sysctl_sched_latency; |
269 | unsigned int nr = cfs_rq->nr_running; | 227 | unsigned long nr_latency = sysctl_sched_nr_latency; |
270 | 228 | ||
271 | if (nr > 1) { | 229 | if (unlikely(nr_running > nr_latency)) { |
272 | gran = gran/nr - gran/nr/nr; | 230 | period *= nr_running; |
273 | gran = max(gran, sysctl_sched_min_granularity); | 231 | do_div(period, nr_latency); |
274 | } | 232 | } |
275 | 233 | ||
276 | return gran; | 234 | return period; |
277 | } | 235 | } |
278 | 236 | ||
279 | /* | 237 | /* |
280 | * We rescale the rescheduling granularity of tasks according to their | 238 | * We calculate the wall-time slice from the period by taking a part |
281 | * nice level, but only linearly, not exponentially: | 239 | * proportional to the weight. |
240 | * | ||
241 | * s = p*w/rw | ||
282 | */ | 242 | */ |
283 | static long | 243 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) |
284 | niced_granularity(struct sched_entity *curr, unsigned long granularity) | ||
285 | { | 244 | { |
286 | u64 tmp; | 245 | u64 slice = __sched_period(cfs_rq->nr_running); |
287 | 246 | ||
288 | if (likely(curr->load.weight == NICE_0_LOAD)) | 247 | slice *= se->load.weight; |
289 | return granularity; | 248 | do_div(slice, cfs_rq->load.weight); |
290 | /* | ||
291 | * Positive nice levels get the same granularity as nice-0: | ||
292 | */ | ||
293 | if (likely(curr->load.weight < NICE_0_LOAD)) { | ||
294 | tmp = curr->load.weight * (u64)granularity; | ||
295 | return (long) (tmp >> NICE_0_SHIFT); | ||
296 | } | ||
297 | /* | ||
298 | * Negative nice level tasks get linearly finer | ||
299 | * granularity: | ||
300 | */ | ||
301 | tmp = curr->load.inv_weight * (u64)granularity; | ||
302 | 249 | ||
303 | /* | 250 | return slice; |
304 | * It will always fit into 'long': | ||
305 | */ | ||
306 | return (long) (tmp >> (WMULT_SHIFT-NICE_0_SHIFT)); | ||
307 | } | 251 | } |
308 | 252 | ||
309 | static inline void | 253 | /* |
310 | limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se) | 254 | * We calculate the vruntime slice. |
255 | * | ||
256 | * vs = s/w = p/rw | ||
257 | */ | ||
258 | static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) | ||
311 | { | 259 | { |
312 | long limit = sysctl_sched_runtime_limit; | 260 | u64 vslice = __sched_period(nr_running); |
313 | 261 | ||
314 | /* | 262 | do_div(vslice, rq_weight); |
315 | * Niced tasks have the same history dynamic range as | 263 | |
316 | * non-niced tasks: | 264 | return vslice; |
317 | */ | ||
318 | if (unlikely(se->wait_runtime > limit)) { | ||
319 | se->wait_runtime = limit; | ||
320 | schedstat_inc(se, wait_runtime_overruns); | ||
321 | schedstat_inc(cfs_rq, wait_runtime_overruns); | ||
322 | } | ||
323 | if (unlikely(se->wait_runtime < -limit)) { | ||
324 | se->wait_runtime = -limit; | ||
325 | schedstat_inc(se, wait_runtime_underruns); | ||
326 | schedstat_inc(cfs_rq, wait_runtime_underruns); | ||
327 | } | ||
328 | } | 265 | } |
329 | 266 | ||
330 | static inline void | 267 | static u64 sched_vslice(struct cfs_rq *cfs_rq) |
331 | __add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) | ||
332 | { | 268 | { |
333 | se->wait_runtime += delta; | 269 | return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running); |
334 | schedstat_add(se, sum_wait_runtime, delta); | ||
335 | limit_wait_runtime(cfs_rq, se); | ||
336 | } | 270 | } |
337 | 271 | ||
338 | static void | 272 | static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) |
339 | add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) | ||
340 | { | 273 | { |
341 | schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); | 274 | return __sched_vslice(cfs_rq->load.weight + se->load.weight, |
342 | __add_wait_runtime(cfs_rq, se, delta); | 275 | cfs_rq->nr_running + 1); |
343 | schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); | ||
344 | } | 276 | } |
345 | 277 | ||
346 | /* | 278 | /* |
@@ -348,46 +280,41 @@ add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) | |||
348 | * are not in our scheduling class. | 280 | * are not in our scheduling class. |
349 | */ | 281 | */ |
350 | static inline void | 282 | static inline void |
351 | __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr) | 283 | __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, |
284 | unsigned long delta_exec) | ||
352 | { | 285 | { |
353 | unsigned long delta, delta_exec, delta_fair, delta_mine; | 286 | unsigned long delta_exec_weighted; |
354 | struct load_weight *lw = &cfs_rq->load; | 287 | u64 vruntime; |
355 | unsigned long load = lw->weight; | ||
356 | 288 | ||
357 | delta_exec = curr->delta_exec; | ||
358 | schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); | 289 | schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); |
359 | 290 | ||
360 | curr->sum_exec_runtime += delta_exec; | 291 | curr->sum_exec_runtime += delta_exec; |
361 | cfs_rq->exec_clock += delta_exec; | 292 | schedstat_add(cfs_rq, exec_clock, delta_exec); |
362 | 293 | delta_exec_weighted = delta_exec; | |
363 | if (unlikely(!load)) | 294 | if (unlikely(curr->load.weight != NICE_0_LOAD)) { |
364 | return; | 295 | delta_exec_weighted = calc_delta_fair(delta_exec_weighted, |
365 | 296 | &curr->load); | |
366 | delta_fair = calc_delta_fair(delta_exec, lw); | ||
367 | delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); | ||
368 | |||
369 | if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) { | ||
370 | delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); | ||
371 | delta = min(delta, (unsigned long)( | ||
372 | (long)sysctl_sched_runtime_limit - curr->wait_runtime)); | ||
373 | cfs_rq->sleeper_bonus -= delta; | ||
374 | delta_mine -= delta; | ||
375 | } | 297 | } |
298 | curr->vruntime += delta_exec_weighted; | ||
376 | 299 | ||
377 | cfs_rq->fair_clock += delta_fair; | ||
378 | /* | 300 | /* |
379 | * We executed delta_exec amount of time on the CPU, | 301 | * maintain cfs_rq->min_vruntime to be a monotonic increasing |
380 | * but we were only entitled to delta_mine amount of | 302 | * value tracking the leftmost vruntime in the tree. |
381 | * time during that period (if nr_running == 1 then | ||
382 | * the two values are equal) | ||
383 | * [Note: delta_mine - delta_exec is negative]: | ||
384 | */ | 303 | */ |
385 | add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec); | 304 | if (first_fair(cfs_rq)) { |
305 | vruntime = min_vruntime(curr->vruntime, | ||
306 | __pick_next_entity(cfs_rq)->vruntime); | ||
307 | } else | ||
308 | vruntime = curr->vruntime; | ||
309 | |||
310 | cfs_rq->min_vruntime = | ||
311 | max_vruntime(cfs_rq->min_vruntime, vruntime); | ||
386 | } | 312 | } |
387 | 313 | ||
388 | static void update_curr(struct cfs_rq *cfs_rq) | 314 | static void update_curr(struct cfs_rq *cfs_rq) |
389 | { | 315 | { |
390 | struct sched_entity *curr = cfs_rq_curr(cfs_rq); | 316 | struct sched_entity *curr = cfs_rq->curr; |
317 | u64 now = rq_of(cfs_rq)->clock; | ||
391 | unsigned long delta_exec; | 318 | unsigned long delta_exec; |
392 | 319 | ||
393 | if (unlikely(!curr)) | 320 | if (unlikely(!curr)) |
@@ -398,135 +325,47 @@ static void update_curr(struct cfs_rq *cfs_rq) | |||
398 | * since the last time we changed load (this cannot | 325 | * since the last time we changed load (this cannot |
399 | * overflow on 32 bits): | 326 | * overflow on 32 bits): |
400 | */ | 327 | */ |
401 | delta_exec = (unsigned long)(rq_of(cfs_rq)->clock - curr->exec_start); | 328 | delta_exec = (unsigned long)(now - curr->exec_start); |
402 | 329 | ||
403 | curr->delta_exec += delta_exec; | 330 | __update_curr(cfs_rq, curr, delta_exec); |
404 | 331 | curr->exec_start = now; | |
405 | if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) { | ||
406 | __update_curr(cfs_rq, curr); | ||
407 | curr->delta_exec = 0; | ||
408 | } | ||
409 | curr->exec_start = rq_of(cfs_rq)->clock; | ||
410 | } | 332 | } |
411 | 333 | ||
412 | static inline void | 334 | static inline void |
413 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | 335 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) |
414 | { | 336 | { |
415 | se->wait_start_fair = cfs_rq->fair_clock; | ||
416 | schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); | 337 | schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); |
417 | } | 338 | } |
418 | 339 | ||
419 | /* | 340 | /* |
420 | * We calculate fair deltas here, so protect against the random effects | ||
421 | * of a multiplication overflow by capping it to the runtime limit: | ||
422 | */ | ||
423 | #if BITS_PER_LONG == 32 | ||
424 | static inline unsigned long | ||
425 | calc_weighted(unsigned long delta, unsigned long weight, int shift) | ||
426 | { | ||
427 | u64 tmp = (u64)delta * weight >> shift; | ||
428 | |||
429 | if (unlikely(tmp > sysctl_sched_runtime_limit*2)) | ||
430 | return sysctl_sched_runtime_limit*2; | ||
431 | return tmp; | ||
432 | } | ||
433 | #else | ||
434 | static inline unsigned long | ||
435 | calc_weighted(unsigned long delta, unsigned long weight, int shift) | ||
436 | { | ||
437 | return delta * weight >> shift; | ||
438 | } | ||
439 | #endif | ||
440 | |||
441 | /* | ||
442 | * Task is being enqueued - update stats: | 341 | * Task is being enqueued - update stats: |
443 | */ | 342 | */ |
444 | static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 343 | static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
445 | { | 344 | { |
446 | s64 key; | ||
447 | |||
448 | /* | 345 | /* |
449 | * Are we enqueueing a waiting task? (for current tasks | 346 | * Are we enqueueing a waiting task? (for current tasks |
450 | * a dequeue/enqueue event is a NOP) | 347 | * a dequeue/enqueue event is a NOP) |
451 | */ | 348 | */ |
452 | if (se != cfs_rq_curr(cfs_rq)) | 349 | if (se != cfs_rq->curr) |
453 | update_stats_wait_start(cfs_rq, se); | 350 | update_stats_wait_start(cfs_rq, se); |
454 | /* | ||
455 | * Update the key: | ||
456 | */ | ||
457 | key = cfs_rq->fair_clock; | ||
458 | |||
459 | /* | ||
460 | * Optimize the common nice 0 case: | ||
461 | */ | ||
462 | if (likely(se->load.weight == NICE_0_LOAD)) { | ||
463 | key -= se->wait_runtime; | ||
464 | } else { | ||
465 | u64 tmp; | ||
466 | |||
467 | if (se->wait_runtime < 0) { | ||
468 | tmp = -se->wait_runtime; | ||
469 | key += (tmp * se->load.inv_weight) >> | ||
470 | (WMULT_SHIFT - NICE_0_SHIFT); | ||
471 | } else { | ||
472 | tmp = se->wait_runtime; | ||
473 | key -= (tmp * se->load.inv_weight) >> | ||
474 | (WMULT_SHIFT - NICE_0_SHIFT); | ||
475 | } | ||
476 | } | ||
477 | |||
478 | se->fair_key = key; | ||
479 | } | ||
480 | |||
481 | /* | ||
482 | * Note: must be called with a freshly updated rq->fair_clock. | ||
483 | */ | ||
484 | static inline void | ||
485 | __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
486 | { | ||
487 | unsigned long delta_fair = se->delta_fair_run; | ||
488 | |||
489 | schedstat_set(se->wait_max, max(se->wait_max, | ||
490 | rq_of(cfs_rq)->clock - se->wait_start)); | ||
491 | |||
492 | if (unlikely(se->load.weight != NICE_0_LOAD)) | ||
493 | delta_fair = calc_weighted(delta_fair, se->load.weight, | ||
494 | NICE_0_SHIFT); | ||
495 | |||
496 | add_wait_runtime(cfs_rq, se, delta_fair); | ||
497 | } | 351 | } |
498 | 352 | ||
499 | static void | 353 | static void |
500 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | 354 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) |
501 | { | 355 | { |
502 | unsigned long delta_fair; | 356 | schedstat_set(se->wait_max, max(se->wait_max, |
503 | 357 | rq_of(cfs_rq)->clock - se->wait_start)); | |
504 | if (unlikely(!se->wait_start_fair)) | ||
505 | return; | ||
506 | |||
507 | delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), | ||
508 | (u64)(cfs_rq->fair_clock - se->wait_start_fair)); | ||
509 | |||
510 | se->delta_fair_run += delta_fair; | ||
511 | if (unlikely(abs(se->delta_fair_run) >= | ||
512 | sysctl_sched_stat_granularity)) { | ||
513 | __update_stats_wait_end(cfs_rq, se); | ||
514 | se->delta_fair_run = 0; | ||
515 | } | ||
516 | |||
517 | se->wait_start_fair = 0; | ||
518 | schedstat_set(se->wait_start, 0); | 358 | schedstat_set(se->wait_start, 0); |
519 | } | 359 | } |
520 | 360 | ||
521 | static inline void | 361 | static inline void |
522 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 362 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
523 | { | 363 | { |
524 | update_curr(cfs_rq); | ||
525 | /* | 364 | /* |
526 | * Mark the end of the wait period if dequeueing a | 365 | * Mark the end of the wait period if dequeueing a |
527 | * waiting task: | 366 | * waiting task: |
528 | */ | 367 | */ |
529 | if (se != cfs_rq_curr(cfs_rq)) | 368 | if (se != cfs_rq->curr) |
530 | update_stats_wait_end(cfs_rq, se); | 369 | update_stats_wait_end(cfs_rq, se); |
531 | } | 370 | } |
532 | 371 | ||
@@ -542,79 +381,28 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
542 | se->exec_start = rq_of(cfs_rq)->clock; | 381 | se->exec_start = rq_of(cfs_rq)->clock; |
543 | } | 382 | } |
544 | 383 | ||
545 | /* | ||
546 | * We are descheduling a task - update its stats: | ||
547 | */ | ||
548 | static inline void | ||
549 | update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
550 | { | ||
551 | se->exec_start = 0; | ||
552 | } | ||
553 | |||
554 | /************************************************** | 384 | /************************************************** |
555 | * Scheduling class queueing methods: | 385 | * Scheduling class queueing methods: |
556 | */ | 386 | */ |
557 | 387 | ||
558 | static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 388 | static void |
389 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
559 | { | 390 | { |
560 | unsigned long load = cfs_rq->load.weight, delta_fair; | 391 | update_load_add(&cfs_rq->load, se->load.weight); |
561 | long prev_runtime; | 392 | cfs_rq->nr_running++; |
562 | 393 | se->on_rq = 1; | |
563 | /* | 394 | } |
564 | * Do not boost sleepers if there's too much bonus 'in flight' | ||
565 | * already: | ||
566 | */ | ||
567 | if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit)) | ||
568 | return; | ||
569 | |||
570 | if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG) | ||
571 | load = rq_of(cfs_rq)->cpu_load[2]; | ||
572 | |||
573 | delta_fair = se->delta_fair_sleep; | ||
574 | |||
575 | /* | ||
576 | * Fix up delta_fair with the effect of us running | ||
577 | * during the whole sleep period: | ||
578 | */ | ||
579 | if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG) | ||
580 | delta_fair = div64_likely32((u64)delta_fair * load, | ||
581 | load + se->load.weight); | ||
582 | |||
583 | if (unlikely(se->load.weight != NICE_0_LOAD)) | ||
584 | delta_fair = calc_weighted(delta_fair, se->load.weight, | ||
585 | NICE_0_SHIFT); | ||
586 | |||
587 | prev_runtime = se->wait_runtime; | ||
588 | __add_wait_runtime(cfs_rq, se, delta_fair); | ||
589 | delta_fair = se->wait_runtime - prev_runtime; | ||
590 | 395 | ||
591 | /* | 396 | static void |
592 | * Track the amount of bonus we've given to sleepers: | 397 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
593 | */ | 398 | { |
594 | cfs_rq->sleeper_bonus += delta_fair; | 399 | update_load_sub(&cfs_rq->load, se->load.weight); |
400 | cfs_rq->nr_running--; | ||
401 | se->on_rq = 0; | ||
595 | } | 402 | } |
596 | 403 | ||
597 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 404 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
598 | { | 405 | { |
599 | struct task_struct *tsk = task_of(se); | ||
600 | unsigned long delta_fair; | ||
601 | |||
602 | if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) || | ||
603 | !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS)) | ||
604 | return; | ||
605 | |||
606 | delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), | ||
607 | (u64)(cfs_rq->fair_clock - se->sleep_start_fair)); | ||
608 | |||
609 | se->delta_fair_sleep += delta_fair; | ||
610 | if (unlikely(abs(se->delta_fair_sleep) >= | ||
611 | sysctl_sched_stat_granularity)) { | ||
612 | __enqueue_sleeper(cfs_rq, se); | ||
613 | se->delta_fair_sleep = 0; | ||
614 | } | ||
615 | |||
616 | se->sleep_start_fair = 0; | ||
617 | |||
618 | #ifdef CONFIG_SCHEDSTATS | 406 | #ifdef CONFIG_SCHEDSTATS |
619 | if (se->sleep_start) { | 407 | if (se->sleep_start) { |
620 | u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; | 408 | u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; |
@@ -646,6 +434,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
646 | * time that the task spent sleeping: | 434 | * time that the task spent sleeping: |
647 | */ | 435 | */ |
648 | if (unlikely(prof_on == SLEEP_PROFILING)) { | 436 | if (unlikely(prof_on == SLEEP_PROFILING)) { |
437 | struct task_struct *tsk = task_of(se); | ||
438 | |||
649 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), | 439 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), |
650 | delta >> 20); | 440 | delta >> 20); |
651 | } | 441 | } |
@@ -653,27 +443,81 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
653 | #endif | 443 | #endif |
654 | } | 444 | } |
655 | 445 | ||
446 | static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
447 | { | ||
448 | #ifdef CONFIG_SCHED_DEBUG | ||
449 | s64 d = se->vruntime - cfs_rq->min_vruntime; | ||
450 | |||
451 | if (d < 0) | ||
452 | d = -d; | ||
453 | |||
454 | if (d > 3*sysctl_sched_latency) | ||
455 | schedstat_inc(cfs_rq, nr_spread_over); | ||
456 | #endif | ||
457 | } | ||
458 | |||
459 | static void | ||
460 | place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | ||
461 | { | ||
462 | u64 vruntime; | ||
463 | |||
464 | vruntime = cfs_rq->min_vruntime; | ||
465 | |||
466 | if (sched_feat(TREE_AVG)) { | ||
467 | struct sched_entity *last = __pick_last_entity(cfs_rq); | ||
468 | if (last) { | ||
469 | vruntime += last->vruntime; | ||
470 | vruntime >>= 1; | ||
471 | } | ||
472 | } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) | ||
473 | vruntime += sched_vslice(cfs_rq)/2; | ||
474 | |||
475 | if (initial && sched_feat(START_DEBIT)) | ||
476 | vruntime += sched_vslice_add(cfs_rq, se); | ||
477 | |||
478 | if (!initial) { | ||
479 | if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) && | ||
480 | task_of(se)->policy != SCHED_BATCH) | ||
481 | vruntime -= sysctl_sched_latency; | ||
482 | |||
483 | vruntime = max_t(s64, vruntime, se->vruntime); | ||
484 | } | ||
485 | |||
486 | se->vruntime = vruntime; | ||
487 | |||
488 | } | ||
489 | |||
656 | static void | 490 | static void |
657 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) | 491 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) |
658 | { | 492 | { |
659 | /* | 493 | /* |
660 | * Update the fair clock. | 494 | * Update run-time statistics of the 'current'. |
661 | */ | 495 | */ |
662 | update_curr(cfs_rq); | 496 | update_curr(cfs_rq); |
663 | 497 | ||
664 | if (wakeup) | 498 | if (wakeup) { |
499 | place_entity(cfs_rq, se, 0); | ||
665 | enqueue_sleeper(cfs_rq, se); | 500 | enqueue_sleeper(cfs_rq, se); |
501 | } | ||
666 | 502 | ||
667 | update_stats_enqueue(cfs_rq, se); | 503 | update_stats_enqueue(cfs_rq, se); |
668 | __enqueue_entity(cfs_rq, se); | 504 | check_spread(cfs_rq, se); |
505 | if (se != cfs_rq->curr) | ||
506 | __enqueue_entity(cfs_rq, se); | ||
507 | account_entity_enqueue(cfs_rq, se); | ||
669 | } | 508 | } |
670 | 509 | ||
671 | static void | 510 | static void |
672 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | 511 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) |
673 | { | 512 | { |
513 | /* | ||
514 | * Update run-time statistics of the 'current'. | ||
515 | */ | ||
516 | update_curr(cfs_rq); | ||
517 | |||
674 | update_stats_dequeue(cfs_rq, se); | 518 | update_stats_dequeue(cfs_rq, se); |
675 | if (sleep) { | 519 | if (sleep) { |
676 | se->sleep_start_fair = cfs_rq->fair_clock; | 520 | se->peer_preempt = 0; |
677 | #ifdef CONFIG_SCHEDSTATS | 521 | #ifdef CONFIG_SCHEDSTATS |
678 | if (entity_is_task(se)) { | 522 | if (entity_is_task(se)) { |
679 | struct task_struct *tsk = task_of(se); | 523 | struct task_struct *tsk = task_of(se); |
@@ -685,68 +529,66 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | |||
685 | } | 529 | } |
686 | #endif | 530 | #endif |
687 | } | 531 | } |
688 | __dequeue_entity(cfs_rq, se); | 532 | |
533 | if (se != cfs_rq->curr) | ||
534 | __dequeue_entity(cfs_rq, se); | ||
535 | account_entity_dequeue(cfs_rq, se); | ||
689 | } | 536 | } |
690 | 537 | ||
691 | /* | 538 | /* |
692 | * Preempt the current task with a newly woken task if needed: | 539 | * Preempt the current task with a newly woken task if needed: |
693 | */ | 540 | */ |
694 | static void | 541 | static void |
695 | __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, | 542 | check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) |
696 | struct sched_entity *curr, unsigned long granularity) | ||
697 | { | 543 | { |
698 | s64 __delta = curr->fair_key - se->fair_key; | ||
699 | unsigned long ideal_runtime, delta_exec; | 544 | unsigned long ideal_runtime, delta_exec; |
700 | 545 | ||
701 | /* | 546 | ideal_runtime = sched_slice(cfs_rq, curr); |
702 | * ideal_runtime is compared against sum_exec_runtime, which is | ||
703 | * walltime, hence do not scale. | ||
704 | */ | ||
705 | ideal_runtime = max(sysctl_sched_latency / cfs_rq->nr_running, | ||
706 | (unsigned long)sysctl_sched_min_granularity); | ||
707 | |||
708 | /* | ||
709 | * If we executed more than what the latency constraint suggests, | ||
710 | * reduce the rescheduling granularity. This way the total latency | ||
711 | * of how much a task is not scheduled converges to | ||
712 | * sysctl_sched_latency: | ||
713 | */ | ||
714 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; | 547 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; |
715 | if (delta_exec > ideal_runtime) | 548 | if (delta_exec > ideal_runtime || |
716 | granularity = 0; | 549 | (sched_feat(PREEMPT_RESTRICT) && curr->peer_preempt)) |
717 | |||
718 | /* | ||
719 | * Take scheduling granularity into account - do not | ||
720 | * preempt the current task unless the best task has | ||
721 | * a larger than sched_granularity fairness advantage: | ||
722 | * | ||
723 | * scale granularity as key space is in fair_clock. | ||
724 | */ | ||
725 | if (__delta > niced_granularity(curr, granularity)) | ||
726 | resched_task(rq_of(cfs_rq)->curr); | 550 | resched_task(rq_of(cfs_rq)->curr); |
551 | curr->peer_preempt = 0; | ||
727 | } | 552 | } |
728 | 553 | ||
729 | static inline void | 554 | static void |
730 | set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | 555 | set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) |
731 | { | 556 | { |
557 | /* 'current' is not kept within the tree. */ | ||
558 | if (se->on_rq) { | ||
559 | /* | ||
560 | * Any task has to be enqueued before it get to execute on | ||
561 | * a CPU. So account for the time it spent waiting on the | ||
562 | * runqueue. | ||
563 | */ | ||
564 | update_stats_wait_end(cfs_rq, se); | ||
565 | __dequeue_entity(cfs_rq, se); | ||
566 | } | ||
567 | |||
568 | update_stats_curr_start(cfs_rq, se); | ||
569 | cfs_rq->curr = se; | ||
570 | #ifdef CONFIG_SCHEDSTATS | ||
732 | /* | 571 | /* |
733 | * Any task has to be enqueued before it get to execute on | 572 | * Track our maximum slice length, if the CPU's load is at |
734 | * a CPU. So account for the time it spent waiting on the | 573 | * least twice that of our own weight (i.e. dont track it |
735 | * runqueue. (note, here we rely on pick_next_task() having | 574 | * when there are only lesser-weight tasks around): |
736 | * done a put_prev_task_fair() shortly before this, which | ||
737 | * updated rq->fair_clock - used by update_stats_wait_end()) | ||
738 | */ | 575 | */ |
739 | update_stats_wait_end(cfs_rq, se); | 576 | if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { |
740 | update_stats_curr_start(cfs_rq, se); | 577 | se->slice_max = max(se->slice_max, |
741 | set_cfs_rq_curr(cfs_rq, se); | 578 | se->sum_exec_runtime - se->prev_sum_exec_runtime); |
579 | } | ||
580 | #endif | ||
742 | se->prev_sum_exec_runtime = se->sum_exec_runtime; | 581 | se->prev_sum_exec_runtime = se->sum_exec_runtime; |
743 | } | 582 | } |
744 | 583 | ||
745 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | 584 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) |
746 | { | 585 | { |
747 | struct sched_entity *se = __pick_next_entity(cfs_rq); | 586 | struct sched_entity *se = NULL; |
748 | 587 | ||
749 | set_next_entity(cfs_rq, se); | 588 | if (first_fair(cfs_rq)) { |
589 | se = __pick_next_entity(cfs_rq); | ||
590 | set_next_entity(cfs_rq, se); | ||
591 | } | ||
750 | 592 | ||
751 | return se; | 593 | return se; |
752 | } | 594 | } |
@@ -760,33 +602,24 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
760 | if (prev->on_rq) | 602 | if (prev->on_rq) |
761 | update_curr(cfs_rq); | 603 | update_curr(cfs_rq); |
762 | 604 | ||
763 | update_stats_curr_end(cfs_rq, prev); | 605 | check_spread(cfs_rq, prev); |
764 | 606 | if (prev->on_rq) { | |
765 | if (prev->on_rq) | ||
766 | update_stats_wait_start(cfs_rq, prev); | 607 | update_stats_wait_start(cfs_rq, prev); |
767 | set_cfs_rq_curr(cfs_rq, NULL); | 608 | /* Put 'current' back into the tree. */ |
609 | __enqueue_entity(cfs_rq, prev); | ||
610 | } | ||
611 | cfs_rq->curr = NULL; | ||
768 | } | 612 | } |
769 | 613 | ||
770 | static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | 614 | static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) |
771 | { | 615 | { |
772 | struct sched_entity *next; | ||
773 | |||
774 | /* | 616 | /* |
775 | * Dequeue and enqueue the task to update its | 617 | * Update run-time statistics of the 'current'. |
776 | * position within the tree: | ||
777 | */ | 618 | */ |
778 | dequeue_entity(cfs_rq, curr, 0); | 619 | update_curr(cfs_rq); |
779 | enqueue_entity(cfs_rq, curr, 0); | ||
780 | |||
781 | /* | ||
782 | * Reschedule if another task tops the current one. | ||
783 | */ | ||
784 | next = __pick_next_entity(cfs_rq); | ||
785 | if (next == curr) | ||
786 | return; | ||
787 | 620 | ||
788 | __check_preempt_curr_fair(cfs_rq, next, curr, | 621 | if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) |
789 | sched_granularity(cfs_rq)); | 622 | check_preempt_tick(cfs_rq, curr); |
790 | } | 623 | } |
791 | 624 | ||
792 | /************************************************** | 625 | /************************************************** |
@@ -821,23 +654,28 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | |||
821 | */ | 654 | */ |
822 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | 655 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) |
823 | { | 656 | { |
824 | /* A later patch will take group into account */ | 657 | return cfs_rq->tg->cfs_rq[this_cpu]; |
825 | return &cpu_rq(this_cpu)->cfs; | ||
826 | } | 658 | } |
827 | 659 | ||
828 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | 660 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ |
829 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 661 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
830 | list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | 662 | list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) |
831 | 663 | ||
832 | /* Do the two (enqueued) tasks belong to the same group ? */ | 664 | /* Do the two (enqueued) entities belong to the same group ? */ |
833 | static inline int is_same_group(struct task_struct *curr, struct task_struct *p) | 665 | static inline int |
666 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
834 | { | 667 | { |
835 | if (curr->se.cfs_rq == p->se.cfs_rq) | 668 | if (se->cfs_rq == pse->cfs_rq) |
836 | return 1; | 669 | return 1; |
837 | 670 | ||
838 | return 0; | 671 | return 0; |
839 | } | 672 | } |
840 | 673 | ||
674 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
675 | { | ||
676 | return se->parent; | ||
677 | } | ||
678 | |||
841 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 679 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
842 | 680 | ||
843 | #define for_each_sched_entity(se) \ | 681 | #define for_each_sched_entity(se) \ |
@@ -870,11 +708,17 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | |||
870 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 708 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
871 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | 709 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) |
872 | 710 | ||
873 | static inline int is_same_group(struct task_struct *curr, struct task_struct *p) | 711 | static inline int |
712 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
874 | { | 713 | { |
875 | return 1; | 714 | return 1; |
876 | } | 715 | } |
877 | 716 | ||
717 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
718 | { | ||
719 | return NULL; | ||
720 | } | ||
721 | |||
878 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 722 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
879 | 723 | ||
880 | /* | 724 | /* |
@@ -892,6 +736,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) | |||
892 | break; | 736 | break; |
893 | cfs_rq = cfs_rq_of(se); | 737 | cfs_rq = cfs_rq_of(se); |
894 | enqueue_entity(cfs_rq, se, wakeup); | 738 | enqueue_entity(cfs_rq, se, wakeup); |
739 | wakeup = 1; | ||
895 | } | 740 | } |
896 | } | 741 | } |
897 | 742 | ||
@@ -911,6 +756,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) | |||
911 | /* Don't dequeue parent if it has other entities besides us */ | 756 | /* Don't dequeue parent if it has other entities besides us */ |
912 | if (cfs_rq->load.weight) | 757 | if (cfs_rq->load.weight) |
913 | break; | 758 | break; |
759 | sleep = 1; | ||
914 | } | 760 | } |
915 | } | 761 | } |
916 | 762 | ||
@@ -919,12 +765,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) | |||
919 | * | 765 | * |
920 | * If compat_yield is turned on then we requeue to the end of the tree. | 766 | * If compat_yield is turned on then we requeue to the end of the tree. |
921 | */ | 767 | */ |
922 | static void yield_task_fair(struct rq *rq, struct task_struct *p) | 768 | static void yield_task_fair(struct rq *rq) |
923 | { | 769 | { |
924 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 770 | struct cfs_rq *cfs_rq = task_cfs_rq(rq->curr); |
925 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; | 771 | struct sched_entity *rightmost, *se = &rq->curr->se; |
926 | struct sched_entity *rightmost, *se = &p->se; | ||
927 | struct rb_node *parent; | ||
928 | 772 | ||
929 | /* | 773 | /* |
930 | * Are we the only task in the tree? | 774 | * Are we the only task in the tree? |
@@ -935,52 +779,39 @@ static void yield_task_fair(struct rq *rq, struct task_struct *p) | |||
935 | if (likely(!sysctl_sched_compat_yield)) { | 779 | if (likely(!sysctl_sched_compat_yield)) { |
936 | __update_rq_clock(rq); | 780 | __update_rq_clock(rq); |
937 | /* | 781 | /* |
938 | * Dequeue and enqueue the task to update its | 782 | * Update run-time statistics of the 'current'. |
939 | * position within the tree: | ||
940 | */ | 783 | */ |
941 | dequeue_entity(cfs_rq, &p->se, 0); | 784 | update_curr(cfs_rq); |
942 | enqueue_entity(cfs_rq, &p->se, 0); | ||
943 | 785 | ||
944 | return; | 786 | return; |
945 | } | 787 | } |
946 | /* | 788 | /* |
947 | * Find the rightmost entry in the rbtree: | 789 | * Find the rightmost entry in the rbtree: |
948 | */ | 790 | */ |
949 | do { | 791 | rightmost = __pick_last_entity(cfs_rq); |
950 | parent = *link; | ||
951 | link = &parent->rb_right; | ||
952 | } while (*link); | ||
953 | |||
954 | rightmost = rb_entry(parent, struct sched_entity, run_node); | ||
955 | /* | 792 | /* |
956 | * Already in the rightmost position? | 793 | * Already in the rightmost position? |
957 | */ | 794 | */ |
958 | if (unlikely(rightmost == se)) | 795 | if (unlikely(rightmost->vruntime < se->vruntime)) |
959 | return; | 796 | return; |
960 | 797 | ||
961 | /* | 798 | /* |
962 | * Minimally necessary key value to be last in the tree: | 799 | * Minimally necessary key value to be last in the tree: |
800 | * Upon rescheduling, sched_class::put_prev_task() will place | ||
801 | * 'current' within the tree based on its new key value. | ||
963 | */ | 802 | */ |
964 | se->fair_key = rightmost->fair_key + 1; | 803 | se->vruntime = rightmost->vruntime + 1; |
965 | |||
966 | if (cfs_rq->rb_leftmost == &se->run_node) | ||
967 | cfs_rq->rb_leftmost = rb_next(&se->run_node); | ||
968 | /* | ||
969 | * Relink the task to the rightmost position: | ||
970 | */ | ||
971 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); | ||
972 | rb_link_node(&se->run_node, parent, link); | ||
973 | rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); | ||
974 | } | 804 | } |
975 | 805 | ||
976 | /* | 806 | /* |
977 | * Preempt the current task with a newly woken task if needed: | 807 | * Preempt the current task with a newly woken task if needed: |
978 | */ | 808 | */ |
979 | static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) | 809 | static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) |
980 | { | 810 | { |
981 | struct task_struct *curr = rq->curr; | 811 | struct task_struct *curr = rq->curr; |
982 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 812 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
983 | unsigned long gran; | 813 | struct sched_entity *se = &curr->se, *pse = &p->se; |
814 | s64 delta, gran; | ||
984 | 815 | ||
985 | if (unlikely(rt_prio(p->prio))) { | 816 | if (unlikely(rt_prio(p->prio))) { |
986 | update_rq_clock(rq); | 817 | update_rq_clock(rq); |
@@ -988,16 +819,31 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) | |||
988 | resched_task(curr); | 819 | resched_task(curr); |
989 | return; | 820 | return; |
990 | } | 821 | } |
991 | |||
992 | gran = sysctl_sched_wakeup_granularity; | ||
993 | /* | 822 | /* |
994 | * Batch tasks prefer throughput over latency: | 823 | * Batch tasks do not preempt (their preemption is driven by |
824 | * the tick): | ||
995 | */ | 825 | */ |
996 | if (unlikely(p->policy == SCHED_BATCH)) | 826 | if (unlikely(p->policy == SCHED_BATCH)) |
997 | gran = sysctl_sched_batch_wakeup_granularity; | 827 | return; |
828 | |||
829 | if (sched_feat(WAKEUP_PREEMPT)) { | ||
830 | while (!is_same_group(se, pse)) { | ||
831 | se = parent_entity(se); | ||
832 | pse = parent_entity(pse); | ||
833 | } | ||
998 | 834 | ||
999 | if (is_same_group(curr, p)) | 835 | delta = se->vruntime - pse->vruntime; |
1000 | __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran); | 836 | gran = sysctl_sched_wakeup_granularity; |
837 | if (unlikely(se->load.weight != NICE_0_LOAD)) | ||
838 | gran = calc_delta_fair(gran, &se->load); | ||
839 | |||
840 | if (delta > gran) { | ||
841 | int now = !sched_feat(PREEMPT_RESTRICT); | ||
842 | |||
843 | if (now || p->prio < curr->prio || !se->peer_preempt++) | ||
844 | resched_task(curr); | ||
845 | } | ||
846 | } | ||
1001 | } | 847 | } |
1002 | 848 | ||
1003 | static struct task_struct *pick_next_task_fair(struct rq *rq) | 849 | static struct task_struct *pick_next_task_fair(struct rq *rq) |
@@ -1041,7 +887,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) | |||
1041 | * achieve that by always pre-iterating before returning | 887 | * achieve that by always pre-iterating before returning |
1042 | * the current task: | 888 | * the current task: |
1043 | */ | 889 | */ |
1044 | static inline struct task_struct * | 890 | static struct task_struct * |
1045 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) | 891 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) |
1046 | { | 892 | { |
1047 | struct task_struct *p; | 893 | struct task_struct *p; |
@@ -1078,7 +924,10 @@ static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | |||
1078 | if (!cfs_rq->nr_running) | 924 | if (!cfs_rq->nr_running) |
1079 | return MAX_PRIO; | 925 | return MAX_PRIO; |
1080 | 926 | ||
1081 | curr = __pick_next_entity(cfs_rq); | 927 | curr = cfs_rq->curr; |
928 | if (!curr) | ||
929 | curr = __pick_next_entity(cfs_rq); | ||
930 | |||
1082 | p = task_of(curr); | 931 | p = task_of(curr); |
1083 | 932 | ||
1084 | return p->prio; | 933 | return p->prio; |
@@ -1153,6 +1002,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr) | |||
1153 | } | 1002 | } |
1154 | } | 1003 | } |
1155 | 1004 | ||
1005 | #define swap(a,b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0) | ||
1006 | |||
1156 | /* | 1007 | /* |
1157 | * Share the fairness runtime between parent and child, thus the | 1008 | * Share the fairness runtime between parent and child, thus the |
1158 | * total amount of pressure for CPU stays equal - new tasks | 1009 | * total amount of pressure for CPU stays equal - new tasks |
@@ -1163,37 +1014,32 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr) | |||
1163 | static void task_new_fair(struct rq *rq, struct task_struct *p) | 1014 | static void task_new_fair(struct rq *rq, struct task_struct *p) |
1164 | { | 1015 | { |
1165 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 1016 | struct cfs_rq *cfs_rq = task_cfs_rq(p); |
1166 | struct sched_entity *se = &p->se, *curr = cfs_rq_curr(cfs_rq); | 1017 | struct sched_entity *se = &p->se, *curr = cfs_rq->curr; |
1018 | int this_cpu = smp_processor_id(); | ||
1167 | 1019 | ||
1168 | sched_info_queued(p); | 1020 | sched_info_queued(p); |
1169 | 1021 | ||
1170 | update_curr(cfs_rq); | 1022 | update_curr(cfs_rq); |
1171 | update_stats_enqueue(cfs_rq, se); | 1023 | place_entity(cfs_rq, se, 1); |
1172 | /* | ||
1173 | * Child runs first: we let it run before the parent | ||
1174 | * until it reschedules once. We set up the key so that | ||
1175 | * it will preempt the parent: | ||
1176 | */ | ||
1177 | se->fair_key = curr->fair_key - | ||
1178 | niced_granularity(curr, sched_granularity(cfs_rq)) - 1; | ||
1179 | /* | ||
1180 | * The first wait is dominated by the child-runs-first logic, | ||
1181 | * so do not credit it with that waiting time yet: | ||
1182 | */ | ||
1183 | if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL) | ||
1184 | se->wait_start_fair = 0; | ||
1185 | 1024 | ||
1186 | /* | 1025 | if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && |
1187 | * The statistical average of wait_runtime is about | 1026 | curr->vruntime < se->vruntime) { |
1188 | * -granularity/2, so initialize the task with that: | 1027 | /* |
1189 | */ | 1028 | * Upon rescheduling, sched_class::put_prev_task() will place |
1190 | if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) | 1029 | * 'current' within the tree based on its new key value. |
1191 | se->wait_runtime = -(sched_granularity(cfs_rq) / 2); | 1030 | */ |
1031 | swap(curr->vruntime, se->vruntime); | ||
1032 | } | ||
1192 | 1033 | ||
1034 | update_stats_enqueue(cfs_rq, se); | ||
1035 | check_spread(cfs_rq, se); | ||
1036 | check_spread(cfs_rq, curr); | ||
1193 | __enqueue_entity(cfs_rq, se); | 1037 | __enqueue_entity(cfs_rq, se); |
1038 | account_entity_enqueue(cfs_rq, se); | ||
1039 | se->peer_preempt = 0; | ||
1040 | resched_task(rq->curr); | ||
1194 | } | 1041 | } |
1195 | 1042 | ||
1196 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1197 | /* Account for a task changing its policy or group. | 1043 | /* Account for a task changing its policy or group. |
1198 | * | 1044 | * |
1199 | * This routine is mostly called to set cfs_rq->curr field when a task | 1045 | * This routine is mostly called to set cfs_rq->curr field when a task |
@@ -1206,21 +1052,17 @@ static void set_curr_task_fair(struct rq *rq) | |||
1206 | for_each_sched_entity(se) | 1052 | for_each_sched_entity(se) |
1207 | set_next_entity(cfs_rq_of(se), se); | 1053 | set_next_entity(cfs_rq_of(se), se); |
1208 | } | 1054 | } |
1209 | #else | ||
1210 | static void set_curr_task_fair(struct rq *rq) | ||
1211 | { | ||
1212 | } | ||
1213 | #endif | ||
1214 | 1055 | ||
1215 | /* | 1056 | /* |
1216 | * All the scheduling class methods: | 1057 | * All the scheduling class methods: |
1217 | */ | 1058 | */ |
1218 | struct sched_class fair_sched_class __read_mostly = { | 1059 | static const struct sched_class fair_sched_class = { |
1060 | .next = &idle_sched_class, | ||
1219 | .enqueue_task = enqueue_task_fair, | 1061 | .enqueue_task = enqueue_task_fair, |
1220 | .dequeue_task = dequeue_task_fair, | 1062 | .dequeue_task = dequeue_task_fair, |
1221 | .yield_task = yield_task_fair, | 1063 | .yield_task = yield_task_fair, |
1222 | 1064 | ||
1223 | .check_preempt_curr = check_preempt_curr_fair, | 1065 | .check_preempt_curr = check_preempt_wakeup, |
1224 | 1066 | ||
1225 | .pick_next_task = pick_next_task_fair, | 1067 | .pick_next_task = pick_next_task_fair, |
1226 | .put_prev_task = put_prev_task_fair, | 1068 | .put_prev_task = put_prev_task_fair, |
@@ -1237,6 +1079,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu) | |||
1237 | { | 1079 | { |
1238 | struct cfs_rq *cfs_rq; | 1080 | struct cfs_rq *cfs_rq; |
1239 | 1081 | ||
1082 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1083 | print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); | ||
1084 | #endif | ||
1240 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) | 1085 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) |
1241 | print_cfs_rq(m, cpu, cfs_rq); | 1086 | print_cfs_rq(m, cpu, cfs_rq); |
1242 | } | 1087 | } |
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 3503fb2d9f96..6e2ead41516e 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
@@ -50,10 +50,15 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr) | |||
50 | { | 50 | { |
51 | } | 51 | } |
52 | 52 | ||
53 | static void set_curr_task_idle(struct rq *rq) | ||
54 | { | ||
55 | } | ||
56 | |||
53 | /* | 57 | /* |
54 | * Simple, special scheduling class for the per-CPU idle tasks: | 58 | * Simple, special scheduling class for the per-CPU idle tasks: |
55 | */ | 59 | */ |
56 | static struct sched_class idle_sched_class __read_mostly = { | 60 | const struct sched_class idle_sched_class = { |
61 | /* .next is NULL */ | ||
57 | /* no enqueue/yield_task for idle tasks */ | 62 | /* no enqueue/yield_task for idle tasks */ |
58 | 63 | ||
59 | /* dequeue is not valid, we print a debug message there: */ | 64 | /* dequeue is not valid, we print a debug message there: */ |
@@ -66,6 +71,7 @@ static struct sched_class idle_sched_class __read_mostly = { | |||
66 | 71 | ||
67 | .load_balance = load_balance_idle, | 72 | .load_balance = load_balance_idle, |
68 | 73 | ||
74 | .set_curr_task = set_curr_task_idle, | ||
69 | .task_tick = task_tick_idle, | 75 | .task_tick = task_tick_idle, |
70 | /* no .task_new for idle tasks */ | 76 | /* no .task_new for idle tasks */ |
71 | }; | 77 | }; |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 4b87476a02d0..d0097a0634e5 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -7,7 +7,7 @@ | |||
7 | * Update the current task's runtime statistics. Skip current tasks that | 7 | * Update the current task's runtime statistics. Skip current tasks that |
8 | * are not in our scheduling class. | 8 | * are not in our scheduling class. |
9 | */ | 9 | */ |
10 | static inline void update_curr_rt(struct rq *rq) | 10 | static void update_curr_rt(struct rq *rq) |
11 | { | 11 | { |
12 | struct task_struct *curr = rq->curr; | 12 | struct task_struct *curr = rq->curr; |
13 | u64 delta_exec; | 13 | u64 delta_exec; |
@@ -59,9 +59,9 @@ static void requeue_task_rt(struct rq *rq, struct task_struct *p) | |||
59 | } | 59 | } |
60 | 60 | ||
61 | static void | 61 | static void |
62 | yield_task_rt(struct rq *rq, struct task_struct *p) | 62 | yield_task_rt(struct rq *rq) |
63 | { | 63 | { |
64 | requeue_task_rt(rq, p); | 64 | requeue_task_rt(rq, rq->curr); |
65 | } | 65 | } |
66 | 66 | ||
67 | /* | 67 | /* |
@@ -206,7 +206,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p) | |||
206 | if (--p->time_slice) | 206 | if (--p->time_slice) |
207 | return; | 207 | return; |
208 | 208 | ||
209 | p->time_slice = static_prio_timeslice(p->static_prio); | 209 | p->time_slice = DEF_TIMESLICE; |
210 | 210 | ||
211 | /* | 211 | /* |
212 | * Requeue to the end of queue if we are not the only element | 212 | * Requeue to the end of queue if we are not the only element |
@@ -218,7 +218,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p) | |||
218 | } | 218 | } |
219 | } | 219 | } |
220 | 220 | ||
221 | static struct sched_class rt_sched_class __read_mostly = { | 221 | static void set_curr_task_rt(struct rq *rq) |
222 | { | ||
223 | struct task_struct *p = rq->curr; | ||
224 | |||
225 | p->se.exec_start = rq->clock; | ||
226 | } | ||
227 | |||
228 | const struct sched_class rt_sched_class = { | ||
229 | .next = &fair_sched_class, | ||
222 | .enqueue_task = enqueue_task_rt, | 230 | .enqueue_task = enqueue_task_rt, |
223 | .dequeue_task = dequeue_task_rt, | 231 | .dequeue_task = dequeue_task_rt, |
224 | .yield_task = yield_task_rt, | 232 | .yield_task = yield_task_rt, |
@@ -230,5 +238,6 @@ static struct sched_class rt_sched_class __read_mostly = { | |||
230 | 238 | ||
231 | .load_balance = load_balance_rt, | 239 | .load_balance = load_balance_rt, |
232 | 240 | ||
241 | .set_curr_task = set_curr_task_rt, | ||
233 | .task_tick = task_tick_rt, | 242 | .task_tick = task_tick_rt, |
234 | }; | 243 | }; |
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index c20a94dda61e..1c084842c3e7 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h | |||
@@ -16,18 +16,18 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
16 | struct rq *rq = cpu_rq(cpu); | 16 | struct rq *rq = cpu_rq(cpu); |
17 | #ifdef CONFIG_SMP | 17 | #ifdef CONFIG_SMP |
18 | struct sched_domain *sd; | 18 | struct sched_domain *sd; |
19 | int dcnt = 0; | 19 | int dcount = 0; |
20 | #endif | 20 | #endif |
21 | 21 | ||
22 | /* runqueue-specific stats */ | 22 | /* runqueue-specific stats */ |
23 | seq_printf(seq, | 23 | seq_printf(seq, |
24 | "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu", | 24 | "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu", |
25 | cpu, rq->yld_both_empty, | 25 | cpu, rq->yld_both_empty, |
26 | rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, | 26 | rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count, |
27 | rq->sched_switch, rq->sched_cnt, rq->sched_goidle, | 27 | rq->sched_switch, rq->sched_count, rq->sched_goidle, |
28 | rq->ttwu_cnt, rq->ttwu_local, | 28 | rq->ttwu_count, rq->ttwu_local, |
29 | rq->rq_sched_info.cpu_time, | 29 | rq->rq_sched_info.cpu_time, |
30 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); | 30 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); |
31 | 31 | ||
32 | seq_printf(seq, "\n"); | 32 | seq_printf(seq, "\n"); |
33 | 33 | ||
@@ -39,12 +39,12 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
39 | char mask_str[NR_CPUS]; | 39 | char mask_str[NR_CPUS]; |
40 | 40 | ||
41 | cpumask_scnprintf(mask_str, NR_CPUS, sd->span); | 41 | cpumask_scnprintf(mask_str, NR_CPUS, sd->span); |
42 | seq_printf(seq, "domain%d %s", dcnt++, mask_str); | 42 | seq_printf(seq, "domain%d %s", dcount++, mask_str); |
43 | for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; | 43 | for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; |
44 | itype++) { | 44 | itype++) { |
45 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " | 45 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " |
46 | "%lu", | 46 | "%lu", |
47 | sd->lb_cnt[itype], | 47 | sd->lb_count[itype], |
48 | sd->lb_balanced[itype], | 48 | sd->lb_balanced[itype], |
49 | sd->lb_failed[itype], | 49 | sd->lb_failed[itype], |
50 | sd->lb_imbalance[itype], | 50 | sd->lb_imbalance[itype], |
@@ -55,9 +55,9 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
55 | } | 55 | } |
56 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" | 56 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" |
57 | " %lu %lu %lu\n", | 57 | " %lu %lu %lu\n", |
58 | sd->alb_cnt, sd->alb_failed, sd->alb_pushed, | 58 | sd->alb_count, sd->alb_failed, sd->alb_pushed, |
59 | sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, | 59 | sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, |
60 | sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, | 60 | sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, |
61 | sd->ttwu_wake_remote, sd->ttwu_move_affine, | 61 | sd->ttwu_wake_remote, sd->ttwu_move_affine, |
62 | sd->ttwu_move_balance); | 62 | sd->ttwu_move_balance); |
63 | } | 63 | } |
@@ -101,7 +101,7 @@ rq_sched_info_arrive(struct rq *rq, unsigned long long delta) | |||
101 | { | 101 | { |
102 | if (rq) { | 102 | if (rq) { |
103 | rq->rq_sched_info.run_delay += delta; | 103 | rq->rq_sched_info.run_delay += delta; |
104 | rq->rq_sched_info.pcnt++; | 104 | rq->rq_sched_info.pcount++; |
105 | } | 105 | } |
106 | } | 106 | } |
107 | 107 | ||
@@ -129,7 +129,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) | |||
129 | # define schedstat_set(var, val) do { } while (0) | 129 | # define schedstat_set(var, val) do { } while (0) |
130 | #endif | 130 | #endif |
131 | 131 | ||
132 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 132 | #ifdef CONFIG_SCHEDSTATS |
133 | /* | 133 | /* |
134 | * Called when a process is dequeued from the active array and given | 134 | * Called when a process is dequeued from the active array and given |
135 | * the cpu. We should note that with the exception of interactive | 135 | * the cpu. We should note that with the exception of interactive |
@@ -164,7 +164,7 @@ static void sched_info_arrive(struct task_struct *t) | |||
164 | sched_info_dequeued(t); | 164 | sched_info_dequeued(t); |
165 | t->sched_info.run_delay += delta; | 165 | t->sched_info.run_delay += delta; |
166 | t->sched_info.last_arrival = now; | 166 | t->sched_info.last_arrival = now; |
167 | t->sched_info.pcnt++; | 167 | t->sched_info.pcount++; |
168 | 168 | ||
169 | rq_sched_info_arrive(task_rq(t), delta); | 169 | rq_sched_info_arrive(task_rq(t), delta); |
170 | } | 170 | } |
@@ -233,5 +233,5 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) | |||
233 | #else | 233 | #else |
234 | #define sched_info_queued(t) do { } while (0) | 234 | #define sched_info_queued(t) do { } while (0) |
235 | #define sched_info_switch(t, next) do { } while (0) | 235 | #define sched_info_switch(t, next) do { } while (0) |
236 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ | 236 | #endif /* CONFIG_SCHEDSTATS */ |
237 | 237 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6c97259e863e..ec14aa8ac51f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -222,14 +222,11 @@ static ctl_table kern_table[] = { | |||
222 | #ifdef CONFIG_SCHED_DEBUG | 222 | #ifdef CONFIG_SCHED_DEBUG |
223 | { | 223 | { |
224 | .ctl_name = CTL_UNNUMBERED, | 224 | .ctl_name = CTL_UNNUMBERED, |
225 | .procname = "sched_min_granularity_ns", | 225 | .procname = "sched_nr_latency", |
226 | .data = &sysctl_sched_min_granularity, | 226 | .data = &sysctl_sched_nr_latency, |
227 | .maxlen = sizeof(unsigned int), | 227 | .maxlen = sizeof(unsigned int), |
228 | .mode = 0644, | 228 | .mode = 0644, |
229 | .proc_handler = &proc_dointvec_minmax, | 229 | .proc_handler = &proc_dointvec, |
230 | .strategy = &sysctl_intvec, | ||
231 | .extra1 = &min_sched_granularity_ns, | ||
232 | .extra2 = &max_sched_granularity_ns, | ||
233 | }, | 230 | }, |
234 | { | 231 | { |
235 | .ctl_name = CTL_UNNUMBERED, | 232 | .ctl_name = CTL_UNNUMBERED, |
@@ -266,38 +263,24 @@ static ctl_table kern_table[] = { | |||
266 | }, | 263 | }, |
267 | { | 264 | { |
268 | .ctl_name = CTL_UNNUMBERED, | 265 | .ctl_name = CTL_UNNUMBERED, |
269 | .procname = "sched_stat_granularity_ns", | 266 | .procname = "sched_child_runs_first", |
270 | .data = &sysctl_sched_stat_granularity, | 267 | .data = &sysctl_sched_child_runs_first, |
271 | .maxlen = sizeof(unsigned int), | ||
272 | .mode = 0644, | ||
273 | .proc_handler = &proc_dointvec_minmax, | ||
274 | .strategy = &sysctl_intvec, | ||
275 | .extra1 = &min_wakeup_granularity_ns, | ||
276 | .extra2 = &max_wakeup_granularity_ns, | ||
277 | }, | ||
278 | { | ||
279 | .ctl_name = CTL_UNNUMBERED, | ||
280 | .procname = "sched_runtime_limit_ns", | ||
281 | .data = &sysctl_sched_runtime_limit, | ||
282 | .maxlen = sizeof(unsigned int), | 268 | .maxlen = sizeof(unsigned int), |
283 | .mode = 0644, | 269 | .mode = 0644, |
284 | .proc_handler = &proc_dointvec_minmax, | 270 | .proc_handler = &proc_dointvec, |
285 | .strategy = &sysctl_intvec, | ||
286 | .extra1 = &min_sched_granularity_ns, | ||
287 | .extra2 = &max_sched_granularity_ns, | ||
288 | }, | 271 | }, |
289 | { | 272 | { |
290 | .ctl_name = CTL_UNNUMBERED, | 273 | .ctl_name = CTL_UNNUMBERED, |
291 | .procname = "sched_child_runs_first", | 274 | .procname = "sched_features", |
292 | .data = &sysctl_sched_child_runs_first, | 275 | .data = &sysctl_sched_features, |
293 | .maxlen = sizeof(unsigned int), | 276 | .maxlen = sizeof(unsigned int), |
294 | .mode = 0644, | 277 | .mode = 0644, |
295 | .proc_handler = &proc_dointvec, | 278 | .proc_handler = &proc_dointvec, |
296 | }, | 279 | }, |
297 | { | 280 | { |
298 | .ctl_name = CTL_UNNUMBERED, | 281 | .ctl_name = CTL_UNNUMBERED, |
299 | .procname = "sched_features", | 282 | .procname = "sched_migration_cost", |
300 | .data = &sysctl_sched_features, | 283 | .data = &sysctl_sched_migration_cost, |
301 | .maxlen = sizeof(unsigned int), | 284 | .maxlen = sizeof(unsigned int), |
302 | .mode = 0644, | 285 | .mode = 0644, |
303 | .proc_handler = &proc_dointvec, | 286 | .proc_handler = &proc_dointvec, |
diff --git a/kernel/user.c b/kernel/user.c index 9ca2848fc356..f0e561e6d085 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -50,12 +50,16 @@ struct user_struct root_user = { | |||
50 | .uid_keyring = &root_user_keyring, | 50 | .uid_keyring = &root_user_keyring, |
51 | .session_keyring = &root_session_keyring, | 51 | .session_keyring = &root_session_keyring, |
52 | #endif | 52 | #endif |
53 | #ifdef CONFIG_FAIR_USER_SCHED | ||
54 | .tg = &init_task_group, | ||
55 | #endif | ||
53 | }; | 56 | }; |
54 | 57 | ||
55 | /* | 58 | /* |
56 | * These routines must be called with the uidhash spinlock held! | 59 | * These routines must be called with the uidhash spinlock held! |
57 | */ | 60 | */ |
58 | static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) | 61 | static inline void uid_hash_insert(struct user_struct *up, |
62 | struct hlist_head *hashent) | ||
59 | { | 63 | { |
60 | hlist_add_head(&up->uidhash_node, hashent); | 64 | hlist_add_head(&up->uidhash_node, hashent); |
61 | } | 65 | } |
@@ -65,13 +69,14 @@ static inline void uid_hash_remove(struct user_struct *up) | |||
65 | hlist_del_init(&up->uidhash_node); | 69 | hlist_del_init(&up->uidhash_node); |
66 | } | 70 | } |
67 | 71 | ||
68 | static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) | 72 | static inline struct user_struct *uid_hash_find(uid_t uid, |
73 | struct hlist_head *hashent) | ||
69 | { | 74 | { |
70 | struct user_struct *user; | 75 | struct user_struct *user; |
71 | struct hlist_node *h; | 76 | struct hlist_node *h; |
72 | 77 | ||
73 | hlist_for_each_entry(user, h, hashent, uidhash_node) { | 78 | hlist_for_each_entry(user, h, hashent, uidhash_node) { |
74 | if(user->uid == uid) { | 79 | if (user->uid == uid) { |
75 | atomic_inc(&user->__count); | 80 | atomic_inc(&user->__count); |
76 | return user; | 81 | return user; |
77 | } | 82 | } |
@@ -80,6 +85,203 @@ static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *ha | |||
80 | return NULL; | 85 | return NULL; |
81 | } | 86 | } |
82 | 87 | ||
88 | #ifdef CONFIG_FAIR_USER_SCHED | ||
89 | |||
90 | static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */ | ||
91 | static DEFINE_MUTEX(uids_mutex); | ||
92 | |||
93 | static void sched_destroy_user(struct user_struct *up) | ||
94 | { | ||
95 | sched_destroy_group(up->tg); | ||
96 | } | ||
97 | |||
98 | static int sched_create_user(struct user_struct *up) | ||
99 | { | ||
100 | int rc = 0; | ||
101 | |||
102 | up->tg = sched_create_group(); | ||
103 | if (IS_ERR(up->tg)) | ||
104 | rc = -ENOMEM; | ||
105 | |||
106 | return rc; | ||
107 | } | ||
108 | |||
109 | static void sched_switch_user(struct task_struct *p) | ||
110 | { | ||
111 | sched_move_task(p); | ||
112 | } | ||
113 | |||
114 | static inline void uids_mutex_lock(void) | ||
115 | { | ||
116 | mutex_lock(&uids_mutex); | ||
117 | } | ||
118 | |||
119 | static inline void uids_mutex_unlock(void) | ||
120 | { | ||
121 | mutex_unlock(&uids_mutex); | ||
122 | } | ||
123 | |||
124 | /* return cpu shares held by the user */ | ||
125 | ssize_t cpu_shares_show(struct kset *kset, char *buffer) | ||
126 | { | ||
127 | struct user_struct *up = container_of(kset, struct user_struct, kset); | ||
128 | |||
129 | return sprintf(buffer, "%lu\n", sched_group_shares(up->tg)); | ||
130 | } | ||
131 | |||
132 | /* modify cpu shares held by the user */ | ||
133 | ssize_t cpu_shares_store(struct kset *kset, const char *buffer, size_t size) | ||
134 | { | ||
135 | struct user_struct *up = container_of(kset, struct user_struct, kset); | ||
136 | unsigned long shares; | ||
137 | int rc; | ||
138 | |||
139 | sscanf(buffer, "%lu", &shares); | ||
140 | |||
141 | rc = sched_group_set_shares(up->tg, shares); | ||
142 | |||
143 | return (rc ? rc : size); | ||
144 | } | ||
145 | |||
146 | static void user_attr_init(struct subsys_attribute *sa, char *name, int mode) | ||
147 | { | ||
148 | sa->attr.name = name; | ||
149 | sa->attr.mode = mode; | ||
150 | sa->show = cpu_shares_show; | ||
151 | sa->store = cpu_shares_store; | ||
152 | } | ||
153 | |||
154 | /* Create "/sys/kernel/uids/<uid>" directory and | ||
155 | * "/sys/kernel/uids/<uid>/cpu_share" file for this user. | ||
156 | */ | ||
157 | static int user_kobject_create(struct user_struct *up) | ||
158 | { | ||
159 | struct kset *kset = &up->kset; | ||
160 | struct kobject *kobj = &kset->kobj; | ||
161 | int error; | ||
162 | |||
163 | memset(kset, 0, sizeof(struct kset)); | ||
164 | kobj->parent = &uids_kobject; /* create under /sys/kernel/uids dir */ | ||
165 | kobject_set_name(kobj, "%d", up->uid); | ||
166 | kset_init(kset); | ||
167 | user_attr_init(&up->user_attr, "cpu_share", 0644); | ||
168 | |||
169 | error = kobject_add(kobj); | ||
170 | if (error) | ||
171 | goto done; | ||
172 | |||
173 | error = sysfs_create_file(kobj, &up->user_attr.attr); | ||
174 | if (error) | ||
175 | kobject_del(kobj); | ||
176 | |||
177 | kobject_uevent(kobj, KOBJ_ADD); | ||
178 | |||
179 | done: | ||
180 | return error; | ||
181 | } | ||
182 | |||
183 | /* create these in sysfs filesystem: | ||
184 | * "/sys/kernel/uids" directory | ||
185 | * "/sys/kernel/uids/0" directory (for root user) | ||
186 | * "/sys/kernel/uids/0/cpu_share" file (for root user) | ||
187 | */ | ||
188 | int __init uids_kobject_init(void) | ||
189 | { | ||
190 | int error; | ||
191 | |||
192 | /* create under /sys/kernel dir */ | ||
193 | uids_kobject.parent = &kernel_subsys.kobj; | ||
194 | uids_kobject.kset = &kernel_subsys; | ||
195 | kobject_set_name(&uids_kobject, "uids"); | ||
196 | kobject_init(&uids_kobject); | ||
197 | |||
198 | error = kobject_add(&uids_kobject); | ||
199 | if (!error) | ||
200 | error = user_kobject_create(&root_user); | ||
201 | |||
202 | return error; | ||
203 | } | ||
204 | |||
205 | /* work function to remove sysfs directory for a user and free up | ||
206 | * corresponding structures. | ||
207 | */ | ||
208 | static void remove_user_sysfs_dir(struct work_struct *w) | ||
209 | { | ||
210 | struct user_struct *up = container_of(w, struct user_struct, work); | ||
211 | struct kobject *kobj = &up->kset.kobj; | ||
212 | unsigned long flags; | ||
213 | int remove_user = 0; | ||
214 | |||
215 | /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del() | ||
216 | * atomic. | ||
217 | */ | ||
218 | uids_mutex_lock(); | ||
219 | |||
220 | local_irq_save(flags); | ||
221 | |||
222 | if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { | ||
223 | uid_hash_remove(up); | ||
224 | remove_user = 1; | ||
225 | spin_unlock_irqrestore(&uidhash_lock, flags); | ||
226 | } else { | ||
227 | local_irq_restore(flags); | ||
228 | } | ||
229 | |||
230 | if (!remove_user) | ||
231 | goto done; | ||
232 | |||
233 | sysfs_remove_file(kobj, &up->user_attr.attr); | ||
234 | kobject_uevent(kobj, KOBJ_REMOVE); | ||
235 | kobject_del(kobj); | ||
236 | |||
237 | sched_destroy_user(up); | ||
238 | key_put(up->uid_keyring); | ||
239 | key_put(up->session_keyring); | ||
240 | kmem_cache_free(uid_cachep, up); | ||
241 | |||
242 | done: | ||
243 | uids_mutex_unlock(); | ||
244 | } | ||
245 | |||
246 | /* IRQs are disabled and uidhash_lock is held upon function entry. | ||
247 | * IRQ state (as stored in flags) is restored and uidhash_lock released | ||
248 | * upon function exit. | ||
249 | */ | ||
250 | static inline void free_user(struct user_struct *up, unsigned long flags) | ||
251 | { | ||
252 | /* restore back the count */ | ||
253 | atomic_inc(&up->__count); | ||
254 | spin_unlock_irqrestore(&uidhash_lock, flags); | ||
255 | |||
256 | INIT_WORK(&up->work, remove_user_sysfs_dir); | ||
257 | schedule_work(&up->work); | ||
258 | } | ||
259 | |||
260 | #else /* CONFIG_FAIR_USER_SCHED */ | ||
261 | |||
262 | static void sched_destroy_user(struct user_struct *up) { } | ||
263 | static int sched_create_user(struct user_struct *up) { return 0; } | ||
264 | static void sched_switch_user(struct task_struct *p) { } | ||
265 | static inline int user_kobject_create(struct user_struct *up) { return 0; } | ||
266 | static inline void uids_mutex_lock(void) { } | ||
267 | static inline void uids_mutex_unlock(void) { } | ||
268 | |||
269 | /* IRQs are disabled and uidhash_lock is held upon function entry. | ||
270 | * IRQ state (as stored in flags) is restored and uidhash_lock released | ||
271 | * upon function exit. | ||
272 | */ | ||
273 | static inline void free_user(struct user_struct *up, unsigned long flags) | ||
274 | { | ||
275 | uid_hash_remove(up); | ||
276 | spin_unlock_irqrestore(&uidhash_lock, flags); | ||
277 | sched_destroy_user(up); | ||
278 | key_put(up->uid_keyring); | ||
279 | key_put(up->session_keyring); | ||
280 | kmem_cache_free(uid_cachep, up); | ||
281 | } | ||
282 | |||
283 | #endif /* CONFIG_FAIR_USER_SCHED */ | ||
284 | |||
83 | /* | 285 | /* |
84 | * Locate the user_struct for the passed UID. If found, take a ref on it. The | 286 | * Locate the user_struct for the passed UID. If found, take a ref on it. The |
85 | * caller must undo that ref with free_uid(). | 287 | * caller must undo that ref with free_uid(). |
@@ -106,15 +308,10 @@ void free_uid(struct user_struct *up) | |||
106 | return; | 308 | return; |
107 | 309 | ||
108 | local_irq_save(flags); | 310 | local_irq_save(flags); |
109 | if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { | 311 | if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) |
110 | uid_hash_remove(up); | 312 | free_user(up, flags); |
111 | spin_unlock_irqrestore(&uidhash_lock, flags); | 313 | else |
112 | key_put(up->uid_keyring); | ||
113 | key_put(up->session_keyring); | ||
114 | kmem_cache_free(uid_cachep, up); | ||
115 | } else { | ||
116 | local_irq_restore(flags); | 314 | local_irq_restore(flags); |
117 | } | ||
118 | } | 315 | } |
119 | 316 | ||
120 | struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | 317 | struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) |
@@ -122,6 +319,11 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
122 | struct hlist_head *hashent = uidhashentry(ns, uid); | 319 | struct hlist_head *hashent = uidhashentry(ns, uid); |
123 | struct user_struct *up; | 320 | struct user_struct *up; |
124 | 321 | ||
322 | /* Make uid_hash_find() + user_kobject_create() + uid_hash_insert() | ||
323 | * atomic. | ||
324 | */ | ||
325 | uids_mutex_lock(); | ||
326 | |||
125 | spin_lock_irq(&uidhash_lock); | 327 | spin_lock_irq(&uidhash_lock); |
126 | up = uid_hash_find(uid, hashent); | 328 | up = uid_hash_find(uid, hashent); |
127 | spin_unlock_irq(&uidhash_lock); | 329 | spin_unlock_irq(&uidhash_lock); |
@@ -150,6 +352,22 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
150 | return NULL; | 352 | return NULL; |
151 | } | 353 | } |
152 | 354 | ||
355 | if (sched_create_user(new) < 0) { | ||
356 | key_put(new->uid_keyring); | ||
357 | key_put(new->session_keyring); | ||
358 | kmem_cache_free(uid_cachep, new); | ||
359 | return NULL; | ||
360 | } | ||
361 | |||
362 | if (user_kobject_create(new)) { | ||
363 | sched_destroy_user(new); | ||
364 | key_put(new->uid_keyring); | ||
365 | key_put(new->session_keyring); | ||
366 | kmem_cache_free(uid_cachep, new); | ||
367 | uids_mutex_unlock(); | ||
368 | return NULL; | ||
369 | } | ||
370 | |||
153 | /* | 371 | /* |
154 | * Before adding this, check whether we raced | 372 | * Before adding this, check whether we raced |
155 | * on adding the same user already.. | 373 | * on adding the same user already.. |
@@ -157,6 +375,11 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
157 | spin_lock_irq(&uidhash_lock); | 375 | spin_lock_irq(&uidhash_lock); |
158 | up = uid_hash_find(uid, hashent); | 376 | up = uid_hash_find(uid, hashent); |
159 | if (up) { | 377 | if (up) { |
378 | /* This case is not possible when CONFIG_FAIR_USER_SCHED | ||
379 | * is defined, since we serialize alloc_uid() using | ||
380 | * uids_mutex. Hence no need to call | ||
381 | * sched_destroy_user() or remove_user_sysfs_dir(). | ||
382 | */ | ||
160 | key_put(new->uid_keyring); | 383 | key_put(new->uid_keyring); |
161 | key_put(new->session_keyring); | 384 | key_put(new->session_keyring); |
162 | kmem_cache_free(uid_cachep, new); | 385 | kmem_cache_free(uid_cachep, new); |
@@ -167,6 +390,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
167 | spin_unlock_irq(&uidhash_lock); | 390 | spin_unlock_irq(&uidhash_lock); |
168 | 391 | ||
169 | } | 392 | } |
393 | |||
394 | uids_mutex_unlock(); | ||
395 | |||
170 | return up; | 396 | return up; |
171 | } | 397 | } |
172 | 398 | ||
@@ -184,6 +410,7 @@ void switch_uid(struct user_struct *new_user) | |||
184 | atomic_dec(&old_user->processes); | 410 | atomic_dec(&old_user->processes); |
185 | switch_uid_keyring(new_user); | 411 | switch_uid_keyring(new_user); |
186 | current->user = new_user; | 412 | current->user = new_user; |
413 | sched_switch_user(current); | ||
187 | 414 | ||
188 | /* | 415 | /* |
189 | * We need to synchronize with __sigqueue_alloc() | 416 | * We need to synchronize with __sigqueue_alloc() |
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 2b57eaf66abc..6996cba5aa96 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c | |||
@@ -334,7 +334,7 @@ static void unix_write_space(struct sock *sk) | |||
334 | read_lock(&sk->sk_callback_lock); | 334 | read_lock(&sk->sk_callback_lock); |
335 | if (unix_writable(sk)) { | 335 | if (unix_writable(sk)) { |
336 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) | 336 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) |
337 | wake_up_interruptible(sk->sk_sleep); | 337 | wake_up_interruptible_sync(sk->sk_sleep); |
338 | sk_wake_async(sk, 2, POLL_OUT); | 338 | sk_wake_async(sk, 2, POLL_OUT); |
339 | } | 339 | } |
340 | read_unlock(&sk->sk_callback_lock); | 340 | read_unlock(&sk->sk_callback_lock); |
@@ -1639,7 +1639,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, | |||
1639 | if (!skb) | 1639 | if (!skb) |
1640 | goto out_unlock; | 1640 | goto out_unlock; |
1641 | 1641 | ||
1642 | wake_up_interruptible(&u->peer_wait); | 1642 | wake_up_interruptible_sync(&u->peer_wait); |
1643 | 1643 | ||
1644 | if (msg->msg_name) | 1644 | if (msg->msg_name) |
1645 | unix_copy_addr(msg, skb->sk); | 1645 | unix_copy_addr(msg, skb->sk); |