diff options
| -rw-r--r-- | Documentation/sched-design-CFS.txt | 67 | ||||
| -rw-r--r-- | arch/i386/Kconfig | 11 | ||||
| -rw-r--r-- | drivers/kvm/kvm.h | 10 | ||||
| -rw-r--r-- | drivers/kvm/kvm_main.c | 2 | ||||
| -rw-r--r-- | fs/pipe.c | 9 | ||||
| -rw-r--r-- | fs/proc/array.c | 17 | ||||
| -rw-r--r-- | fs/proc/base.c | 2 | ||||
| -rw-r--r-- | fs/proc/proc_misc.c | 15 | ||||
| -rw-r--r-- | include/linux/kernel_stat.h | 1 | ||||
| -rw-r--r-- | include/linux/sched.h | 99 | ||||
| -rw-r--r-- | include/linux/topology.h | 5 | ||||
| -rw-r--r-- | init/Kconfig | 21 | ||||
| -rw-r--r-- | kernel/delayacct.c | 2 | ||||
| -rw-r--r-- | kernel/exit.c | 6 | ||||
| -rw-r--r-- | kernel/fork.c | 3 | ||||
| -rw-r--r-- | kernel/ksysfs.c | 8 | ||||
| -rw-r--r-- | kernel/sched.c | 1444 | ||||
| -rw-r--r-- | kernel/sched_debug.c | 282 | ||||
| -rw-r--r-- | kernel/sched_fair.c | 811 | ||||
| -rw-r--r-- | kernel/sched_idletask.c | 8 | ||||
| -rw-r--r-- | kernel/sched_rt.c | 19 | ||||
| -rw-r--r-- | kernel/sched_stats.h | 28 | ||||
| -rw-r--r-- | kernel/sysctl.c | 37 | ||||
| -rw-r--r-- | kernel/user.c | 249 | ||||
| -rw-r--r-- | net/unix/af_unix.c | 4 |
25 files changed, 1872 insertions, 1288 deletions
diff --git a/Documentation/sched-design-CFS.txt b/Documentation/sched-design-CFS.txt index 84901e7c0508..88bcb8767335 100644 --- a/Documentation/sched-design-CFS.txt +++ b/Documentation/sched-design-CFS.txt | |||
| @@ -117,3 +117,70 @@ Some implementation details: | |||
| 117 | iterators of the scheduling modules are used. The balancing code got | 117 | iterators of the scheduling modules are used. The balancing code got |
| 118 | quite a bit simpler as a result. | 118 | quite a bit simpler as a result. |
| 119 | 119 | ||
| 120 | |||
| 121 | Group scheduler extension to CFS | ||
| 122 | ================================ | ||
| 123 | |||
| 124 | Normally the scheduler operates on individual tasks and strives to provide | ||
| 125 | fair CPU time to each task. Sometimes, it may be desirable to group tasks | ||
| 126 | and provide fair CPU time to each such task group. For example, it may | ||
| 127 | be desirable to first provide fair CPU time to each user on the system | ||
| 128 | and then to each task belonging to a user. | ||
| 129 | |||
| 130 | CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets | ||
| 131 | SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such | ||
| 132 | groups. At present, there are two (mutually exclusive) mechanisms to group | ||
| 133 | tasks for CPU bandwidth control purpose: | ||
| 134 | |||
| 135 | - Based on user id (CONFIG_FAIR_USER_SCHED) | ||
| 136 | In this option, tasks are grouped according to their user id. | ||
| 137 | - Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED) | ||
| 138 | This options lets the administrator create arbitrary groups | ||
| 139 | of tasks, using the "cgroup" pseudo filesystem. See | ||
| 140 | Documentation/cgroups.txt for more information about this | ||
| 141 | filesystem. | ||
| 142 | |||
| 143 | Only one of these options to group tasks can be chosen and not both. | ||
| 144 | |||
| 145 | Group scheduler tunables: | ||
| 146 | |||
| 147 | When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for | ||
| 148 | each new user and a "cpu_share" file is added in that directory. | ||
| 149 | |||
| 150 | # cd /sys/kernel/uids | ||
| 151 | # cat 512/cpu_share # Display user 512's CPU share | ||
| 152 | 1024 | ||
| 153 | # echo 2048 > 512/cpu_share # Modify user 512's CPU share | ||
| 154 | # cat 512/cpu_share # Display user 512's CPU share | ||
| 155 | 2048 | ||
| 156 | # | ||
| 157 | |||
| 158 | CPU bandwidth between two users are divided in the ratio of their CPU shares. | ||
| 159 | For ex: if you would like user "root" to get twice the bandwidth of user | ||
| 160 | "guest", then set the cpu_share for both the users such that "root"'s | ||
| 161 | cpu_share is twice "guest"'s cpu_share | ||
| 162 | |||
| 163 | |||
| 164 | When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created | ||
| 165 | for each group created using the pseudo filesystem. See example steps | ||
| 166 | below to create task groups and modify their CPU share using the "cgroups" | ||
| 167 | pseudo filesystem | ||
| 168 | |||
| 169 | # mkdir /dev/cpuctl | ||
| 170 | # mount -t cgroup -ocpu none /dev/cpuctl | ||
| 171 | # cd /dev/cpuctl | ||
| 172 | |||
| 173 | # mkdir multimedia # create "multimedia" group of tasks | ||
| 174 | # mkdir browser # create "browser" group of tasks | ||
| 175 | |||
| 176 | # #Configure the multimedia group to receive twice the CPU bandwidth | ||
| 177 | # #that of browser group | ||
| 178 | |||
| 179 | # echo 2048 > multimedia/cpu.shares | ||
| 180 | # echo 1024 > browser/cpu.shares | ||
| 181 | |||
| 182 | # firefox & # Launch firefox and move it to "browser" group | ||
| 183 | # echo <firefox_pid> > browser/tasks | ||
| 184 | |||
| 185 | # #Launch gmplayer (or your favourite movie player) | ||
| 186 | # echo <movie_player_pid> > multimedia/tasks | ||
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index f1486f8a3e6d..bf9aafad4978 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig | |||
| @@ -214,6 +214,17 @@ config X86_ES7000 | |||
| 214 | 214 | ||
| 215 | endchoice | 215 | endchoice |
| 216 | 216 | ||
| 217 | config SCHED_NO_NO_OMIT_FRAME_POINTER | ||
| 218 | bool "Single-depth WCHAN output" | ||
| 219 | default y | ||
| 220 | help | ||
| 221 | Calculate simpler /proc/<PID>/wchan values. If this option | ||
| 222 | is disabled then wchan values will recurse back to the | ||
| 223 | caller function. This provides more accurate wchan values, | ||
| 224 | at the expense of slightly more scheduling overhead. | ||
| 225 | |||
| 226 | If in doubt, say "Y". | ||
| 227 | |||
| 217 | config PARAVIRT | 228 | config PARAVIRT |
| 218 | bool "Paravirtualization support (EXPERIMENTAL)" | 229 | bool "Paravirtualization support (EXPERIMENTAL)" |
| 219 | depends on EXPERIMENTAL | 230 | depends on EXPERIMENTAL |
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index ad0813843adc..3b0bc4bda5f2 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h | |||
| @@ -624,6 +624,16 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu); | |||
| 624 | 624 | ||
| 625 | int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); | 625 | int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); |
| 626 | 626 | ||
| 627 | static inline void kvm_guest_enter(void) | ||
| 628 | { | ||
| 629 | current->flags |= PF_VCPU; | ||
| 630 | } | ||
| 631 | |||
| 632 | static inline void kvm_guest_exit(void) | ||
| 633 | { | ||
| 634 | current->flags &= ~PF_VCPU; | ||
| 635 | } | ||
| 636 | |||
| 627 | static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | 637 | static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, |
| 628 | u32 error_code) | 638 | u32 error_code) |
| 629 | { | 639 | { |
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 353e58527d15..af2d288c881d 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c | |||
| @@ -2046,6 +2046,7 @@ again: | |||
| 2046 | kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); | 2046 | kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); |
| 2047 | 2047 | ||
| 2048 | vcpu->guest_mode = 1; | 2048 | vcpu->guest_mode = 1; |
| 2049 | kvm_guest_enter(); | ||
| 2049 | 2050 | ||
| 2050 | if (vcpu->requests) | 2051 | if (vcpu->requests) |
| 2051 | if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) | 2052 | if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) |
| @@ -2053,6 +2054,7 @@ again: | |||
| 2053 | 2054 | ||
| 2054 | kvm_x86_ops->run(vcpu, kvm_run); | 2055 | kvm_x86_ops->run(vcpu, kvm_run); |
| 2055 | 2056 | ||
| 2057 | kvm_guest_exit(); | ||
| 2056 | vcpu->guest_mode = 0; | 2058 | vcpu->guest_mode = 0; |
| 2057 | local_irq_enable(); | 2059 | local_irq_enable(); |
| 2058 | 2060 | ||
| @@ -45,8 +45,7 @@ void pipe_wait(struct pipe_inode_info *pipe) | |||
| 45 | * Pipes are system-local resources, so sleeping on them | 45 | * Pipes are system-local resources, so sleeping on them |
| 46 | * is considered a noninteractive wait: | 46 | * is considered a noninteractive wait: |
| 47 | */ | 47 | */ |
| 48 | prepare_to_wait(&pipe->wait, &wait, | 48 | prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); |
| 49 | TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE); | ||
| 50 | if (pipe->inode) | 49 | if (pipe->inode) |
| 51 | mutex_unlock(&pipe->inode->i_mutex); | 50 | mutex_unlock(&pipe->inode->i_mutex); |
| 52 | schedule(); | 51 | schedule(); |
| @@ -383,7 +382,7 @@ redo: | |||
| 383 | 382 | ||
| 384 | /* Signal writers asynchronously that there is more room. */ | 383 | /* Signal writers asynchronously that there is more room. */ |
| 385 | if (do_wakeup) { | 384 | if (do_wakeup) { |
| 386 | wake_up_interruptible(&pipe->wait); | 385 | wake_up_interruptible_sync(&pipe->wait); |
| 387 | kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); | 386 | kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); |
| 388 | } | 387 | } |
| 389 | if (ret > 0) | 388 | if (ret > 0) |
| @@ -556,7 +555,7 @@ redo2: | |||
| 556 | out: | 555 | out: |
| 557 | mutex_unlock(&inode->i_mutex); | 556 | mutex_unlock(&inode->i_mutex); |
| 558 | if (do_wakeup) { | 557 | if (do_wakeup) { |
| 559 | wake_up_interruptible(&pipe->wait); | 558 | wake_up_interruptible_sync(&pipe->wait); |
| 560 | kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); | 559 | kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); |
| 561 | } | 560 | } |
| 562 | if (ret > 0) | 561 | if (ret > 0) |
| @@ -650,7 +649,7 @@ pipe_release(struct inode *inode, int decr, int decw) | |||
| 650 | if (!pipe->readers && !pipe->writers) { | 649 | if (!pipe->readers && !pipe->writers) { |
| 651 | free_pipe_info(inode); | 650 | free_pipe_info(inode); |
| 652 | } else { | 651 | } else { |
| 653 | wake_up_interruptible(&pipe->wait); | 652 | wake_up_interruptible_sync(&pipe->wait); |
| 654 | kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); | 653 | kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); |
| 655 | kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); | 654 | kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); |
| 656 | } | 655 | } |
diff --git a/fs/proc/array.c b/fs/proc/array.c index ee4814dd98f9..27b59f5f3bd1 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c | |||
| @@ -370,6 +370,11 @@ static cputime_t task_stime(struct task_struct *p) | |||
| 370 | } | 370 | } |
| 371 | #endif | 371 | #endif |
| 372 | 372 | ||
| 373 | static cputime_t task_gtime(struct task_struct *p) | ||
| 374 | { | ||
| 375 | return p->gtime; | ||
| 376 | } | ||
| 377 | |||
| 373 | static int do_task_stat(struct task_struct *task, char *buffer, int whole) | 378 | static int do_task_stat(struct task_struct *task, char *buffer, int whole) |
| 374 | { | 379 | { |
| 375 | unsigned long vsize, eip, esp, wchan = ~0UL; | 380 | unsigned long vsize, eip, esp, wchan = ~0UL; |
| @@ -385,6 +390,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) | |||
| 385 | unsigned long cmin_flt = 0, cmaj_flt = 0; | 390 | unsigned long cmin_flt = 0, cmaj_flt = 0; |
| 386 | unsigned long min_flt = 0, maj_flt = 0; | 391 | unsigned long min_flt = 0, maj_flt = 0; |
| 387 | cputime_t cutime, cstime, utime, stime; | 392 | cputime_t cutime, cstime, utime, stime; |
| 393 | cputime_t cgtime, gtime; | ||
| 388 | unsigned long rsslim = 0; | 394 | unsigned long rsslim = 0; |
| 389 | char tcomm[sizeof(task->comm)]; | 395 | char tcomm[sizeof(task->comm)]; |
| 390 | unsigned long flags; | 396 | unsigned long flags; |
| @@ -403,6 +409,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) | |||
| 403 | sigemptyset(&sigign); | 409 | sigemptyset(&sigign); |
| 404 | sigemptyset(&sigcatch); | 410 | sigemptyset(&sigcatch); |
| 405 | cutime = cstime = utime = stime = cputime_zero; | 411 | cutime = cstime = utime = stime = cputime_zero; |
| 412 | cgtime = gtime = cputime_zero; | ||
| 406 | 413 | ||
| 407 | rcu_read_lock(); | 414 | rcu_read_lock(); |
| 408 | if (lock_task_sighand(task, &flags)) { | 415 | if (lock_task_sighand(task, &flags)) { |
| @@ -420,6 +427,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) | |||
| 420 | cmaj_flt = sig->cmaj_flt; | 427 | cmaj_flt = sig->cmaj_flt; |
| 421 | cutime = sig->cutime; | 428 | cutime = sig->cutime; |
| 422 | cstime = sig->cstime; | 429 | cstime = sig->cstime; |
| 430 | cgtime = sig->cgtime; | ||
| 423 | rsslim = sig->rlim[RLIMIT_RSS].rlim_cur; | 431 | rsslim = sig->rlim[RLIMIT_RSS].rlim_cur; |
| 424 | 432 | ||
| 425 | /* add up live thread stats at the group level */ | 433 | /* add up live thread stats at the group level */ |
| @@ -430,6 +438,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) | |||
| 430 | maj_flt += t->maj_flt; | 438 | maj_flt += t->maj_flt; |
| 431 | utime = cputime_add(utime, task_utime(t)); | 439 | utime = cputime_add(utime, task_utime(t)); |
| 432 | stime = cputime_add(stime, task_stime(t)); | 440 | stime = cputime_add(stime, task_stime(t)); |
| 441 | gtime = cputime_add(gtime, task_gtime(t)); | ||
| 433 | t = next_thread(t); | 442 | t = next_thread(t); |
| 434 | } while (t != task); | 443 | } while (t != task); |
| 435 | 444 | ||
| @@ -437,6 +446,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) | |||
| 437 | maj_flt += sig->maj_flt; | 446 | maj_flt += sig->maj_flt; |
| 438 | utime = cputime_add(utime, sig->utime); | 447 | utime = cputime_add(utime, sig->utime); |
| 439 | stime = cputime_add(stime, sig->stime); | 448 | stime = cputime_add(stime, sig->stime); |
| 449 | gtime += cputime_add(gtime, sig->gtime); | ||
| 440 | } | 450 | } |
| 441 | 451 | ||
| 442 | sid = signal_session(sig); | 452 | sid = signal_session(sig); |
| @@ -454,6 +464,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) | |||
| 454 | maj_flt = task->maj_flt; | 464 | maj_flt = task->maj_flt; |
| 455 | utime = task_utime(task); | 465 | utime = task_utime(task); |
| 456 | stime = task_stime(task); | 466 | stime = task_stime(task); |
| 467 | gtime = task_gtime(task); | ||
| 457 | } | 468 | } |
| 458 | 469 | ||
| 459 | /* scale priority and nice values from timeslices to -20..20 */ | 470 | /* scale priority and nice values from timeslices to -20..20 */ |
| @@ -471,7 +482,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) | |||
| 471 | 482 | ||
| 472 | res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \ | 483 | res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \ |
| 473 | %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ | 484 | %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ |
| 474 | %lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu\n", | 485 | %lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", |
| 475 | task->pid, | 486 | task->pid, |
| 476 | tcomm, | 487 | tcomm, |
| 477 | state, | 488 | state, |
| @@ -516,7 +527,9 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) | |||
| 516 | task_cpu(task), | 527 | task_cpu(task), |
| 517 | task->rt_priority, | 528 | task->rt_priority, |
| 518 | task->policy, | 529 | task->policy, |
| 519 | (unsigned long long)delayacct_blkio_ticks(task)); | 530 | (unsigned long long)delayacct_blkio_ticks(task), |
| 531 | cputime_to_clock_t(gtime), | ||
| 532 | cputime_to_clock_t(cgtime)); | ||
| 520 | if (mm) | 533 | if (mm) |
| 521 | mmput(mm); | 534 | mmput(mm); |
| 522 | return res; | 535 | return res; |
diff --git a/fs/proc/base.c b/fs/proc/base.c index 19489b0d5554..e5d0953d4db1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c | |||
| @@ -304,7 +304,7 @@ static int proc_pid_schedstat(struct task_struct *task, char *buffer) | |||
| 304 | return sprintf(buffer, "%llu %llu %lu\n", | 304 | return sprintf(buffer, "%llu %llu %lu\n", |
| 305 | task->sched_info.cpu_time, | 305 | task->sched_info.cpu_time, |
| 306 | task->sched_info.run_delay, | 306 | task->sched_info.run_delay, |
| 307 | task->sched_info.pcnt); | 307 | task->sched_info.pcount); |
| 308 | } | 308 | } |
| 309 | #endif | 309 | #endif |
| 310 | 310 | ||
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index bee251cb87c8..b872a01ad3af 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c | |||
| @@ -443,6 +443,7 @@ static int show_stat(struct seq_file *p, void *v) | |||
| 443 | int i; | 443 | int i; |
| 444 | unsigned long jif; | 444 | unsigned long jif; |
| 445 | cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; | 445 | cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; |
| 446 | cputime64_t guest; | ||
| 446 | u64 sum = 0; | 447 | u64 sum = 0; |
| 447 | struct timespec boottime; | 448 | struct timespec boottime; |
| 448 | unsigned int *per_irq_sum; | 449 | unsigned int *per_irq_sum; |
| @@ -453,6 +454,7 @@ static int show_stat(struct seq_file *p, void *v) | |||
| 453 | 454 | ||
| 454 | user = nice = system = idle = iowait = | 455 | user = nice = system = idle = iowait = |
| 455 | irq = softirq = steal = cputime64_zero; | 456 | irq = softirq = steal = cputime64_zero; |
| 457 | guest = cputime64_zero; | ||
| 456 | getboottime(&boottime); | 458 | getboottime(&boottime); |
| 457 | jif = boottime.tv_sec; | 459 | jif = boottime.tv_sec; |
| 458 | 460 | ||
| @@ -467,6 +469,7 @@ static int show_stat(struct seq_file *p, void *v) | |||
| 467 | irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); | 469 | irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); |
| 468 | softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); | 470 | softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); |
| 469 | steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); | 471 | steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); |
| 472 | guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); | ||
| 470 | for (j = 0; j < NR_IRQS; j++) { | 473 | for (j = 0; j < NR_IRQS; j++) { |
| 471 | unsigned int temp = kstat_cpu(i).irqs[j]; | 474 | unsigned int temp = kstat_cpu(i).irqs[j]; |
| 472 | sum += temp; | 475 | sum += temp; |
| @@ -474,7 +477,7 @@ static int show_stat(struct seq_file *p, void *v) | |||
| 474 | } | 477 | } |
| 475 | } | 478 | } |
| 476 | 479 | ||
| 477 | seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu\n", | 480 | seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", |
| 478 | (unsigned long long)cputime64_to_clock_t(user), | 481 | (unsigned long long)cputime64_to_clock_t(user), |
| 479 | (unsigned long long)cputime64_to_clock_t(nice), | 482 | (unsigned long long)cputime64_to_clock_t(nice), |
| 480 | (unsigned long long)cputime64_to_clock_t(system), | 483 | (unsigned long long)cputime64_to_clock_t(system), |
| @@ -482,7 +485,8 @@ static int show_stat(struct seq_file *p, void *v) | |||
| 482 | (unsigned long long)cputime64_to_clock_t(iowait), | 485 | (unsigned long long)cputime64_to_clock_t(iowait), |
| 483 | (unsigned long long)cputime64_to_clock_t(irq), | 486 | (unsigned long long)cputime64_to_clock_t(irq), |
| 484 | (unsigned long long)cputime64_to_clock_t(softirq), | 487 | (unsigned long long)cputime64_to_clock_t(softirq), |
| 485 | (unsigned long long)cputime64_to_clock_t(steal)); | 488 | (unsigned long long)cputime64_to_clock_t(steal), |
| 489 | (unsigned long long)cputime64_to_clock_t(guest)); | ||
| 486 | for_each_online_cpu(i) { | 490 | for_each_online_cpu(i) { |
| 487 | 491 | ||
| 488 | /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ | 492 | /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ |
| @@ -494,7 +498,9 @@ static int show_stat(struct seq_file *p, void *v) | |||
| 494 | irq = kstat_cpu(i).cpustat.irq; | 498 | irq = kstat_cpu(i).cpustat.irq; |
| 495 | softirq = kstat_cpu(i).cpustat.softirq; | 499 | softirq = kstat_cpu(i).cpustat.softirq; |
| 496 | steal = kstat_cpu(i).cpustat.steal; | 500 | steal = kstat_cpu(i).cpustat.steal; |
| 497 | seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu\n", | 501 | guest = kstat_cpu(i).cpustat.guest; |
| 502 | seq_printf(p, | ||
| 503 | "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", | ||
| 498 | i, | 504 | i, |
| 499 | (unsigned long long)cputime64_to_clock_t(user), | 505 | (unsigned long long)cputime64_to_clock_t(user), |
| 500 | (unsigned long long)cputime64_to_clock_t(nice), | 506 | (unsigned long long)cputime64_to_clock_t(nice), |
| @@ -503,7 +509,8 @@ static int show_stat(struct seq_file *p, void *v) | |||
| 503 | (unsigned long long)cputime64_to_clock_t(iowait), | 509 | (unsigned long long)cputime64_to_clock_t(iowait), |
| 504 | (unsigned long long)cputime64_to_clock_t(irq), | 510 | (unsigned long long)cputime64_to_clock_t(irq), |
| 505 | (unsigned long long)cputime64_to_clock_t(softirq), | 511 | (unsigned long long)cputime64_to_clock_t(softirq), |
| 506 | (unsigned long long)cputime64_to_clock_t(steal)); | 512 | (unsigned long long)cputime64_to_clock_t(steal), |
| 513 | (unsigned long long)cputime64_to_clock_t(guest)); | ||
| 507 | } | 514 | } |
| 508 | seq_printf(p, "intr %llu", (unsigned long long)sum); | 515 | seq_printf(p, "intr %llu", (unsigned long long)sum); |
| 509 | 516 | ||
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 43e895f1cabe..12bf44f083f5 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h | |||
| @@ -23,6 +23,7 @@ struct cpu_usage_stat { | |||
| 23 | cputime64_t idle; | 23 | cputime64_t idle; |
| 24 | cputime64_t iowait; | 24 | cputime64_t iowait; |
| 25 | cputime64_t steal; | 25 | cputime64_t steal; |
| 26 | cputime64_t guest; | ||
| 26 | }; | 27 | }; |
| 27 | 28 | ||
| 28 | struct kernel_stat { | 29 | struct kernel_stat { |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 833f7dc2b8de..228e0a8ce248 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -87,6 +87,7 @@ struct sched_param { | |||
| 87 | #include <linux/timer.h> | 87 | #include <linux/timer.h> |
| 88 | #include <linux/hrtimer.h> | 88 | #include <linux/hrtimer.h> |
| 89 | #include <linux/task_io_accounting.h> | 89 | #include <linux/task_io_accounting.h> |
| 90 | #include <linux/kobject.h> | ||
| 90 | 91 | ||
| 91 | #include <asm/processor.h> | 92 | #include <asm/processor.h> |
| 92 | 93 | ||
| @@ -136,6 +137,7 @@ extern unsigned long weighted_cpuload(const int cpu); | |||
| 136 | 137 | ||
| 137 | struct seq_file; | 138 | struct seq_file; |
| 138 | struct cfs_rq; | 139 | struct cfs_rq; |
| 140 | struct task_group; | ||
| 139 | #ifdef CONFIG_SCHED_DEBUG | 141 | #ifdef CONFIG_SCHED_DEBUG |
| 140 | extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); | 142 | extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); |
| 141 | extern void proc_sched_set_task(struct task_struct *p); | 143 | extern void proc_sched_set_task(struct task_struct *p); |
| @@ -174,8 +176,7 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
| 174 | #define EXIT_ZOMBIE 16 | 176 | #define EXIT_ZOMBIE 16 |
| 175 | #define EXIT_DEAD 32 | 177 | #define EXIT_DEAD 32 |
| 176 | /* in tsk->state again */ | 178 | /* in tsk->state again */ |
| 177 | #define TASK_NONINTERACTIVE 64 | 179 | #define TASK_DEAD 64 |
| 178 | #define TASK_DEAD 128 | ||
| 179 | 180 | ||
| 180 | #define __set_task_state(tsk, state_value) \ | 181 | #define __set_task_state(tsk, state_value) \ |
| 181 | do { (tsk)->state = (state_value); } while (0) | 182 | do { (tsk)->state = (state_value); } while (0) |
| @@ -516,6 +517,8 @@ struct signal_struct { | |||
| 516 | * in __exit_signal, except for the group leader. | 517 | * in __exit_signal, except for the group leader. |
| 517 | */ | 518 | */ |
| 518 | cputime_t utime, stime, cutime, cstime; | 519 | cputime_t utime, stime, cutime, cstime; |
| 520 | cputime_t gtime; | ||
| 521 | cputime_t cgtime; | ||
| 519 | unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; | 522 | unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; |
| 520 | unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; | 523 | unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; |
| 521 | unsigned long inblock, oublock, cinblock, coublock; | 524 | unsigned long inblock, oublock, cinblock, coublock; |
| @@ -596,8 +599,21 @@ struct user_struct { | |||
| 596 | /* Hash table maintenance information */ | 599 | /* Hash table maintenance information */ |
| 597 | struct hlist_node uidhash_node; | 600 | struct hlist_node uidhash_node; |
| 598 | uid_t uid; | 601 | uid_t uid; |
| 602 | |||
| 603 | #ifdef CONFIG_FAIR_USER_SCHED | ||
| 604 | struct task_group *tg; | ||
| 605 | struct kset kset; | ||
| 606 | struct subsys_attribute user_attr; | ||
| 607 | struct work_struct work; | ||
| 608 | #endif | ||
| 599 | }; | 609 | }; |
| 600 | 610 | ||
| 611 | #ifdef CONFIG_FAIR_USER_SCHED | ||
| 612 | extern int uids_kobject_init(void); | ||
| 613 | #else | ||
| 614 | static inline int uids_kobject_init(void) { return 0; } | ||
| 615 | #endif | ||
| 616 | |||
| 601 | extern struct user_struct *find_user(uid_t); | 617 | extern struct user_struct *find_user(uid_t); |
| 602 | 618 | ||
| 603 | extern struct user_struct root_user; | 619 | extern struct user_struct root_user; |
| @@ -609,13 +625,17 @@ struct reclaim_state; | |||
| 609 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 625 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
| 610 | struct sched_info { | 626 | struct sched_info { |
| 611 | /* cumulative counters */ | 627 | /* cumulative counters */ |
| 612 | unsigned long pcnt; /* # of times run on this cpu */ | 628 | unsigned long pcount; /* # of times run on this cpu */ |
| 613 | unsigned long long cpu_time, /* time spent on the cpu */ | 629 | unsigned long long cpu_time, /* time spent on the cpu */ |
| 614 | run_delay; /* time spent waiting on a runqueue */ | 630 | run_delay; /* time spent waiting on a runqueue */ |
| 615 | 631 | ||
| 616 | /* timestamps */ | 632 | /* timestamps */ |
| 617 | unsigned long long last_arrival,/* when we last ran on a cpu */ | 633 | unsigned long long last_arrival,/* when we last ran on a cpu */ |
| 618 | last_queued; /* when we were last queued to run */ | 634 | last_queued; /* when we were last queued to run */ |
| 635 | #ifdef CONFIG_SCHEDSTATS | ||
| 636 | /* BKL stats */ | ||
| 637 | unsigned long bkl_count; | ||
| 638 | #endif | ||
| 619 | }; | 639 | }; |
| 620 | #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ | 640 | #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ |
| 621 | 641 | ||
| @@ -750,7 +770,7 @@ struct sched_domain { | |||
| 750 | 770 | ||
| 751 | #ifdef CONFIG_SCHEDSTATS | 771 | #ifdef CONFIG_SCHEDSTATS |
| 752 | /* load_balance() stats */ | 772 | /* load_balance() stats */ |
| 753 | unsigned long lb_cnt[CPU_MAX_IDLE_TYPES]; | 773 | unsigned long lb_count[CPU_MAX_IDLE_TYPES]; |
| 754 | unsigned long lb_failed[CPU_MAX_IDLE_TYPES]; | 774 | unsigned long lb_failed[CPU_MAX_IDLE_TYPES]; |
| 755 | unsigned long lb_balanced[CPU_MAX_IDLE_TYPES]; | 775 | unsigned long lb_balanced[CPU_MAX_IDLE_TYPES]; |
| 756 | unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES]; | 776 | unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES]; |
| @@ -760,17 +780,17 @@ struct sched_domain { | |||
| 760 | unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES]; | 780 | unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES]; |
| 761 | 781 | ||
| 762 | /* Active load balancing */ | 782 | /* Active load balancing */ |
| 763 | unsigned long alb_cnt; | 783 | unsigned long alb_count; |
| 764 | unsigned long alb_failed; | 784 | unsigned long alb_failed; |
| 765 | unsigned long alb_pushed; | 785 | unsigned long alb_pushed; |
| 766 | 786 | ||
| 767 | /* SD_BALANCE_EXEC stats */ | 787 | /* SD_BALANCE_EXEC stats */ |
| 768 | unsigned long sbe_cnt; | 788 | unsigned long sbe_count; |
| 769 | unsigned long sbe_balanced; | 789 | unsigned long sbe_balanced; |
| 770 | unsigned long sbe_pushed; | 790 | unsigned long sbe_pushed; |
| 771 | 791 | ||
| 772 | /* SD_BALANCE_FORK stats */ | 792 | /* SD_BALANCE_FORK stats */ |
| 773 | unsigned long sbf_cnt; | 793 | unsigned long sbf_count; |
| 774 | unsigned long sbf_balanced; | 794 | unsigned long sbf_balanced; |
| 775 | unsigned long sbf_pushed; | 795 | unsigned long sbf_pushed; |
| 776 | 796 | ||
| @@ -854,11 +874,11 @@ struct rq; | |||
| 854 | struct sched_domain; | 874 | struct sched_domain; |
| 855 | 875 | ||
| 856 | struct sched_class { | 876 | struct sched_class { |
| 857 | struct sched_class *next; | 877 | const struct sched_class *next; |
| 858 | 878 | ||
| 859 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); | 879 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); |
| 860 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); | 880 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); |
| 861 | void (*yield_task) (struct rq *rq, struct task_struct *p); | 881 | void (*yield_task) (struct rq *rq); |
| 862 | 882 | ||
| 863 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); | 883 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); |
| 864 | 884 | ||
| @@ -888,31 +908,22 @@ struct load_weight { | |||
| 888 | * 4 se->block_start | 908 | * 4 se->block_start |
| 889 | * 4 se->run_node | 909 | * 4 se->run_node |
| 890 | * 4 se->sleep_start | 910 | * 4 se->sleep_start |
| 891 | * 4 se->sleep_start_fair | ||
| 892 | * 6 se->load.weight | 911 | * 6 se->load.weight |
| 893 | * 7 se->delta_fair | ||
| 894 | * 15 se->wait_runtime | ||
| 895 | */ | 912 | */ |
| 896 | struct sched_entity { | 913 | struct sched_entity { |
| 897 | long wait_runtime; | ||
| 898 | unsigned long delta_fair_run; | ||
| 899 | unsigned long delta_fair_sleep; | ||
| 900 | unsigned long delta_exec; | ||
| 901 | s64 fair_key; | ||
| 902 | struct load_weight load; /* for load-balancing */ | 914 | struct load_weight load; /* for load-balancing */ |
| 903 | struct rb_node run_node; | 915 | struct rb_node run_node; |
| 904 | unsigned int on_rq; | 916 | unsigned int on_rq; |
| 917 | int peer_preempt; | ||
| 905 | 918 | ||
| 906 | u64 exec_start; | 919 | u64 exec_start; |
| 907 | u64 sum_exec_runtime; | 920 | u64 sum_exec_runtime; |
| 921 | u64 vruntime; | ||
| 908 | u64 prev_sum_exec_runtime; | 922 | u64 prev_sum_exec_runtime; |
| 909 | u64 wait_start_fair; | ||
| 910 | u64 sleep_start_fair; | ||
| 911 | 923 | ||
| 912 | #ifdef CONFIG_SCHEDSTATS | 924 | #ifdef CONFIG_SCHEDSTATS |
| 913 | u64 wait_start; | 925 | u64 wait_start; |
| 914 | u64 wait_max; | 926 | u64 wait_max; |
| 915 | s64 sum_wait_runtime; | ||
| 916 | 927 | ||
| 917 | u64 sleep_start; | 928 | u64 sleep_start; |
| 918 | u64 sleep_max; | 929 | u64 sleep_max; |
| @@ -921,9 +932,25 @@ struct sched_entity { | |||
| 921 | u64 block_start; | 932 | u64 block_start; |
| 922 | u64 block_max; | 933 | u64 block_max; |
| 923 | u64 exec_max; | 934 | u64 exec_max; |
| 924 | 935 | u64 slice_max; | |
| 925 | unsigned long wait_runtime_overruns; | 936 | |
| 926 | unsigned long wait_runtime_underruns; | 937 | u64 nr_migrations; |
| 938 | u64 nr_migrations_cold; | ||
| 939 | u64 nr_failed_migrations_affine; | ||
| 940 | u64 nr_failed_migrations_running; | ||
| 941 | u64 nr_failed_migrations_hot; | ||
| 942 | u64 nr_forced_migrations; | ||
| 943 | u64 nr_forced2_migrations; | ||
| 944 | |||
| 945 | u64 nr_wakeups; | ||
| 946 | u64 nr_wakeups_sync; | ||
| 947 | u64 nr_wakeups_migrate; | ||
| 948 | u64 nr_wakeups_local; | ||
| 949 | u64 nr_wakeups_remote; | ||
| 950 | u64 nr_wakeups_affine; | ||
| 951 | u64 nr_wakeups_affine_attempts; | ||
| 952 | u64 nr_wakeups_passive; | ||
| 953 | u64 nr_wakeups_idle; | ||
| 927 | #endif | 954 | #endif |
| 928 | 955 | ||
| 929 | #ifdef CONFIG_FAIR_GROUP_SCHED | 956 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| @@ -952,7 +979,7 @@ struct task_struct { | |||
| 952 | 979 | ||
| 953 | int prio, static_prio, normal_prio; | 980 | int prio, static_prio, normal_prio; |
| 954 | struct list_head run_list; | 981 | struct list_head run_list; |
| 955 | struct sched_class *sched_class; | 982 | const struct sched_class *sched_class; |
| 956 | struct sched_entity se; | 983 | struct sched_entity se; |
| 957 | 984 | ||
| 958 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 985 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
| @@ -1023,6 +1050,7 @@ struct task_struct { | |||
| 1023 | 1050 | ||
| 1024 | unsigned int rt_priority; | 1051 | unsigned int rt_priority; |
| 1025 | cputime_t utime, stime; | 1052 | cputime_t utime, stime; |
| 1053 | cputime_t gtime; | ||
| 1026 | unsigned long nvcsw, nivcsw; /* context switch counts */ | 1054 | unsigned long nvcsw, nivcsw; /* context switch counts */ |
| 1027 | struct timespec start_time; /* monotonic time */ | 1055 | struct timespec start_time; /* monotonic time */ |
| 1028 | struct timespec real_start_time; /* boot based time */ | 1056 | struct timespec real_start_time; /* boot based time */ |
| @@ -1314,6 +1342,7 @@ static inline void put_task_struct(struct task_struct *t) | |||
| 1314 | #define PF_STARTING 0x00000002 /* being created */ | 1342 | #define PF_STARTING 0x00000002 /* being created */ |
| 1315 | #define PF_EXITING 0x00000004 /* getting shut down */ | 1343 | #define PF_EXITING 0x00000004 /* getting shut down */ |
| 1316 | #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ | 1344 | #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ |
| 1345 | #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ | ||
| 1317 | #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ | 1346 | #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ |
| 1318 | #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ | 1347 | #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ |
| 1319 | #define PF_DUMPCORE 0x00000200 /* dumped core */ | 1348 | #define PF_DUMPCORE 0x00000200 /* dumped core */ |
| @@ -1401,15 +1430,17 @@ static inline void idle_task_exit(void) {} | |||
| 1401 | 1430 | ||
| 1402 | extern void sched_idle_next(void); | 1431 | extern void sched_idle_next(void); |
| 1403 | 1432 | ||
| 1433 | #ifdef CONFIG_SCHED_DEBUG | ||
| 1404 | extern unsigned int sysctl_sched_latency; | 1434 | extern unsigned int sysctl_sched_latency; |
| 1405 | extern unsigned int sysctl_sched_min_granularity; | 1435 | extern unsigned int sysctl_sched_nr_latency; |
| 1406 | extern unsigned int sysctl_sched_wakeup_granularity; | 1436 | extern unsigned int sysctl_sched_wakeup_granularity; |
| 1407 | extern unsigned int sysctl_sched_batch_wakeup_granularity; | 1437 | extern unsigned int sysctl_sched_batch_wakeup_granularity; |
| 1408 | extern unsigned int sysctl_sched_stat_granularity; | ||
| 1409 | extern unsigned int sysctl_sched_runtime_limit; | ||
| 1410 | extern unsigned int sysctl_sched_compat_yield; | ||
| 1411 | extern unsigned int sysctl_sched_child_runs_first; | 1438 | extern unsigned int sysctl_sched_child_runs_first; |
| 1412 | extern unsigned int sysctl_sched_features; | 1439 | extern unsigned int sysctl_sched_features; |
| 1440 | extern unsigned int sysctl_sched_migration_cost; | ||
| 1441 | #endif | ||
| 1442 | |||
| 1443 | extern unsigned int sysctl_sched_compat_yield; | ||
| 1413 | 1444 | ||
| 1414 | #ifdef CONFIG_RT_MUTEXES | 1445 | #ifdef CONFIG_RT_MUTEXES |
| 1415 | extern int rt_mutex_getprio(struct task_struct *p); | 1446 | extern int rt_mutex_getprio(struct task_struct *p); |
| @@ -1843,6 +1874,18 @@ extern int sched_mc_power_savings, sched_smt_power_savings; | |||
| 1843 | 1874 | ||
| 1844 | extern void normalize_rt_tasks(void); | 1875 | extern void normalize_rt_tasks(void); |
| 1845 | 1876 | ||
| 1877 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1878 | |||
| 1879 | extern struct task_group init_task_group; | ||
| 1880 | |||
| 1881 | extern struct task_group *sched_create_group(void); | ||
| 1882 | extern void sched_destroy_group(struct task_group *tg); | ||
| 1883 | extern void sched_move_task(struct task_struct *tsk); | ||
| 1884 | extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); | ||
| 1885 | extern unsigned long sched_group_shares(struct task_group *tg); | ||
| 1886 | |||
| 1887 | #endif | ||
| 1888 | |||
| 1846 | #ifdef CONFIG_TASK_XACCT | 1889 | #ifdef CONFIG_TASK_XACCT |
| 1847 | static inline void add_rchar(struct task_struct *tsk, ssize_t amt) | 1890 | static inline void add_rchar(struct task_struct *tsk, ssize_t amt) |
| 1848 | { | 1891 | { |
diff --git a/include/linux/topology.h b/include/linux/topology.h index 525d437b1253..47729f18bfdf 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h | |||
| @@ -159,15 +159,14 @@ | |||
| 159 | .imbalance_pct = 125, \ | 159 | .imbalance_pct = 125, \ |
| 160 | .cache_nice_tries = 1, \ | 160 | .cache_nice_tries = 1, \ |
| 161 | .busy_idx = 2, \ | 161 | .busy_idx = 2, \ |
| 162 | .idle_idx = 0, \ | 162 | .idle_idx = 1, \ |
| 163 | .newidle_idx = 0, \ | 163 | .newidle_idx = 2, \ |
| 164 | .wake_idx = 1, \ | 164 | .wake_idx = 1, \ |
| 165 | .forkexec_idx = 1, \ | 165 | .forkexec_idx = 1, \ |
| 166 | .flags = SD_LOAD_BALANCE \ | 166 | .flags = SD_LOAD_BALANCE \ |
| 167 | | SD_BALANCE_NEWIDLE \ | 167 | | SD_BALANCE_NEWIDLE \ |
| 168 | | SD_BALANCE_EXEC \ | 168 | | SD_BALANCE_EXEC \ |
| 169 | | SD_WAKE_AFFINE \ | 169 | | SD_WAKE_AFFINE \ |
| 170 | | SD_WAKE_IDLE \ | ||
| 171 | | BALANCE_FOR_PKG_POWER,\ | 170 | | BALANCE_FOR_PKG_POWER,\ |
| 172 | .last_balance = jiffies, \ | 171 | .last_balance = jiffies, \ |
| 173 | .balance_interval = 1, \ | 172 | .balance_interval = 1, \ |
diff --git a/init/Kconfig b/init/Kconfig index d54d0cadcc06..54f31a191b88 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
| @@ -281,6 +281,27 @@ config CPUSETS | |||
| 281 | 281 | ||
| 282 | Say N if unsure. | 282 | Say N if unsure. |
| 283 | 283 | ||
| 284 | config FAIR_GROUP_SCHED | ||
| 285 | bool "Fair group CPU scheduler" | ||
| 286 | default y | ||
| 287 | depends on EXPERIMENTAL | ||
| 288 | help | ||
| 289 | This feature lets CPU scheduler recognize task groups and control CPU | ||
| 290 | bandwidth allocation to such task groups. | ||
| 291 | |||
| 292 | choice | ||
| 293 | depends on FAIR_GROUP_SCHED | ||
| 294 | prompt "Basis for grouping tasks" | ||
| 295 | default FAIR_USER_SCHED | ||
| 296 | |||
| 297 | config FAIR_USER_SCHED | ||
| 298 | bool "user id" | ||
| 299 | help | ||
| 300 | This option will choose userid as the basis for grouping | ||
| 301 | tasks, thus providing equal CPU bandwidth to each user. | ||
| 302 | |||
| 303 | endchoice | ||
| 304 | |||
| 284 | config SYSFS_DEPRECATED | 305 | config SYSFS_DEPRECATED |
| 285 | bool "Create deprecated sysfs files" | 306 | bool "Create deprecated sysfs files" |
| 286 | default y | 307 | default y |
diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 81e697829633..09e9574eeb26 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c | |||
| @@ -119,7 +119,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) | |||
| 119 | * No locking available for sched_info (and too expensive to add one) | 119 | * No locking available for sched_info (and too expensive to add one) |
| 120 | * Mitigate by taking snapshot of values | 120 | * Mitigate by taking snapshot of values |
| 121 | */ | 121 | */ |
| 122 | t1 = tsk->sched_info.pcnt; | 122 | t1 = tsk->sched_info.pcount; |
| 123 | t2 = tsk->sched_info.run_delay; | 123 | t2 = tsk->sched_info.run_delay; |
| 124 | t3 = tsk->sched_info.cpu_time; | 124 | t3 = tsk->sched_info.cpu_time; |
| 125 | 125 | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 993369ee94d1..7f7959de4a87 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -111,6 +111,7 @@ static void __exit_signal(struct task_struct *tsk) | |||
| 111 | */ | 111 | */ |
| 112 | sig->utime = cputime_add(sig->utime, tsk->utime); | 112 | sig->utime = cputime_add(sig->utime, tsk->utime); |
| 113 | sig->stime = cputime_add(sig->stime, tsk->stime); | 113 | sig->stime = cputime_add(sig->stime, tsk->stime); |
| 114 | sig->gtime = cputime_add(sig->gtime, tsk->gtime); | ||
| 114 | sig->min_flt += tsk->min_flt; | 115 | sig->min_flt += tsk->min_flt; |
| 115 | sig->maj_flt += tsk->maj_flt; | 116 | sig->maj_flt += tsk->maj_flt; |
| 116 | sig->nvcsw += tsk->nvcsw; | 117 | sig->nvcsw += tsk->nvcsw; |
| @@ -1242,6 +1243,11 @@ static int wait_task_zombie(struct task_struct *p, int noreap, | |||
| 1242 | cputime_add(p->stime, | 1243 | cputime_add(p->stime, |
| 1243 | cputime_add(sig->stime, | 1244 | cputime_add(sig->stime, |
| 1244 | sig->cstime))); | 1245 | sig->cstime))); |
| 1246 | psig->cgtime = | ||
| 1247 | cputime_add(psig->cgtime, | ||
| 1248 | cputime_add(p->gtime, | ||
| 1249 | cputime_add(sig->gtime, | ||
| 1250 | sig->cgtime))); | ||
| 1245 | psig->cmin_flt += | 1251 | psig->cmin_flt += |
| 1246 | p->min_flt + sig->min_flt + sig->cmin_flt; | 1252 | p->min_flt + sig->min_flt + sig->cmin_flt; |
| 1247 | psig->cmaj_flt += | 1253 | psig->cmaj_flt += |
diff --git a/kernel/fork.c b/kernel/fork.c index 5e67f90a1694..3fc3c1383912 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -877,6 +877,8 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
| 877 | sig->tty_old_pgrp = NULL; | 877 | sig->tty_old_pgrp = NULL; |
| 878 | 878 | ||
| 879 | sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; | 879 | sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; |
| 880 | sig->gtime = cputime_zero; | ||
| 881 | sig->cgtime = cputime_zero; | ||
| 880 | sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; | 882 | sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; |
| 881 | sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; | 883 | sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; |
| 882 | sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; | 884 | sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; |
| @@ -1045,6 +1047,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1045 | 1047 | ||
| 1046 | p->utime = cputime_zero; | 1048 | p->utime = cputime_zero; |
| 1047 | p->stime = cputime_zero; | 1049 | p->stime = cputime_zero; |
| 1050 | p->gtime = cputime_zero; | ||
| 1048 | 1051 | ||
| 1049 | #ifdef CONFIG_TASK_XACCT | 1052 | #ifdef CONFIG_TASK_XACCT |
| 1050 | p->rchar = 0; /* I/O counter: bytes read */ | 1053 | p->rchar = 0; /* I/O counter: bytes read */ |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index d0e5c48e18c7..6046939d0804 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
| 15 | #include <linux/init.h> | 15 | #include <linux/init.h> |
| 16 | #include <linux/kexec.h> | 16 | #include <linux/kexec.h> |
| 17 | #include <linux/sched.h> | ||
| 17 | 18 | ||
| 18 | #define KERNEL_ATTR_RO(_name) \ | 19 | #define KERNEL_ATTR_RO(_name) \ |
| 19 | static struct subsys_attribute _name##_attr = __ATTR_RO(_name) | 20 | static struct subsys_attribute _name##_attr = __ATTR_RO(_name) |
| @@ -116,6 +117,13 @@ static int __init ksysfs_init(void) | |||
| 116 | ¬es_attr); | 117 | ¬es_attr); |
| 117 | } | 118 | } |
| 118 | 119 | ||
| 120 | /* | ||
| 121 | * Create "/sys/kernel/uids" directory and corresponding root user's | ||
| 122 | * directory under it. | ||
| 123 | */ | ||
| 124 | if (!error) | ||
| 125 | error = uids_kobject_init(); | ||
| 126 | |||
| 119 | return error; | 127 | return error; |
| 120 | } | 128 | } |
| 121 | 129 | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 6c10fa796ca0..bba57adb9504 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -96,7 +96,7 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
| 96 | /* | 96 | /* |
| 97 | * Some helpers for converting nanosecond timing to jiffy resolution | 97 | * Some helpers for converting nanosecond timing to jiffy resolution |
| 98 | */ | 98 | */ |
| 99 | #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) | 99 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (1000000000 / HZ)) |
| 100 | #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) | 100 | #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) |
| 101 | 101 | ||
| 102 | #define NICE_0_LOAD SCHED_LOAD_SCALE | 102 | #define NICE_0_LOAD SCHED_LOAD_SCALE |
| @@ -105,11 +105,9 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
| 105 | /* | 105 | /* |
| 106 | * These are the 'tuning knobs' of the scheduler: | 106 | * These are the 'tuning knobs' of the scheduler: |
| 107 | * | 107 | * |
| 108 | * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), | 108 | * default timeslice is 100 msecs (used only for SCHED_RR tasks). |
| 109 | * default timeslice is 100 msecs, maximum timeslice is 800 msecs. | ||
| 110 | * Timeslices get refilled after they expire. | 109 | * Timeslices get refilled after they expire. |
| 111 | */ | 110 | */ |
| 112 | #define MIN_TIMESLICE max(5 * HZ / 1000, 1) | ||
| 113 | #define DEF_TIMESLICE (100 * HZ / 1000) | 111 | #define DEF_TIMESLICE (100 * HZ / 1000) |
| 114 | 112 | ||
| 115 | #ifdef CONFIG_SMP | 113 | #ifdef CONFIG_SMP |
| @@ -133,24 +131,6 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) | |||
| 133 | } | 131 | } |
| 134 | #endif | 132 | #endif |
| 135 | 133 | ||
| 136 | #define SCALE_PRIO(x, prio) \ | ||
| 137 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) | ||
| 138 | |||
| 139 | /* | ||
| 140 | * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] | ||
| 141 | * to time slice values: [800ms ... 100ms ... 5ms] | ||
| 142 | */ | ||
| 143 | static unsigned int static_prio_timeslice(int static_prio) | ||
| 144 | { | ||
| 145 | if (static_prio == NICE_TO_PRIO(19)) | ||
| 146 | return 1; | ||
| 147 | |||
| 148 | if (static_prio < NICE_TO_PRIO(0)) | ||
| 149 | return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); | ||
| 150 | else | ||
| 151 | return SCALE_PRIO(DEF_TIMESLICE, static_prio); | ||
| 152 | } | ||
| 153 | |||
| 154 | static inline int rt_policy(int policy) | 134 | static inline int rt_policy(int policy) |
| 155 | { | 135 | { |
| 156 | if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) | 136 | if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) |
| @@ -171,31 +151,91 @@ struct rt_prio_array { | |||
| 171 | struct list_head queue[MAX_RT_PRIO]; | 151 | struct list_head queue[MAX_RT_PRIO]; |
| 172 | }; | 152 | }; |
| 173 | 153 | ||
| 174 | struct load_stat { | 154 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 175 | struct load_weight load; | 155 | |
| 176 | u64 load_update_start, load_update_last; | 156 | struct cfs_rq; |
| 177 | unsigned long delta_fair, delta_exec, delta_stat; | 157 | |
| 158 | /* task group related information */ | ||
| 159 | struct task_group { | ||
| 160 | /* schedulable entities of this group on each cpu */ | ||
| 161 | struct sched_entity **se; | ||
| 162 | /* runqueue "owned" by this group on each cpu */ | ||
| 163 | struct cfs_rq **cfs_rq; | ||
| 164 | unsigned long shares; | ||
| 165 | /* spinlock to serialize modification to shares */ | ||
| 166 | spinlock_t lock; | ||
| 167 | }; | ||
| 168 | |||
| 169 | /* Default task group's sched entity on each cpu */ | ||
| 170 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | ||
| 171 | /* Default task group's cfs_rq on each cpu */ | ||
| 172 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | ||
| 173 | |||
| 174 | static struct sched_entity *init_sched_entity_p[NR_CPUS]; | ||
| 175 | static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; | ||
| 176 | |||
| 177 | /* Default task group. | ||
| 178 | * Every task in system belong to this group at bootup. | ||
| 179 | */ | ||
| 180 | struct task_group init_task_group = { | ||
| 181 | .se = init_sched_entity_p, | ||
| 182 | .cfs_rq = init_cfs_rq_p, | ||
| 178 | }; | 183 | }; |
| 179 | 184 | ||
| 185 | #ifdef CONFIG_FAIR_USER_SCHED | ||
| 186 | # define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD | ||
| 187 | #else | ||
| 188 | # define INIT_TASK_GRP_LOAD NICE_0_LOAD | ||
| 189 | #endif | ||
| 190 | |||
| 191 | static int init_task_group_load = INIT_TASK_GRP_LOAD; | ||
| 192 | |||
| 193 | /* return group to which a task belongs */ | ||
| 194 | static inline struct task_group *task_group(struct task_struct *p) | ||
| 195 | { | ||
| 196 | struct task_group *tg; | ||
| 197 | |||
| 198 | #ifdef CONFIG_FAIR_USER_SCHED | ||
| 199 | tg = p->user->tg; | ||
| 200 | #else | ||
| 201 | tg = &init_task_group; | ||
| 202 | #endif | ||
| 203 | |||
| 204 | return tg; | ||
| 205 | } | ||
| 206 | |||
| 207 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | ||
| 208 | static inline void set_task_cfs_rq(struct task_struct *p) | ||
| 209 | { | ||
| 210 | p->se.cfs_rq = task_group(p)->cfs_rq[task_cpu(p)]; | ||
| 211 | p->se.parent = task_group(p)->se[task_cpu(p)]; | ||
| 212 | } | ||
| 213 | |||
| 214 | #else | ||
| 215 | |||
| 216 | static inline void set_task_cfs_rq(struct task_struct *p) { } | ||
| 217 | |||
| 218 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 219 | |||
| 180 | /* CFS-related fields in a runqueue */ | 220 | /* CFS-related fields in a runqueue */ |
| 181 | struct cfs_rq { | 221 | struct cfs_rq { |
| 182 | struct load_weight load; | 222 | struct load_weight load; |
| 183 | unsigned long nr_running; | 223 | unsigned long nr_running; |
| 184 | 224 | ||
| 185 | s64 fair_clock; | ||
| 186 | u64 exec_clock; | 225 | u64 exec_clock; |
| 187 | s64 wait_runtime; | 226 | u64 min_vruntime; |
| 188 | u64 sleeper_bonus; | ||
| 189 | unsigned long wait_runtime_overruns, wait_runtime_underruns; | ||
| 190 | 227 | ||
| 191 | struct rb_root tasks_timeline; | 228 | struct rb_root tasks_timeline; |
| 192 | struct rb_node *rb_leftmost; | 229 | struct rb_node *rb_leftmost; |
| 193 | struct rb_node *rb_load_balance_curr; | 230 | struct rb_node *rb_load_balance_curr; |
| 194 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 195 | /* 'curr' points to currently running entity on this cfs_rq. | 231 | /* 'curr' points to currently running entity on this cfs_rq. |
| 196 | * It is set to NULL otherwise (i.e when none are currently running). | 232 | * It is set to NULL otherwise (i.e when none are currently running). |
| 197 | */ | 233 | */ |
| 198 | struct sched_entity *curr; | 234 | struct sched_entity *curr; |
| 235 | |||
| 236 | unsigned long nr_spread_over; | ||
| 237 | |||
| 238 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 199 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | 239 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ |
| 200 | 240 | ||
| 201 | /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | 241 | /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in |
| @@ -206,6 +246,8 @@ struct cfs_rq { | |||
| 206 | * list is used during load balance. | 246 | * list is used during load balance. |
| 207 | */ | 247 | */ |
| 208 | struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ | 248 | struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ |
| 249 | struct task_group *tg; /* group that "owns" this runqueue */ | ||
| 250 | struct rcu_head rcu; | ||
| 209 | #endif | 251 | #endif |
| 210 | }; | 252 | }; |
| 211 | 253 | ||
| @@ -237,7 +279,7 @@ struct rq { | |||
| 237 | #ifdef CONFIG_NO_HZ | 279 | #ifdef CONFIG_NO_HZ |
| 238 | unsigned char in_nohz_recently; | 280 | unsigned char in_nohz_recently; |
| 239 | #endif | 281 | #endif |
| 240 | struct load_stat ls; /* capture load from *all* tasks on this cpu */ | 282 | struct load_weight load; /* capture load from *all* tasks on this cpu */ |
| 241 | unsigned long nr_load_updates; | 283 | unsigned long nr_load_updates; |
| 242 | u64 nr_switches; | 284 | u64 nr_switches; |
| 243 | 285 | ||
| @@ -289,16 +331,19 @@ struct rq { | |||
| 289 | unsigned long yld_exp_empty; | 331 | unsigned long yld_exp_empty; |
| 290 | unsigned long yld_act_empty; | 332 | unsigned long yld_act_empty; |
| 291 | unsigned long yld_both_empty; | 333 | unsigned long yld_both_empty; |
| 292 | unsigned long yld_cnt; | 334 | unsigned long yld_count; |
| 293 | 335 | ||
| 294 | /* schedule() stats */ | 336 | /* schedule() stats */ |
| 295 | unsigned long sched_switch; | 337 | unsigned long sched_switch; |
| 296 | unsigned long sched_cnt; | 338 | unsigned long sched_count; |
| 297 | unsigned long sched_goidle; | 339 | unsigned long sched_goidle; |
| 298 | 340 | ||
| 299 | /* try_to_wake_up() stats */ | 341 | /* try_to_wake_up() stats */ |
| 300 | unsigned long ttwu_cnt; | 342 | unsigned long ttwu_count; |
| 301 | unsigned long ttwu_local; | 343 | unsigned long ttwu_local; |
| 344 | |||
| 345 | /* BKL stats */ | ||
| 346 | unsigned long bkl_count; | ||
| 302 | #endif | 347 | #endif |
| 303 | struct lock_class_key rq_lock_key; | 348 | struct lock_class_key rq_lock_key; |
| 304 | }; | 349 | }; |
| @@ -383,6 +428,37 @@ static void update_rq_clock(struct rq *rq) | |||
| 383 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 428 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
| 384 | 429 | ||
| 385 | /* | 430 | /* |
| 431 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | ||
| 432 | */ | ||
| 433 | #ifdef CONFIG_SCHED_DEBUG | ||
| 434 | # define const_debug __read_mostly | ||
| 435 | #else | ||
| 436 | # define const_debug static const | ||
| 437 | #endif | ||
| 438 | |||
| 439 | /* | ||
| 440 | * Debugging: various feature bits | ||
| 441 | */ | ||
| 442 | enum { | ||
| 443 | SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, | ||
| 444 | SCHED_FEAT_START_DEBIT = 2, | ||
| 445 | SCHED_FEAT_TREE_AVG = 4, | ||
| 446 | SCHED_FEAT_APPROX_AVG = 8, | ||
| 447 | SCHED_FEAT_WAKEUP_PREEMPT = 16, | ||
| 448 | SCHED_FEAT_PREEMPT_RESTRICT = 32, | ||
| 449 | }; | ||
| 450 | |||
| 451 | const_debug unsigned int sysctl_sched_features = | ||
| 452 | SCHED_FEAT_NEW_FAIR_SLEEPERS *1 | | ||
| 453 | SCHED_FEAT_START_DEBIT *1 | | ||
| 454 | SCHED_FEAT_TREE_AVG *0 | | ||
| 455 | SCHED_FEAT_APPROX_AVG *0 | | ||
| 456 | SCHED_FEAT_WAKEUP_PREEMPT *1 | | ||
| 457 | SCHED_FEAT_PREEMPT_RESTRICT *1; | ||
| 458 | |||
| 459 | #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) | ||
| 460 | |||
| 461 | /* | ||
| 386 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | 462 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu |
| 387 | * clock constructed from sched_clock(): | 463 | * clock constructed from sched_clock(): |
| 388 | */ | 464 | */ |
| @@ -400,18 +476,7 @@ unsigned long long cpu_clock(int cpu) | |||
| 400 | 476 | ||
| 401 | return now; | 477 | return now; |
| 402 | } | 478 | } |
| 403 | 479 | EXPORT_SYMBOL_GPL(cpu_clock); | |
| 404 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 405 | /* Change a task's ->cfs_rq if it moves across CPUs */ | ||
| 406 | static inline void set_task_cfs_rq(struct task_struct *p) | ||
| 407 | { | ||
| 408 | p->se.cfs_rq = &task_rq(p)->cfs; | ||
| 409 | } | ||
| 410 | #else | ||
| 411 | static inline void set_task_cfs_rq(struct task_struct *p) | ||
| 412 | { | ||
| 413 | } | ||
| 414 | #endif | ||
| 415 | 480 | ||
| 416 | #ifndef prepare_arch_switch | 481 | #ifndef prepare_arch_switch |
| 417 | # define prepare_arch_switch(next) do { } while (0) | 482 | # define prepare_arch_switch(next) do { } while (0) |
| @@ -497,16 +562,13 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
| 497 | static inline struct rq *__task_rq_lock(struct task_struct *p) | 562 | static inline struct rq *__task_rq_lock(struct task_struct *p) |
| 498 | __acquires(rq->lock) | 563 | __acquires(rq->lock) |
| 499 | { | 564 | { |
| 500 | struct rq *rq; | 565 | for (;;) { |
| 501 | 566 | struct rq *rq = task_rq(p); | |
| 502 | repeat_lock_task: | 567 | spin_lock(&rq->lock); |
| 503 | rq = task_rq(p); | 568 | if (likely(rq == task_rq(p))) |
| 504 | spin_lock(&rq->lock); | 569 | return rq; |
| 505 | if (unlikely(rq != task_rq(p))) { | ||
| 506 | spin_unlock(&rq->lock); | 570 | spin_unlock(&rq->lock); |
| 507 | goto repeat_lock_task; | ||
| 508 | } | 571 | } |
| 509 | return rq; | ||
| 510 | } | 572 | } |
| 511 | 573 | ||
| 512 | /* | 574 | /* |
| @@ -519,18 +581,17 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | |||
| 519 | { | 581 | { |
| 520 | struct rq *rq; | 582 | struct rq *rq; |
| 521 | 583 | ||
| 522 | repeat_lock_task: | 584 | for (;;) { |
| 523 | local_irq_save(*flags); | 585 | local_irq_save(*flags); |
| 524 | rq = task_rq(p); | 586 | rq = task_rq(p); |
| 525 | spin_lock(&rq->lock); | 587 | spin_lock(&rq->lock); |
| 526 | if (unlikely(rq != task_rq(p))) { | 588 | if (likely(rq == task_rq(p))) |
| 589 | return rq; | ||
| 527 | spin_unlock_irqrestore(&rq->lock, *flags); | 590 | spin_unlock_irqrestore(&rq->lock, *flags); |
| 528 | goto repeat_lock_task; | ||
| 529 | } | 591 | } |
| 530 | return rq; | ||
| 531 | } | 592 | } |
| 532 | 593 | ||
| 533 | static inline void __task_rq_unlock(struct rq *rq) | 594 | static void __task_rq_unlock(struct rq *rq) |
| 534 | __releases(rq->lock) | 595 | __releases(rq->lock) |
| 535 | { | 596 | { |
| 536 | spin_unlock(&rq->lock); | 597 | spin_unlock(&rq->lock); |
| @@ -545,7 +606,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) | |||
| 545 | /* | 606 | /* |
| 546 | * this_rq_lock - lock this runqueue and disable interrupts. | 607 | * this_rq_lock - lock this runqueue and disable interrupts. |
| 547 | */ | 608 | */ |
| 548 | static inline struct rq *this_rq_lock(void) | 609 | static struct rq *this_rq_lock(void) |
| 549 | __acquires(rq->lock) | 610 | __acquires(rq->lock) |
| 550 | { | 611 | { |
| 551 | struct rq *rq; | 612 | struct rq *rq; |
| @@ -645,19 +706,6 @@ static inline void resched_task(struct task_struct *p) | |||
| 645 | } | 706 | } |
| 646 | #endif | 707 | #endif |
| 647 | 708 | ||
| 648 | static u64 div64_likely32(u64 divident, unsigned long divisor) | ||
| 649 | { | ||
| 650 | #if BITS_PER_LONG == 32 | ||
| 651 | if (likely(divident <= 0xffffffffULL)) | ||
| 652 | return (u32)divident / divisor; | ||
| 653 | do_div(divident, divisor); | ||
| 654 | |||
| 655 | return divident; | ||
| 656 | #else | ||
| 657 | return divident / divisor; | ||
| 658 | #endif | ||
| 659 | } | ||
| 660 | |||
| 661 | #if BITS_PER_LONG == 32 | 709 | #if BITS_PER_LONG == 32 |
| 662 | # define WMULT_CONST (~0UL) | 710 | # define WMULT_CONST (~0UL) |
| 663 | #else | 711 | #else |
| @@ -699,16 +747,14 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) | |||
| 699 | return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); | 747 | return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); |
| 700 | } | 748 | } |
| 701 | 749 | ||
| 702 | static void update_load_add(struct load_weight *lw, unsigned long inc) | 750 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) |
| 703 | { | 751 | { |
| 704 | lw->weight += inc; | 752 | lw->weight += inc; |
| 705 | lw->inv_weight = 0; | ||
| 706 | } | 753 | } |
| 707 | 754 | ||
| 708 | static void update_load_sub(struct load_weight *lw, unsigned long dec) | 755 | static inline void update_load_sub(struct load_weight *lw, unsigned long dec) |
| 709 | { | 756 | { |
| 710 | lw->weight -= dec; | 757 | lw->weight -= dec; |
| 711 | lw->inv_weight = 0; | ||
| 712 | } | 758 | } |
| 713 | 759 | ||
| 714 | /* | 760 | /* |
| @@ -784,29 +830,20 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 784 | int *this_best_prio, struct rq_iterator *iterator); | 830 | int *this_best_prio, struct rq_iterator *iterator); |
| 785 | 831 | ||
| 786 | #include "sched_stats.h" | 832 | #include "sched_stats.h" |
| 787 | #include "sched_rt.c" | ||
| 788 | #include "sched_fair.c" | ||
| 789 | #include "sched_idletask.c" | 833 | #include "sched_idletask.c" |
| 834 | #include "sched_fair.c" | ||
| 835 | #include "sched_rt.c" | ||
| 790 | #ifdef CONFIG_SCHED_DEBUG | 836 | #ifdef CONFIG_SCHED_DEBUG |
| 791 | # include "sched_debug.c" | 837 | # include "sched_debug.c" |
| 792 | #endif | 838 | #endif |
| 793 | 839 | ||
| 794 | #define sched_class_highest (&rt_sched_class) | 840 | #define sched_class_highest (&rt_sched_class) |
| 795 | 841 | ||
| 796 | static void __update_curr_load(struct rq *rq, struct load_stat *ls) | ||
| 797 | { | ||
| 798 | if (rq->curr != rq->idle && ls->load.weight) { | ||
| 799 | ls->delta_exec += ls->delta_stat; | ||
| 800 | ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load); | ||
| 801 | ls->delta_stat = 0; | ||
| 802 | } | ||
| 803 | } | ||
| 804 | |||
| 805 | /* | 842 | /* |
| 806 | * Update delta_exec, delta_fair fields for rq. | 843 | * Update delta_exec, delta_fair fields for rq. |
| 807 | * | 844 | * |
| 808 | * delta_fair clock advances at a rate inversely proportional to | 845 | * delta_fair clock advances at a rate inversely proportional to |
| 809 | * total load (rq->ls.load.weight) on the runqueue, while | 846 | * total load (rq->load.weight) on the runqueue, while |
| 810 | * delta_exec advances at the same rate as wall-clock (provided | 847 | * delta_exec advances at the same rate as wall-clock (provided |
| 811 | * cpu is not idle). | 848 | * cpu is not idle). |
| 812 | * | 849 | * |
| @@ -814,35 +851,17 @@ static void __update_curr_load(struct rq *rq, struct load_stat *ls) | |||
| 814 | * runqueue over any given interval. This (smoothened) load is used | 851 | * runqueue over any given interval. This (smoothened) load is used |
| 815 | * during load balance. | 852 | * during load balance. |
| 816 | * | 853 | * |
| 817 | * This function is called /before/ updating rq->ls.load | 854 | * This function is called /before/ updating rq->load |
| 818 | * and when switching tasks. | 855 | * and when switching tasks. |
| 819 | */ | 856 | */ |
| 820 | static void update_curr_load(struct rq *rq) | ||
| 821 | { | ||
| 822 | struct load_stat *ls = &rq->ls; | ||
| 823 | u64 start; | ||
| 824 | |||
| 825 | start = ls->load_update_start; | ||
| 826 | ls->load_update_start = rq->clock; | ||
| 827 | ls->delta_stat += rq->clock - start; | ||
| 828 | /* | ||
| 829 | * Stagger updates to ls->delta_fair. Very frequent updates | ||
| 830 | * can be expensive. | ||
| 831 | */ | ||
| 832 | if (ls->delta_stat >= sysctl_sched_stat_granularity) | ||
| 833 | __update_curr_load(rq, ls); | ||
| 834 | } | ||
| 835 | |||
| 836 | static inline void inc_load(struct rq *rq, const struct task_struct *p) | 857 | static inline void inc_load(struct rq *rq, const struct task_struct *p) |
| 837 | { | 858 | { |
| 838 | update_curr_load(rq); | 859 | update_load_add(&rq->load, p->se.load.weight); |
| 839 | update_load_add(&rq->ls.load, p->se.load.weight); | ||
| 840 | } | 860 | } |
| 841 | 861 | ||
| 842 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | 862 | static inline void dec_load(struct rq *rq, const struct task_struct *p) |
| 843 | { | 863 | { |
| 844 | update_curr_load(rq); | 864 | update_load_sub(&rq->load, p->se.load.weight); |
| 845 | update_load_sub(&rq->ls.load, p->se.load.weight); | ||
| 846 | } | 865 | } |
| 847 | 866 | ||
| 848 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | 867 | static void inc_nr_running(struct task_struct *p, struct rq *rq) |
| @@ -859,8 +878,6 @@ static void dec_nr_running(struct task_struct *p, struct rq *rq) | |||
| 859 | 878 | ||
| 860 | static void set_load_weight(struct task_struct *p) | 879 | static void set_load_weight(struct task_struct *p) |
| 861 | { | 880 | { |
| 862 | p->se.wait_runtime = 0; | ||
| 863 | |||
| 864 | if (task_has_rt_policy(p)) { | 881 | if (task_has_rt_policy(p)) { |
| 865 | p->se.load.weight = prio_to_weight[0] * 2; | 882 | p->se.load.weight = prio_to_weight[0] * 2; |
| 866 | p->se.load.inv_weight = prio_to_wmult[0] >> 1; | 883 | p->se.load.inv_weight = prio_to_wmult[0] >> 1; |
| @@ -952,20 +969,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
| 952 | } | 969 | } |
| 953 | 970 | ||
| 954 | /* | 971 | /* |
| 955 | * activate_idle_task - move idle task to the _front_ of runqueue. | ||
| 956 | */ | ||
| 957 | static inline void activate_idle_task(struct task_struct *p, struct rq *rq) | ||
| 958 | { | ||
| 959 | update_rq_clock(rq); | ||
| 960 | |||
| 961 | if (p->state == TASK_UNINTERRUPTIBLE) | ||
| 962 | rq->nr_uninterruptible--; | ||
| 963 | |||
| 964 | enqueue_task(rq, p, 0); | ||
| 965 | inc_nr_running(p, rq); | ||
| 966 | } | ||
| 967 | |||
| 968 | /* | ||
| 969 | * deactivate_task - remove a task from the runqueue. | 972 | * deactivate_task - remove a task from the runqueue. |
| 970 | */ | 973 | */ |
| 971 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | 974 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) |
| @@ -989,32 +992,50 @@ inline int task_curr(const struct task_struct *p) | |||
| 989 | /* Used instead of source_load when we know the type == 0 */ | 992 | /* Used instead of source_load when we know the type == 0 */ |
| 990 | unsigned long weighted_cpuload(const int cpu) | 993 | unsigned long weighted_cpuload(const int cpu) |
| 991 | { | 994 | { |
| 992 | return cpu_rq(cpu)->ls.load.weight; | 995 | return cpu_rq(cpu)->load.weight; |
| 993 | } | 996 | } |
| 994 | 997 | ||
| 995 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | 998 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) |
| 996 | { | 999 | { |
| 997 | #ifdef CONFIG_SMP | 1000 | #ifdef CONFIG_SMP |
| 998 | task_thread_info(p)->cpu = cpu; | 1001 | task_thread_info(p)->cpu = cpu; |
| 999 | set_task_cfs_rq(p); | ||
| 1000 | #endif | 1002 | #endif |
| 1003 | set_task_cfs_rq(p); | ||
| 1001 | } | 1004 | } |
| 1002 | 1005 | ||
| 1003 | #ifdef CONFIG_SMP | 1006 | #ifdef CONFIG_SMP |
| 1004 | 1007 | ||
| 1008 | /* | ||
| 1009 | * Is this task likely cache-hot: | ||
| 1010 | */ | ||
| 1011 | static inline int | ||
| 1012 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | ||
| 1013 | { | ||
| 1014 | s64 delta; | ||
| 1015 | |||
| 1016 | if (p->sched_class != &fair_sched_class) | ||
| 1017 | return 0; | ||
| 1018 | |||
| 1019 | if (sysctl_sched_migration_cost == -1) | ||
| 1020 | return 1; | ||
| 1021 | if (sysctl_sched_migration_cost == 0) | ||
| 1022 | return 0; | ||
| 1023 | |||
| 1024 | delta = now - p->se.exec_start; | ||
| 1025 | |||
| 1026 | return delta < (s64)sysctl_sched_migration_cost; | ||
| 1027 | } | ||
| 1028 | |||
| 1029 | |||
| 1005 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 1030 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
| 1006 | { | 1031 | { |
| 1007 | int old_cpu = task_cpu(p); | 1032 | int old_cpu = task_cpu(p); |
| 1008 | struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); | 1033 | struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); |
| 1009 | u64 clock_offset, fair_clock_offset; | 1034 | struct cfs_rq *old_cfsrq = task_cfs_rq(p), |
| 1035 | *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); | ||
| 1036 | u64 clock_offset; | ||
| 1010 | 1037 | ||
| 1011 | clock_offset = old_rq->clock - new_rq->clock; | 1038 | clock_offset = old_rq->clock - new_rq->clock; |
| 1012 | fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock; | ||
| 1013 | |||
| 1014 | if (p->se.wait_start_fair) | ||
| 1015 | p->se.wait_start_fair -= fair_clock_offset; | ||
| 1016 | if (p->se.sleep_start_fair) | ||
| 1017 | p->se.sleep_start_fair -= fair_clock_offset; | ||
| 1018 | 1039 | ||
| 1019 | #ifdef CONFIG_SCHEDSTATS | 1040 | #ifdef CONFIG_SCHEDSTATS |
| 1020 | if (p->se.wait_start) | 1041 | if (p->se.wait_start) |
| @@ -1023,7 +1044,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
| 1023 | p->se.sleep_start -= clock_offset; | 1044 | p->se.sleep_start -= clock_offset; |
| 1024 | if (p->se.block_start) | 1045 | if (p->se.block_start) |
| 1025 | p->se.block_start -= clock_offset; | 1046 | p->se.block_start -= clock_offset; |
| 1047 | if (old_cpu != new_cpu) { | ||
| 1048 | schedstat_inc(p, se.nr_migrations); | ||
| 1049 | if (task_hot(p, old_rq->clock, NULL)) | ||
| 1050 | schedstat_inc(p, se.nr_forced2_migrations); | ||
| 1051 | } | ||
| 1026 | #endif | 1052 | #endif |
| 1053 | p->se.vruntime -= old_cfsrq->min_vruntime - | ||
| 1054 | new_cfsrq->min_vruntime; | ||
| 1027 | 1055 | ||
| 1028 | __set_task_cpu(p, new_cpu); | 1056 | __set_task_cpu(p, new_cpu); |
| 1029 | } | 1057 | } |
| @@ -1078,69 +1106,71 @@ void wait_task_inactive(struct task_struct *p) | |||
| 1078 | int running, on_rq; | 1106 | int running, on_rq; |
| 1079 | struct rq *rq; | 1107 | struct rq *rq; |
| 1080 | 1108 | ||
| 1081 | repeat: | 1109 | for (;;) { |
| 1082 | /* | 1110 | /* |
| 1083 | * We do the initial early heuristics without holding | 1111 | * We do the initial early heuristics without holding |
| 1084 | * any task-queue locks at all. We'll only try to get | 1112 | * any task-queue locks at all. We'll only try to get |
| 1085 | * the runqueue lock when things look like they will | 1113 | * the runqueue lock when things look like they will |
| 1086 | * work out! | 1114 | * work out! |
| 1087 | */ | 1115 | */ |
| 1088 | rq = task_rq(p); | 1116 | rq = task_rq(p); |
| 1089 | 1117 | ||
| 1090 | /* | 1118 | /* |
| 1091 | * If the task is actively running on another CPU | 1119 | * If the task is actively running on another CPU |
| 1092 | * still, just relax and busy-wait without holding | 1120 | * still, just relax and busy-wait without holding |
| 1093 | * any locks. | 1121 | * any locks. |
| 1094 | * | 1122 | * |
| 1095 | * NOTE! Since we don't hold any locks, it's not | 1123 | * NOTE! Since we don't hold any locks, it's not |
| 1096 | * even sure that "rq" stays as the right runqueue! | 1124 | * even sure that "rq" stays as the right runqueue! |
| 1097 | * But we don't care, since "task_running()" will | 1125 | * But we don't care, since "task_running()" will |
| 1098 | * return false if the runqueue has changed and p | 1126 | * return false if the runqueue has changed and p |
| 1099 | * is actually now running somewhere else! | 1127 | * is actually now running somewhere else! |
| 1100 | */ | 1128 | */ |
| 1101 | while (task_running(rq, p)) | 1129 | while (task_running(rq, p)) |
| 1102 | cpu_relax(); | 1130 | cpu_relax(); |
| 1103 | 1131 | ||
| 1104 | /* | 1132 | /* |
| 1105 | * Ok, time to look more closely! We need the rq | 1133 | * Ok, time to look more closely! We need the rq |
| 1106 | * lock now, to be *sure*. If we're wrong, we'll | 1134 | * lock now, to be *sure*. If we're wrong, we'll |
| 1107 | * just go back and repeat. | 1135 | * just go back and repeat. |
| 1108 | */ | 1136 | */ |
| 1109 | rq = task_rq_lock(p, &flags); | 1137 | rq = task_rq_lock(p, &flags); |
| 1110 | running = task_running(rq, p); | 1138 | running = task_running(rq, p); |
| 1111 | on_rq = p->se.on_rq; | 1139 | on_rq = p->se.on_rq; |
| 1112 | task_rq_unlock(rq, &flags); | 1140 | task_rq_unlock(rq, &flags); |
| 1113 | 1141 | ||
| 1114 | /* | 1142 | /* |
| 1115 | * Was it really running after all now that we | 1143 | * Was it really running after all now that we |
| 1116 | * checked with the proper locks actually held? | 1144 | * checked with the proper locks actually held? |
| 1117 | * | 1145 | * |
| 1118 | * Oops. Go back and try again.. | 1146 | * Oops. Go back and try again.. |
| 1119 | */ | 1147 | */ |
| 1120 | if (unlikely(running)) { | 1148 | if (unlikely(running)) { |
| 1121 | cpu_relax(); | 1149 | cpu_relax(); |
| 1122 | goto repeat; | 1150 | continue; |
| 1123 | } | 1151 | } |
| 1124 | 1152 | ||
| 1125 | /* | 1153 | /* |
| 1126 | * It's not enough that it's not actively running, | 1154 | * It's not enough that it's not actively running, |
| 1127 | * it must be off the runqueue _entirely_, and not | 1155 | * it must be off the runqueue _entirely_, and not |
| 1128 | * preempted! | 1156 | * preempted! |
| 1129 | * | 1157 | * |
| 1130 | * So if it wa still runnable (but just not actively | 1158 | * So if it wa still runnable (but just not actively |
| 1131 | * running right now), it's preempted, and we should | 1159 | * running right now), it's preempted, and we should |
| 1132 | * yield - it could be a while. | 1160 | * yield - it could be a while. |
| 1133 | */ | 1161 | */ |
| 1134 | if (unlikely(on_rq)) { | 1162 | if (unlikely(on_rq)) { |
| 1135 | yield(); | 1163 | schedule_timeout_uninterruptible(1); |
| 1136 | goto repeat; | 1164 | continue; |
| 1137 | } | 1165 | } |
| 1138 | 1166 | ||
| 1139 | /* | 1167 | /* |
| 1140 | * Ahh, all good. It wasn't running, and it wasn't | 1168 | * Ahh, all good. It wasn't running, and it wasn't |
| 1141 | * runnable, which means that it will never become | 1169 | * runnable, which means that it will never become |
| 1142 | * running in the future either. We're all done! | 1170 | * running in the future either. We're all done! |
| 1143 | */ | 1171 | */ |
| 1172 | break; | ||
| 1173 | } | ||
| 1144 | } | 1174 | } |
| 1145 | 1175 | ||
| 1146 | /*** | 1176 | /*** |
| @@ -1174,7 +1204,7 @@ void kick_process(struct task_struct *p) | |||
| 1174 | * We want to under-estimate the load of migration sources, to | 1204 | * We want to under-estimate the load of migration sources, to |
| 1175 | * balance conservatively. | 1205 | * balance conservatively. |
| 1176 | */ | 1206 | */ |
| 1177 | static inline unsigned long source_load(int cpu, int type) | 1207 | static unsigned long source_load(int cpu, int type) |
| 1178 | { | 1208 | { |
| 1179 | struct rq *rq = cpu_rq(cpu); | 1209 | struct rq *rq = cpu_rq(cpu); |
| 1180 | unsigned long total = weighted_cpuload(cpu); | 1210 | unsigned long total = weighted_cpuload(cpu); |
| @@ -1189,7 +1219,7 @@ static inline unsigned long source_load(int cpu, int type) | |||
| 1189 | * Return a high guess at the load of a migration-target cpu weighted | 1219 | * Return a high guess at the load of a migration-target cpu weighted |
| 1190 | * according to the scheduling class and "nice" value. | 1220 | * according to the scheduling class and "nice" value. |
| 1191 | */ | 1221 | */ |
| 1192 | static inline unsigned long target_load(int cpu, int type) | 1222 | static unsigned long target_load(int cpu, int type) |
| 1193 | { | 1223 | { |
| 1194 | struct rq *rq = cpu_rq(cpu); | 1224 | struct rq *rq = cpu_rq(cpu); |
| 1195 | unsigned long total = weighted_cpuload(cpu); | 1225 | unsigned long total = weighted_cpuload(cpu); |
| @@ -1231,7 +1261,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
| 1231 | 1261 | ||
| 1232 | /* Skip over this group if it has no CPUs allowed */ | 1262 | /* Skip over this group if it has no CPUs allowed */ |
| 1233 | if (!cpus_intersects(group->cpumask, p->cpus_allowed)) | 1263 | if (!cpus_intersects(group->cpumask, p->cpus_allowed)) |
| 1234 | goto nextgroup; | 1264 | continue; |
| 1235 | 1265 | ||
| 1236 | local_group = cpu_isset(this_cpu, group->cpumask); | 1266 | local_group = cpu_isset(this_cpu, group->cpumask); |
| 1237 | 1267 | ||
| @@ -1259,9 +1289,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
| 1259 | min_load = avg_load; | 1289 | min_load = avg_load; |
| 1260 | idlest = group; | 1290 | idlest = group; |
| 1261 | } | 1291 | } |
| 1262 | nextgroup: | 1292 | } while (group = group->next, group != sd->groups); |
| 1263 | group = group->next; | ||
| 1264 | } while (group != sd->groups); | ||
| 1265 | 1293 | ||
| 1266 | if (!idlest || 100*this_load < imbalance*min_load) | 1294 | if (!idlest || 100*this_load < imbalance*min_load) |
| 1267 | return NULL; | 1295 | return NULL; |
| @@ -1393,8 +1421,13 @@ static int wake_idle(int cpu, struct task_struct *p) | |||
| 1393 | if (sd->flags & SD_WAKE_IDLE) { | 1421 | if (sd->flags & SD_WAKE_IDLE) { |
| 1394 | cpus_and(tmp, sd->span, p->cpus_allowed); | 1422 | cpus_and(tmp, sd->span, p->cpus_allowed); |
| 1395 | for_each_cpu_mask(i, tmp) { | 1423 | for_each_cpu_mask(i, tmp) { |
| 1396 | if (idle_cpu(i)) | 1424 | if (idle_cpu(i)) { |
| 1425 | if (i != task_cpu(p)) { | ||
| 1426 | schedstat_inc(p, | ||
| 1427 | se.nr_wakeups_idle); | ||
| 1428 | } | ||
| 1397 | return i; | 1429 | return i; |
| 1430 | } | ||
| 1398 | } | 1431 | } |
| 1399 | } else { | 1432 | } else { |
| 1400 | break; | 1433 | break; |
| @@ -1425,7 +1458,7 @@ static inline int wake_idle(int cpu, struct task_struct *p) | |||
| 1425 | */ | 1458 | */ |
| 1426 | static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | 1459 | static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) |
| 1427 | { | 1460 | { |
| 1428 | int cpu, this_cpu, success = 0; | 1461 | int cpu, orig_cpu, this_cpu, success = 0; |
| 1429 | unsigned long flags; | 1462 | unsigned long flags; |
| 1430 | long old_state; | 1463 | long old_state; |
| 1431 | struct rq *rq; | 1464 | struct rq *rq; |
| @@ -1444,6 +1477,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
| 1444 | goto out_running; | 1477 | goto out_running; |
| 1445 | 1478 | ||
| 1446 | cpu = task_cpu(p); | 1479 | cpu = task_cpu(p); |
| 1480 | orig_cpu = cpu; | ||
| 1447 | this_cpu = smp_processor_id(); | 1481 | this_cpu = smp_processor_id(); |
| 1448 | 1482 | ||
| 1449 | #ifdef CONFIG_SMP | 1483 | #ifdef CONFIG_SMP |
| @@ -1452,7 +1486,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
| 1452 | 1486 | ||
| 1453 | new_cpu = cpu; | 1487 | new_cpu = cpu; |
| 1454 | 1488 | ||
| 1455 | schedstat_inc(rq, ttwu_cnt); | 1489 | schedstat_inc(rq, ttwu_count); |
| 1456 | if (cpu == this_cpu) { | 1490 | if (cpu == this_cpu) { |
| 1457 | schedstat_inc(rq, ttwu_local); | 1491 | schedstat_inc(rq, ttwu_local); |
| 1458 | goto out_set_cpu; | 1492 | goto out_set_cpu; |
| @@ -1487,6 +1521,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
| 1487 | unsigned long tl = this_load; | 1521 | unsigned long tl = this_load; |
| 1488 | unsigned long tl_per_task; | 1522 | unsigned long tl_per_task; |
| 1489 | 1523 | ||
| 1524 | /* | ||
| 1525 | * Attract cache-cold tasks on sync wakeups: | ||
| 1526 | */ | ||
| 1527 | if (sync && !task_hot(p, rq->clock, this_sd)) | ||
| 1528 | goto out_set_cpu; | ||
| 1529 | |||
| 1530 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | ||
| 1490 | tl_per_task = cpu_avg_load_per_task(this_cpu); | 1531 | tl_per_task = cpu_avg_load_per_task(this_cpu); |
| 1491 | 1532 | ||
| 1492 | /* | 1533 | /* |
| @@ -1506,6 +1547,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
| 1506 | * there is no bad imbalance. | 1547 | * there is no bad imbalance. |
| 1507 | */ | 1548 | */ |
| 1508 | schedstat_inc(this_sd, ttwu_move_affine); | 1549 | schedstat_inc(this_sd, ttwu_move_affine); |
| 1550 | schedstat_inc(p, se.nr_wakeups_affine); | ||
| 1509 | goto out_set_cpu; | 1551 | goto out_set_cpu; |
| 1510 | } | 1552 | } |
| 1511 | } | 1553 | } |
| @@ -1517,6 +1559,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
| 1517 | if (this_sd->flags & SD_WAKE_BALANCE) { | 1559 | if (this_sd->flags & SD_WAKE_BALANCE) { |
| 1518 | if (imbalance*this_load <= 100*load) { | 1560 | if (imbalance*this_load <= 100*load) { |
| 1519 | schedstat_inc(this_sd, ttwu_move_balance); | 1561 | schedstat_inc(this_sd, ttwu_move_balance); |
| 1562 | schedstat_inc(p, se.nr_wakeups_passive); | ||
| 1520 | goto out_set_cpu; | 1563 | goto out_set_cpu; |
| 1521 | } | 1564 | } |
| 1522 | } | 1565 | } |
| @@ -1542,18 +1585,18 @@ out_set_cpu: | |||
| 1542 | 1585 | ||
| 1543 | out_activate: | 1586 | out_activate: |
| 1544 | #endif /* CONFIG_SMP */ | 1587 | #endif /* CONFIG_SMP */ |
| 1588 | schedstat_inc(p, se.nr_wakeups); | ||
| 1589 | if (sync) | ||
| 1590 | schedstat_inc(p, se.nr_wakeups_sync); | ||
| 1591 | if (orig_cpu != cpu) | ||
| 1592 | schedstat_inc(p, se.nr_wakeups_migrate); | ||
| 1593 | if (cpu == this_cpu) | ||
| 1594 | schedstat_inc(p, se.nr_wakeups_local); | ||
| 1595 | else | ||
| 1596 | schedstat_inc(p, se.nr_wakeups_remote); | ||
| 1545 | update_rq_clock(rq); | 1597 | update_rq_clock(rq); |
| 1546 | activate_task(rq, p, 1); | 1598 | activate_task(rq, p, 1); |
| 1547 | /* | 1599 | check_preempt_curr(rq, p); |
| 1548 | * Sync wakeups (i.e. those types of wakeups where the waker | ||
| 1549 | * has indicated that it will leave the CPU in short order) | ||
| 1550 | * don't trigger a preemption, if the woken up task will run on | ||
| 1551 | * this cpu. (in this case the 'I will reschedule' promise of | ||
| 1552 | * the waker guarantees that the freshly woken up task is going | ||
| 1553 | * to be considered on this CPU.) | ||
| 1554 | */ | ||
| 1555 | if (!sync || cpu != this_cpu) | ||
| 1556 | check_preempt_curr(rq, p); | ||
| 1557 | success = 1; | 1600 | success = 1; |
| 1558 | 1601 | ||
| 1559 | out_running: | 1602 | out_running: |
| @@ -1584,28 +1627,20 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state) | |||
| 1584 | */ | 1627 | */ |
| 1585 | static void __sched_fork(struct task_struct *p) | 1628 | static void __sched_fork(struct task_struct *p) |
| 1586 | { | 1629 | { |
| 1587 | p->se.wait_start_fair = 0; | ||
| 1588 | p->se.exec_start = 0; | 1630 | p->se.exec_start = 0; |
| 1589 | p->se.sum_exec_runtime = 0; | 1631 | p->se.sum_exec_runtime = 0; |
| 1590 | p->se.prev_sum_exec_runtime = 0; | 1632 | p->se.prev_sum_exec_runtime = 0; |
| 1591 | p->se.delta_exec = 0; | ||
| 1592 | p->se.delta_fair_run = 0; | ||
| 1593 | p->se.delta_fair_sleep = 0; | ||
| 1594 | p->se.wait_runtime = 0; | ||
| 1595 | p->se.sleep_start_fair = 0; | ||
| 1596 | 1633 | ||
| 1597 | #ifdef CONFIG_SCHEDSTATS | 1634 | #ifdef CONFIG_SCHEDSTATS |
| 1598 | p->se.wait_start = 0; | 1635 | p->se.wait_start = 0; |
| 1599 | p->se.sum_wait_runtime = 0; | ||
| 1600 | p->se.sum_sleep_runtime = 0; | 1636 | p->se.sum_sleep_runtime = 0; |
| 1601 | p->se.sleep_start = 0; | 1637 | p->se.sleep_start = 0; |
| 1602 | p->se.block_start = 0; | 1638 | p->se.block_start = 0; |
| 1603 | p->se.sleep_max = 0; | 1639 | p->se.sleep_max = 0; |
| 1604 | p->se.block_max = 0; | 1640 | p->se.block_max = 0; |
| 1605 | p->se.exec_max = 0; | 1641 | p->se.exec_max = 0; |
| 1642 | p->se.slice_max = 0; | ||
| 1606 | p->se.wait_max = 0; | 1643 | p->se.wait_max = 0; |
| 1607 | p->se.wait_runtime_overruns = 0; | ||
| 1608 | p->se.wait_runtime_underruns = 0; | ||
| 1609 | #endif | 1644 | #endif |
| 1610 | 1645 | ||
| 1611 | INIT_LIST_HEAD(&p->run_list); | 1646 | INIT_LIST_HEAD(&p->run_list); |
| @@ -1636,12 +1671,14 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
| 1636 | #ifdef CONFIG_SMP | 1671 | #ifdef CONFIG_SMP |
| 1637 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); | 1672 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); |
| 1638 | #endif | 1673 | #endif |
| 1639 | __set_task_cpu(p, cpu); | 1674 | set_task_cpu(p, cpu); |
| 1640 | 1675 | ||
| 1641 | /* | 1676 | /* |
| 1642 | * Make sure we do not leak PI boosting priority to the child: | 1677 | * Make sure we do not leak PI boosting priority to the child: |
| 1643 | */ | 1678 | */ |
| 1644 | p->prio = current->normal_prio; | 1679 | p->prio = current->normal_prio; |
| 1680 | if (!rt_prio(p->prio)) | ||
| 1681 | p->sched_class = &fair_sched_class; | ||
| 1645 | 1682 | ||
| 1646 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 1683 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
| 1647 | if (likely(sched_info_on())) | 1684 | if (likely(sched_info_on())) |
| @@ -1658,12 +1695,6 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
| 1658 | } | 1695 | } |
| 1659 | 1696 | ||
| 1660 | /* | 1697 | /* |
| 1661 | * After fork, child runs first. (default) If set to 0 then | ||
| 1662 | * parent will (try to) run first. | ||
| 1663 | */ | ||
| 1664 | unsigned int __read_mostly sysctl_sched_child_runs_first = 1; | ||
| 1665 | |||
| 1666 | /* | ||
| 1667 | * wake_up_new_task - wake up a newly created task for the first time. | 1698 | * wake_up_new_task - wake up a newly created task for the first time. |
| 1668 | * | 1699 | * |
| 1669 | * This function will do some initial scheduler statistics housekeeping | 1700 | * This function will do some initial scheduler statistics housekeeping |
| @@ -1674,24 +1705,14 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
| 1674 | { | 1705 | { |
| 1675 | unsigned long flags; | 1706 | unsigned long flags; |
| 1676 | struct rq *rq; | 1707 | struct rq *rq; |
| 1677 | int this_cpu; | ||
| 1678 | 1708 | ||
| 1679 | rq = task_rq_lock(p, &flags); | 1709 | rq = task_rq_lock(p, &flags); |
| 1680 | BUG_ON(p->state != TASK_RUNNING); | 1710 | BUG_ON(p->state != TASK_RUNNING); |
| 1681 | this_cpu = smp_processor_id(); /* parent's CPU */ | ||
| 1682 | update_rq_clock(rq); | 1711 | update_rq_clock(rq); |
| 1683 | 1712 | ||
| 1684 | p->prio = effective_prio(p); | 1713 | p->prio = effective_prio(p); |
| 1685 | 1714 | ||
| 1686 | if (rt_prio(p->prio)) | 1715 | if (!p->sched_class->task_new || !current->se.on_rq || !rq->cfs.curr) { |
| 1687 | p->sched_class = &rt_sched_class; | ||
| 1688 | else | ||
| 1689 | p->sched_class = &fair_sched_class; | ||
| 1690 | |||
| 1691 | if (!p->sched_class->task_new || !sysctl_sched_child_runs_first || | ||
| 1692 | (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu || | ||
| 1693 | !current->se.on_rq) { | ||
| 1694 | |||
| 1695 | activate_task(rq, p, 0); | 1716 | activate_task(rq, p, 0); |
| 1696 | } else { | 1717 | } else { |
| 1697 | /* | 1718 | /* |
| @@ -1800,7 +1821,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, | |||
| 1800 | * with the lock held can cause deadlocks; see schedule() for | 1821 | * with the lock held can cause deadlocks; see schedule() for |
| 1801 | * details.) | 1822 | * details.) |
| 1802 | */ | 1823 | */ |
| 1803 | static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) | 1824 | static void finish_task_switch(struct rq *rq, struct task_struct *prev) |
| 1804 | __releases(rq->lock) | 1825 | __releases(rq->lock) |
| 1805 | { | 1826 | { |
| 1806 | struct mm_struct *mm = rq->prev_mm; | 1827 | struct mm_struct *mm = rq->prev_mm; |
| @@ -1982,42 +2003,10 @@ unsigned long nr_active(void) | |||
| 1982 | */ | 2003 | */ |
| 1983 | static void update_cpu_load(struct rq *this_rq) | 2004 | static void update_cpu_load(struct rq *this_rq) |
| 1984 | { | 2005 | { |
| 1985 | u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64; | 2006 | unsigned long this_load = this_rq->load.weight; |
| 1986 | unsigned long total_load = this_rq->ls.load.weight; | ||
| 1987 | unsigned long this_load = total_load; | ||
| 1988 | struct load_stat *ls = &this_rq->ls; | ||
| 1989 | int i, scale; | 2007 | int i, scale; |
| 1990 | 2008 | ||
| 1991 | this_rq->nr_load_updates++; | 2009 | this_rq->nr_load_updates++; |
| 1992 | if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD))) | ||
| 1993 | goto do_avg; | ||
| 1994 | |||
| 1995 | /* Update delta_fair/delta_exec fields first */ | ||
| 1996 | update_curr_load(this_rq); | ||
| 1997 | |||
| 1998 | fair_delta64 = ls->delta_fair + 1; | ||
| 1999 | ls->delta_fair = 0; | ||
| 2000 | |||
| 2001 | exec_delta64 = ls->delta_exec + 1; | ||
| 2002 | ls->delta_exec = 0; | ||
| 2003 | |||
| 2004 | sample_interval64 = this_rq->clock - ls->load_update_last; | ||
| 2005 | ls->load_update_last = this_rq->clock; | ||
| 2006 | |||
| 2007 | if ((s64)sample_interval64 < (s64)TICK_NSEC) | ||
| 2008 | sample_interval64 = TICK_NSEC; | ||
| 2009 | |||
| 2010 | if (exec_delta64 > sample_interval64) | ||
| 2011 | exec_delta64 = sample_interval64; | ||
| 2012 | |||
| 2013 | idle_delta64 = sample_interval64 - exec_delta64; | ||
| 2014 | |||
| 2015 | tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64); | ||
| 2016 | tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64); | ||
| 2017 | |||
| 2018 | this_load = (unsigned long)tmp64; | ||
| 2019 | |||
| 2020 | do_avg: | ||
| 2021 | 2010 | ||
| 2022 | /* Update our load: */ | 2011 | /* Update our load: */ |
| 2023 | for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | 2012 | for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { |
| @@ -2027,7 +2016,13 @@ do_avg: | |||
| 2027 | 2016 | ||
| 2028 | old_load = this_rq->cpu_load[i]; | 2017 | old_load = this_rq->cpu_load[i]; |
| 2029 | new_load = this_load; | 2018 | new_load = this_load; |
| 2030 | 2019 | /* | |
| 2020 | * Round up the averaging division if load is increasing. This | ||
| 2021 | * prevents us from getting stuck on 9 if the load is 10, for | ||
| 2022 | * example. | ||
| 2023 | */ | ||
| 2024 | if (new_load > old_load) | ||
| 2025 | new_load += scale-1; | ||
| 2031 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; | 2026 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; |
| 2032 | } | 2027 | } |
| 2033 | } | 2028 | } |
| @@ -2179,13 +2174,38 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
| 2179 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | 2174 | * 2) cannot be migrated to this CPU due to cpus_allowed, or |
| 2180 | * 3) are cache-hot on their current CPU. | 2175 | * 3) are cache-hot on their current CPU. |
| 2181 | */ | 2176 | */ |
| 2182 | if (!cpu_isset(this_cpu, p->cpus_allowed)) | 2177 | if (!cpu_isset(this_cpu, p->cpus_allowed)) { |
| 2178 | schedstat_inc(p, se.nr_failed_migrations_affine); | ||
| 2183 | return 0; | 2179 | return 0; |
| 2180 | } | ||
| 2184 | *all_pinned = 0; | 2181 | *all_pinned = 0; |
| 2185 | 2182 | ||
| 2186 | if (task_running(rq, p)) | 2183 | if (task_running(rq, p)) { |
| 2184 | schedstat_inc(p, se.nr_failed_migrations_running); | ||
| 2187 | return 0; | 2185 | return 0; |
| 2186 | } | ||
| 2187 | |||
| 2188 | /* | ||
| 2189 | * Aggressive migration if: | ||
| 2190 | * 1) task is cache cold, or | ||
| 2191 | * 2) too many balance attempts have failed. | ||
| 2192 | */ | ||
| 2193 | |||
| 2194 | if (!task_hot(p, rq->clock, sd) || | ||
| 2195 | sd->nr_balance_failed > sd->cache_nice_tries) { | ||
| 2196 | #ifdef CONFIG_SCHEDSTATS | ||
| 2197 | if (task_hot(p, rq->clock, sd)) { | ||
| 2198 | schedstat_inc(sd, lb_hot_gained[idle]); | ||
| 2199 | schedstat_inc(p, se.nr_forced_migrations); | ||
| 2200 | } | ||
| 2201 | #endif | ||
| 2202 | return 1; | ||
| 2203 | } | ||
| 2188 | 2204 | ||
| 2205 | if (task_hot(p, rq->clock, sd)) { | ||
| 2206 | schedstat_inc(p, se.nr_failed_migrations_hot); | ||
| 2207 | return 0; | ||
| 2208 | } | ||
| 2189 | return 1; | 2209 | return 1; |
| 2190 | } | 2210 | } |
| 2191 | 2211 | ||
| @@ -2264,7 +2284,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 2264 | struct sched_domain *sd, enum cpu_idle_type idle, | 2284 | struct sched_domain *sd, enum cpu_idle_type idle, |
| 2265 | int *all_pinned) | 2285 | int *all_pinned) |
| 2266 | { | 2286 | { |
| 2267 | struct sched_class *class = sched_class_highest; | 2287 | const struct sched_class *class = sched_class_highest; |
| 2268 | unsigned long total_load_moved = 0; | 2288 | unsigned long total_load_moved = 0; |
| 2269 | int this_best_prio = this_rq->curr->prio; | 2289 | int this_best_prio = this_rq->curr->prio; |
| 2270 | 2290 | ||
| @@ -2289,7 +2309,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 2289 | static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2309 | static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| 2290 | struct sched_domain *sd, enum cpu_idle_type idle) | 2310 | struct sched_domain *sd, enum cpu_idle_type idle) |
| 2291 | { | 2311 | { |
| 2292 | struct sched_class *class; | 2312 | const struct sched_class *class; |
| 2293 | int this_best_prio = MAX_PRIO; | 2313 | int this_best_prio = MAX_PRIO; |
| 2294 | 2314 | ||
| 2295 | for (class = sched_class_highest; class; class = class->next) | 2315 | for (class = sched_class_highest; class; class = class->next) |
| @@ -2653,7 +2673,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 2653 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2673 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
| 2654 | sd_idle = 1; | 2674 | sd_idle = 1; |
| 2655 | 2675 | ||
| 2656 | schedstat_inc(sd, lb_cnt[idle]); | 2676 | schedstat_inc(sd, lb_count[idle]); |
| 2657 | 2677 | ||
| 2658 | redo: | 2678 | redo: |
| 2659 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 2679 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
| @@ -2806,7 +2826,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | |||
| 2806 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2826 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
| 2807 | sd_idle = 1; | 2827 | sd_idle = 1; |
| 2808 | 2828 | ||
| 2809 | schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]); | 2829 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); |
| 2810 | redo: | 2830 | redo: |
| 2811 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, | 2831 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, |
| 2812 | &sd_idle, &cpus, NULL); | 2832 | &sd_idle, &cpus, NULL); |
| @@ -2940,7 +2960,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
| 2940 | } | 2960 | } |
| 2941 | 2961 | ||
| 2942 | if (likely(sd)) { | 2962 | if (likely(sd)) { |
| 2943 | schedstat_inc(sd, alb_cnt); | 2963 | schedstat_inc(sd, alb_count); |
| 2944 | 2964 | ||
| 2945 | if (move_one_task(target_rq, target_cpu, busiest_rq, | 2965 | if (move_one_task(target_rq, target_cpu, busiest_rq, |
| 2946 | sd, CPU_IDLE)) | 2966 | sd, CPU_IDLE)) |
| @@ -3033,7 +3053,7 @@ static DEFINE_SPINLOCK(balancing); | |||
| 3033 | * | 3053 | * |
| 3034 | * Balancing parameters are set up in arch_init_sched_domains. | 3054 | * Balancing parameters are set up in arch_init_sched_domains. |
| 3035 | */ | 3055 | */ |
| 3036 | static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) | 3056 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) |
| 3037 | { | 3057 | { |
| 3038 | int balance = 1; | 3058 | int balance = 1; |
| 3039 | struct rq *rq = cpu_rq(cpu); | 3059 | struct rq *rq = cpu_rq(cpu); |
| @@ -3280,6 +3300,25 @@ void account_user_time(struct task_struct *p, cputime_t cputime) | |||
| 3280 | } | 3300 | } |
| 3281 | 3301 | ||
| 3282 | /* | 3302 | /* |
| 3303 | * Account guest cpu time to a process. | ||
| 3304 | * @p: the process that the cpu time gets accounted to | ||
| 3305 | * @cputime: the cpu time spent in virtual machine since the last update | ||
| 3306 | */ | ||
| 3307 | void account_guest_time(struct task_struct *p, cputime_t cputime) | ||
| 3308 | { | ||
| 3309 | cputime64_t tmp; | ||
| 3310 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
| 3311 | |||
| 3312 | tmp = cputime_to_cputime64(cputime); | ||
| 3313 | |||
| 3314 | p->utime = cputime_add(p->utime, cputime); | ||
| 3315 | p->gtime = cputime_add(p->gtime, cputime); | ||
| 3316 | |||
| 3317 | cpustat->user = cputime64_add(cpustat->user, tmp); | ||
| 3318 | cpustat->guest = cputime64_add(cpustat->guest, tmp); | ||
| 3319 | } | ||
| 3320 | |||
| 3321 | /* | ||
| 3283 | * Account system cpu time to a process. | 3322 | * Account system cpu time to a process. |
| 3284 | * @p: the process that the cpu time gets accounted to | 3323 | * @p: the process that the cpu time gets accounted to |
| 3285 | * @hardirq_offset: the offset to subtract from hardirq_count() | 3324 | * @hardirq_offset: the offset to subtract from hardirq_count() |
| @@ -3292,6 +3331,12 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
| 3292 | struct rq *rq = this_rq(); | 3331 | struct rq *rq = this_rq(); |
| 3293 | cputime64_t tmp; | 3332 | cputime64_t tmp; |
| 3294 | 3333 | ||
| 3334 | if (p->flags & PF_VCPU) { | ||
| 3335 | account_guest_time(p, cputime); | ||
| 3336 | p->flags &= ~PF_VCPU; | ||
| 3337 | return; | ||
| 3338 | } | ||
| 3339 | |||
| 3295 | p->stime = cputime_add(p->stime, cputime); | 3340 | p->stime = cputime_add(p->stime, cputime); |
| 3296 | 3341 | ||
| 3297 | /* Add system time to cpustat. */ | 3342 | /* Add system time to cpustat. */ |
| @@ -3430,7 +3475,13 @@ static inline void schedule_debug(struct task_struct *prev) | |||
| 3430 | 3475 | ||
| 3431 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 3476 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
| 3432 | 3477 | ||
| 3433 | schedstat_inc(this_rq(), sched_cnt); | 3478 | schedstat_inc(this_rq(), sched_count); |
| 3479 | #ifdef CONFIG_SCHEDSTATS | ||
| 3480 | if (unlikely(prev->lock_depth >= 0)) { | ||
| 3481 | schedstat_inc(this_rq(), bkl_count); | ||
| 3482 | schedstat_inc(prev, sched_info.bkl_count); | ||
| 3483 | } | ||
| 3484 | #endif | ||
| 3434 | } | 3485 | } |
| 3435 | 3486 | ||
| 3436 | /* | 3487 | /* |
| @@ -3439,7 +3490,7 @@ static inline void schedule_debug(struct task_struct *prev) | |||
| 3439 | static inline struct task_struct * | 3490 | static inline struct task_struct * |
| 3440 | pick_next_task(struct rq *rq, struct task_struct *prev) | 3491 | pick_next_task(struct rq *rq, struct task_struct *prev) |
| 3441 | { | 3492 | { |
| 3442 | struct sched_class *class; | 3493 | const struct sched_class *class; |
| 3443 | struct task_struct *p; | 3494 | struct task_struct *p; |
| 3444 | 3495 | ||
| 3445 | /* | 3496 | /* |
| @@ -3488,9 +3539,13 @@ need_resched_nonpreemptible: | |||
| 3488 | 3539 | ||
| 3489 | schedule_debug(prev); | 3540 | schedule_debug(prev); |
| 3490 | 3541 | ||
| 3491 | spin_lock_irq(&rq->lock); | 3542 | /* |
| 3492 | clear_tsk_need_resched(prev); | 3543 | * Do the rq-clock update outside the rq lock: |
| 3544 | */ | ||
| 3545 | local_irq_disable(); | ||
| 3493 | __update_rq_clock(rq); | 3546 | __update_rq_clock(rq); |
| 3547 | spin_lock(&rq->lock); | ||
| 3548 | clear_tsk_need_resched(prev); | ||
| 3494 | 3549 | ||
| 3495 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3550 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
| 3496 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && | 3551 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && |
| @@ -3550,27 +3605,30 @@ asmlinkage void __sched preempt_schedule(void) | |||
| 3550 | if (likely(ti->preempt_count || irqs_disabled())) | 3605 | if (likely(ti->preempt_count || irqs_disabled())) |
| 3551 | return; | 3606 | return; |
| 3552 | 3607 | ||
| 3553 | need_resched: | 3608 | do { |
| 3554 | add_preempt_count(PREEMPT_ACTIVE); | 3609 | add_preempt_count(PREEMPT_ACTIVE); |
| 3555 | /* | 3610 | |
| 3556 | * We keep the big kernel semaphore locked, but we | 3611 | /* |
| 3557 | * clear ->lock_depth so that schedule() doesnt | 3612 | * We keep the big kernel semaphore locked, but we |
| 3558 | * auto-release the semaphore: | 3613 | * clear ->lock_depth so that schedule() doesnt |
| 3559 | */ | 3614 | * auto-release the semaphore: |
| 3615 | */ | ||
| 3560 | #ifdef CONFIG_PREEMPT_BKL | 3616 | #ifdef CONFIG_PREEMPT_BKL |
| 3561 | saved_lock_depth = task->lock_depth; | 3617 | saved_lock_depth = task->lock_depth; |
| 3562 | task->lock_depth = -1; | 3618 | task->lock_depth = -1; |
| 3563 | #endif | 3619 | #endif |
| 3564 | schedule(); | 3620 | schedule(); |
| 3565 | #ifdef CONFIG_PREEMPT_BKL | 3621 | #ifdef CONFIG_PREEMPT_BKL |
| 3566 | task->lock_depth = saved_lock_depth; | 3622 | task->lock_depth = saved_lock_depth; |
| 3567 | #endif | 3623 | #endif |
| 3568 | sub_preempt_count(PREEMPT_ACTIVE); | 3624 | sub_preempt_count(PREEMPT_ACTIVE); |
| 3569 | 3625 | ||
| 3570 | /* we could miss a preemption opportunity between schedule and now */ | 3626 | /* |
| 3571 | barrier(); | 3627 | * Check again in case we missed a preemption opportunity |
| 3572 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3628 | * between schedule and now. |
| 3573 | goto need_resched; | 3629 | */ |
| 3630 | barrier(); | ||
| 3631 | } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); | ||
| 3574 | } | 3632 | } |
| 3575 | EXPORT_SYMBOL(preempt_schedule); | 3633 | EXPORT_SYMBOL(preempt_schedule); |
| 3576 | 3634 | ||
| @@ -3590,29 +3648,32 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
| 3590 | /* Catch callers which need to be fixed */ | 3648 | /* Catch callers which need to be fixed */ |
| 3591 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 3649 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
| 3592 | 3650 | ||
| 3593 | need_resched: | 3651 | do { |
| 3594 | add_preempt_count(PREEMPT_ACTIVE); | 3652 | add_preempt_count(PREEMPT_ACTIVE); |
| 3595 | /* | 3653 | |
| 3596 | * We keep the big kernel semaphore locked, but we | 3654 | /* |
| 3597 | * clear ->lock_depth so that schedule() doesnt | 3655 | * We keep the big kernel semaphore locked, but we |
| 3598 | * auto-release the semaphore: | 3656 | * clear ->lock_depth so that schedule() doesnt |
| 3599 | */ | 3657 | * auto-release the semaphore: |
| 3658 | */ | ||
| 3600 | #ifdef CONFIG_PREEMPT_BKL | 3659 | #ifdef CONFIG_PREEMPT_BKL |
| 3601 | saved_lock_depth = task->lock_depth; | 3660 | saved_lock_depth = task->lock_depth; |
| 3602 | task->lock_depth = -1; | 3661 | task->lock_depth = -1; |
| 3603 | #endif | 3662 | #endif |
| 3604 | local_irq_enable(); | 3663 | local_irq_enable(); |
| 3605 | schedule(); | 3664 | schedule(); |
| 3606 | local_irq_disable(); | 3665 | local_irq_disable(); |
| 3607 | #ifdef CONFIG_PREEMPT_BKL | 3666 | #ifdef CONFIG_PREEMPT_BKL |
| 3608 | task->lock_depth = saved_lock_depth; | 3667 | task->lock_depth = saved_lock_depth; |
| 3609 | #endif | 3668 | #endif |
| 3610 | sub_preempt_count(PREEMPT_ACTIVE); | 3669 | sub_preempt_count(PREEMPT_ACTIVE); |
| 3611 | 3670 | ||
| 3612 | /* we could miss a preemption opportunity between schedule and now */ | 3671 | /* |
| 3613 | barrier(); | 3672 | * Check again in case we missed a preemption opportunity |
| 3614 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3673 | * between schedule and now. |
| 3615 | goto need_resched; | 3674 | */ |
| 3675 | barrier(); | ||
| 3676 | } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); | ||
| 3616 | } | 3677 | } |
| 3617 | 3678 | ||
| 3618 | #endif /* CONFIG_PREEMPT */ | 3679 | #endif /* CONFIG_PREEMPT */ |
| @@ -3636,10 +3697,9 @@ EXPORT_SYMBOL(default_wake_function); | |||
| 3636 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | 3697 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, |
| 3637 | int nr_exclusive, int sync, void *key) | 3698 | int nr_exclusive, int sync, void *key) |
| 3638 | { | 3699 | { |
| 3639 | struct list_head *tmp, *next; | 3700 | wait_queue_t *curr, *next; |
| 3640 | 3701 | ||
| 3641 | list_for_each_safe(tmp, next, &q->task_list) { | 3702 | list_for_each_entry_safe(curr, next, &q->task_list, task_list) { |
| 3642 | wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); | ||
| 3643 | unsigned flags = curr->flags; | 3703 | unsigned flags = curr->flags; |
| 3644 | 3704 | ||
| 3645 | if (curr->func(curr, mode, sync, key) && | 3705 | if (curr->func(curr, mode, sync, key) && |
| @@ -3729,206 +3789,116 @@ void fastcall complete_all(struct completion *x) | |||
| 3729 | } | 3789 | } |
| 3730 | EXPORT_SYMBOL(complete_all); | 3790 | EXPORT_SYMBOL(complete_all); |
| 3731 | 3791 | ||
| 3732 | void fastcall __sched wait_for_completion(struct completion *x) | 3792 | static inline long __sched |
| 3733 | { | 3793 | do_wait_for_common(struct completion *x, long timeout, int state) |
| 3734 | might_sleep(); | ||
| 3735 | |||
| 3736 | spin_lock_irq(&x->wait.lock); | ||
| 3737 | if (!x->done) { | ||
| 3738 | DECLARE_WAITQUEUE(wait, current); | ||
| 3739 | |||
| 3740 | wait.flags |= WQ_FLAG_EXCLUSIVE; | ||
| 3741 | __add_wait_queue_tail(&x->wait, &wait); | ||
| 3742 | do { | ||
| 3743 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
| 3744 | spin_unlock_irq(&x->wait.lock); | ||
| 3745 | schedule(); | ||
| 3746 | spin_lock_irq(&x->wait.lock); | ||
| 3747 | } while (!x->done); | ||
| 3748 | __remove_wait_queue(&x->wait, &wait); | ||
| 3749 | } | ||
| 3750 | x->done--; | ||
| 3751 | spin_unlock_irq(&x->wait.lock); | ||
| 3752 | } | ||
| 3753 | EXPORT_SYMBOL(wait_for_completion); | ||
| 3754 | |||
| 3755 | unsigned long fastcall __sched | ||
| 3756 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | ||
| 3757 | { | 3794 | { |
| 3758 | might_sleep(); | ||
| 3759 | |||
| 3760 | spin_lock_irq(&x->wait.lock); | ||
| 3761 | if (!x->done) { | 3795 | if (!x->done) { |
| 3762 | DECLARE_WAITQUEUE(wait, current); | 3796 | DECLARE_WAITQUEUE(wait, current); |
| 3763 | 3797 | ||
| 3764 | wait.flags |= WQ_FLAG_EXCLUSIVE; | 3798 | wait.flags |= WQ_FLAG_EXCLUSIVE; |
| 3765 | __add_wait_queue_tail(&x->wait, &wait); | 3799 | __add_wait_queue_tail(&x->wait, &wait); |
| 3766 | do { | 3800 | do { |
| 3767 | __set_current_state(TASK_UNINTERRUPTIBLE); | 3801 | if (state == TASK_INTERRUPTIBLE && |
| 3802 | signal_pending(current)) { | ||
| 3803 | __remove_wait_queue(&x->wait, &wait); | ||
| 3804 | return -ERESTARTSYS; | ||
| 3805 | } | ||
| 3806 | __set_current_state(state); | ||
| 3768 | spin_unlock_irq(&x->wait.lock); | 3807 | spin_unlock_irq(&x->wait.lock); |
| 3769 | timeout = schedule_timeout(timeout); | 3808 | timeout = schedule_timeout(timeout); |
| 3770 | spin_lock_irq(&x->wait.lock); | 3809 | spin_lock_irq(&x->wait.lock); |
| 3771 | if (!timeout) { | 3810 | if (!timeout) { |
| 3772 | __remove_wait_queue(&x->wait, &wait); | 3811 | __remove_wait_queue(&x->wait, &wait); |
| 3773 | goto out; | 3812 | return timeout; |
| 3774 | } | 3813 | } |
| 3775 | } while (!x->done); | 3814 | } while (!x->done); |
| 3776 | __remove_wait_queue(&x->wait, &wait); | 3815 | __remove_wait_queue(&x->wait, &wait); |
| 3777 | } | 3816 | } |
| 3778 | x->done--; | 3817 | x->done--; |
| 3779 | out: | ||
| 3780 | spin_unlock_irq(&x->wait.lock); | ||
| 3781 | return timeout; | 3818 | return timeout; |
| 3782 | } | 3819 | } |
| 3783 | EXPORT_SYMBOL(wait_for_completion_timeout); | ||
| 3784 | 3820 | ||
| 3785 | int fastcall __sched wait_for_completion_interruptible(struct completion *x) | 3821 | static long __sched |
| 3822 | wait_for_common(struct completion *x, long timeout, int state) | ||
| 3786 | { | 3823 | { |
| 3787 | int ret = 0; | ||
| 3788 | |||
| 3789 | might_sleep(); | 3824 | might_sleep(); |
| 3790 | 3825 | ||
| 3791 | spin_lock_irq(&x->wait.lock); | 3826 | spin_lock_irq(&x->wait.lock); |
| 3792 | if (!x->done) { | 3827 | timeout = do_wait_for_common(x, timeout, state); |
| 3793 | DECLARE_WAITQUEUE(wait, current); | ||
| 3794 | |||
| 3795 | wait.flags |= WQ_FLAG_EXCLUSIVE; | ||
| 3796 | __add_wait_queue_tail(&x->wait, &wait); | ||
| 3797 | do { | ||
| 3798 | if (signal_pending(current)) { | ||
| 3799 | ret = -ERESTARTSYS; | ||
| 3800 | __remove_wait_queue(&x->wait, &wait); | ||
| 3801 | goto out; | ||
| 3802 | } | ||
| 3803 | __set_current_state(TASK_INTERRUPTIBLE); | ||
| 3804 | spin_unlock_irq(&x->wait.lock); | ||
| 3805 | schedule(); | ||
| 3806 | spin_lock_irq(&x->wait.lock); | ||
| 3807 | } while (!x->done); | ||
| 3808 | __remove_wait_queue(&x->wait, &wait); | ||
| 3809 | } | ||
| 3810 | x->done--; | ||
| 3811 | out: | ||
| 3812 | spin_unlock_irq(&x->wait.lock); | 3828 | spin_unlock_irq(&x->wait.lock); |
| 3829 | return timeout; | ||
| 3830 | } | ||
| 3813 | 3831 | ||
| 3814 | return ret; | 3832 | void fastcall __sched wait_for_completion(struct completion *x) |
| 3833 | { | ||
| 3834 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | ||
| 3815 | } | 3835 | } |
| 3816 | EXPORT_SYMBOL(wait_for_completion_interruptible); | 3836 | EXPORT_SYMBOL(wait_for_completion); |
| 3817 | 3837 | ||
| 3818 | unsigned long fastcall __sched | 3838 | unsigned long fastcall __sched |
| 3819 | wait_for_completion_interruptible_timeout(struct completion *x, | 3839 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) |
| 3820 | unsigned long timeout) | ||
| 3821 | { | 3840 | { |
| 3822 | might_sleep(); | 3841 | return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); |
| 3823 | |||
| 3824 | spin_lock_irq(&x->wait.lock); | ||
| 3825 | if (!x->done) { | ||
| 3826 | DECLARE_WAITQUEUE(wait, current); | ||
| 3827 | |||
| 3828 | wait.flags |= WQ_FLAG_EXCLUSIVE; | ||
| 3829 | __add_wait_queue_tail(&x->wait, &wait); | ||
| 3830 | do { | ||
| 3831 | if (signal_pending(current)) { | ||
| 3832 | timeout = -ERESTARTSYS; | ||
| 3833 | __remove_wait_queue(&x->wait, &wait); | ||
| 3834 | goto out; | ||
| 3835 | } | ||
| 3836 | __set_current_state(TASK_INTERRUPTIBLE); | ||
| 3837 | spin_unlock_irq(&x->wait.lock); | ||
| 3838 | timeout = schedule_timeout(timeout); | ||
| 3839 | spin_lock_irq(&x->wait.lock); | ||
| 3840 | if (!timeout) { | ||
| 3841 | __remove_wait_queue(&x->wait, &wait); | ||
| 3842 | goto out; | ||
| 3843 | } | ||
| 3844 | } while (!x->done); | ||
| 3845 | __remove_wait_queue(&x->wait, &wait); | ||
| 3846 | } | ||
| 3847 | x->done--; | ||
| 3848 | out: | ||
| 3849 | spin_unlock_irq(&x->wait.lock); | ||
| 3850 | return timeout; | ||
| 3851 | } | 3842 | } |
| 3852 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | 3843 | EXPORT_SYMBOL(wait_for_completion_timeout); |
| 3853 | 3844 | ||
| 3854 | static inline void | 3845 | int __sched wait_for_completion_interruptible(struct completion *x) |
| 3855 | sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) | ||
| 3856 | { | 3846 | { |
| 3857 | spin_lock_irqsave(&q->lock, *flags); | 3847 | return wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); |
| 3858 | __add_wait_queue(q, wait); | ||
| 3859 | spin_unlock(&q->lock); | ||
| 3860 | } | 3848 | } |
| 3849 | EXPORT_SYMBOL(wait_for_completion_interruptible); | ||
| 3861 | 3850 | ||
| 3862 | static inline void | 3851 | unsigned long fastcall __sched |
| 3863 | sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) | 3852 | wait_for_completion_interruptible_timeout(struct completion *x, |
| 3853 | unsigned long timeout) | ||
| 3864 | { | 3854 | { |
| 3865 | spin_lock_irq(&q->lock); | 3855 | return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); |
| 3866 | __remove_wait_queue(q, wait); | ||
| 3867 | spin_unlock_irqrestore(&q->lock, *flags); | ||
| 3868 | } | 3856 | } |
| 3857 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | ||
| 3869 | 3858 | ||
| 3870 | void __sched interruptible_sleep_on(wait_queue_head_t *q) | 3859 | static long __sched |
| 3860 | sleep_on_common(wait_queue_head_t *q, int state, long timeout) | ||
| 3871 | { | 3861 | { |
| 3872 | unsigned long flags; | 3862 | unsigned long flags; |
| 3873 | wait_queue_t wait; | 3863 | wait_queue_t wait; |
| 3874 | 3864 | ||
| 3875 | init_waitqueue_entry(&wait, current); | 3865 | init_waitqueue_entry(&wait, current); |
| 3876 | 3866 | ||
| 3877 | current->state = TASK_INTERRUPTIBLE; | 3867 | __set_current_state(state); |
| 3878 | 3868 | ||
| 3879 | sleep_on_head(q, &wait, &flags); | 3869 | spin_lock_irqsave(&q->lock, flags); |
| 3880 | schedule(); | 3870 | __add_wait_queue(q, &wait); |
| 3881 | sleep_on_tail(q, &wait, &flags); | 3871 | spin_unlock(&q->lock); |
| 3872 | timeout = schedule_timeout(timeout); | ||
| 3873 | spin_lock_irq(&q->lock); | ||
| 3874 | __remove_wait_queue(q, &wait); | ||
| 3875 | spin_unlock_irqrestore(&q->lock, flags); | ||
| 3876 | |||
| 3877 | return timeout; | ||
| 3878 | } | ||
| 3879 | |||
| 3880 | void __sched interruptible_sleep_on(wait_queue_head_t *q) | ||
| 3881 | { | ||
| 3882 | sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); | ||
| 3882 | } | 3883 | } |
| 3883 | EXPORT_SYMBOL(interruptible_sleep_on); | 3884 | EXPORT_SYMBOL(interruptible_sleep_on); |
| 3884 | 3885 | ||
| 3885 | long __sched | 3886 | long __sched |
| 3886 | interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) | 3887 | interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) |
| 3887 | { | 3888 | { |
| 3888 | unsigned long flags; | 3889 | return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); |
| 3889 | wait_queue_t wait; | ||
| 3890 | |||
| 3891 | init_waitqueue_entry(&wait, current); | ||
| 3892 | |||
| 3893 | current->state = TASK_INTERRUPTIBLE; | ||
| 3894 | |||
| 3895 | sleep_on_head(q, &wait, &flags); | ||
| 3896 | timeout = schedule_timeout(timeout); | ||
| 3897 | sleep_on_tail(q, &wait, &flags); | ||
| 3898 | |||
| 3899 | return timeout; | ||
| 3900 | } | 3890 | } |
| 3901 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); | 3891 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); |
| 3902 | 3892 | ||
| 3903 | void __sched sleep_on(wait_queue_head_t *q) | 3893 | void __sched sleep_on(wait_queue_head_t *q) |
| 3904 | { | 3894 | { |
| 3905 | unsigned long flags; | 3895 | sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); |
| 3906 | wait_queue_t wait; | ||
| 3907 | |||
| 3908 | init_waitqueue_entry(&wait, current); | ||
| 3909 | |||
| 3910 | current->state = TASK_UNINTERRUPTIBLE; | ||
| 3911 | |||
| 3912 | sleep_on_head(q, &wait, &flags); | ||
| 3913 | schedule(); | ||
| 3914 | sleep_on_tail(q, &wait, &flags); | ||
| 3915 | } | 3896 | } |
| 3916 | EXPORT_SYMBOL(sleep_on); | 3897 | EXPORT_SYMBOL(sleep_on); |
| 3917 | 3898 | ||
| 3918 | long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | 3899 | long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) |
| 3919 | { | 3900 | { |
| 3920 | unsigned long flags; | 3901 | return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); |
| 3921 | wait_queue_t wait; | ||
| 3922 | |||
| 3923 | init_waitqueue_entry(&wait, current); | ||
| 3924 | |||
| 3925 | current->state = TASK_UNINTERRUPTIBLE; | ||
| 3926 | |||
| 3927 | sleep_on_head(q, &wait, &flags); | ||
| 3928 | timeout = schedule_timeout(timeout); | ||
| 3929 | sleep_on_tail(q, &wait, &flags); | ||
| 3930 | |||
| 3931 | return timeout; | ||
| 3932 | } | 3902 | } |
| 3933 | EXPORT_SYMBOL(sleep_on_timeout); | 3903 | EXPORT_SYMBOL(sleep_on_timeout); |
| 3934 | 3904 | ||
| @@ -3947,7 +3917,7 @@ EXPORT_SYMBOL(sleep_on_timeout); | |||
| 3947 | void rt_mutex_setprio(struct task_struct *p, int prio) | 3917 | void rt_mutex_setprio(struct task_struct *p, int prio) |
| 3948 | { | 3918 | { |
| 3949 | unsigned long flags; | 3919 | unsigned long flags; |
| 3950 | int oldprio, on_rq; | 3920 | int oldprio, on_rq, running; |
| 3951 | struct rq *rq; | 3921 | struct rq *rq; |
| 3952 | 3922 | ||
| 3953 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 3923 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
| @@ -3957,8 +3927,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 3957 | 3927 | ||
| 3958 | oldprio = p->prio; | 3928 | oldprio = p->prio; |
| 3959 | on_rq = p->se.on_rq; | 3929 | on_rq = p->se.on_rq; |
| 3960 | if (on_rq) | 3930 | running = task_running(rq, p); |
| 3931 | if (on_rq) { | ||
| 3961 | dequeue_task(rq, p, 0); | 3932 | dequeue_task(rq, p, 0); |
| 3933 | if (running) | ||
| 3934 | p->sched_class->put_prev_task(rq, p); | ||
| 3935 | } | ||
| 3962 | 3936 | ||
| 3963 | if (rt_prio(prio)) | 3937 | if (rt_prio(prio)) |
| 3964 | p->sched_class = &rt_sched_class; | 3938 | p->sched_class = &rt_sched_class; |
| @@ -3968,13 +3942,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 3968 | p->prio = prio; | 3942 | p->prio = prio; |
| 3969 | 3943 | ||
| 3970 | if (on_rq) { | 3944 | if (on_rq) { |
| 3945 | if (running) | ||
| 3946 | p->sched_class->set_curr_task(rq); | ||
| 3971 | enqueue_task(rq, p, 0); | 3947 | enqueue_task(rq, p, 0); |
| 3972 | /* | 3948 | /* |
| 3973 | * Reschedule if we are currently running on this runqueue and | 3949 | * Reschedule if we are currently running on this runqueue and |
| 3974 | * our priority decreased, or if we are not currently running on | 3950 | * our priority decreased, or if we are not currently running on |
| 3975 | * this runqueue and our priority is higher than the current's | 3951 | * this runqueue and our priority is higher than the current's |
| 3976 | */ | 3952 | */ |
| 3977 | if (task_running(rq, p)) { | 3953 | if (running) { |
| 3978 | if (p->prio > oldprio) | 3954 | if (p->prio > oldprio) |
| 3979 | resched_task(rq->curr); | 3955 | resched_task(rq->curr); |
| 3980 | } else { | 3956 | } else { |
| @@ -4138,7 +4114,7 @@ struct task_struct *idle_task(int cpu) | |||
| 4138 | * find_process_by_pid - find a process with a matching PID value. | 4114 | * find_process_by_pid - find a process with a matching PID value. |
| 4139 | * @pid: the pid in question. | 4115 | * @pid: the pid in question. |
| 4140 | */ | 4116 | */ |
| 4141 | static inline struct task_struct *find_process_by_pid(pid_t pid) | 4117 | static struct task_struct *find_process_by_pid(pid_t pid) |
| 4142 | { | 4118 | { |
| 4143 | return pid ? find_task_by_pid(pid) : current; | 4119 | return pid ? find_task_by_pid(pid) : current; |
| 4144 | } | 4120 | } |
| @@ -4180,7 +4156,7 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | |||
| 4180 | int sched_setscheduler(struct task_struct *p, int policy, | 4156 | int sched_setscheduler(struct task_struct *p, int policy, |
| 4181 | struct sched_param *param) | 4157 | struct sched_param *param) |
| 4182 | { | 4158 | { |
| 4183 | int retval, oldprio, oldpolicy = -1, on_rq; | 4159 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
| 4184 | unsigned long flags; | 4160 | unsigned long flags; |
| 4185 | struct rq *rq; | 4161 | struct rq *rq; |
| 4186 | 4162 | ||
| @@ -4262,18 +4238,26 @@ recheck: | |||
| 4262 | } | 4238 | } |
| 4263 | update_rq_clock(rq); | 4239 | update_rq_clock(rq); |
| 4264 | on_rq = p->se.on_rq; | 4240 | on_rq = p->se.on_rq; |
| 4265 | if (on_rq) | 4241 | running = task_running(rq, p); |
| 4242 | if (on_rq) { | ||
| 4266 | deactivate_task(rq, p, 0); | 4243 | deactivate_task(rq, p, 0); |
| 4244 | if (running) | ||
| 4245 | p->sched_class->put_prev_task(rq, p); | ||
| 4246 | } | ||
| 4247 | |||
| 4267 | oldprio = p->prio; | 4248 | oldprio = p->prio; |
| 4268 | __setscheduler(rq, p, policy, param->sched_priority); | 4249 | __setscheduler(rq, p, policy, param->sched_priority); |
| 4250 | |||
| 4269 | if (on_rq) { | 4251 | if (on_rq) { |
| 4252 | if (running) | ||
| 4253 | p->sched_class->set_curr_task(rq); | ||
| 4270 | activate_task(rq, p, 0); | 4254 | activate_task(rq, p, 0); |
| 4271 | /* | 4255 | /* |
| 4272 | * Reschedule if we are currently running on this runqueue and | 4256 | * Reschedule if we are currently running on this runqueue and |
| 4273 | * our priority decreased, or if we are not currently running on | 4257 | * our priority decreased, or if we are not currently running on |
| 4274 | * this runqueue and our priority is higher than the current's | 4258 | * this runqueue and our priority is higher than the current's |
| 4275 | */ | 4259 | */ |
| 4276 | if (task_running(rq, p)) { | 4260 | if (running) { |
| 4277 | if (p->prio > oldprio) | 4261 | if (p->prio > oldprio) |
| 4278 | resched_task(rq->curr); | 4262 | resched_task(rq->curr); |
| 4279 | } else { | 4263 | } else { |
| @@ -4344,10 +4328,10 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) | |||
| 4344 | asmlinkage long sys_sched_getscheduler(pid_t pid) | 4328 | asmlinkage long sys_sched_getscheduler(pid_t pid) |
| 4345 | { | 4329 | { |
| 4346 | struct task_struct *p; | 4330 | struct task_struct *p; |
| 4347 | int retval = -EINVAL; | 4331 | int retval; |
| 4348 | 4332 | ||
| 4349 | if (pid < 0) | 4333 | if (pid < 0) |
| 4350 | goto out_nounlock; | 4334 | return -EINVAL; |
| 4351 | 4335 | ||
| 4352 | retval = -ESRCH; | 4336 | retval = -ESRCH; |
| 4353 | read_lock(&tasklist_lock); | 4337 | read_lock(&tasklist_lock); |
| @@ -4358,8 +4342,6 @@ asmlinkage long sys_sched_getscheduler(pid_t pid) | |||
| 4358 | retval = p->policy; | 4342 | retval = p->policy; |
| 4359 | } | 4343 | } |
| 4360 | read_unlock(&tasklist_lock); | 4344 | read_unlock(&tasklist_lock); |
| 4361 | |||
| 4362 | out_nounlock: | ||
| 4363 | return retval; | 4345 | return retval; |
| 4364 | } | 4346 | } |
| 4365 | 4347 | ||
| @@ -4372,10 +4354,10 @@ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) | |||
| 4372 | { | 4354 | { |
| 4373 | struct sched_param lp; | 4355 | struct sched_param lp; |
| 4374 | struct task_struct *p; | 4356 | struct task_struct *p; |
| 4375 | int retval = -EINVAL; | 4357 | int retval; |
| 4376 | 4358 | ||
| 4377 | if (!param || pid < 0) | 4359 | if (!param || pid < 0) |
| 4378 | goto out_nounlock; | 4360 | return -EINVAL; |
| 4379 | 4361 | ||
| 4380 | read_lock(&tasklist_lock); | 4362 | read_lock(&tasklist_lock); |
| 4381 | p = find_process_by_pid(pid); | 4363 | p = find_process_by_pid(pid); |
| @@ -4395,7 +4377,6 @@ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) | |||
| 4395 | */ | 4377 | */ |
| 4396 | retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; | 4378 | retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; |
| 4397 | 4379 | ||
| 4398 | out_nounlock: | ||
| 4399 | return retval; | 4380 | return retval; |
| 4400 | 4381 | ||
| 4401 | out_unlock: | 4382 | out_unlock: |
| @@ -4555,8 +4536,8 @@ asmlinkage long sys_sched_yield(void) | |||
| 4555 | { | 4536 | { |
| 4556 | struct rq *rq = this_rq_lock(); | 4537 | struct rq *rq = this_rq_lock(); |
| 4557 | 4538 | ||
| 4558 | schedstat_inc(rq, yld_cnt); | 4539 | schedstat_inc(rq, yld_count); |
| 4559 | current->sched_class->yield_task(rq, current); | 4540 | current->sched_class->yield_task(rq); |
| 4560 | 4541 | ||
| 4561 | /* | 4542 | /* |
| 4562 | * Since we are going to call schedule() anyway, there's | 4543 | * Since we are going to call schedule() anyway, there's |
| @@ -4750,11 +4731,12 @@ asmlinkage | |||
| 4750 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | 4731 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) |
| 4751 | { | 4732 | { |
| 4752 | struct task_struct *p; | 4733 | struct task_struct *p; |
| 4753 | int retval = -EINVAL; | 4734 | unsigned int time_slice; |
| 4735 | int retval; | ||
| 4754 | struct timespec t; | 4736 | struct timespec t; |
| 4755 | 4737 | ||
| 4756 | if (pid < 0) | 4738 | if (pid < 0) |
| 4757 | goto out_nounlock; | 4739 | return -EINVAL; |
| 4758 | 4740 | ||
| 4759 | retval = -ESRCH; | 4741 | retval = -ESRCH; |
| 4760 | read_lock(&tasklist_lock); | 4742 | read_lock(&tasklist_lock); |
| @@ -4766,12 +4748,24 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | |||
| 4766 | if (retval) | 4748 | if (retval) |
| 4767 | goto out_unlock; | 4749 | goto out_unlock; |
| 4768 | 4750 | ||
| 4769 | jiffies_to_timespec(p->policy == SCHED_FIFO ? | 4751 | if (p->policy == SCHED_FIFO) |
| 4770 | 0 : static_prio_timeslice(p->static_prio), &t); | 4752 | time_slice = 0; |
| 4753 | else if (p->policy == SCHED_RR) | ||
| 4754 | time_slice = DEF_TIMESLICE; | ||
| 4755 | else { | ||
| 4756 | struct sched_entity *se = &p->se; | ||
| 4757 | unsigned long flags; | ||
| 4758 | struct rq *rq; | ||
| 4759 | |||
| 4760 | rq = task_rq_lock(p, &flags); | ||
| 4761 | time_slice = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); | ||
| 4762 | task_rq_unlock(rq, &flags); | ||
| 4763 | } | ||
| 4771 | read_unlock(&tasklist_lock); | 4764 | read_unlock(&tasklist_lock); |
| 4765 | jiffies_to_timespec(time_slice, &t); | ||
| 4772 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | 4766 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
| 4773 | out_nounlock: | ||
| 4774 | return retval; | 4767 | return retval; |
| 4768 | |||
| 4775 | out_unlock: | 4769 | out_unlock: |
| 4776 | read_unlock(&tasklist_lock); | 4770 | read_unlock(&tasklist_lock); |
| 4777 | return retval; | 4771 | return retval; |
| @@ -4900,32 +4894,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
| 4900 | */ | 4894 | */ |
| 4901 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | 4895 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; |
| 4902 | 4896 | ||
| 4903 | /* | ||
| 4904 | * Increase the granularity value when there are more CPUs, | ||
| 4905 | * because with more CPUs the 'effective latency' as visible | ||
| 4906 | * to users decreases. But the relationship is not linear, | ||
| 4907 | * so pick a second-best guess by going with the log2 of the | ||
| 4908 | * number of CPUs. | ||
| 4909 | * | ||
| 4910 | * This idea comes from the SD scheduler of Con Kolivas: | ||
| 4911 | */ | ||
| 4912 | static inline void sched_init_granularity(void) | ||
| 4913 | { | ||
| 4914 | unsigned int factor = 1 + ilog2(num_online_cpus()); | ||
| 4915 | const unsigned long limit = 100000000; | ||
| 4916 | |||
| 4917 | sysctl_sched_min_granularity *= factor; | ||
| 4918 | if (sysctl_sched_min_granularity > limit) | ||
| 4919 | sysctl_sched_min_granularity = limit; | ||
| 4920 | |||
| 4921 | sysctl_sched_latency *= factor; | ||
| 4922 | if (sysctl_sched_latency > limit) | ||
| 4923 | sysctl_sched_latency = limit; | ||
| 4924 | |||
| 4925 | sysctl_sched_runtime_limit = sysctl_sched_latency; | ||
| 4926 | sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2; | ||
| 4927 | } | ||
| 4928 | |||
| 4929 | #ifdef CONFIG_SMP | 4897 | #ifdef CONFIG_SMP |
| 4930 | /* | 4898 | /* |
| 4931 | * This is how migration works: | 4899 | * This is how migration works: |
| @@ -5103,35 +5071,34 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
| 5103 | struct rq *rq; | 5071 | struct rq *rq; |
| 5104 | int dest_cpu; | 5072 | int dest_cpu; |
| 5105 | 5073 | ||
| 5106 | restart: | 5074 | do { |
| 5107 | /* On same node? */ | 5075 | /* On same node? */ |
| 5108 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); | 5076 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); |
| 5109 | cpus_and(mask, mask, p->cpus_allowed); | 5077 | cpus_and(mask, mask, p->cpus_allowed); |
| 5110 | dest_cpu = any_online_cpu(mask); | 5078 | dest_cpu = any_online_cpu(mask); |
| 5111 | 5079 | ||
| 5112 | /* On any allowed CPU? */ | 5080 | /* On any allowed CPU? */ |
| 5113 | if (dest_cpu == NR_CPUS) | 5081 | if (dest_cpu == NR_CPUS) |
| 5114 | dest_cpu = any_online_cpu(p->cpus_allowed); | 5082 | dest_cpu = any_online_cpu(p->cpus_allowed); |
| 5115 | 5083 | ||
| 5116 | /* No more Mr. Nice Guy. */ | 5084 | /* No more Mr. Nice Guy. */ |
| 5117 | if (dest_cpu == NR_CPUS) { | 5085 | if (dest_cpu == NR_CPUS) { |
| 5118 | rq = task_rq_lock(p, &flags); | 5086 | rq = task_rq_lock(p, &flags); |
| 5119 | cpus_setall(p->cpus_allowed); | 5087 | cpus_setall(p->cpus_allowed); |
| 5120 | dest_cpu = any_online_cpu(p->cpus_allowed); | 5088 | dest_cpu = any_online_cpu(p->cpus_allowed); |
| 5121 | task_rq_unlock(rq, &flags); | 5089 | task_rq_unlock(rq, &flags); |
| 5122 | 5090 | ||
| 5123 | /* | 5091 | /* |
| 5124 | * Don't tell them about moving exiting tasks or | 5092 | * Don't tell them about moving exiting tasks or |
| 5125 | * kernel threads (both mm NULL), since they never | 5093 | * kernel threads (both mm NULL), since they never |
| 5126 | * leave kernel. | 5094 | * leave kernel. |
| 5127 | */ | 5095 | */ |
| 5128 | if (p->mm && printk_ratelimit()) | 5096 | if (p->mm && printk_ratelimit()) |
| 5129 | printk(KERN_INFO "process %d (%s) no " | 5097 | printk(KERN_INFO "process %d (%s) no " |
| 5130 | "longer affine to cpu%d\n", | 5098 | "longer affine to cpu%d\n", |
| 5131 | p->pid, p->comm, dead_cpu); | 5099 | p->pid, p->comm, dead_cpu); |
| 5132 | } | 5100 | } |
| 5133 | if (!__migrate_task(p, dead_cpu, dest_cpu)) | 5101 | } while (!__migrate_task(p, dead_cpu, dest_cpu)); |
| 5134 | goto restart; | ||
| 5135 | } | 5102 | } |
| 5136 | 5103 | ||
| 5137 | /* | 5104 | /* |
| @@ -5173,6 +5140,20 @@ static void migrate_live_tasks(int src_cpu) | |||
| 5173 | } | 5140 | } |
| 5174 | 5141 | ||
| 5175 | /* | 5142 | /* |
| 5143 | * activate_idle_task - move idle task to the _front_ of runqueue. | ||
| 5144 | */ | ||
| 5145 | static void activate_idle_task(struct task_struct *p, struct rq *rq) | ||
| 5146 | { | ||
| 5147 | update_rq_clock(rq); | ||
| 5148 | |||
| 5149 | if (p->state == TASK_UNINTERRUPTIBLE) | ||
| 5150 | rq->nr_uninterruptible--; | ||
| 5151 | |||
| 5152 | enqueue_task(rq, p, 0); | ||
| 5153 | inc_nr_running(p, rq); | ||
| 5154 | } | ||
| 5155 | |||
| 5156 | /* | ||
| 5176 | * Schedules idle task to be the next runnable task on current CPU. | 5157 | * Schedules idle task to be the next runnable task on current CPU. |
| 5177 | * It does so by boosting its priority to highest possible and adding it to | 5158 | * It does so by boosting its priority to highest possible and adding it to |
| 5178 | * the _front_ of the runqueue. Used by CPU offline code. | 5159 | * the _front_ of the runqueue. Used by CPU offline code. |
| @@ -5284,14 +5265,23 @@ static struct ctl_table sd_ctl_root[] = { | |||
| 5284 | static struct ctl_table *sd_alloc_ctl_entry(int n) | 5265 | static struct ctl_table *sd_alloc_ctl_entry(int n) |
| 5285 | { | 5266 | { |
| 5286 | struct ctl_table *entry = | 5267 | struct ctl_table *entry = |
| 5287 | kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL); | 5268 | kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); |
| 5288 | |||
| 5289 | BUG_ON(!entry); | ||
| 5290 | memset(entry, 0, n * sizeof(struct ctl_table)); | ||
| 5291 | 5269 | ||
| 5292 | return entry; | 5270 | return entry; |
| 5293 | } | 5271 | } |
| 5294 | 5272 | ||
| 5273 | static void sd_free_ctl_entry(struct ctl_table **tablep) | ||
| 5274 | { | ||
| 5275 | struct ctl_table *entry = *tablep; | ||
| 5276 | |||
| 5277 | for (entry = *tablep; entry->procname; entry++) | ||
| 5278 | if (entry->child) | ||
| 5279 | sd_free_ctl_entry(&entry->child); | ||
| 5280 | |||
| 5281 | kfree(*tablep); | ||
| 5282 | *tablep = NULL; | ||
| 5283 | } | ||
| 5284 | |||
| 5295 | static void | 5285 | static void |
| 5296 | set_table_entry(struct ctl_table *entry, | 5286 | set_table_entry(struct ctl_table *entry, |
| 5297 | const char *procname, void *data, int maxlen, | 5287 | const char *procname, void *data, int maxlen, |
| @@ -5307,7 +5297,10 @@ set_table_entry(struct ctl_table *entry, | |||
| 5307 | static struct ctl_table * | 5297 | static struct ctl_table * |
| 5308 | sd_alloc_ctl_domain_table(struct sched_domain *sd) | 5298 | sd_alloc_ctl_domain_table(struct sched_domain *sd) |
| 5309 | { | 5299 | { |
| 5310 | struct ctl_table *table = sd_alloc_ctl_entry(14); | 5300 | struct ctl_table *table = sd_alloc_ctl_entry(12); |
| 5301 | |||
| 5302 | if (table == NULL) | ||
| 5303 | return NULL; | ||
| 5311 | 5304 | ||
| 5312 | set_table_entry(&table[0], "min_interval", &sd->min_interval, | 5305 | set_table_entry(&table[0], "min_interval", &sd->min_interval, |
| 5313 | sizeof(long), 0644, proc_doulongvec_minmax); | 5306 | sizeof(long), 0644, proc_doulongvec_minmax); |
| @@ -5327,11 +5320,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) | |||
| 5327 | sizeof(int), 0644, proc_dointvec_minmax); | 5320 | sizeof(int), 0644, proc_dointvec_minmax); |
| 5328 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, | 5321 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, |
| 5329 | sizeof(int), 0644, proc_dointvec_minmax); | 5322 | sizeof(int), 0644, proc_dointvec_minmax); |
| 5330 | set_table_entry(&table[10], "cache_nice_tries", | 5323 | set_table_entry(&table[9], "cache_nice_tries", |
| 5331 | &sd->cache_nice_tries, | 5324 | &sd->cache_nice_tries, |
| 5332 | sizeof(int), 0644, proc_dointvec_minmax); | 5325 | sizeof(int), 0644, proc_dointvec_minmax); |
| 5333 | set_table_entry(&table[12], "flags", &sd->flags, | 5326 | set_table_entry(&table[10], "flags", &sd->flags, |
| 5334 | sizeof(int), 0644, proc_dointvec_minmax); | 5327 | sizeof(int), 0644, proc_dointvec_minmax); |
| 5328 | /* &table[11] is terminator */ | ||
| 5335 | 5329 | ||
| 5336 | return table; | 5330 | return table; |
| 5337 | } | 5331 | } |
| @@ -5346,6 +5340,8 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) | |||
| 5346 | for_each_domain(cpu, sd) | 5340 | for_each_domain(cpu, sd) |
| 5347 | domain_num++; | 5341 | domain_num++; |
| 5348 | entry = table = sd_alloc_ctl_entry(domain_num + 1); | 5342 | entry = table = sd_alloc_ctl_entry(domain_num + 1); |
| 5343 | if (table == NULL) | ||
| 5344 | return NULL; | ||
| 5349 | 5345 | ||
| 5350 | i = 0; | 5346 | i = 0; |
| 5351 | for_each_domain(cpu, sd) { | 5347 | for_each_domain(cpu, sd) { |
| @@ -5360,24 +5356,38 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) | |||
| 5360 | } | 5356 | } |
| 5361 | 5357 | ||
| 5362 | static struct ctl_table_header *sd_sysctl_header; | 5358 | static struct ctl_table_header *sd_sysctl_header; |
| 5363 | static void init_sched_domain_sysctl(void) | 5359 | static void register_sched_domain_sysctl(void) |
| 5364 | { | 5360 | { |
| 5365 | int i, cpu_num = num_online_cpus(); | 5361 | int i, cpu_num = num_online_cpus(); |
| 5366 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); | 5362 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); |
| 5367 | char buf[32]; | 5363 | char buf[32]; |
| 5368 | 5364 | ||
| 5365 | if (entry == NULL) | ||
| 5366 | return; | ||
| 5367 | |||
| 5369 | sd_ctl_dir[0].child = entry; | 5368 | sd_ctl_dir[0].child = entry; |
| 5370 | 5369 | ||
| 5371 | for (i = 0; i < cpu_num; i++, entry++) { | 5370 | for_each_online_cpu(i) { |
| 5372 | snprintf(buf, 32, "cpu%d", i); | 5371 | snprintf(buf, 32, "cpu%d", i); |
| 5373 | entry->procname = kstrdup(buf, GFP_KERNEL); | 5372 | entry->procname = kstrdup(buf, GFP_KERNEL); |
| 5374 | entry->mode = 0555; | 5373 | entry->mode = 0555; |
| 5375 | entry->child = sd_alloc_ctl_cpu_table(i); | 5374 | entry->child = sd_alloc_ctl_cpu_table(i); |
| 5375 | entry++; | ||
| 5376 | } | 5376 | } |
| 5377 | sd_sysctl_header = register_sysctl_table(sd_ctl_root); | 5377 | sd_sysctl_header = register_sysctl_table(sd_ctl_root); |
| 5378 | } | 5378 | } |
| 5379 | |||
| 5380 | static void unregister_sched_domain_sysctl(void) | ||
| 5381 | { | ||
| 5382 | unregister_sysctl_table(sd_sysctl_header); | ||
| 5383 | sd_sysctl_header = NULL; | ||
| 5384 | sd_free_ctl_entry(&sd_ctl_dir[0].child); | ||
| 5385 | } | ||
| 5379 | #else | 5386 | #else |
| 5380 | static void init_sched_domain_sysctl(void) | 5387 | static void register_sched_domain_sysctl(void) |
| 5388 | { | ||
| 5389 | } | ||
| 5390 | static void unregister_sched_domain_sysctl(void) | ||
| 5381 | { | 5391 | { |
| 5382 | } | 5392 | } |
| 5383 | #endif | 5393 | #endif |
| @@ -5499,8 +5509,7 @@ int __init migration_init(void) | |||
| 5499 | int nr_cpu_ids __read_mostly = NR_CPUS; | 5509 | int nr_cpu_ids __read_mostly = NR_CPUS; |
| 5500 | EXPORT_SYMBOL(nr_cpu_ids); | 5510 | EXPORT_SYMBOL(nr_cpu_ids); |
| 5501 | 5511 | ||
| 5502 | #undef SCHED_DOMAIN_DEBUG | 5512 | #ifdef CONFIG_SCHED_DEBUG |
| 5503 | #ifdef SCHED_DOMAIN_DEBUG | ||
| 5504 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 5513 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
| 5505 | { | 5514 | { |
| 5506 | int level = 0; | 5515 | int level = 0; |
| @@ -5558,16 +5567,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
| 5558 | printk("\n"); | 5567 | printk("\n"); |
| 5559 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 5568 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
| 5560 | "set\n"); | 5569 | "set\n"); |
| 5570 | break; | ||
| 5561 | } | 5571 | } |
| 5562 | 5572 | ||
| 5563 | if (!cpus_weight(group->cpumask)) { | 5573 | if (!cpus_weight(group->cpumask)) { |
| 5564 | printk("\n"); | 5574 | printk("\n"); |
| 5565 | printk(KERN_ERR "ERROR: empty group\n"); | 5575 | printk(KERN_ERR "ERROR: empty group\n"); |
| 5576 | break; | ||
| 5566 | } | 5577 | } |
| 5567 | 5578 | ||
| 5568 | if (cpus_intersects(groupmask, group->cpumask)) { | 5579 | if (cpus_intersects(groupmask, group->cpumask)) { |
| 5569 | printk("\n"); | 5580 | printk("\n"); |
| 5570 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | 5581 | printk(KERN_ERR "ERROR: repeated CPUs\n"); |
| 5582 | break; | ||
| 5571 | } | 5583 | } |
| 5572 | 5584 | ||
| 5573 | cpus_or(groupmask, groupmask, group->cpumask); | 5585 | cpus_or(groupmask, groupmask, group->cpumask); |
| @@ -5701,7 +5713,7 @@ static int __init isolated_cpu_setup(char *str) | |||
| 5701 | return 1; | 5713 | return 1; |
| 5702 | } | 5714 | } |
| 5703 | 5715 | ||
| 5704 | __setup ("isolcpus=", isolated_cpu_setup); | 5716 | __setup("isolcpus=", isolated_cpu_setup); |
| 5705 | 5717 | ||
| 5706 | /* | 5718 | /* |
| 5707 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer | 5719 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer |
| @@ -5930,24 +5942,23 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
| 5930 | 5942 | ||
| 5931 | if (!sg) | 5943 | if (!sg) |
| 5932 | return; | 5944 | return; |
| 5933 | next_sg: | 5945 | do { |
| 5934 | for_each_cpu_mask(j, sg->cpumask) { | 5946 | for_each_cpu_mask(j, sg->cpumask) { |
| 5935 | struct sched_domain *sd; | 5947 | struct sched_domain *sd; |
| 5936 | 5948 | ||
| 5937 | sd = &per_cpu(phys_domains, j); | 5949 | sd = &per_cpu(phys_domains, j); |
| 5938 | if (j != first_cpu(sd->groups->cpumask)) { | 5950 | if (j != first_cpu(sd->groups->cpumask)) { |
| 5939 | /* | 5951 | /* |
| 5940 | * Only add "power" once for each | 5952 | * Only add "power" once for each |
| 5941 | * physical package. | 5953 | * physical package. |
| 5942 | */ | 5954 | */ |
| 5943 | continue; | 5955 | continue; |
| 5944 | } | 5956 | } |
| 5945 | 5957 | ||
| 5946 | sg_inc_cpu_power(sg, sd->groups->__cpu_power); | 5958 | sg_inc_cpu_power(sg, sd->groups->__cpu_power); |
| 5947 | } | 5959 | } |
| 5948 | sg = sg->next; | 5960 | sg = sg->next; |
| 5949 | if (sg != group_head) | 5961 | } while (sg != group_head); |
| 5950 | goto next_sg; | ||
| 5951 | } | 5962 | } |
| 5952 | #endif | 5963 | #endif |
| 5953 | 5964 | ||
| @@ -6058,7 +6069,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6058 | /* | 6069 | /* |
| 6059 | * Allocate the per-node list of sched groups | 6070 | * Allocate the per-node list of sched groups |
| 6060 | */ | 6071 | */ |
| 6061 | sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES, | 6072 | sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), |
| 6062 | GFP_KERNEL); | 6073 | GFP_KERNEL); |
| 6063 | if (!sched_group_nodes) { | 6074 | if (!sched_group_nodes) { |
| 6064 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 6075 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
| @@ -6311,6 +6322,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) | |||
| 6311 | 6322 | ||
| 6312 | err = build_sched_domains(&cpu_default_map); | 6323 | err = build_sched_domains(&cpu_default_map); |
| 6313 | 6324 | ||
| 6325 | register_sched_domain_sysctl(); | ||
| 6326 | |||
| 6314 | return err; | 6327 | return err; |
| 6315 | } | 6328 | } |
| 6316 | 6329 | ||
| @@ -6327,6 +6340,8 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
| 6327 | { | 6340 | { |
| 6328 | int i; | 6341 | int i; |
| 6329 | 6342 | ||
| 6343 | unregister_sched_domain_sysctl(); | ||
| 6344 | |||
| 6330 | for_each_cpu_mask(i, *cpu_map) | 6345 | for_each_cpu_mask(i, *cpu_map) |
| 6331 | cpu_attach_domain(NULL, i); | 6346 | cpu_attach_domain(NULL, i); |
| 6332 | synchronize_sched(); | 6347 | synchronize_sched(); |
| @@ -6357,6 +6372,8 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | |||
| 6357 | if (!err && !cpus_empty(*partition2)) | 6372 | if (!err && !cpus_empty(*partition2)) |
| 6358 | err = build_sched_domains(partition2); | 6373 | err = build_sched_domains(partition2); |
| 6359 | 6374 | ||
| 6375 | register_sched_domain_sysctl(); | ||
| 6376 | |||
| 6360 | return err; | 6377 | return err; |
| 6361 | } | 6378 | } |
| 6362 | 6379 | ||
| @@ -6488,17 +6505,13 @@ void __init sched_init_smp(void) | |||
| 6488 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 6505 | /* XXX: Theoretical race here - CPU may be hotplugged now */ |
| 6489 | hotcpu_notifier(update_sched_domains, 0); | 6506 | hotcpu_notifier(update_sched_domains, 0); |
| 6490 | 6507 | ||
| 6491 | init_sched_domain_sysctl(); | ||
| 6492 | |||
| 6493 | /* Move init over to a non-isolated CPU */ | 6508 | /* Move init over to a non-isolated CPU */ |
| 6494 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 6509 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) |
| 6495 | BUG(); | 6510 | BUG(); |
| 6496 | sched_init_granularity(); | ||
| 6497 | } | 6511 | } |
| 6498 | #else | 6512 | #else |
| 6499 | void __init sched_init_smp(void) | 6513 | void __init sched_init_smp(void) |
| 6500 | { | 6514 | { |
| 6501 | sched_init_granularity(); | ||
| 6502 | } | 6515 | } |
| 6503 | #endif /* CONFIG_SMP */ | 6516 | #endif /* CONFIG_SMP */ |
| 6504 | 6517 | ||
| @@ -6512,28 +6525,20 @@ int in_sched_functions(unsigned long addr) | |||
| 6512 | && addr < (unsigned long)__sched_text_end); | 6525 | && addr < (unsigned long)__sched_text_end); |
| 6513 | } | 6526 | } |
| 6514 | 6527 | ||
| 6515 | static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | 6528 | static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) |
| 6516 | { | 6529 | { |
| 6517 | cfs_rq->tasks_timeline = RB_ROOT; | 6530 | cfs_rq->tasks_timeline = RB_ROOT; |
| 6518 | cfs_rq->fair_clock = 1; | ||
| 6519 | #ifdef CONFIG_FAIR_GROUP_SCHED | 6531 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 6520 | cfs_rq->rq = rq; | 6532 | cfs_rq->rq = rq; |
| 6521 | #endif | 6533 | #endif |
| 6534 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | ||
| 6522 | } | 6535 | } |
| 6523 | 6536 | ||
| 6524 | void __init sched_init(void) | 6537 | void __init sched_init(void) |
| 6525 | { | 6538 | { |
| 6526 | u64 now = sched_clock(); | ||
| 6527 | int highest_cpu = 0; | 6539 | int highest_cpu = 0; |
| 6528 | int i, j; | 6540 | int i, j; |
| 6529 | 6541 | ||
| 6530 | /* | ||
| 6531 | * Link up the scheduling class hierarchy: | ||
| 6532 | */ | ||
| 6533 | rt_sched_class.next = &fair_sched_class; | ||
| 6534 | fair_sched_class.next = &idle_sched_class; | ||
| 6535 | idle_sched_class.next = NULL; | ||
| 6536 | |||
| 6537 | for_each_possible_cpu(i) { | 6542 | for_each_possible_cpu(i) { |
| 6538 | struct rt_prio_array *array; | 6543 | struct rt_prio_array *array; |
| 6539 | struct rq *rq; | 6544 | struct rq *rq; |
| @@ -6546,10 +6551,28 @@ void __init sched_init(void) | |||
| 6546 | init_cfs_rq(&rq->cfs, rq); | 6551 | init_cfs_rq(&rq->cfs, rq); |
| 6547 | #ifdef CONFIG_FAIR_GROUP_SCHED | 6552 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 6548 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 6553 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
| 6549 | list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | 6554 | { |
| 6555 | struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i); | ||
| 6556 | struct sched_entity *se = | ||
| 6557 | &per_cpu(init_sched_entity, i); | ||
| 6558 | |||
| 6559 | init_cfs_rq_p[i] = cfs_rq; | ||
| 6560 | init_cfs_rq(cfs_rq, rq); | ||
| 6561 | cfs_rq->tg = &init_task_group; | ||
| 6562 | list_add(&cfs_rq->leaf_cfs_rq_list, | ||
| 6563 | &rq->leaf_cfs_rq_list); | ||
| 6564 | |||
| 6565 | init_sched_entity_p[i] = se; | ||
| 6566 | se->cfs_rq = &rq->cfs; | ||
| 6567 | se->my_q = cfs_rq; | ||
| 6568 | se->load.weight = init_task_group_load; | ||
| 6569 | se->load.inv_weight = | ||
| 6570 | div64_64(1ULL<<32, init_task_group_load); | ||
| 6571 | se->parent = NULL; | ||
| 6572 | } | ||
| 6573 | init_task_group.shares = init_task_group_load; | ||
| 6574 | spin_lock_init(&init_task_group.lock); | ||
| 6550 | #endif | 6575 | #endif |
| 6551 | rq->ls.load_update_last = now; | ||
| 6552 | rq->ls.load_update_start = now; | ||
| 6553 | 6576 | ||
| 6554 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 6577 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
| 6555 | rq->cpu_load[j] = 0; | 6578 | rq->cpu_load[j] = 0; |
| @@ -6634,26 +6657,40 @@ EXPORT_SYMBOL(__might_sleep); | |||
| 6634 | #endif | 6657 | #endif |
| 6635 | 6658 | ||
| 6636 | #ifdef CONFIG_MAGIC_SYSRQ | 6659 | #ifdef CONFIG_MAGIC_SYSRQ |
| 6660 | static void normalize_task(struct rq *rq, struct task_struct *p) | ||
| 6661 | { | ||
| 6662 | int on_rq; | ||
| 6663 | update_rq_clock(rq); | ||
| 6664 | on_rq = p->se.on_rq; | ||
| 6665 | if (on_rq) | ||
| 6666 | deactivate_task(rq, p, 0); | ||
| 6667 | __setscheduler(rq, p, SCHED_NORMAL, 0); | ||
| 6668 | if (on_rq) { | ||
| 6669 | activate_task(rq, p, 0); | ||
| 6670 | resched_task(rq->curr); | ||
| 6671 | } | ||
| 6672 | } | ||
| 6673 | |||
| 6637 | void normalize_rt_tasks(void) | 6674 | void normalize_rt_tasks(void) |
| 6638 | { | 6675 | { |
| 6639 | struct task_struct *g, *p; | 6676 | struct task_struct *g, *p; |
| 6640 | unsigned long flags; | 6677 | unsigned long flags; |
| 6641 | struct rq *rq; | 6678 | struct rq *rq; |
| 6642 | int on_rq; | ||
| 6643 | 6679 | ||
| 6644 | read_lock_irq(&tasklist_lock); | 6680 | read_lock_irq(&tasklist_lock); |
| 6645 | do_each_thread(g, p) { | 6681 | do_each_thread(g, p) { |
| 6646 | p->se.fair_key = 0; | 6682 | /* |
| 6647 | p->se.wait_runtime = 0; | 6683 | * Only normalize user tasks: |
| 6684 | */ | ||
| 6685 | if (!p->mm) | ||
| 6686 | continue; | ||
| 6687 | |||
| 6648 | p->se.exec_start = 0; | 6688 | p->se.exec_start = 0; |
| 6649 | p->se.wait_start_fair = 0; | ||
| 6650 | p->se.sleep_start_fair = 0; | ||
| 6651 | #ifdef CONFIG_SCHEDSTATS | 6689 | #ifdef CONFIG_SCHEDSTATS |
| 6652 | p->se.wait_start = 0; | 6690 | p->se.wait_start = 0; |
| 6653 | p->se.sleep_start = 0; | 6691 | p->se.sleep_start = 0; |
| 6654 | p->se.block_start = 0; | 6692 | p->se.block_start = 0; |
| 6655 | #endif | 6693 | #endif |
| 6656 | task_rq(p)->cfs.fair_clock = 0; | ||
| 6657 | task_rq(p)->clock = 0; | 6694 | task_rq(p)->clock = 0; |
| 6658 | 6695 | ||
| 6659 | if (!rt_task(p)) { | 6696 | if (!rt_task(p)) { |
| @@ -6668,26 +6705,9 @@ void normalize_rt_tasks(void) | |||
| 6668 | 6705 | ||
| 6669 | spin_lock_irqsave(&p->pi_lock, flags); | 6706 | spin_lock_irqsave(&p->pi_lock, flags); |
| 6670 | rq = __task_rq_lock(p); | 6707 | rq = __task_rq_lock(p); |
| 6671 | #ifdef CONFIG_SMP | ||
| 6672 | /* | ||
| 6673 | * Do not touch the migration thread: | ||
| 6674 | */ | ||
| 6675 | if (p == rq->migration_thread) | ||
| 6676 | goto out_unlock; | ||
| 6677 | #endif | ||
| 6678 | 6708 | ||
| 6679 | update_rq_clock(rq); | 6709 | normalize_task(rq, p); |
| 6680 | on_rq = p->se.on_rq; | 6710 | |
| 6681 | if (on_rq) | ||
| 6682 | deactivate_task(rq, p, 0); | ||
| 6683 | __setscheduler(rq, p, SCHED_NORMAL, 0); | ||
| 6684 | if (on_rq) { | ||
| 6685 | activate_task(rq, p, 0); | ||
| 6686 | resched_task(rq->curr); | ||
| 6687 | } | ||
| 6688 | #ifdef CONFIG_SMP | ||
| 6689 | out_unlock: | ||
| 6690 | #endif | ||
| 6691 | __task_rq_unlock(rq); | 6711 | __task_rq_unlock(rq); |
| 6692 | spin_unlock_irqrestore(&p->pi_lock, flags); | 6712 | spin_unlock_irqrestore(&p->pi_lock, flags); |
| 6693 | } while_each_thread(g, p); | 6713 | } while_each_thread(g, p); |
| @@ -6740,3 +6760,201 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
| 6740 | } | 6760 | } |
| 6741 | 6761 | ||
| 6742 | #endif | 6762 | #endif |
| 6763 | |||
| 6764 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 6765 | |||
| 6766 | /* allocate runqueue etc for a new task group */ | ||
| 6767 | struct task_group *sched_create_group(void) | ||
| 6768 | { | ||
| 6769 | struct task_group *tg; | ||
| 6770 | struct cfs_rq *cfs_rq; | ||
| 6771 | struct sched_entity *se; | ||
| 6772 | struct rq *rq; | ||
| 6773 | int i; | ||
| 6774 | |||
| 6775 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); | ||
| 6776 | if (!tg) | ||
| 6777 | return ERR_PTR(-ENOMEM); | ||
| 6778 | |||
| 6779 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); | ||
| 6780 | if (!tg->cfs_rq) | ||
| 6781 | goto err; | ||
| 6782 | tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); | ||
| 6783 | if (!tg->se) | ||
| 6784 | goto err; | ||
| 6785 | |||
| 6786 | for_each_possible_cpu(i) { | ||
| 6787 | rq = cpu_rq(i); | ||
| 6788 | |||
| 6789 | cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, | ||
| 6790 | cpu_to_node(i)); | ||
| 6791 | if (!cfs_rq) | ||
| 6792 | goto err; | ||
| 6793 | |||
| 6794 | se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, | ||
| 6795 | cpu_to_node(i)); | ||
| 6796 | if (!se) | ||
| 6797 | goto err; | ||
| 6798 | |||
| 6799 | memset(cfs_rq, 0, sizeof(struct cfs_rq)); | ||
| 6800 | memset(se, 0, sizeof(struct sched_entity)); | ||
| 6801 | |||
| 6802 | tg->cfs_rq[i] = cfs_rq; | ||
| 6803 | init_cfs_rq(cfs_rq, rq); | ||
| 6804 | cfs_rq->tg = tg; | ||
| 6805 | |||
| 6806 | tg->se[i] = se; | ||
| 6807 | se->cfs_rq = &rq->cfs; | ||
| 6808 | se->my_q = cfs_rq; | ||
| 6809 | se->load.weight = NICE_0_LOAD; | ||
| 6810 | se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD); | ||
| 6811 | se->parent = NULL; | ||
| 6812 | } | ||
| 6813 | |||
| 6814 | for_each_possible_cpu(i) { | ||
| 6815 | rq = cpu_rq(i); | ||
| 6816 | cfs_rq = tg->cfs_rq[i]; | ||
| 6817 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
| 6818 | } | ||
| 6819 | |||
| 6820 | tg->shares = NICE_0_LOAD; | ||
| 6821 | spin_lock_init(&tg->lock); | ||
| 6822 | |||
| 6823 | return tg; | ||
| 6824 | |||
| 6825 | err: | ||
| 6826 | for_each_possible_cpu(i) { | ||
| 6827 | if (tg->cfs_rq) | ||
| 6828 | kfree(tg->cfs_rq[i]); | ||
| 6829 | if (tg->se) | ||
| 6830 | kfree(tg->se[i]); | ||
| 6831 | } | ||
| 6832 | kfree(tg->cfs_rq); | ||
| 6833 | kfree(tg->se); | ||
| 6834 | kfree(tg); | ||
| 6835 | |||
| 6836 | return ERR_PTR(-ENOMEM); | ||
| 6837 | } | ||
| 6838 | |||
| 6839 | /* rcu callback to free various structures associated with a task group */ | ||
| 6840 | static void free_sched_group(struct rcu_head *rhp) | ||
| 6841 | { | ||
| 6842 | struct cfs_rq *cfs_rq = container_of(rhp, struct cfs_rq, rcu); | ||
| 6843 | struct task_group *tg = cfs_rq->tg; | ||
| 6844 | struct sched_entity *se; | ||
| 6845 | int i; | ||
| 6846 | |||
| 6847 | /* now it should be safe to free those cfs_rqs */ | ||
| 6848 | for_each_possible_cpu(i) { | ||
| 6849 | cfs_rq = tg->cfs_rq[i]; | ||
| 6850 | kfree(cfs_rq); | ||
| 6851 | |||
| 6852 | se = tg->se[i]; | ||
| 6853 | kfree(se); | ||
| 6854 | } | ||
| 6855 | |||
| 6856 | kfree(tg->cfs_rq); | ||
| 6857 | kfree(tg->se); | ||
| 6858 | kfree(tg); | ||
| 6859 | } | ||
| 6860 | |||
| 6861 | /* Destroy runqueue etc associated with a task group */ | ||
| 6862 | void sched_destroy_group(struct task_group *tg) | ||
| 6863 | { | ||
| 6864 | struct cfs_rq *cfs_rq; | ||
| 6865 | int i; | ||
| 6866 | |||
| 6867 | for_each_possible_cpu(i) { | ||
| 6868 | cfs_rq = tg->cfs_rq[i]; | ||
| 6869 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | ||
| 6870 | } | ||
| 6871 | |||
| 6872 | cfs_rq = tg->cfs_rq[0]; | ||
| 6873 | |||
| 6874 | /* wait for possible concurrent references to cfs_rqs complete */ | ||
| 6875 | call_rcu(&cfs_rq->rcu, free_sched_group); | ||
| 6876 | } | ||
| 6877 | |||
| 6878 | /* change task's runqueue when it moves between groups. | ||
| 6879 | * The caller of this function should have put the task in its new group | ||
| 6880 | * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to | ||
| 6881 | * reflect its new group. | ||
| 6882 | */ | ||
| 6883 | void sched_move_task(struct task_struct *tsk) | ||
| 6884 | { | ||
| 6885 | int on_rq, running; | ||
| 6886 | unsigned long flags; | ||
| 6887 | struct rq *rq; | ||
| 6888 | |||
| 6889 | rq = task_rq_lock(tsk, &flags); | ||
| 6890 | |||
| 6891 | if (tsk->sched_class != &fair_sched_class) | ||
| 6892 | goto done; | ||
| 6893 | |||
| 6894 | update_rq_clock(rq); | ||
| 6895 | |||
| 6896 | running = task_running(rq, tsk); | ||
| 6897 | on_rq = tsk->se.on_rq; | ||
| 6898 | |||
| 6899 | if (on_rq) { | ||
| 6900 | dequeue_task(rq, tsk, 0); | ||
| 6901 | if (unlikely(running)) | ||
| 6902 | tsk->sched_class->put_prev_task(rq, tsk); | ||
| 6903 | } | ||
| 6904 | |||
| 6905 | set_task_cfs_rq(tsk); | ||
| 6906 | |||
| 6907 | if (on_rq) { | ||
| 6908 | if (unlikely(running)) | ||
| 6909 | tsk->sched_class->set_curr_task(rq); | ||
| 6910 | enqueue_task(rq, tsk, 0); | ||
| 6911 | } | ||
| 6912 | |||
| 6913 | done: | ||
| 6914 | task_rq_unlock(rq, &flags); | ||
| 6915 | } | ||
| 6916 | |||
| 6917 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | ||
| 6918 | { | ||
| 6919 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
| 6920 | struct rq *rq = cfs_rq->rq; | ||
| 6921 | int on_rq; | ||
| 6922 | |||
| 6923 | spin_lock_irq(&rq->lock); | ||
| 6924 | |||
| 6925 | on_rq = se->on_rq; | ||
| 6926 | if (on_rq) | ||
| 6927 | dequeue_entity(cfs_rq, se, 0); | ||
| 6928 | |||
| 6929 | se->load.weight = shares; | ||
| 6930 | se->load.inv_weight = div64_64((1ULL<<32), shares); | ||
| 6931 | |||
| 6932 | if (on_rq) | ||
| 6933 | enqueue_entity(cfs_rq, se, 0); | ||
| 6934 | |||
| 6935 | spin_unlock_irq(&rq->lock); | ||
| 6936 | } | ||
| 6937 | |||
| 6938 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | ||
| 6939 | { | ||
| 6940 | int i; | ||
| 6941 | |||
| 6942 | spin_lock(&tg->lock); | ||
| 6943 | if (tg->shares == shares) | ||
| 6944 | goto done; | ||
| 6945 | |||
| 6946 | tg->shares = shares; | ||
| 6947 | for_each_possible_cpu(i) | ||
| 6948 | set_se_shares(tg->se[i], shares); | ||
| 6949 | |||
| 6950 | done: | ||
| 6951 | spin_unlock(&tg->lock); | ||
| 6952 | return 0; | ||
| 6953 | } | ||
| 6954 | |||
| 6955 | unsigned long sched_group_shares(struct task_group *tg) | ||
| 6956 | { | ||
| 6957 | return tg->shares; | ||
| 6958 | } | ||
| 6959 | |||
| 6960 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index c3ee38bd3426..a5e517ec07c3 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
| @@ -28,6 +28,31 @@ | |||
| 28 | printk(x); \ | 28 | printk(x); \ |
| 29 | } while (0) | 29 | } while (0) |
| 30 | 30 | ||
| 31 | /* | ||
| 32 | * Ease the printing of nsec fields: | ||
| 33 | */ | ||
| 34 | static long long nsec_high(long long nsec) | ||
| 35 | { | ||
| 36 | if (nsec < 0) { | ||
| 37 | nsec = -nsec; | ||
| 38 | do_div(nsec, 1000000); | ||
| 39 | return -nsec; | ||
| 40 | } | ||
| 41 | do_div(nsec, 1000000); | ||
| 42 | |||
| 43 | return nsec; | ||
| 44 | } | ||
| 45 | |||
| 46 | static unsigned long nsec_low(long long nsec) | ||
| 47 | { | ||
| 48 | if (nsec < 0) | ||
| 49 | nsec = -nsec; | ||
| 50 | |||
| 51 | return do_div(nsec, 1000000); | ||
| 52 | } | ||
| 53 | |||
| 54 | #define SPLIT_NS(x) nsec_high(x), nsec_low(x) | ||
| 55 | |||
| 31 | static void | 56 | static void |
| 32 | print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | 57 | print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) |
| 33 | { | 58 | { |
| @@ -36,23 +61,19 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
| 36 | else | 61 | else |
| 37 | SEQ_printf(m, " "); | 62 | SEQ_printf(m, " "); |
| 38 | 63 | ||
| 39 | SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d ", | 64 | SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", |
| 40 | p->comm, p->pid, | 65 | p->comm, p->pid, |
| 41 | (long long)p->se.fair_key, | 66 | SPLIT_NS(p->se.vruntime), |
| 42 | (long long)(p->se.fair_key - rq->cfs.fair_clock), | ||
| 43 | (long long)p->se.wait_runtime, | ||
| 44 | (long long)(p->nvcsw + p->nivcsw), | 67 | (long long)(p->nvcsw + p->nivcsw), |
| 45 | p->prio); | 68 | p->prio); |
| 46 | #ifdef CONFIG_SCHEDSTATS | 69 | #ifdef CONFIG_SCHEDSTATS |
| 47 | SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n", | 70 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n", |
| 48 | (long long)p->se.sum_exec_runtime, | 71 | SPLIT_NS(p->se.vruntime), |
| 49 | (long long)p->se.sum_wait_runtime, | 72 | SPLIT_NS(p->se.sum_exec_runtime), |
| 50 | (long long)p->se.sum_sleep_runtime, | 73 | SPLIT_NS(p->se.sum_sleep_runtime)); |
| 51 | (long long)p->se.wait_runtime_overruns, | ||
| 52 | (long long)p->se.wait_runtime_underruns); | ||
| 53 | #else | 74 | #else |
| 54 | SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n", | 75 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n", |
| 55 | 0LL, 0LL, 0LL, 0LL, 0LL); | 76 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); |
| 56 | #endif | 77 | #endif |
| 57 | } | 78 | } |
| 58 | 79 | ||
| @@ -62,14 +83,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
| 62 | 83 | ||
| 63 | SEQ_printf(m, | 84 | SEQ_printf(m, |
| 64 | "\nrunnable tasks:\n" | 85 | "\nrunnable tasks:\n" |
| 65 | " task PID tree-key delta waiting" | 86 | " task PID tree-key switches prio" |
| 66 | " switches prio" | 87 | " exec-runtime sum-exec sum-sleep\n" |
| 67 | " sum-exec sum-wait sum-sleep" | 88 | "------------------------------------------------------" |
| 68 | " wait-overrun wait-underrun\n" | 89 | "----------------------------------------------------\n"); |
| 69 | "------------------------------------------------------------------" | ||
| 70 | "----------------" | ||
| 71 | "------------------------------------------------" | ||
| 72 | "--------------------------------\n"); | ||
| 73 | 90 | ||
| 74 | read_lock_irq(&tasklist_lock); | 91 | read_lock_irq(&tasklist_lock); |
| 75 | 92 | ||
| @@ -83,45 +100,48 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
| 83 | read_unlock_irq(&tasklist_lock); | 100 | read_unlock_irq(&tasklist_lock); |
| 84 | } | 101 | } |
| 85 | 102 | ||
| 86 | static void | 103 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) |
| 87 | print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | ||
| 88 | { | 104 | { |
| 89 | s64 wait_runtime_rq_sum = 0; | 105 | s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, |
| 90 | struct task_struct *p; | 106 | spread, rq0_min_vruntime, spread0; |
| 91 | struct rb_node *curr; | ||
| 92 | unsigned long flags; | ||
| 93 | struct rq *rq = &per_cpu(runqueues, cpu); | 107 | struct rq *rq = &per_cpu(runqueues, cpu); |
| 108 | struct sched_entity *last; | ||
| 109 | unsigned long flags; | ||
| 94 | 110 | ||
| 95 | spin_lock_irqsave(&rq->lock, flags); | ||
| 96 | curr = first_fair(cfs_rq); | ||
| 97 | while (curr) { | ||
| 98 | p = rb_entry(curr, struct task_struct, se.run_node); | ||
| 99 | wait_runtime_rq_sum += p->se.wait_runtime; | ||
| 100 | |||
| 101 | curr = rb_next(curr); | ||
| 102 | } | ||
| 103 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 104 | |||
| 105 | SEQ_printf(m, " .%-30s: %Ld\n", "wait_runtime_rq_sum", | ||
| 106 | (long long)wait_runtime_rq_sum); | ||
| 107 | } | ||
| 108 | |||
| 109 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | ||
| 110 | { | ||
| 111 | SEQ_printf(m, "\ncfs_rq\n"); | 111 | SEQ_printf(m, "\ncfs_rq\n"); |
| 112 | 112 | ||
| 113 | #define P(x) \ | 113 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", |
| 114 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(cfs_rq->x)) | 114 | SPLIT_NS(cfs_rq->exec_clock)); |
| 115 | |||
| 116 | P(fair_clock); | ||
| 117 | P(exec_clock); | ||
| 118 | P(wait_runtime); | ||
| 119 | P(wait_runtime_overruns); | ||
| 120 | P(wait_runtime_underruns); | ||
| 121 | P(sleeper_bonus); | ||
| 122 | #undef P | ||
| 123 | 115 | ||
| 124 | print_cfs_rq_runtime_sum(m, cpu, cfs_rq); | 116 | spin_lock_irqsave(&rq->lock, flags); |
| 117 | if (cfs_rq->rb_leftmost) | ||
| 118 | MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; | ||
| 119 | last = __pick_last_entity(cfs_rq); | ||
| 120 | if (last) | ||
| 121 | max_vruntime = last->vruntime; | ||
| 122 | min_vruntime = rq->cfs.min_vruntime; | ||
| 123 | rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; | ||
| 124 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 125 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", | ||
| 126 | SPLIT_NS(MIN_vruntime)); | ||
| 127 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", | ||
| 128 | SPLIT_NS(min_vruntime)); | ||
| 129 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", | ||
| 130 | SPLIT_NS(max_vruntime)); | ||
| 131 | spread = max_vruntime - MIN_vruntime; | ||
| 132 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", | ||
| 133 | SPLIT_NS(spread)); | ||
| 134 | spread0 = min_vruntime - rq0_min_vruntime; | ||
| 135 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", | ||
| 136 | SPLIT_NS(spread0)); | ||
| 137 | SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); | ||
| 138 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | ||
| 139 | #ifdef CONFIG_SCHEDSTATS | ||
| 140 | SEQ_printf(m, " .%-30s: %ld\n", "bkl_count", | ||
| 141 | rq->bkl_count); | ||
| 142 | #endif | ||
| 143 | SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", | ||
| 144 | cfs_rq->nr_spread_over); | ||
| 125 | } | 145 | } |
| 126 | 146 | ||
| 127 | static void print_cpu(struct seq_file *m, int cpu) | 147 | static void print_cpu(struct seq_file *m, int cpu) |
| @@ -141,31 +161,32 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
| 141 | 161 | ||
| 142 | #define P(x) \ | 162 | #define P(x) \ |
| 143 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) | 163 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) |
| 164 | #define PN(x) \ | ||
| 165 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) | ||
| 144 | 166 | ||
| 145 | P(nr_running); | 167 | P(nr_running); |
| 146 | SEQ_printf(m, " .%-30s: %lu\n", "load", | 168 | SEQ_printf(m, " .%-30s: %lu\n", "load", |
| 147 | rq->ls.load.weight); | 169 | rq->load.weight); |
| 148 | P(ls.delta_fair); | ||
| 149 | P(ls.delta_exec); | ||
| 150 | P(nr_switches); | 170 | P(nr_switches); |
| 151 | P(nr_load_updates); | 171 | P(nr_load_updates); |
| 152 | P(nr_uninterruptible); | 172 | P(nr_uninterruptible); |
| 153 | SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies); | 173 | SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies); |
| 154 | P(next_balance); | 174 | PN(next_balance); |
| 155 | P(curr->pid); | 175 | P(curr->pid); |
| 156 | P(clock); | 176 | PN(clock); |
| 157 | P(idle_clock); | 177 | PN(idle_clock); |
| 158 | P(prev_clock_raw); | 178 | PN(prev_clock_raw); |
| 159 | P(clock_warps); | 179 | P(clock_warps); |
| 160 | P(clock_overflows); | 180 | P(clock_overflows); |
| 161 | P(clock_deep_idle_events); | 181 | P(clock_deep_idle_events); |
| 162 | P(clock_max_delta); | 182 | PN(clock_max_delta); |
| 163 | P(cpu_load[0]); | 183 | P(cpu_load[0]); |
| 164 | P(cpu_load[1]); | 184 | P(cpu_load[1]); |
| 165 | P(cpu_load[2]); | 185 | P(cpu_load[2]); |
| 166 | P(cpu_load[3]); | 186 | P(cpu_load[3]); |
| 167 | P(cpu_load[4]); | 187 | P(cpu_load[4]); |
| 168 | #undef P | 188 | #undef P |
| 189 | #undef PN | ||
| 169 | 190 | ||
| 170 | print_cfs_stats(m, cpu); | 191 | print_cfs_stats(m, cpu); |
| 171 | 192 | ||
| @@ -177,12 +198,25 @@ static int sched_debug_show(struct seq_file *m, void *v) | |||
| 177 | u64 now = ktime_to_ns(ktime_get()); | 198 | u64 now = ktime_to_ns(ktime_get()); |
| 178 | int cpu; | 199 | int cpu; |
| 179 | 200 | ||
| 180 | SEQ_printf(m, "Sched Debug Version: v0.05-v20, %s %.*s\n", | 201 | SEQ_printf(m, "Sched Debug Version: v0.06-v22, %s %.*s\n", |
| 181 | init_utsname()->release, | 202 | init_utsname()->release, |
| 182 | (int)strcspn(init_utsname()->version, " "), | 203 | (int)strcspn(init_utsname()->version, " "), |
| 183 | init_utsname()->version); | 204 | init_utsname()->version); |
| 184 | 205 | ||
| 185 | SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now); | 206 | SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); |
| 207 | |||
| 208 | #define P(x) \ | ||
| 209 | SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) | ||
| 210 | #define PN(x) \ | ||
| 211 | SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) | ||
| 212 | PN(sysctl_sched_latency); | ||
| 213 | PN(sysctl_sched_nr_latency); | ||
| 214 | PN(sysctl_sched_wakeup_granularity); | ||
| 215 | PN(sysctl_sched_batch_wakeup_granularity); | ||
| 216 | PN(sysctl_sched_child_runs_first); | ||
| 217 | P(sysctl_sched_features); | ||
| 218 | #undef PN | ||
| 219 | #undef P | ||
| 186 | 220 | ||
| 187 | for_each_online_cpu(cpu) | 221 | for_each_online_cpu(cpu) |
| 188 | print_cpu(m, cpu); | 222 | print_cpu(m, cpu); |
| @@ -202,7 +236,7 @@ static int sched_debug_open(struct inode *inode, struct file *filp) | |||
| 202 | return single_open(filp, sched_debug_show, NULL); | 236 | return single_open(filp, sched_debug_show, NULL); |
| 203 | } | 237 | } |
| 204 | 238 | ||
| 205 | static struct file_operations sched_debug_fops = { | 239 | static const struct file_operations sched_debug_fops = { |
| 206 | .open = sched_debug_open, | 240 | .open = sched_debug_open, |
| 207 | .read = seq_read, | 241 | .read = seq_read, |
| 208 | .llseek = seq_lseek, | 242 | .llseek = seq_lseek, |
| @@ -226,6 +260,7 @@ __initcall(init_sched_debug_procfs); | |||
| 226 | 260 | ||
| 227 | void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | 261 | void proc_sched_show_task(struct task_struct *p, struct seq_file *m) |
| 228 | { | 262 | { |
| 263 | unsigned long nr_switches; | ||
| 229 | unsigned long flags; | 264 | unsigned long flags; |
| 230 | int num_threads = 1; | 265 | int num_threads = 1; |
| 231 | 266 | ||
| @@ -237,41 +272,89 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
| 237 | rcu_read_unlock(); | 272 | rcu_read_unlock(); |
| 238 | 273 | ||
| 239 | SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); | 274 | SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); |
| 240 | SEQ_printf(m, "----------------------------------------------\n"); | 275 | SEQ_printf(m, |
| 276 | "---------------------------------------------------------\n"); | ||
| 277 | #define __P(F) \ | ||
| 278 | SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F) | ||
| 241 | #define P(F) \ | 279 | #define P(F) \ |
| 242 | SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F) | 280 | SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F) |
| 281 | #define __PN(F) \ | ||
| 282 | SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) | ||
| 283 | #define PN(F) \ | ||
| 284 | SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) | ||
| 243 | 285 | ||
| 244 | P(se.wait_runtime); | 286 | PN(se.exec_start); |
| 245 | P(se.wait_start_fair); | 287 | PN(se.vruntime); |
| 246 | P(se.exec_start); | 288 | PN(se.sum_exec_runtime); |
| 247 | P(se.sleep_start_fair); | 289 | |
| 248 | P(se.sum_exec_runtime); | 290 | nr_switches = p->nvcsw + p->nivcsw; |
| 249 | 291 | ||
| 250 | #ifdef CONFIG_SCHEDSTATS | 292 | #ifdef CONFIG_SCHEDSTATS |
| 251 | P(se.wait_start); | 293 | PN(se.wait_start); |
| 252 | P(se.sleep_start); | 294 | PN(se.sleep_start); |
| 253 | P(se.block_start); | 295 | PN(se.block_start); |
| 254 | P(se.sleep_max); | 296 | PN(se.sleep_max); |
| 255 | P(se.block_max); | 297 | PN(se.block_max); |
| 256 | P(se.exec_max); | 298 | PN(se.exec_max); |
| 257 | P(se.wait_max); | 299 | PN(se.slice_max); |
| 258 | P(se.wait_runtime_overruns); | 300 | PN(se.wait_max); |
| 259 | P(se.wait_runtime_underruns); | 301 | P(sched_info.bkl_count); |
| 260 | P(se.sum_wait_runtime); | 302 | P(se.nr_migrations); |
| 303 | P(se.nr_migrations_cold); | ||
| 304 | P(se.nr_failed_migrations_affine); | ||
| 305 | P(se.nr_failed_migrations_running); | ||
| 306 | P(se.nr_failed_migrations_hot); | ||
| 307 | P(se.nr_forced_migrations); | ||
| 308 | P(se.nr_forced2_migrations); | ||
| 309 | P(se.nr_wakeups); | ||
| 310 | P(se.nr_wakeups_sync); | ||
| 311 | P(se.nr_wakeups_migrate); | ||
| 312 | P(se.nr_wakeups_local); | ||
| 313 | P(se.nr_wakeups_remote); | ||
| 314 | P(se.nr_wakeups_affine); | ||
| 315 | P(se.nr_wakeups_affine_attempts); | ||
| 316 | P(se.nr_wakeups_passive); | ||
| 317 | P(se.nr_wakeups_idle); | ||
| 318 | |||
| 319 | { | ||
| 320 | u64 avg_atom, avg_per_cpu; | ||
| 321 | |||
| 322 | avg_atom = p->se.sum_exec_runtime; | ||
| 323 | if (nr_switches) | ||
| 324 | do_div(avg_atom, nr_switches); | ||
| 325 | else | ||
| 326 | avg_atom = -1LL; | ||
| 327 | |||
| 328 | avg_per_cpu = p->se.sum_exec_runtime; | ||
| 329 | if (p->se.nr_migrations) | ||
| 330 | avg_per_cpu = div64_64(avg_per_cpu, p->se.nr_migrations); | ||
| 331 | else | ||
| 332 | avg_per_cpu = -1LL; | ||
| 333 | |||
| 334 | __PN(avg_atom); | ||
| 335 | __PN(avg_per_cpu); | ||
| 336 | } | ||
| 261 | #endif | 337 | #endif |
| 262 | SEQ_printf(m, "%-25s:%20Ld\n", | 338 | __P(nr_switches); |
| 263 | "nr_switches", (long long)(p->nvcsw + p->nivcsw)); | 339 | SEQ_printf(m, "%-35s:%21Ld\n", |
| 340 | "nr_voluntary_switches", (long long)p->nvcsw); | ||
| 341 | SEQ_printf(m, "%-35s:%21Ld\n", | ||
| 342 | "nr_involuntary_switches", (long long)p->nivcsw); | ||
| 343 | |||
| 264 | P(se.load.weight); | 344 | P(se.load.weight); |
| 265 | P(policy); | 345 | P(policy); |
| 266 | P(prio); | 346 | P(prio); |
| 347 | #undef PN | ||
| 348 | #undef __PN | ||
| 267 | #undef P | 349 | #undef P |
| 350 | #undef __P | ||
| 268 | 351 | ||
| 269 | { | 352 | { |
| 270 | u64 t0, t1; | 353 | u64 t0, t1; |
| 271 | 354 | ||
| 272 | t0 = sched_clock(); | 355 | t0 = sched_clock(); |
| 273 | t1 = sched_clock(); | 356 | t1 = sched_clock(); |
| 274 | SEQ_printf(m, "%-25s:%20Ld\n", | 357 | SEQ_printf(m, "%-35s:%21Ld\n", |
| 275 | "clock-delta", (long long)(t1-t0)); | 358 | "clock-delta", (long long)(t1-t0)); |
| 276 | } | 359 | } |
| 277 | } | 360 | } |
| @@ -279,9 +362,32 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
| 279 | void proc_sched_set_task(struct task_struct *p) | 362 | void proc_sched_set_task(struct task_struct *p) |
| 280 | { | 363 | { |
| 281 | #ifdef CONFIG_SCHEDSTATS | 364 | #ifdef CONFIG_SCHEDSTATS |
| 282 | p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0; | 365 | p->se.wait_max = 0; |
| 283 | p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0; | 366 | p->se.sleep_max = 0; |
| 367 | p->se.sum_sleep_runtime = 0; | ||
| 368 | p->se.block_max = 0; | ||
| 369 | p->se.exec_max = 0; | ||
| 370 | p->se.slice_max = 0; | ||
| 371 | p->se.nr_migrations = 0; | ||
| 372 | p->se.nr_migrations_cold = 0; | ||
| 373 | p->se.nr_failed_migrations_affine = 0; | ||
| 374 | p->se.nr_failed_migrations_running = 0; | ||
| 375 | p->se.nr_failed_migrations_hot = 0; | ||
| 376 | p->se.nr_forced_migrations = 0; | ||
| 377 | p->se.nr_forced2_migrations = 0; | ||
| 378 | p->se.nr_wakeups = 0; | ||
| 379 | p->se.nr_wakeups_sync = 0; | ||
| 380 | p->se.nr_wakeups_migrate = 0; | ||
| 381 | p->se.nr_wakeups_local = 0; | ||
| 382 | p->se.nr_wakeups_remote = 0; | ||
| 383 | p->se.nr_wakeups_affine = 0; | ||
| 384 | p->se.nr_wakeups_affine_attempts = 0; | ||
| 385 | p->se.nr_wakeups_passive = 0; | ||
| 386 | p->se.nr_wakeups_idle = 0; | ||
| 387 | p->sched_info.bkl_count = 0; | ||
| 284 | #endif | 388 | #endif |
| 285 | p->se.sum_exec_runtime = 0; | 389 | p->se.sum_exec_runtime = 0; |
| 286 | p->se.prev_sum_exec_runtime = 0; | 390 | p->se.prev_sum_exec_runtime = 0; |
| 391 | p->nvcsw = 0; | ||
| 392 | p->nivcsw = 0; | ||
| 287 | } | 393 | } |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 67c67a87146e..a17b785d7000 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
| @@ -25,22 +25,26 @@ | |||
| 25 | * (default: 20ms, units: nanoseconds) | 25 | * (default: 20ms, units: nanoseconds) |
| 26 | * | 26 | * |
| 27 | * NOTE: this latency value is not the same as the concept of | 27 | * NOTE: this latency value is not the same as the concept of |
| 28 | * 'timeslice length' - timeslices in CFS are of variable length. | 28 | * 'timeslice length' - timeslices in CFS are of variable length |
| 29 | * (to see the precise effective timeslice length of your workload, | 29 | * and have no persistent notion like in traditional, time-slice |
| 30 | * run vmstat and monitor the context-switches field) | 30 | * based scheduling concepts. |
| 31 | * | 31 | * |
| 32 | * On SMP systems the value of this is multiplied by the log2 of the | 32 | * (to see the precise effective timeslice length of your workload, |
| 33 | * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way | 33 | * run vmstat and monitor the context-switches (cs) field) |
| 34 | * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) | ||
| 35 | * Targeted preemption latency for CPU-bound tasks: | ||
| 36 | */ | 34 | */ |
| 37 | unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; | 35 | const_debug unsigned int sysctl_sched_latency = 20000000ULL; |
| 36 | |||
| 37 | /* | ||
| 38 | * After fork, child runs first. (default) If set to 0 then | ||
| 39 | * parent will (try to) run first. | ||
| 40 | */ | ||
| 41 | const_debug unsigned int sysctl_sched_child_runs_first = 1; | ||
| 38 | 42 | ||
| 39 | /* | 43 | /* |
| 40 | * Minimal preemption granularity for CPU-bound tasks: | 44 | * Minimal preemption granularity for CPU-bound tasks: |
| 41 | * (default: 2 msec, units: nanoseconds) | 45 | * (default: 2 msec, units: nanoseconds) |
| 42 | */ | 46 | */ |
| 43 | unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL; | 47 | const_debug unsigned int sysctl_sched_nr_latency = 20; |
| 44 | 48 | ||
| 45 | /* | 49 | /* |
| 46 | * sys_sched_yield() compat mode | 50 | * sys_sched_yield() compat mode |
| @@ -52,52 +56,25 @@ unsigned int __read_mostly sysctl_sched_compat_yield; | |||
| 52 | 56 | ||
| 53 | /* | 57 | /* |
| 54 | * SCHED_BATCH wake-up granularity. | 58 | * SCHED_BATCH wake-up granularity. |
| 55 | * (default: 25 msec, units: nanoseconds) | 59 | * (default: 10 msec, units: nanoseconds) |
| 56 | * | 60 | * |
| 57 | * This option delays the preemption effects of decoupled workloads | 61 | * This option delays the preemption effects of decoupled workloads |
| 58 | * and reduces their over-scheduling. Synchronous workloads will still | 62 | * and reduces their over-scheduling. Synchronous workloads will still |
| 59 | * have immediate wakeup/sleep latencies. | 63 | * have immediate wakeup/sleep latencies. |
| 60 | */ | 64 | */ |
| 61 | unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL; | 65 | const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; |
| 62 | 66 | ||
| 63 | /* | 67 | /* |
| 64 | * SCHED_OTHER wake-up granularity. | 68 | * SCHED_OTHER wake-up granularity. |
| 65 | * (default: 1 msec, units: nanoseconds) | 69 | * (default: 10 msec, units: nanoseconds) |
| 66 | * | 70 | * |
| 67 | * This option delays the preemption effects of decoupled workloads | 71 | * This option delays the preemption effects of decoupled workloads |
| 68 | * and reduces their over-scheduling. Synchronous workloads will still | 72 | * and reduces their over-scheduling. Synchronous workloads will still |
| 69 | * have immediate wakeup/sleep latencies. | 73 | * have immediate wakeup/sleep latencies. |
| 70 | */ | 74 | */ |
| 71 | unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000UL; | 75 | const_debug unsigned int sysctl_sched_wakeup_granularity = 10000000UL; |
| 72 | |||
| 73 | unsigned int sysctl_sched_stat_granularity __read_mostly; | ||
| 74 | |||
| 75 | /* | ||
| 76 | * Initialized in sched_init_granularity() [to 5 times the base granularity]: | ||
| 77 | */ | ||
| 78 | unsigned int sysctl_sched_runtime_limit __read_mostly; | ||
| 79 | |||
| 80 | /* | ||
| 81 | * Debugging: various feature bits | ||
| 82 | */ | ||
| 83 | enum { | ||
| 84 | SCHED_FEAT_FAIR_SLEEPERS = 1, | ||
| 85 | SCHED_FEAT_SLEEPER_AVG = 2, | ||
| 86 | SCHED_FEAT_SLEEPER_LOAD_AVG = 4, | ||
| 87 | SCHED_FEAT_PRECISE_CPU_LOAD = 8, | ||
| 88 | SCHED_FEAT_START_DEBIT = 16, | ||
| 89 | SCHED_FEAT_SKIP_INITIAL = 32, | ||
| 90 | }; | ||
| 91 | 76 | ||
| 92 | unsigned int sysctl_sched_features __read_mostly = | 77 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
| 93 | SCHED_FEAT_FAIR_SLEEPERS *1 | | ||
| 94 | SCHED_FEAT_SLEEPER_AVG *0 | | ||
| 95 | SCHED_FEAT_SLEEPER_LOAD_AVG *1 | | ||
| 96 | SCHED_FEAT_PRECISE_CPU_LOAD *1 | | ||
| 97 | SCHED_FEAT_START_DEBIT *1 | | ||
| 98 | SCHED_FEAT_SKIP_INITIAL *0; | ||
| 99 | |||
| 100 | extern struct sched_class fair_sched_class; | ||
| 101 | 78 | ||
| 102 | /************************************************************** | 79 | /************************************************************** |
| 103 | * CFS operations on generic schedulable entities: | 80 | * CFS operations on generic schedulable entities: |
| @@ -111,21 +88,9 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
| 111 | return cfs_rq->rq; | 88 | return cfs_rq->rq; |
| 112 | } | 89 | } |
| 113 | 90 | ||
| 114 | /* currently running entity (if any) on this cfs_rq */ | ||
| 115 | static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) | ||
| 116 | { | ||
| 117 | return cfs_rq->curr; | ||
| 118 | } | ||
| 119 | |||
| 120 | /* An entity is a task if it doesn't "own" a runqueue */ | 91 | /* An entity is a task if it doesn't "own" a runqueue */ |
| 121 | #define entity_is_task(se) (!se->my_q) | 92 | #define entity_is_task(se) (!se->my_q) |
| 122 | 93 | ||
| 123 | static inline void | ||
| 124 | set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 125 | { | ||
| 126 | cfs_rq->curr = se; | ||
| 127 | } | ||
| 128 | |||
| 129 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 94 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
| 130 | 95 | ||
| 131 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | 96 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) |
| @@ -133,21 +98,8 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
| 133 | return container_of(cfs_rq, struct rq, cfs); | 98 | return container_of(cfs_rq, struct rq, cfs); |
| 134 | } | 99 | } |
| 135 | 100 | ||
| 136 | static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) | ||
| 137 | { | ||
| 138 | struct rq *rq = rq_of(cfs_rq); | ||
| 139 | |||
| 140 | if (unlikely(rq->curr->sched_class != &fair_sched_class)) | ||
| 141 | return NULL; | ||
| 142 | |||
| 143 | return &rq->curr->se; | ||
| 144 | } | ||
| 145 | |||
| 146 | #define entity_is_task(se) 1 | 101 | #define entity_is_task(se) 1 |
| 147 | 102 | ||
| 148 | static inline void | ||
| 149 | set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { } | ||
| 150 | |||
| 151 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 103 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 152 | 104 | ||
| 153 | static inline struct task_struct *task_of(struct sched_entity *se) | 105 | static inline struct task_struct *task_of(struct sched_entity *se) |
| @@ -160,16 +112,38 @@ static inline struct task_struct *task_of(struct sched_entity *se) | |||
| 160 | * Scheduling class tree data structure manipulation methods: | 112 | * Scheduling class tree data structure manipulation methods: |
| 161 | */ | 113 | */ |
| 162 | 114 | ||
| 115 | static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) | ||
| 116 | { | ||
| 117 | s64 delta = (s64)(vruntime - min_vruntime); | ||
| 118 | if (delta > 0) | ||
| 119 | min_vruntime = vruntime; | ||
| 120 | |||
| 121 | return min_vruntime; | ||
| 122 | } | ||
| 123 | |||
| 124 | static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) | ||
| 125 | { | ||
| 126 | s64 delta = (s64)(vruntime - min_vruntime); | ||
| 127 | if (delta < 0) | ||
| 128 | min_vruntime = vruntime; | ||
| 129 | |||
| 130 | return min_vruntime; | ||
| 131 | } | ||
| 132 | |||
| 133 | static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 134 | { | ||
| 135 | return se->vruntime - cfs_rq->min_vruntime; | ||
| 136 | } | ||
| 137 | |||
| 163 | /* | 138 | /* |
| 164 | * Enqueue an entity into the rb-tree: | 139 | * Enqueue an entity into the rb-tree: |
| 165 | */ | 140 | */ |
| 166 | static inline void | 141 | static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 167 | __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 168 | { | 142 | { |
| 169 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; | 143 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; |
| 170 | struct rb_node *parent = NULL; | 144 | struct rb_node *parent = NULL; |
| 171 | struct sched_entity *entry; | 145 | struct sched_entity *entry; |
| 172 | s64 key = se->fair_key; | 146 | s64 key = entity_key(cfs_rq, se); |
| 173 | int leftmost = 1; | 147 | int leftmost = 1; |
| 174 | 148 | ||
| 175 | /* | 149 | /* |
| @@ -182,7 +156,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 182 | * We dont care about collisions. Nodes with | 156 | * We dont care about collisions. Nodes with |
| 183 | * the same key stay together. | 157 | * the same key stay together. |
| 184 | */ | 158 | */ |
| 185 | if (key - entry->fair_key < 0) { | 159 | if (key < entity_key(cfs_rq, entry)) { |
| 186 | link = &parent->rb_left; | 160 | link = &parent->rb_left; |
| 187 | } else { | 161 | } else { |
| 188 | link = &parent->rb_right; | 162 | link = &parent->rb_right; |
| @@ -199,24 +173,14 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 199 | 173 | ||
| 200 | rb_link_node(&se->run_node, parent, link); | 174 | rb_link_node(&se->run_node, parent, link); |
| 201 | rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); | 175 | rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); |
| 202 | update_load_add(&cfs_rq->load, se->load.weight); | ||
| 203 | cfs_rq->nr_running++; | ||
| 204 | se->on_rq = 1; | ||
| 205 | |||
| 206 | schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); | ||
| 207 | } | 176 | } |
| 208 | 177 | ||
| 209 | static inline void | 178 | static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 210 | __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 211 | { | 179 | { |
| 212 | if (cfs_rq->rb_leftmost == &se->run_node) | 180 | if (cfs_rq->rb_leftmost == &se->run_node) |
| 213 | cfs_rq->rb_leftmost = rb_next(&se->run_node); | 181 | cfs_rq->rb_leftmost = rb_next(&se->run_node); |
| 214 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); | ||
| 215 | update_load_sub(&cfs_rq->load, se->load.weight); | ||
| 216 | cfs_rq->nr_running--; | ||
| 217 | se->on_rq = 0; | ||
| 218 | 182 | ||
| 219 | schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); | 183 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); |
| 220 | } | 184 | } |
| 221 | 185 | ||
| 222 | static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) | 186 | static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) |
| @@ -229,118 +193,86 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) | |||
| 229 | return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); | 193 | return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); |
| 230 | } | 194 | } |
| 231 | 195 | ||
| 196 | static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | ||
| 197 | { | ||
| 198 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; | ||
| 199 | struct sched_entity *se = NULL; | ||
| 200 | struct rb_node *parent; | ||
| 201 | |||
| 202 | while (*link) { | ||
| 203 | parent = *link; | ||
| 204 | se = rb_entry(parent, struct sched_entity, run_node); | ||
| 205 | link = &parent->rb_right; | ||
| 206 | } | ||
| 207 | |||
| 208 | return se; | ||
| 209 | } | ||
| 210 | |||
| 232 | /************************************************************** | 211 | /************************************************************** |
| 233 | * Scheduling class statistics methods: | 212 | * Scheduling class statistics methods: |
| 234 | */ | 213 | */ |
| 235 | 214 | ||
| 215 | |||
| 236 | /* | 216 | /* |
| 237 | * Calculate the preemption granularity needed to schedule every | 217 | * The idea is to set a period in which each task runs once. |
| 238 | * runnable task once per sysctl_sched_latency amount of time. | ||
| 239 | * (down to a sensible low limit on granularity) | ||
| 240 | * | ||
| 241 | * For example, if there are 2 tasks running and latency is 10 msecs, | ||
| 242 | * we switch tasks every 5 msecs. If we have 3 tasks running, we have | ||
| 243 | * to switch tasks every 3.33 msecs to get a 10 msecs observed latency | ||
| 244 | * for each task. We do finer and finer scheduling up to until we | ||
| 245 | * reach the minimum granularity value. | ||
| 246 | * | ||
| 247 | * To achieve this we use the following dynamic-granularity rule: | ||
| 248 | * | 218 | * |
| 249 | * gran = lat/nr - lat/nr/nr | 219 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch |
| 220 | * this period because otherwise the slices get too small. | ||
| 250 | * | 221 | * |
| 251 | * This comes out of the following equations: | 222 | * p = (nr <= nl) ? l : l*nr/nl |
| 252 | * | ||
| 253 | * kA1 + gran = kB1 | ||
| 254 | * kB2 + gran = kA2 | ||
| 255 | * kA2 = kA1 | ||
| 256 | * kB2 = kB1 - d + d/nr | ||
| 257 | * lat = d * nr | ||
| 258 | * | ||
| 259 | * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running), | ||
| 260 | * '1' is start of time, '2' is end of time, 'd' is delay between | ||
| 261 | * 1 and 2 (during which task B was running), 'nr' is number of tasks | ||
| 262 | * running, 'lat' is the the period of each task. ('lat' is the | ||
| 263 | * sched_latency that we aim for.) | ||
| 264 | */ | 223 | */ |
| 265 | static long | 224 | static u64 __sched_period(unsigned long nr_running) |
| 266 | sched_granularity(struct cfs_rq *cfs_rq) | ||
| 267 | { | 225 | { |
| 268 | unsigned int gran = sysctl_sched_latency; | 226 | u64 period = sysctl_sched_latency; |
| 269 | unsigned int nr = cfs_rq->nr_running; | 227 | unsigned long nr_latency = sysctl_sched_nr_latency; |
| 270 | 228 | ||
| 271 | if (nr > 1) { | 229 | if (unlikely(nr_running > nr_latency)) { |
| 272 | gran = gran/nr - gran/nr/nr; | 230 | period *= nr_running; |
| 273 | gran = max(gran, sysctl_sched_min_granularity); | 231 | do_div(period, nr_latency); |
| 274 | } | 232 | } |
| 275 | 233 | ||
| 276 | return gran; | 234 | return period; |
| 277 | } | 235 | } |
| 278 | 236 | ||
| 279 | /* | 237 | /* |
| 280 | * We rescale the rescheduling granularity of tasks according to their | 238 | * We calculate the wall-time slice from the period by taking a part |
| 281 | * nice level, but only linearly, not exponentially: | 239 | * proportional to the weight. |
| 240 | * | ||
| 241 | * s = p*w/rw | ||
| 282 | */ | 242 | */ |
| 283 | static long | 243 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 284 | niced_granularity(struct sched_entity *curr, unsigned long granularity) | ||
| 285 | { | 244 | { |
| 286 | u64 tmp; | 245 | u64 slice = __sched_period(cfs_rq->nr_running); |
| 287 | 246 | ||
| 288 | if (likely(curr->load.weight == NICE_0_LOAD)) | 247 | slice *= se->load.weight; |
| 289 | return granularity; | 248 | do_div(slice, cfs_rq->load.weight); |
| 290 | /* | ||
| 291 | * Positive nice levels get the same granularity as nice-0: | ||
| 292 | */ | ||
| 293 | if (likely(curr->load.weight < NICE_0_LOAD)) { | ||
| 294 | tmp = curr->load.weight * (u64)granularity; | ||
| 295 | return (long) (tmp >> NICE_0_SHIFT); | ||
| 296 | } | ||
| 297 | /* | ||
| 298 | * Negative nice level tasks get linearly finer | ||
| 299 | * granularity: | ||
| 300 | */ | ||
| 301 | tmp = curr->load.inv_weight * (u64)granularity; | ||
| 302 | 249 | ||
| 303 | /* | 250 | return slice; |
| 304 | * It will always fit into 'long': | ||
| 305 | */ | ||
| 306 | return (long) (tmp >> (WMULT_SHIFT-NICE_0_SHIFT)); | ||
| 307 | } | 251 | } |
| 308 | 252 | ||
| 309 | static inline void | 253 | /* |
| 310 | limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se) | 254 | * We calculate the vruntime slice. |
| 255 | * | ||
| 256 | * vs = s/w = p/rw | ||
| 257 | */ | ||
| 258 | static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) | ||
| 311 | { | 259 | { |
| 312 | long limit = sysctl_sched_runtime_limit; | 260 | u64 vslice = __sched_period(nr_running); |
| 313 | 261 | ||
| 314 | /* | 262 | do_div(vslice, rq_weight); |
| 315 | * Niced tasks have the same history dynamic range as | 263 | |
| 316 | * non-niced tasks: | 264 | return vslice; |
| 317 | */ | ||
| 318 | if (unlikely(se->wait_runtime > limit)) { | ||
| 319 | se->wait_runtime = limit; | ||
| 320 | schedstat_inc(se, wait_runtime_overruns); | ||
| 321 | schedstat_inc(cfs_rq, wait_runtime_overruns); | ||
| 322 | } | ||
| 323 | if (unlikely(se->wait_runtime < -limit)) { | ||
| 324 | se->wait_runtime = -limit; | ||
| 325 | schedstat_inc(se, wait_runtime_underruns); | ||
| 326 | schedstat_inc(cfs_rq, wait_runtime_underruns); | ||
| 327 | } | ||
| 328 | } | 265 | } |
| 329 | 266 | ||
| 330 | static inline void | 267 | static u64 sched_vslice(struct cfs_rq *cfs_rq) |
| 331 | __add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) | ||
| 332 | { | 268 | { |
| 333 | se->wait_runtime += delta; | 269 | return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running); |
| 334 | schedstat_add(se, sum_wait_runtime, delta); | ||
| 335 | limit_wait_runtime(cfs_rq, se); | ||
| 336 | } | 270 | } |
| 337 | 271 | ||
| 338 | static void | 272 | static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 339 | add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) | ||
| 340 | { | 273 | { |
| 341 | schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); | 274 | return __sched_vslice(cfs_rq->load.weight + se->load.weight, |
| 342 | __add_wait_runtime(cfs_rq, se, delta); | 275 | cfs_rq->nr_running + 1); |
| 343 | schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); | ||
| 344 | } | 276 | } |
| 345 | 277 | ||
| 346 | /* | 278 | /* |
| @@ -348,46 +280,41 @@ add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) | |||
| 348 | * are not in our scheduling class. | 280 | * are not in our scheduling class. |
| 349 | */ | 281 | */ |
| 350 | static inline void | 282 | static inline void |
| 351 | __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr) | 283 | __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, |
| 284 | unsigned long delta_exec) | ||
| 352 | { | 285 | { |
| 353 | unsigned long delta, delta_exec, delta_fair, delta_mine; | 286 | unsigned long delta_exec_weighted; |
| 354 | struct load_weight *lw = &cfs_rq->load; | 287 | u64 vruntime; |
| 355 | unsigned long load = lw->weight; | ||
| 356 | 288 | ||
| 357 | delta_exec = curr->delta_exec; | ||
| 358 | schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); | 289 | schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); |
| 359 | 290 | ||
| 360 | curr->sum_exec_runtime += delta_exec; | 291 | curr->sum_exec_runtime += delta_exec; |
| 361 | cfs_rq->exec_clock += delta_exec; | 292 | schedstat_add(cfs_rq, exec_clock, delta_exec); |
| 362 | 293 | delta_exec_weighted = delta_exec; | |
| 363 | if (unlikely(!load)) | 294 | if (unlikely(curr->load.weight != NICE_0_LOAD)) { |
| 364 | return; | 295 | delta_exec_weighted = calc_delta_fair(delta_exec_weighted, |
| 365 | 296 | &curr->load); | |
| 366 | delta_fair = calc_delta_fair(delta_exec, lw); | ||
| 367 | delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); | ||
| 368 | |||
| 369 | if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) { | ||
| 370 | delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); | ||
| 371 | delta = min(delta, (unsigned long)( | ||
| 372 | (long)sysctl_sched_runtime_limit - curr->wait_runtime)); | ||
| 373 | cfs_rq->sleeper_bonus -= delta; | ||
| 374 | delta_mine -= delta; | ||
| 375 | } | 297 | } |
| 298 | curr->vruntime += delta_exec_weighted; | ||
| 376 | 299 | ||
| 377 | cfs_rq->fair_clock += delta_fair; | ||
| 378 | /* | 300 | /* |
| 379 | * We executed delta_exec amount of time on the CPU, | 301 | * maintain cfs_rq->min_vruntime to be a monotonic increasing |
| 380 | * but we were only entitled to delta_mine amount of | 302 | * value tracking the leftmost vruntime in the tree. |
| 381 | * time during that period (if nr_running == 1 then | ||
| 382 | * the two values are equal) | ||
| 383 | * [Note: delta_mine - delta_exec is negative]: | ||
| 384 | */ | 303 | */ |
| 385 | add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec); | 304 | if (first_fair(cfs_rq)) { |
| 305 | vruntime = min_vruntime(curr->vruntime, | ||
| 306 | __pick_next_entity(cfs_rq)->vruntime); | ||
| 307 | } else | ||
| 308 | vruntime = curr->vruntime; | ||
| 309 | |||
| 310 | cfs_rq->min_vruntime = | ||
| 311 | max_vruntime(cfs_rq->min_vruntime, vruntime); | ||
| 386 | } | 312 | } |
| 387 | 313 | ||
| 388 | static void update_curr(struct cfs_rq *cfs_rq) | 314 | static void update_curr(struct cfs_rq *cfs_rq) |
| 389 | { | 315 | { |
| 390 | struct sched_entity *curr = cfs_rq_curr(cfs_rq); | 316 | struct sched_entity *curr = cfs_rq->curr; |
| 317 | u64 now = rq_of(cfs_rq)->clock; | ||
| 391 | unsigned long delta_exec; | 318 | unsigned long delta_exec; |
| 392 | 319 | ||
| 393 | if (unlikely(!curr)) | 320 | if (unlikely(!curr)) |
| @@ -398,135 +325,47 @@ static void update_curr(struct cfs_rq *cfs_rq) | |||
| 398 | * since the last time we changed load (this cannot | 325 | * since the last time we changed load (this cannot |
| 399 | * overflow on 32 bits): | 326 | * overflow on 32 bits): |
| 400 | */ | 327 | */ |
| 401 | delta_exec = (unsigned long)(rq_of(cfs_rq)->clock - curr->exec_start); | 328 | delta_exec = (unsigned long)(now - curr->exec_start); |
| 402 | 329 | ||
| 403 | curr->delta_exec += delta_exec; | 330 | __update_curr(cfs_rq, curr, delta_exec); |
| 404 | 331 | curr->exec_start = now; | |
| 405 | if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) { | ||
| 406 | __update_curr(cfs_rq, curr); | ||
| 407 | curr->delta_exec = 0; | ||
| 408 | } | ||
| 409 | curr->exec_start = rq_of(cfs_rq)->clock; | ||
| 410 | } | 332 | } |
| 411 | 333 | ||
| 412 | static inline void | 334 | static inline void |
| 413 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | 335 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 414 | { | 336 | { |
| 415 | se->wait_start_fair = cfs_rq->fair_clock; | ||
| 416 | schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); | 337 | schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); |
| 417 | } | 338 | } |
| 418 | 339 | ||
| 419 | /* | 340 | /* |
| 420 | * We calculate fair deltas here, so protect against the random effects | ||
| 421 | * of a multiplication overflow by capping it to the runtime limit: | ||
| 422 | */ | ||
| 423 | #if BITS_PER_LONG == 32 | ||
| 424 | static inline unsigned long | ||
| 425 | calc_weighted(unsigned long delta, unsigned long weight, int shift) | ||
| 426 | { | ||
| 427 | u64 tmp = (u64)delta * weight >> shift; | ||
| 428 | |||
| 429 | if (unlikely(tmp > sysctl_sched_runtime_limit*2)) | ||
| 430 | return sysctl_sched_runtime_limit*2; | ||
| 431 | return tmp; | ||
| 432 | } | ||
| 433 | #else | ||
| 434 | static inline unsigned long | ||
| 435 | calc_weighted(unsigned long delta, unsigned long weight, int shift) | ||
| 436 | { | ||
| 437 | return delta * weight >> shift; | ||
| 438 | } | ||
| 439 | #endif | ||
| 440 | |||
| 441 | /* | ||
| 442 | * Task is being enqueued - update stats: | 341 | * Task is being enqueued - update stats: |
| 443 | */ | 342 | */ |
| 444 | static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 343 | static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 445 | { | 344 | { |
| 446 | s64 key; | ||
| 447 | |||
| 448 | /* | 345 | /* |
| 449 | * Are we enqueueing a waiting task? (for current tasks | 346 | * Are we enqueueing a waiting task? (for current tasks |
| 450 | * a dequeue/enqueue event is a NOP) | 347 | * a dequeue/enqueue event is a NOP) |
| 451 | */ | 348 | */ |
| 452 | if (se != cfs_rq_curr(cfs_rq)) | 349 | if (se != cfs_rq->curr) |
| 453 | update_stats_wait_start(cfs_rq, se); | 350 | update_stats_wait_start(cfs_rq, se); |
| 454 | /* | ||
| 455 | * Update the key: | ||
| 456 | */ | ||
| 457 | key = cfs_rq->fair_clock; | ||
| 458 | |||
| 459 | /* | ||
| 460 | * Optimize the common nice 0 case: | ||
| 461 | */ | ||
| 462 | if (likely(se->load.weight == NICE_0_LOAD)) { | ||
| 463 | key -= se->wait_runtime; | ||
| 464 | } else { | ||
| 465 | u64 tmp; | ||
| 466 | |||
| 467 | if (se->wait_runtime < 0) { | ||
| 468 | tmp = -se->wait_runtime; | ||
| 469 | key += (tmp * se->load.inv_weight) >> | ||
| 470 | (WMULT_SHIFT - NICE_0_SHIFT); | ||
| 471 | } else { | ||
| 472 | tmp = se->wait_runtime; | ||
| 473 | key -= (tmp * se->load.inv_weight) >> | ||
| 474 | (WMULT_SHIFT - NICE_0_SHIFT); | ||
| 475 | } | ||
| 476 | } | ||
| 477 | |||
| 478 | se->fair_key = key; | ||
| 479 | } | ||
| 480 | |||
| 481 | /* | ||
| 482 | * Note: must be called with a freshly updated rq->fair_clock. | ||
| 483 | */ | ||
| 484 | static inline void | ||
| 485 | __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 486 | { | ||
| 487 | unsigned long delta_fair = se->delta_fair_run; | ||
| 488 | |||
| 489 | schedstat_set(se->wait_max, max(se->wait_max, | ||
| 490 | rq_of(cfs_rq)->clock - se->wait_start)); | ||
| 491 | |||
| 492 | if (unlikely(se->load.weight != NICE_0_LOAD)) | ||
| 493 | delta_fair = calc_weighted(delta_fair, se->load.weight, | ||
| 494 | NICE_0_SHIFT); | ||
| 495 | |||
| 496 | add_wait_runtime(cfs_rq, se, delta_fair); | ||
| 497 | } | 351 | } |
| 498 | 352 | ||
| 499 | static void | 353 | static void |
| 500 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | 354 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 501 | { | 355 | { |
| 502 | unsigned long delta_fair; | 356 | schedstat_set(se->wait_max, max(se->wait_max, |
| 503 | 357 | rq_of(cfs_rq)->clock - se->wait_start)); | |
| 504 | if (unlikely(!se->wait_start_fair)) | ||
| 505 | return; | ||
| 506 | |||
| 507 | delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), | ||
| 508 | (u64)(cfs_rq->fair_clock - se->wait_start_fair)); | ||
| 509 | |||
| 510 | se->delta_fair_run += delta_fair; | ||
| 511 | if (unlikely(abs(se->delta_fair_run) >= | ||
| 512 | sysctl_sched_stat_granularity)) { | ||
| 513 | __update_stats_wait_end(cfs_rq, se); | ||
| 514 | se->delta_fair_run = 0; | ||
| 515 | } | ||
| 516 | |||
| 517 | se->wait_start_fair = 0; | ||
| 518 | schedstat_set(se->wait_start, 0); | 358 | schedstat_set(se->wait_start, 0); |
| 519 | } | 359 | } |
| 520 | 360 | ||
| 521 | static inline void | 361 | static inline void |
| 522 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 362 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 523 | { | 363 | { |
| 524 | update_curr(cfs_rq); | ||
| 525 | /* | 364 | /* |
| 526 | * Mark the end of the wait period if dequeueing a | 365 | * Mark the end of the wait period if dequeueing a |
| 527 | * waiting task: | 366 | * waiting task: |
| 528 | */ | 367 | */ |
| 529 | if (se != cfs_rq_curr(cfs_rq)) | 368 | if (se != cfs_rq->curr) |
| 530 | update_stats_wait_end(cfs_rq, se); | 369 | update_stats_wait_end(cfs_rq, se); |
| 531 | } | 370 | } |
| 532 | 371 | ||
| @@ -542,79 +381,28 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 542 | se->exec_start = rq_of(cfs_rq)->clock; | 381 | se->exec_start = rq_of(cfs_rq)->clock; |
| 543 | } | 382 | } |
| 544 | 383 | ||
| 545 | /* | ||
| 546 | * We are descheduling a task - update its stats: | ||
| 547 | */ | ||
| 548 | static inline void | ||
| 549 | update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 550 | { | ||
| 551 | se->exec_start = 0; | ||
| 552 | } | ||
| 553 | |||
| 554 | /************************************************** | 384 | /************************************************** |
| 555 | * Scheduling class queueing methods: | 385 | * Scheduling class queueing methods: |
| 556 | */ | 386 | */ |
| 557 | 387 | ||
| 558 | static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 388 | static void |
| 389 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 559 | { | 390 | { |
| 560 | unsigned long load = cfs_rq->load.weight, delta_fair; | 391 | update_load_add(&cfs_rq->load, se->load.weight); |
| 561 | long prev_runtime; | 392 | cfs_rq->nr_running++; |
| 562 | 393 | se->on_rq = 1; | |
| 563 | /* | 394 | } |
| 564 | * Do not boost sleepers if there's too much bonus 'in flight' | ||
| 565 | * already: | ||
| 566 | */ | ||
| 567 | if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit)) | ||
| 568 | return; | ||
| 569 | |||
| 570 | if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG) | ||
| 571 | load = rq_of(cfs_rq)->cpu_load[2]; | ||
| 572 | |||
| 573 | delta_fair = se->delta_fair_sleep; | ||
| 574 | |||
| 575 | /* | ||
| 576 | * Fix up delta_fair with the effect of us running | ||
| 577 | * during the whole sleep period: | ||
| 578 | */ | ||
| 579 | if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG) | ||
| 580 | delta_fair = div64_likely32((u64)delta_fair * load, | ||
| 581 | load + se->load.weight); | ||
| 582 | |||
| 583 | if (unlikely(se->load.weight != NICE_0_LOAD)) | ||
| 584 | delta_fair = calc_weighted(delta_fair, se->load.weight, | ||
| 585 | NICE_0_SHIFT); | ||
| 586 | |||
| 587 | prev_runtime = se->wait_runtime; | ||
| 588 | __add_wait_runtime(cfs_rq, se, delta_fair); | ||
| 589 | delta_fair = se->wait_runtime - prev_runtime; | ||
| 590 | 395 | ||
| 591 | /* | 396 | static void |
| 592 | * Track the amount of bonus we've given to sleepers: | 397 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 593 | */ | 398 | { |
| 594 | cfs_rq->sleeper_bonus += delta_fair; | 399 | update_load_sub(&cfs_rq->load, se->load.weight); |
| 400 | cfs_rq->nr_running--; | ||
| 401 | se->on_rq = 0; | ||
| 595 | } | 402 | } |
| 596 | 403 | ||
| 597 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 404 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 598 | { | 405 | { |
| 599 | struct task_struct *tsk = task_of(se); | ||
| 600 | unsigned long delta_fair; | ||
| 601 | |||
| 602 | if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) || | ||
| 603 | !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS)) | ||
| 604 | return; | ||
| 605 | |||
| 606 | delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), | ||
| 607 | (u64)(cfs_rq->fair_clock - se->sleep_start_fair)); | ||
| 608 | |||
| 609 | se->delta_fair_sleep += delta_fair; | ||
| 610 | if (unlikely(abs(se->delta_fair_sleep) >= | ||
| 611 | sysctl_sched_stat_granularity)) { | ||
| 612 | __enqueue_sleeper(cfs_rq, se); | ||
| 613 | se->delta_fair_sleep = 0; | ||
| 614 | } | ||
| 615 | |||
| 616 | se->sleep_start_fair = 0; | ||
| 617 | |||
| 618 | #ifdef CONFIG_SCHEDSTATS | 406 | #ifdef CONFIG_SCHEDSTATS |
| 619 | if (se->sleep_start) { | 407 | if (se->sleep_start) { |
| 620 | u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; | 408 | u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; |
| @@ -646,6 +434,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 646 | * time that the task spent sleeping: | 434 | * time that the task spent sleeping: |
| 647 | */ | 435 | */ |
| 648 | if (unlikely(prof_on == SLEEP_PROFILING)) { | 436 | if (unlikely(prof_on == SLEEP_PROFILING)) { |
| 437 | struct task_struct *tsk = task_of(se); | ||
| 438 | |||
| 649 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), | 439 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), |
| 650 | delta >> 20); | 440 | delta >> 20); |
| 651 | } | 441 | } |
| @@ -653,27 +443,81 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 653 | #endif | 443 | #endif |
| 654 | } | 444 | } |
| 655 | 445 | ||
| 446 | static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 447 | { | ||
| 448 | #ifdef CONFIG_SCHED_DEBUG | ||
| 449 | s64 d = se->vruntime - cfs_rq->min_vruntime; | ||
| 450 | |||
| 451 | if (d < 0) | ||
| 452 | d = -d; | ||
| 453 | |||
| 454 | if (d > 3*sysctl_sched_latency) | ||
| 455 | schedstat_inc(cfs_rq, nr_spread_over); | ||
| 456 | #endif | ||
| 457 | } | ||
| 458 | |||
| 459 | static void | ||
| 460 | place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | ||
| 461 | { | ||
| 462 | u64 vruntime; | ||
| 463 | |||
| 464 | vruntime = cfs_rq->min_vruntime; | ||
| 465 | |||
| 466 | if (sched_feat(TREE_AVG)) { | ||
| 467 | struct sched_entity *last = __pick_last_entity(cfs_rq); | ||
| 468 | if (last) { | ||
| 469 | vruntime += last->vruntime; | ||
| 470 | vruntime >>= 1; | ||
| 471 | } | ||
| 472 | } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) | ||
| 473 | vruntime += sched_vslice(cfs_rq)/2; | ||
| 474 | |||
| 475 | if (initial && sched_feat(START_DEBIT)) | ||
| 476 | vruntime += sched_vslice_add(cfs_rq, se); | ||
| 477 | |||
| 478 | if (!initial) { | ||
| 479 | if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) && | ||
| 480 | task_of(se)->policy != SCHED_BATCH) | ||
| 481 | vruntime -= sysctl_sched_latency; | ||
| 482 | |||
| 483 | vruntime = max_t(s64, vruntime, se->vruntime); | ||
| 484 | } | ||
| 485 | |||
| 486 | se->vruntime = vruntime; | ||
| 487 | |||
| 488 | } | ||
| 489 | |||
| 656 | static void | 490 | static void |
| 657 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) | 491 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) |
| 658 | { | 492 | { |
| 659 | /* | 493 | /* |
| 660 | * Update the fair clock. | 494 | * Update run-time statistics of the 'current'. |
| 661 | */ | 495 | */ |
| 662 | update_curr(cfs_rq); | 496 | update_curr(cfs_rq); |
| 663 | 497 | ||
| 664 | if (wakeup) | 498 | if (wakeup) { |
| 499 | place_entity(cfs_rq, se, 0); | ||
| 665 | enqueue_sleeper(cfs_rq, se); | 500 | enqueue_sleeper(cfs_rq, se); |
| 501 | } | ||
| 666 | 502 | ||
| 667 | update_stats_enqueue(cfs_rq, se); | 503 | update_stats_enqueue(cfs_rq, se); |
| 668 | __enqueue_entity(cfs_rq, se); | 504 | check_spread(cfs_rq, se); |
| 505 | if (se != cfs_rq->curr) | ||
| 506 | __enqueue_entity(cfs_rq, se); | ||
| 507 | account_entity_enqueue(cfs_rq, se); | ||
| 669 | } | 508 | } |
| 670 | 509 | ||
| 671 | static void | 510 | static void |
| 672 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | 511 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) |
| 673 | { | 512 | { |
| 513 | /* | ||
| 514 | * Update run-time statistics of the 'current'. | ||
| 515 | */ | ||
| 516 | update_curr(cfs_rq); | ||
| 517 | |||
| 674 | update_stats_dequeue(cfs_rq, se); | 518 | update_stats_dequeue(cfs_rq, se); |
| 675 | if (sleep) { | 519 | if (sleep) { |
| 676 | se->sleep_start_fair = cfs_rq->fair_clock; | 520 | se->peer_preempt = 0; |
| 677 | #ifdef CONFIG_SCHEDSTATS | 521 | #ifdef CONFIG_SCHEDSTATS |
| 678 | if (entity_is_task(se)) { | 522 | if (entity_is_task(se)) { |
| 679 | struct task_struct *tsk = task_of(se); | 523 | struct task_struct *tsk = task_of(se); |
| @@ -685,68 +529,66 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | |||
| 685 | } | 529 | } |
| 686 | #endif | 530 | #endif |
| 687 | } | 531 | } |
| 688 | __dequeue_entity(cfs_rq, se); | 532 | |
| 533 | if (se != cfs_rq->curr) | ||
| 534 | __dequeue_entity(cfs_rq, se); | ||
| 535 | account_entity_dequeue(cfs_rq, se); | ||
| 689 | } | 536 | } |
| 690 | 537 | ||
| 691 | /* | 538 | /* |
| 692 | * Preempt the current task with a newly woken task if needed: | 539 | * Preempt the current task with a newly woken task if needed: |
| 693 | */ | 540 | */ |
| 694 | static void | 541 | static void |
| 695 | __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, | 542 | check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) |
| 696 | struct sched_entity *curr, unsigned long granularity) | ||
| 697 | { | 543 | { |
| 698 | s64 __delta = curr->fair_key - se->fair_key; | ||
| 699 | unsigned long ideal_runtime, delta_exec; | 544 | unsigned long ideal_runtime, delta_exec; |
| 700 | 545 | ||
| 701 | /* | 546 | ideal_runtime = sched_slice(cfs_rq, curr); |
| 702 | * ideal_runtime is compared against sum_exec_runtime, which is | ||
| 703 | * walltime, hence do not scale. | ||
| 704 | */ | ||
| 705 | ideal_runtime = max(sysctl_sched_latency / cfs_rq->nr_running, | ||
| 706 | (unsigned long)sysctl_sched_min_granularity); | ||
| 707 | |||
| 708 | /* | ||
| 709 | * If we executed more than what the latency constraint suggests, | ||
| 710 | * reduce the rescheduling granularity. This way the total latency | ||
| 711 | * of how much a task is not scheduled converges to | ||
| 712 | * sysctl_sched_latency: | ||
| 713 | */ | ||
| 714 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; | 547 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; |
| 715 | if (delta_exec > ideal_runtime) | 548 | if (delta_exec > ideal_runtime || |
| 716 | granularity = 0; | 549 | (sched_feat(PREEMPT_RESTRICT) && curr->peer_preempt)) |
| 717 | |||
| 718 | /* | ||
| 719 | * Take scheduling granularity into account - do not | ||
| 720 | * preempt the current task unless the best task has | ||
| 721 | * a larger than sched_granularity fairness advantage: | ||
| 722 | * | ||
| 723 | * scale granularity as key space is in fair_clock. | ||
| 724 | */ | ||
| 725 | if (__delta > niced_granularity(curr, granularity)) | ||
| 726 | resched_task(rq_of(cfs_rq)->curr); | 550 | resched_task(rq_of(cfs_rq)->curr); |
| 551 | curr->peer_preempt = 0; | ||
| 727 | } | 552 | } |
| 728 | 553 | ||
| 729 | static inline void | 554 | static void |
| 730 | set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | 555 | set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 731 | { | 556 | { |
| 557 | /* 'current' is not kept within the tree. */ | ||
| 558 | if (se->on_rq) { | ||
| 559 | /* | ||
| 560 | * Any task has to be enqueued before it get to execute on | ||
| 561 | * a CPU. So account for the time it spent waiting on the | ||
| 562 | * runqueue. | ||
| 563 | */ | ||
| 564 | update_stats_wait_end(cfs_rq, se); | ||
| 565 | __dequeue_entity(cfs_rq, se); | ||
| 566 | } | ||
| 567 | |||
| 568 | update_stats_curr_start(cfs_rq, se); | ||
| 569 | cfs_rq->curr = se; | ||
| 570 | #ifdef CONFIG_SCHEDSTATS | ||
| 732 | /* | 571 | /* |
| 733 | * Any task has to be enqueued before it get to execute on | 572 | * Track our maximum slice length, if the CPU's load is at |
| 734 | * a CPU. So account for the time it spent waiting on the | 573 | * least twice that of our own weight (i.e. dont track it |
| 735 | * runqueue. (note, here we rely on pick_next_task() having | 574 | * when there are only lesser-weight tasks around): |
| 736 | * done a put_prev_task_fair() shortly before this, which | ||
| 737 | * updated rq->fair_clock - used by update_stats_wait_end()) | ||
| 738 | */ | 575 | */ |
| 739 | update_stats_wait_end(cfs_rq, se); | 576 | if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { |
| 740 | update_stats_curr_start(cfs_rq, se); | 577 | se->slice_max = max(se->slice_max, |
| 741 | set_cfs_rq_curr(cfs_rq, se); | 578 | se->sum_exec_runtime - se->prev_sum_exec_runtime); |
| 579 | } | ||
| 580 | #endif | ||
| 742 | se->prev_sum_exec_runtime = se->sum_exec_runtime; | 581 | se->prev_sum_exec_runtime = se->sum_exec_runtime; |
| 743 | } | 582 | } |
| 744 | 583 | ||
| 745 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | 584 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) |
| 746 | { | 585 | { |
| 747 | struct sched_entity *se = __pick_next_entity(cfs_rq); | 586 | struct sched_entity *se = NULL; |
| 748 | 587 | ||
| 749 | set_next_entity(cfs_rq, se); | 588 | if (first_fair(cfs_rq)) { |
| 589 | se = __pick_next_entity(cfs_rq); | ||
| 590 | set_next_entity(cfs_rq, se); | ||
| 591 | } | ||
| 750 | 592 | ||
| 751 | return se; | 593 | return se; |
| 752 | } | 594 | } |
| @@ -760,33 +602,24 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
| 760 | if (prev->on_rq) | 602 | if (prev->on_rq) |
| 761 | update_curr(cfs_rq); | 603 | update_curr(cfs_rq); |
| 762 | 604 | ||
| 763 | update_stats_curr_end(cfs_rq, prev); | 605 | check_spread(cfs_rq, prev); |
| 764 | 606 | if (prev->on_rq) { | |
| 765 | if (prev->on_rq) | ||
| 766 | update_stats_wait_start(cfs_rq, prev); | 607 | update_stats_wait_start(cfs_rq, prev); |
| 767 | set_cfs_rq_curr(cfs_rq, NULL); | 608 | /* Put 'current' back into the tree. */ |
| 609 | __enqueue_entity(cfs_rq, prev); | ||
| 610 | } | ||
| 611 | cfs_rq->curr = NULL; | ||
| 768 | } | 612 | } |
| 769 | 613 | ||
| 770 | static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | 614 | static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) |
| 771 | { | 615 | { |
| 772 | struct sched_entity *next; | ||
| 773 | |||
| 774 | /* | 616 | /* |
| 775 | * Dequeue and enqueue the task to update its | 617 | * Update run-time statistics of the 'current'. |
| 776 | * position within the tree: | ||
| 777 | */ | 618 | */ |
| 778 | dequeue_entity(cfs_rq, curr, 0); | 619 | update_curr(cfs_rq); |
| 779 | enqueue_entity(cfs_rq, curr, 0); | ||
| 780 | |||
| 781 | /* | ||
| 782 | * Reschedule if another task tops the current one. | ||
| 783 | */ | ||
| 784 | next = __pick_next_entity(cfs_rq); | ||
| 785 | if (next == curr) | ||
| 786 | return; | ||
| 787 | 620 | ||
| 788 | __check_preempt_curr_fair(cfs_rq, next, curr, | 621 | if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) |
| 789 | sched_granularity(cfs_rq)); | 622 | check_preempt_tick(cfs_rq, curr); |
| 790 | } | 623 | } |
| 791 | 624 | ||
| 792 | /************************************************** | 625 | /************************************************** |
| @@ -821,23 +654,28 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | |||
| 821 | */ | 654 | */ |
| 822 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | 655 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) |
| 823 | { | 656 | { |
| 824 | /* A later patch will take group into account */ | 657 | return cfs_rq->tg->cfs_rq[this_cpu]; |
| 825 | return &cpu_rq(this_cpu)->cfs; | ||
| 826 | } | 658 | } |
| 827 | 659 | ||
| 828 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | 660 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ |
| 829 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 661 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
| 830 | list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | 662 | list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) |
| 831 | 663 | ||
| 832 | /* Do the two (enqueued) tasks belong to the same group ? */ | 664 | /* Do the two (enqueued) entities belong to the same group ? */ |
| 833 | static inline int is_same_group(struct task_struct *curr, struct task_struct *p) | 665 | static inline int |
| 666 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
| 834 | { | 667 | { |
| 835 | if (curr->se.cfs_rq == p->se.cfs_rq) | 668 | if (se->cfs_rq == pse->cfs_rq) |
| 836 | return 1; | 669 | return 1; |
| 837 | 670 | ||
| 838 | return 0; | 671 | return 0; |
| 839 | } | 672 | } |
| 840 | 673 | ||
| 674 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
| 675 | { | ||
| 676 | return se->parent; | ||
| 677 | } | ||
| 678 | |||
| 841 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 679 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
| 842 | 680 | ||
| 843 | #define for_each_sched_entity(se) \ | 681 | #define for_each_sched_entity(se) \ |
| @@ -870,11 +708,17 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | |||
| 870 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 708 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
| 871 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | 709 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) |
| 872 | 710 | ||
| 873 | static inline int is_same_group(struct task_struct *curr, struct task_struct *p) | 711 | static inline int |
| 712 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
| 874 | { | 713 | { |
| 875 | return 1; | 714 | return 1; |
| 876 | } | 715 | } |
| 877 | 716 | ||
| 717 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
| 718 | { | ||
| 719 | return NULL; | ||
| 720 | } | ||
| 721 | |||
| 878 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 722 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 879 | 723 | ||
| 880 | /* | 724 | /* |
| @@ -892,6 +736,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) | |||
| 892 | break; | 736 | break; |
| 893 | cfs_rq = cfs_rq_of(se); | 737 | cfs_rq = cfs_rq_of(se); |
| 894 | enqueue_entity(cfs_rq, se, wakeup); | 738 | enqueue_entity(cfs_rq, se, wakeup); |
| 739 | wakeup = 1; | ||
| 895 | } | 740 | } |
| 896 | } | 741 | } |
| 897 | 742 | ||
| @@ -911,6 +756,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) | |||
| 911 | /* Don't dequeue parent if it has other entities besides us */ | 756 | /* Don't dequeue parent if it has other entities besides us */ |
| 912 | if (cfs_rq->load.weight) | 757 | if (cfs_rq->load.weight) |
| 913 | break; | 758 | break; |
| 759 | sleep = 1; | ||
| 914 | } | 760 | } |
| 915 | } | 761 | } |
| 916 | 762 | ||
| @@ -919,12 +765,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) | |||
| 919 | * | 765 | * |
| 920 | * If compat_yield is turned on then we requeue to the end of the tree. | 766 | * If compat_yield is turned on then we requeue to the end of the tree. |
| 921 | */ | 767 | */ |
| 922 | static void yield_task_fair(struct rq *rq, struct task_struct *p) | 768 | static void yield_task_fair(struct rq *rq) |
| 923 | { | 769 | { |
| 924 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 770 | struct cfs_rq *cfs_rq = task_cfs_rq(rq->curr); |
| 925 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; | 771 | struct sched_entity *rightmost, *se = &rq->curr->se; |
| 926 | struct sched_entity *rightmost, *se = &p->se; | ||
| 927 | struct rb_node *parent; | ||
| 928 | 772 | ||
| 929 | /* | 773 | /* |
| 930 | * Are we the only task in the tree? | 774 | * Are we the only task in the tree? |
| @@ -935,52 +779,39 @@ static void yield_task_fair(struct rq *rq, struct task_struct *p) | |||
| 935 | if (likely(!sysctl_sched_compat_yield)) { | 779 | if (likely(!sysctl_sched_compat_yield)) { |
| 936 | __update_rq_clock(rq); | 780 | __update_rq_clock(rq); |
| 937 | /* | 781 | /* |
| 938 | * Dequeue and enqueue the task to update its | 782 | * Update run-time statistics of the 'current'. |
| 939 | * position within the tree: | ||
| 940 | */ | 783 | */ |
| 941 | dequeue_entity(cfs_rq, &p->se, 0); | 784 | update_curr(cfs_rq); |
| 942 | enqueue_entity(cfs_rq, &p->se, 0); | ||
| 943 | 785 | ||
| 944 | return; | 786 | return; |
| 945 | } | 787 | } |
| 946 | /* | 788 | /* |
| 947 | * Find the rightmost entry in the rbtree: | 789 | * Find the rightmost entry in the rbtree: |
| 948 | */ | 790 | */ |
| 949 | do { | 791 | rightmost = __pick_last_entity(cfs_rq); |
| 950 | parent = *link; | ||
| 951 | link = &parent->rb_right; | ||
| 952 | } while (*link); | ||
| 953 | |||
| 954 | rightmost = rb_entry(parent, struct sched_entity, run_node); | ||
| 955 | /* | 792 | /* |
| 956 | * Already in the rightmost position? | 793 | * Already in the rightmost position? |
| 957 | */ | 794 | */ |
| 958 | if (unlikely(rightmost == se)) | 795 | if (unlikely(rightmost->vruntime < se->vruntime)) |
| 959 | return; | 796 | return; |
| 960 | 797 | ||
| 961 | /* | 798 | /* |
| 962 | * Minimally necessary key value to be last in the tree: | 799 | * Minimally necessary key value to be last in the tree: |
| 800 | * Upon rescheduling, sched_class::put_prev_task() will place | ||
| 801 | * 'current' within the tree based on its new key value. | ||
| 963 | */ | 802 | */ |
| 964 | se->fair_key = rightmost->fair_key + 1; | 803 | se->vruntime = rightmost->vruntime + 1; |
| 965 | |||
| 966 | if (cfs_rq->rb_leftmost == &se->run_node) | ||
| 967 | cfs_rq->rb_leftmost = rb_next(&se->run_node); | ||
| 968 | /* | ||
| 969 | * Relink the task to the rightmost position: | ||
| 970 | */ | ||
| 971 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); | ||
| 972 | rb_link_node(&se->run_node, parent, link); | ||
| 973 | rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); | ||
| 974 | } | 804 | } |
| 975 | 805 | ||
| 976 | /* | 806 | /* |
| 977 | * Preempt the current task with a newly woken task if needed: | 807 | * Preempt the current task with a newly woken task if needed: |
| 978 | */ | 808 | */ |
| 979 | static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) | 809 | static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) |
| 980 | { | 810 | { |
| 981 | struct task_struct *curr = rq->curr; | 811 | struct task_struct *curr = rq->curr; |
| 982 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 812 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
| 983 | unsigned long gran; | 813 | struct sched_entity *se = &curr->se, *pse = &p->se; |
| 814 | s64 delta, gran; | ||
| 984 | 815 | ||
| 985 | if (unlikely(rt_prio(p->prio))) { | 816 | if (unlikely(rt_prio(p->prio))) { |
| 986 | update_rq_clock(rq); | 817 | update_rq_clock(rq); |
| @@ -988,16 +819,31 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) | |||
| 988 | resched_task(curr); | 819 | resched_task(curr); |
| 989 | return; | 820 | return; |
| 990 | } | 821 | } |
| 991 | |||
| 992 | gran = sysctl_sched_wakeup_granularity; | ||
| 993 | /* | 822 | /* |
| 994 | * Batch tasks prefer throughput over latency: | 823 | * Batch tasks do not preempt (their preemption is driven by |
| 824 | * the tick): | ||
| 995 | */ | 825 | */ |
| 996 | if (unlikely(p->policy == SCHED_BATCH)) | 826 | if (unlikely(p->policy == SCHED_BATCH)) |
| 997 | gran = sysctl_sched_batch_wakeup_granularity; | 827 | return; |
| 828 | |||
| 829 | if (sched_feat(WAKEUP_PREEMPT)) { | ||
| 830 | while (!is_same_group(se, pse)) { | ||
| 831 | se = parent_entity(se); | ||
| 832 | pse = parent_entity(pse); | ||
| 833 | } | ||
| 998 | 834 | ||
| 999 | if (is_same_group(curr, p)) | 835 | delta = se->vruntime - pse->vruntime; |
| 1000 | __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran); | 836 | gran = sysctl_sched_wakeup_granularity; |
| 837 | if (unlikely(se->load.weight != NICE_0_LOAD)) | ||
| 838 | gran = calc_delta_fair(gran, &se->load); | ||
| 839 | |||
| 840 | if (delta > gran) { | ||
| 841 | int now = !sched_feat(PREEMPT_RESTRICT); | ||
| 842 | |||
| 843 | if (now || p->prio < curr->prio || !se->peer_preempt++) | ||
| 844 | resched_task(curr); | ||
| 845 | } | ||
| 846 | } | ||
| 1001 | } | 847 | } |
| 1002 | 848 | ||
| 1003 | static struct task_struct *pick_next_task_fair(struct rq *rq) | 849 | static struct task_struct *pick_next_task_fair(struct rq *rq) |
| @@ -1041,7 +887,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) | |||
| 1041 | * achieve that by always pre-iterating before returning | 887 | * achieve that by always pre-iterating before returning |
| 1042 | * the current task: | 888 | * the current task: |
| 1043 | */ | 889 | */ |
| 1044 | static inline struct task_struct * | 890 | static struct task_struct * |
| 1045 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) | 891 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) |
| 1046 | { | 892 | { |
| 1047 | struct task_struct *p; | 893 | struct task_struct *p; |
| @@ -1078,7 +924,10 @@ static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | |||
| 1078 | if (!cfs_rq->nr_running) | 924 | if (!cfs_rq->nr_running) |
| 1079 | return MAX_PRIO; | 925 | return MAX_PRIO; |
| 1080 | 926 | ||
| 1081 | curr = __pick_next_entity(cfs_rq); | 927 | curr = cfs_rq->curr; |
| 928 | if (!curr) | ||
| 929 | curr = __pick_next_entity(cfs_rq); | ||
| 930 | |||
| 1082 | p = task_of(curr); | 931 | p = task_of(curr); |
| 1083 | 932 | ||
| 1084 | return p->prio; | 933 | return p->prio; |
| @@ -1153,6 +1002,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr) | |||
| 1153 | } | 1002 | } |
| 1154 | } | 1003 | } |
| 1155 | 1004 | ||
| 1005 | #define swap(a,b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0) | ||
| 1006 | |||
| 1156 | /* | 1007 | /* |
| 1157 | * Share the fairness runtime between parent and child, thus the | 1008 | * Share the fairness runtime between parent and child, thus the |
| 1158 | * total amount of pressure for CPU stays equal - new tasks | 1009 | * total amount of pressure for CPU stays equal - new tasks |
| @@ -1163,37 +1014,32 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr) | |||
| 1163 | static void task_new_fair(struct rq *rq, struct task_struct *p) | 1014 | static void task_new_fair(struct rq *rq, struct task_struct *p) |
| 1164 | { | 1015 | { |
| 1165 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 1016 | struct cfs_rq *cfs_rq = task_cfs_rq(p); |
| 1166 | struct sched_entity *se = &p->se, *curr = cfs_rq_curr(cfs_rq); | 1017 | struct sched_entity *se = &p->se, *curr = cfs_rq->curr; |
| 1018 | int this_cpu = smp_processor_id(); | ||
| 1167 | 1019 | ||
| 1168 | sched_info_queued(p); | 1020 | sched_info_queued(p); |
| 1169 | 1021 | ||
| 1170 | update_curr(cfs_rq); | 1022 | update_curr(cfs_rq); |
| 1171 | update_stats_enqueue(cfs_rq, se); | 1023 | place_entity(cfs_rq, se, 1); |
| 1172 | /* | ||
| 1173 | * Child runs first: we let it run before the parent | ||
| 1174 | * until it reschedules once. We set up the key so that | ||
| 1175 | * it will preempt the parent: | ||
| 1176 | */ | ||
| 1177 | se->fair_key = curr->fair_key - | ||
| 1178 | niced_granularity(curr, sched_granularity(cfs_rq)) - 1; | ||
| 1179 | /* | ||
| 1180 | * The first wait is dominated by the child-runs-first logic, | ||
| 1181 | * so do not credit it with that waiting time yet: | ||
| 1182 | */ | ||
| 1183 | if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL) | ||
| 1184 | se->wait_start_fair = 0; | ||
| 1185 | 1024 | ||
| 1186 | /* | 1025 | if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && |
| 1187 | * The statistical average of wait_runtime is about | 1026 | curr->vruntime < se->vruntime) { |
| 1188 | * -granularity/2, so initialize the task with that: | 1027 | /* |
| 1189 | */ | 1028 | * Upon rescheduling, sched_class::put_prev_task() will place |
| 1190 | if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) | 1029 | * 'current' within the tree based on its new key value. |
| 1191 | se->wait_runtime = -(sched_granularity(cfs_rq) / 2); | 1030 | */ |
| 1031 | swap(curr->vruntime, se->vruntime); | ||
| 1032 | } | ||
| 1192 | 1033 | ||
| 1034 | update_stats_enqueue(cfs_rq, se); | ||
| 1035 | check_spread(cfs_rq, se); | ||
| 1036 | check_spread(cfs_rq, curr); | ||
| 1193 | __enqueue_entity(cfs_rq, se); | 1037 | __enqueue_entity(cfs_rq, se); |
| 1038 | account_entity_enqueue(cfs_rq, se); | ||
| 1039 | se->peer_preempt = 0; | ||
| 1040 | resched_task(rq->curr); | ||
| 1194 | } | 1041 | } |
| 1195 | 1042 | ||
| 1196 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1197 | /* Account for a task changing its policy or group. | 1043 | /* Account for a task changing its policy or group. |
| 1198 | * | 1044 | * |
| 1199 | * This routine is mostly called to set cfs_rq->curr field when a task | 1045 | * This routine is mostly called to set cfs_rq->curr field when a task |
| @@ -1206,21 +1052,17 @@ static void set_curr_task_fair(struct rq *rq) | |||
| 1206 | for_each_sched_entity(se) | 1052 | for_each_sched_entity(se) |
| 1207 | set_next_entity(cfs_rq_of(se), se); | 1053 | set_next_entity(cfs_rq_of(se), se); |
| 1208 | } | 1054 | } |
| 1209 | #else | ||
| 1210 | static void set_curr_task_fair(struct rq *rq) | ||
| 1211 | { | ||
| 1212 | } | ||
| 1213 | #endif | ||
| 1214 | 1055 | ||
| 1215 | /* | 1056 | /* |
| 1216 | * All the scheduling class methods: | 1057 | * All the scheduling class methods: |
| 1217 | */ | 1058 | */ |
| 1218 | struct sched_class fair_sched_class __read_mostly = { | 1059 | static const struct sched_class fair_sched_class = { |
| 1060 | .next = &idle_sched_class, | ||
| 1219 | .enqueue_task = enqueue_task_fair, | 1061 | .enqueue_task = enqueue_task_fair, |
| 1220 | .dequeue_task = dequeue_task_fair, | 1062 | .dequeue_task = dequeue_task_fair, |
| 1221 | .yield_task = yield_task_fair, | 1063 | .yield_task = yield_task_fair, |
| 1222 | 1064 | ||
| 1223 | .check_preempt_curr = check_preempt_curr_fair, | 1065 | .check_preempt_curr = check_preempt_wakeup, |
| 1224 | 1066 | ||
| 1225 | .pick_next_task = pick_next_task_fair, | 1067 | .pick_next_task = pick_next_task_fair, |
| 1226 | .put_prev_task = put_prev_task_fair, | 1068 | .put_prev_task = put_prev_task_fair, |
| @@ -1237,6 +1079,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu) | |||
| 1237 | { | 1079 | { |
| 1238 | struct cfs_rq *cfs_rq; | 1080 | struct cfs_rq *cfs_rq; |
| 1239 | 1081 | ||
| 1082 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1083 | print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); | ||
| 1084 | #endif | ||
| 1240 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) | 1085 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) |
| 1241 | print_cfs_rq(m, cpu, cfs_rq); | 1086 | print_cfs_rq(m, cpu, cfs_rq); |
| 1242 | } | 1087 | } |
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 3503fb2d9f96..6e2ead41516e 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
| @@ -50,10 +50,15 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr) | |||
| 50 | { | 50 | { |
| 51 | } | 51 | } |
| 52 | 52 | ||
| 53 | static void set_curr_task_idle(struct rq *rq) | ||
| 54 | { | ||
| 55 | } | ||
| 56 | |||
| 53 | /* | 57 | /* |
| 54 | * Simple, special scheduling class for the per-CPU idle tasks: | 58 | * Simple, special scheduling class for the per-CPU idle tasks: |
| 55 | */ | 59 | */ |
| 56 | static struct sched_class idle_sched_class __read_mostly = { | 60 | const struct sched_class idle_sched_class = { |
| 61 | /* .next is NULL */ | ||
| 57 | /* no enqueue/yield_task for idle tasks */ | 62 | /* no enqueue/yield_task for idle tasks */ |
| 58 | 63 | ||
| 59 | /* dequeue is not valid, we print a debug message there: */ | 64 | /* dequeue is not valid, we print a debug message there: */ |
| @@ -66,6 +71,7 @@ static struct sched_class idle_sched_class __read_mostly = { | |||
| 66 | 71 | ||
| 67 | .load_balance = load_balance_idle, | 72 | .load_balance = load_balance_idle, |
| 68 | 73 | ||
| 74 | .set_curr_task = set_curr_task_idle, | ||
| 69 | .task_tick = task_tick_idle, | 75 | .task_tick = task_tick_idle, |
| 70 | /* no .task_new for idle tasks */ | 76 | /* no .task_new for idle tasks */ |
| 71 | }; | 77 | }; |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 4b87476a02d0..d0097a0634e5 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
| @@ -7,7 +7,7 @@ | |||
| 7 | * Update the current task's runtime statistics. Skip current tasks that | 7 | * Update the current task's runtime statistics. Skip current tasks that |
| 8 | * are not in our scheduling class. | 8 | * are not in our scheduling class. |
| 9 | */ | 9 | */ |
| 10 | static inline void update_curr_rt(struct rq *rq) | 10 | static void update_curr_rt(struct rq *rq) |
| 11 | { | 11 | { |
| 12 | struct task_struct *curr = rq->curr; | 12 | struct task_struct *curr = rq->curr; |
| 13 | u64 delta_exec; | 13 | u64 delta_exec; |
| @@ -59,9 +59,9 @@ static void requeue_task_rt(struct rq *rq, struct task_struct *p) | |||
| 59 | } | 59 | } |
| 60 | 60 | ||
| 61 | static void | 61 | static void |
| 62 | yield_task_rt(struct rq *rq, struct task_struct *p) | 62 | yield_task_rt(struct rq *rq) |
| 63 | { | 63 | { |
| 64 | requeue_task_rt(rq, p); | 64 | requeue_task_rt(rq, rq->curr); |
| 65 | } | 65 | } |
| 66 | 66 | ||
| 67 | /* | 67 | /* |
| @@ -206,7 +206,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p) | |||
| 206 | if (--p->time_slice) | 206 | if (--p->time_slice) |
| 207 | return; | 207 | return; |
| 208 | 208 | ||
| 209 | p->time_slice = static_prio_timeslice(p->static_prio); | 209 | p->time_slice = DEF_TIMESLICE; |
| 210 | 210 | ||
| 211 | /* | 211 | /* |
| 212 | * Requeue to the end of queue if we are not the only element | 212 | * Requeue to the end of queue if we are not the only element |
| @@ -218,7 +218,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p) | |||
| 218 | } | 218 | } |
| 219 | } | 219 | } |
| 220 | 220 | ||
| 221 | static struct sched_class rt_sched_class __read_mostly = { | 221 | static void set_curr_task_rt(struct rq *rq) |
| 222 | { | ||
| 223 | struct task_struct *p = rq->curr; | ||
| 224 | |||
| 225 | p->se.exec_start = rq->clock; | ||
| 226 | } | ||
| 227 | |||
| 228 | const struct sched_class rt_sched_class = { | ||
| 229 | .next = &fair_sched_class, | ||
| 222 | .enqueue_task = enqueue_task_rt, | 230 | .enqueue_task = enqueue_task_rt, |
| 223 | .dequeue_task = dequeue_task_rt, | 231 | .dequeue_task = dequeue_task_rt, |
| 224 | .yield_task = yield_task_rt, | 232 | .yield_task = yield_task_rt, |
| @@ -230,5 +238,6 @@ static struct sched_class rt_sched_class __read_mostly = { | |||
| 230 | 238 | ||
| 231 | .load_balance = load_balance_rt, | 239 | .load_balance = load_balance_rt, |
| 232 | 240 | ||
| 241 | .set_curr_task = set_curr_task_rt, | ||
| 233 | .task_tick = task_tick_rt, | 242 | .task_tick = task_tick_rt, |
| 234 | }; | 243 | }; |
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index c20a94dda61e..1c084842c3e7 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h | |||
| @@ -16,18 +16,18 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
| 16 | struct rq *rq = cpu_rq(cpu); | 16 | struct rq *rq = cpu_rq(cpu); |
| 17 | #ifdef CONFIG_SMP | 17 | #ifdef CONFIG_SMP |
| 18 | struct sched_domain *sd; | 18 | struct sched_domain *sd; |
| 19 | int dcnt = 0; | 19 | int dcount = 0; |
| 20 | #endif | 20 | #endif |
| 21 | 21 | ||
| 22 | /* runqueue-specific stats */ | 22 | /* runqueue-specific stats */ |
| 23 | seq_printf(seq, | 23 | seq_printf(seq, |
| 24 | "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu", | 24 | "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu", |
| 25 | cpu, rq->yld_both_empty, | 25 | cpu, rq->yld_both_empty, |
| 26 | rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, | 26 | rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count, |
| 27 | rq->sched_switch, rq->sched_cnt, rq->sched_goidle, | 27 | rq->sched_switch, rq->sched_count, rq->sched_goidle, |
| 28 | rq->ttwu_cnt, rq->ttwu_local, | 28 | rq->ttwu_count, rq->ttwu_local, |
| 29 | rq->rq_sched_info.cpu_time, | 29 | rq->rq_sched_info.cpu_time, |
| 30 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); | 30 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); |
| 31 | 31 | ||
| 32 | seq_printf(seq, "\n"); | 32 | seq_printf(seq, "\n"); |
| 33 | 33 | ||
| @@ -39,12 +39,12 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
| 39 | char mask_str[NR_CPUS]; | 39 | char mask_str[NR_CPUS]; |
| 40 | 40 | ||
| 41 | cpumask_scnprintf(mask_str, NR_CPUS, sd->span); | 41 | cpumask_scnprintf(mask_str, NR_CPUS, sd->span); |
| 42 | seq_printf(seq, "domain%d %s", dcnt++, mask_str); | 42 | seq_printf(seq, "domain%d %s", dcount++, mask_str); |
| 43 | for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; | 43 | for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; |
| 44 | itype++) { | 44 | itype++) { |
| 45 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " | 45 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " |
| 46 | "%lu", | 46 | "%lu", |
| 47 | sd->lb_cnt[itype], | 47 | sd->lb_count[itype], |
| 48 | sd->lb_balanced[itype], | 48 | sd->lb_balanced[itype], |
| 49 | sd->lb_failed[itype], | 49 | sd->lb_failed[itype], |
| 50 | sd->lb_imbalance[itype], | 50 | sd->lb_imbalance[itype], |
| @@ -55,9 +55,9 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
| 55 | } | 55 | } |
| 56 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" | 56 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" |
| 57 | " %lu %lu %lu\n", | 57 | " %lu %lu %lu\n", |
| 58 | sd->alb_cnt, sd->alb_failed, sd->alb_pushed, | 58 | sd->alb_count, sd->alb_failed, sd->alb_pushed, |
| 59 | sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, | 59 | sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, |
| 60 | sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, | 60 | sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, |
| 61 | sd->ttwu_wake_remote, sd->ttwu_move_affine, | 61 | sd->ttwu_wake_remote, sd->ttwu_move_affine, |
| 62 | sd->ttwu_move_balance); | 62 | sd->ttwu_move_balance); |
| 63 | } | 63 | } |
| @@ -101,7 +101,7 @@ rq_sched_info_arrive(struct rq *rq, unsigned long long delta) | |||
| 101 | { | 101 | { |
| 102 | if (rq) { | 102 | if (rq) { |
| 103 | rq->rq_sched_info.run_delay += delta; | 103 | rq->rq_sched_info.run_delay += delta; |
| 104 | rq->rq_sched_info.pcnt++; | 104 | rq->rq_sched_info.pcount++; |
| 105 | } | 105 | } |
| 106 | } | 106 | } |
| 107 | 107 | ||
| @@ -129,7 +129,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) | |||
| 129 | # define schedstat_set(var, val) do { } while (0) | 129 | # define schedstat_set(var, val) do { } while (0) |
| 130 | #endif | 130 | #endif |
| 131 | 131 | ||
| 132 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 132 | #ifdef CONFIG_SCHEDSTATS |
| 133 | /* | 133 | /* |
| 134 | * Called when a process is dequeued from the active array and given | 134 | * Called when a process is dequeued from the active array and given |
| 135 | * the cpu. We should note that with the exception of interactive | 135 | * the cpu. We should note that with the exception of interactive |
| @@ -164,7 +164,7 @@ static void sched_info_arrive(struct task_struct *t) | |||
| 164 | sched_info_dequeued(t); | 164 | sched_info_dequeued(t); |
| 165 | t->sched_info.run_delay += delta; | 165 | t->sched_info.run_delay += delta; |
| 166 | t->sched_info.last_arrival = now; | 166 | t->sched_info.last_arrival = now; |
| 167 | t->sched_info.pcnt++; | 167 | t->sched_info.pcount++; |
| 168 | 168 | ||
| 169 | rq_sched_info_arrive(task_rq(t), delta); | 169 | rq_sched_info_arrive(task_rq(t), delta); |
| 170 | } | 170 | } |
| @@ -233,5 +233,5 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) | |||
| 233 | #else | 233 | #else |
| 234 | #define sched_info_queued(t) do { } while (0) | 234 | #define sched_info_queued(t) do { } while (0) |
| 235 | #define sched_info_switch(t, next) do { } while (0) | 235 | #define sched_info_switch(t, next) do { } while (0) |
| 236 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ | 236 | #endif /* CONFIG_SCHEDSTATS */ |
| 237 | 237 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6c97259e863e..ec14aa8ac51f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -222,14 +222,11 @@ static ctl_table kern_table[] = { | |||
| 222 | #ifdef CONFIG_SCHED_DEBUG | 222 | #ifdef CONFIG_SCHED_DEBUG |
| 223 | { | 223 | { |
| 224 | .ctl_name = CTL_UNNUMBERED, | 224 | .ctl_name = CTL_UNNUMBERED, |
| 225 | .procname = "sched_min_granularity_ns", | 225 | .procname = "sched_nr_latency", |
| 226 | .data = &sysctl_sched_min_granularity, | 226 | .data = &sysctl_sched_nr_latency, |
| 227 | .maxlen = sizeof(unsigned int), | 227 | .maxlen = sizeof(unsigned int), |
| 228 | .mode = 0644, | 228 | .mode = 0644, |
| 229 | .proc_handler = &proc_dointvec_minmax, | 229 | .proc_handler = &proc_dointvec, |
| 230 | .strategy = &sysctl_intvec, | ||
| 231 | .extra1 = &min_sched_granularity_ns, | ||
| 232 | .extra2 = &max_sched_granularity_ns, | ||
| 233 | }, | 230 | }, |
| 234 | { | 231 | { |
| 235 | .ctl_name = CTL_UNNUMBERED, | 232 | .ctl_name = CTL_UNNUMBERED, |
| @@ -266,38 +263,24 @@ static ctl_table kern_table[] = { | |||
| 266 | }, | 263 | }, |
| 267 | { | 264 | { |
| 268 | .ctl_name = CTL_UNNUMBERED, | 265 | .ctl_name = CTL_UNNUMBERED, |
| 269 | .procname = "sched_stat_granularity_ns", | 266 | .procname = "sched_child_runs_first", |
| 270 | .data = &sysctl_sched_stat_granularity, | 267 | .data = &sysctl_sched_child_runs_first, |
| 271 | .maxlen = sizeof(unsigned int), | ||
| 272 | .mode = 0644, | ||
| 273 | .proc_handler = &proc_dointvec_minmax, | ||
| 274 | .strategy = &sysctl_intvec, | ||
| 275 | .extra1 = &min_wakeup_granularity_ns, | ||
| 276 | .extra2 = &max_wakeup_granularity_ns, | ||
| 277 | }, | ||
| 278 | { | ||
| 279 | .ctl_name = CTL_UNNUMBERED, | ||
| 280 | .procname = "sched_runtime_limit_ns", | ||
| 281 | .data = &sysctl_sched_runtime_limit, | ||
| 282 | .maxlen = sizeof(unsigned int), | 268 | .maxlen = sizeof(unsigned int), |
| 283 | .mode = 0644, | 269 | .mode = 0644, |
| 284 | .proc_handler = &proc_dointvec_minmax, | 270 | .proc_handler = &proc_dointvec, |
| 285 | .strategy = &sysctl_intvec, | ||
| 286 | .extra1 = &min_sched_granularity_ns, | ||
| 287 | .extra2 = &max_sched_granularity_ns, | ||
| 288 | }, | 271 | }, |
| 289 | { | 272 | { |
| 290 | .ctl_name = CTL_UNNUMBERED, | 273 | .ctl_name = CTL_UNNUMBERED, |
| 291 | .procname = "sched_child_runs_first", | 274 | .procname = "sched_features", |
| 292 | .data = &sysctl_sched_child_runs_first, | 275 | .data = &sysctl_sched_features, |
| 293 | .maxlen = sizeof(unsigned int), | 276 | .maxlen = sizeof(unsigned int), |
| 294 | .mode = 0644, | 277 | .mode = 0644, |
| 295 | .proc_handler = &proc_dointvec, | 278 | .proc_handler = &proc_dointvec, |
| 296 | }, | 279 | }, |
| 297 | { | 280 | { |
| 298 | .ctl_name = CTL_UNNUMBERED, | 281 | .ctl_name = CTL_UNNUMBERED, |
| 299 | .procname = "sched_features", | 282 | .procname = "sched_migration_cost", |
| 300 | .data = &sysctl_sched_features, | 283 | .data = &sysctl_sched_migration_cost, |
| 301 | .maxlen = sizeof(unsigned int), | 284 | .maxlen = sizeof(unsigned int), |
| 302 | .mode = 0644, | 285 | .mode = 0644, |
| 303 | .proc_handler = &proc_dointvec, | 286 | .proc_handler = &proc_dointvec, |
diff --git a/kernel/user.c b/kernel/user.c index 9ca2848fc356..f0e561e6d085 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
| @@ -50,12 +50,16 @@ struct user_struct root_user = { | |||
| 50 | .uid_keyring = &root_user_keyring, | 50 | .uid_keyring = &root_user_keyring, |
| 51 | .session_keyring = &root_session_keyring, | 51 | .session_keyring = &root_session_keyring, |
| 52 | #endif | 52 | #endif |
| 53 | #ifdef CONFIG_FAIR_USER_SCHED | ||
| 54 | .tg = &init_task_group, | ||
| 55 | #endif | ||
| 53 | }; | 56 | }; |
| 54 | 57 | ||
| 55 | /* | 58 | /* |
| 56 | * These routines must be called with the uidhash spinlock held! | 59 | * These routines must be called with the uidhash spinlock held! |
| 57 | */ | 60 | */ |
| 58 | static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) | 61 | static inline void uid_hash_insert(struct user_struct *up, |
| 62 | struct hlist_head *hashent) | ||
| 59 | { | 63 | { |
| 60 | hlist_add_head(&up->uidhash_node, hashent); | 64 | hlist_add_head(&up->uidhash_node, hashent); |
| 61 | } | 65 | } |
| @@ -65,13 +69,14 @@ static inline void uid_hash_remove(struct user_struct *up) | |||
| 65 | hlist_del_init(&up->uidhash_node); | 69 | hlist_del_init(&up->uidhash_node); |
| 66 | } | 70 | } |
| 67 | 71 | ||
| 68 | static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) | 72 | static inline struct user_struct *uid_hash_find(uid_t uid, |
| 73 | struct hlist_head *hashent) | ||
| 69 | { | 74 | { |
| 70 | struct user_struct *user; | 75 | struct user_struct *user; |
| 71 | struct hlist_node *h; | 76 | struct hlist_node *h; |
| 72 | 77 | ||
| 73 | hlist_for_each_entry(user, h, hashent, uidhash_node) { | 78 | hlist_for_each_entry(user, h, hashent, uidhash_node) { |
| 74 | if(user->uid == uid) { | 79 | if (user->uid == uid) { |
| 75 | atomic_inc(&user->__count); | 80 | atomic_inc(&user->__count); |
| 76 | return user; | 81 | return user; |
| 77 | } | 82 | } |
| @@ -80,6 +85,203 @@ static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *ha | |||
| 80 | return NULL; | 85 | return NULL; |
| 81 | } | 86 | } |
| 82 | 87 | ||
| 88 | #ifdef CONFIG_FAIR_USER_SCHED | ||
| 89 | |||
| 90 | static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */ | ||
| 91 | static DEFINE_MUTEX(uids_mutex); | ||
| 92 | |||
| 93 | static void sched_destroy_user(struct user_struct *up) | ||
| 94 | { | ||
| 95 | sched_destroy_group(up->tg); | ||
| 96 | } | ||
| 97 | |||
| 98 | static int sched_create_user(struct user_struct *up) | ||
| 99 | { | ||
| 100 | int rc = 0; | ||
| 101 | |||
| 102 | up->tg = sched_create_group(); | ||
| 103 | if (IS_ERR(up->tg)) | ||
| 104 | rc = -ENOMEM; | ||
| 105 | |||
| 106 | return rc; | ||
| 107 | } | ||
| 108 | |||
| 109 | static void sched_switch_user(struct task_struct *p) | ||
| 110 | { | ||
| 111 | sched_move_task(p); | ||
| 112 | } | ||
| 113 | |||
| 114 | static inline void uids_mutex_lock(void) | ||
| 115 | { | ||
| 116 | mutex_lock(&uids_mutex); | ||
| 117 | } | ||
| 118 | |||
| 119 | static inline void uids_mutex_unlock(void) | ||
| 120 | { | ||
| 121 | mutex_unlock(&uids_mutex); | ||
| 122 | } | ||
| 123 | |||
| 124 | /* return cpu shares held by the user */ | ||
| 125 | ssize_t cpu_shares_show(struct kset *kset, char *buffer) | ||
| 126 | { | ||
| 127 | struct user_struct *up = container_of(kset, struct user_struct, kset); | ||
| 128 | |||
| 129 | return sprintf(buffer, "%lu\n", sched_group_shares(up->tg)); | ||
| 130 | } | ||
| 131 | |||
| 132 | /* modify cpu shares held by the user */ | ||
| 133 | ssize_t cpu_shares_store(struct kset *kset, const char *buffer, size_t size) | ||
| 134 | { | ||
| 135 | struct user_struct *up = container_of(kset, struct user_struct, kset); | ||
| 136 | unsigned long shares; | ||
| 137 | int rc; | ||
| 138 | |||
| 139 | sscanf(buffer, "%lu", &shares); | ||
| 140 | |||
| 141 | rc = sched_group_set_shares(up->tg, shares); | ||
| 142 | |||
| 143 | return (rc ? rc : size); | ||
| 144 | } | ||
| 145 | |||
| 146 | static void user_attr_init(struct subsys_attribute *sa, char *name, int mode) | ||
| 147 | { | ||
| 148 | sa->attr.name = name; | ||
| 149 | sa->attr.mode = mode; | ||
| 150 | sa->show = cpu_shares_show; | ||
| 151 | sa->store = cpu_shares_store; | ||
| 152 | } | ||
| 153 | |||
| 154 | /* Create "/sys/kernel/uids/<uid>" directory and | ||
| 155 | * "/sys/kernel/uids/<uid>/cpu_share" file for this user. | ||
| 156 | */ | ||
| 157 | static int user_kobject_create(struct user_struct *up) | ||
| 158 | { | ||
| 159 | struct kset *kset = &up->kset; | ||
| 160 | struct kobject *kobj = &kset->kobj; | ||
| 161 | int error; | ||
| 162 | |||
| 163 | memset(kset, 0, sizeof(struct kset)); | ||
| 164 | kobj->parent = &uids_kobject; /* create under /sys/kernel/uids dir */ | ||
| 165 | kobject_set_name(kobj, "%d", up->uid); | ||
| 166 | kset_init(kset); | ||
| 167 | user_attr_init(&up->user_attr, "cpu_share", 0644); | ||
| 168 | |||
| 169 | error = kobject_add(kobj); | ||
| 170 | if (error) | ||
| 171 | goto done; | ||
| 172 | |||
| 173 | error = sysfs_create_file(kobj, &up->user_attr.attr); | ||
| 174 | if (error) | ||
| 175 | kobject_del(kobj); | ||
| 176 | |||
| 177 | kobject_uevent(kobj, KOBJ_ADD); | ||
| 178 | |||
| 179 | done: | ||
| 180 | return error; | ||
| 181 | } | ||
| 182 | |||
| 183 | /* create these in sysfs filesystem: | ||
| 184 | * "/sys/kernel/uids" directory | ||
| 185 | * "/sys/kernel/uids/0" directory (for root user) | ||
| 186 | * "/sys/kernel/uids/0/cpu_share" file (for root user) | ||
| 187 | */ | ||
| 188 | int __init uids_kobject_init(void) | ||
| 189 | { | ||
| 190 | int error; | ||
| 191 | |||
| 192 | /* create under /sys/kernel dir */ | ||
| 193 | uids_kobject.parent = &kernel_subsys.kobj; | ||
| 194 | uids_kobject.kset = &kernel_subsys; | ||
| 195 | kobject_set_name(&uids_kobject, "uids"); | ||
| 196 | kobject_init(&uids_kobject); | ||
| 197 | |||
| 198 | error = kobject_add(&uids_kobject); | ||
| 199 | if (!error) | ||
| 200 | error = user_kobject_create(&root_user); | ||
| 201 | |||
| 202 | return error; | ||
| 203 | } | ||
| 204 | |||
| 205 | /* work function to remove sysfs directory for a user and free up | ||
| 206 | * corresponding structures. | ||
| 207 | */ | ||
| 208 | static void remove_user_sysfs_dir(struct work_struct *w) | ||
| 209 | { | ||
| 210 | struct user_struct *up = container_of(w, struct user_struct, work); | ||
| 211 | struct kobject *kobj = &up->kset.kobj; | ||
| 212 | unsigned long flags; | ||
| 213 | int remove_user = 0; | ||
| 214 | |||
| 215 | /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del() | ||
| 216 | * atomic. | ||
| 217 | */ | ||
| 218 | uids_mutex_lock(); | ||
| 219 | |||
| 220 | local_irq_save(flags); | ||
| 221 | |||
| 222 | if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { | ||
| 223 | uid_hash_remove(up); | ||
| 224 | remove_user = 1; | ||
| 225 | spin_unlock_irqrestore(&uidhash_lock, flags); | ||
| 226 | } else { | ||
| 227 | local_irq_restore(flags); | ||
| 228 | } | ||
| 229 | |||
| 230 | if (!remove_user) | ||
| 231 | goto done; | ||
| 232 | |||
| 233 | sysfs_remove_file(kobj, &up->user_attr.attr); | ||
| 234 | kobject_uevent(kobj, KOBJ_REMOVE); | ||
| 235 | kobject_del(kobj); | ||
| 236 | |||
| 237 | sched_destroy_user(up); | ||
| 238 | key_put(up->uid_keyring); | ||
| 239 | key_put(up->session_keyring); | ||
| 240 | kmem_cache_free(uid_cachep, up); | ||
| 241 | |||
| 242 | done: | ||
| 243 | uids_mutex_unlock(); | ||
| 244 | } | ||
| 245 | |||
| 246 | /* IRQs are disabled and uidhash_lock is held upon function entry. | ||
| 247 | * IRQ state (as stored in flags) is restored and uidhash_lock released | ||
| 248 | * upon function exit. | ||
| 249 | */ | ||
| 250 | static inline void free_user(struct user_struct *up, unsigned long flags) | ||
| 251 | { | ||
| 252 | /* restore back the count */ | ||
| 253 | atomic_inc(&up->__count); | ||
| 254 | spin_unlock_irqrestore(&uidhash_lock, flags); | ||
| 255 | |||
| 256 | INIT_WORK(&up->work, remove_user_sysfs_dir); | ||
| 257 | schedule_work(&up->work); | ||
| 258 | } | ||
| 259 | |||
| 260 | #else /* CONFIG_FAIR_USER_SCHED */ | ||
| 261 | |||
| 262 | static void sched_destroy_user(struct user_struct *up) { } | ||
| 263 | static int sched_create_user(struct user_struct *up) { return 0; } | ||
| 264 | static void sched_switch_user(struct task_struct *p) { } | ||
| 265 | static inline int user_kobject_create(struct user_struct *up) { return 0; } | ||
| 266 | static inline void uids_mutex_lock(void) { } | ||
| 267 | static inline void uids_mutex_unlock(void) { } | ||
| 268 | |||
| 269 | /* IRQs are disabled and uidhash_lock is held upon function entry. | ||
| 270 | * IRQ state (as stored in flags) is restored and uidhash_lock released | ||
| 271 | * upon function exit. | ||
| 272 | */ | ||
| 273 | static inline void free_user(struct user_struct *up, unsigned long flags) | ||
| 274 | { | ||
| 275 | uid_hash_remove(up); | ||
| 276 | spin_unlock_irqrestore(&uidhash_lock, flags); | ||
| 277 | sched_destroy_user(up); | ||
| 278 | key_put(up->uid_keyring); | ||
| 279 | key_put(up->session_keyring); | ||
| 280 | kmem_cache_free(uid_cachep, up); | ||
| 281 | } | ||
| 282 | |||
| 283 | #endif /* CONFIG_FAIR_USER_SCHED */ | ||
| 284 | |||
| 83 | /* | 285 | /* |
| 84 | * Locate the user_struct for the passed UID. If found, take a ref on it. The | 286 | * Locate the user_struct for the passed UID. If found, take a ref on it. The |
| 85 | * caller must undo that ref with free_uid(). | 287 | * caller must undo that ref with free_uid(). |
| @@ -106,15 +308,10 @@ void free_uid(struct user_struct *up) | |||
| 106 | return; | 308 | return; |
| 107 | 309 | ||
| 108 | local_irq_save(flags); | 310 | local_irq_save(flags); |
| 109 | if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { | 311 | if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) |
| 110 | uid_hash_remove(up); | 312 | free_user(up, flags); |
| 111 | spin_unlock_irqrestore(&uidhash_lock, flags); | 313 | else |
| 112 | key_put(up->uid_keyring); | ||
| 113 | key_put(up->session_keyring); | ||
| 114 | kmem_cache_free(uid_cachep, up); | ||
| 115 | } else { | ||
| 116 | local_irq_restore(flags); | 314 | local_irq_restore(flags); |
| 117 | } | ||
| 118 | } | 315 | } |
| 119 | 316 | ||
| 120 | struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | 317 | struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) |
| @@ -122,6 +319,11 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
| 122 | struct hlist_head *hashent = uidhashentry(ns, uid); | 319 | struct hlist_head *hashent = uidhashentry(ns, uid); |
| 123 | struct user_struct *up; | 320 | struct user_struct *up; |
| 124 | 321 | ||
| 322 | /* Make uid_hash_find() + user_kobject_create() + uid_hash_insert() | ||
| 323 | * atomic. | ||
| 324 | */ | ||
| 325 | uids_mutex_lock(); | ||
| 326 | |||
| 125 | spin_lock_irq(&uidhash_lock); | 327 | spin_lock_irq(&uidhash_lock); |
| 126 | up = uid_hash_find(uid, hashent); | 328 | up = uid_hash_find(uid, hashent); |
| 127 | spin_unlock_irq(&uidhash_lock); | 329 | spin_unlock_irq(&uidhash_lock); |
| @@ -150,6 +352,22 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
| 150 | return NULL; | 352 | return NULL; |
| 151 | } | 353 | } |
| 152 | 354 | ||
| 355 | if (sched_create_user(new) < 0) { | ||
| 356 | key_put(new->uid_keyring); | ||
| 357 | key_put(new->session_keyring); | ||
| 358 | kmem_cache_free(uid_cachep, new); | ||
| 359 | return NULL; | ||
| 360 | } | ||
| 361 | |||
| 362 | if (user_kobject_create(new)) { | ||
| 363 | sched_destroy_user(new); | ||
| 364 | key_put(new->uid_keyring); | ||
| 365 | key_put(new->session_keyring); | ||
| 366 | kmem_cache_free(uid_cachep, new); | ||
| 367 | uids_mutex_unlock(); | ||
| 368 | return NULL; | ||
| 369 | } | ||
| 370 | |||
| 153 | /* | 371 | /* |
| 154 | * Before adding this, check whether we raced | 372 | * Before adding this, check whether we raced |
| 155 | * on adding the same user already.. | 373 | * on adding the same user already.. |
| @@ -157,6 +375,11 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
| 157 | spin_lock_irq(&uidhash_lock); | 375 | spin_lock_irq(&uidhash_lock); |
| 158 | up = uid_hash_find(uid, hashent); | 376 | up = uid_hash_find(uid, hashent); |
| 159 | if (up) { | 377 | if (up) { |
| 378 | /* This case is not possible when CONFIG_FAIR_USER_SCHED | ||
| 379 | * is defined, since we serialize alloc_uid() using | ||
| 380 | * uids_mutex. Hence no need to call | ||
| 381 | * sched_destroy_user() or remove_user_sysfs_dir(). | ||
| 382 | */ | ||
| 160 | key_put(new->uid_keyring); | 383 | key_put(new->uid_keyring); |
| 161 | key_put(new->session_keyring); | 384 | key_put(new->session_keyring); |
| 162 | kmem_cache_free(uid_cachep, new); | 385 | kmem_cache_free(uid_cachep, new); |
| @@ -167,6 +390,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
| 167 | spin_unlock_irq(&uidhash_lock); | 390 | spin_unlock_irq(&uidhash_lock); |
| 168 | 391 | ||
| 169 | } | 392 | } |
| 393 | |||
| 394 | uids_mutex_unlock(); | ||
| 395 | |||
| 170 | return up; | 396 | return up; |
| 171 | } | 397 | } |
| 172 | 398 | ||
| @@ -184,6 +410,7 @@ void switch_uid(struct user_struct *new_user) | |||
| 184 | atomic_dec(&old_user->processes); | 410 | atomic_dec(&old_user->processes); |
| 185 | switch_uid_keyring(new_user); | 411 | switch_uid_keyring(new_user); |
| 186 | current->user = new_user; | 412 | current->user = new_user; |
| 413 | sched_switch_user(current); | ||
| 187 | 414 | ||
| 188 | /* | 415 | /* |
| 189 | * We need to synchronize with __sigqueue_alloc() | 416 | * We need to synchronize with __sigqueue_alloc() |
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 2b57eaf66abc..6996cba5aa96 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c | |||
| @@ -334,7 +334,7 @@ static void unix_write_space(struct sock *sk) | |||
| 334 | read_lock(&sk->sk_callback_lock); | 334 | read_lock(&sk->sk_callback_lock); |
| 335 | if (unix_writable(sk)) { | 335 | if (unix_writable(sk)) { |
| 336 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) | 336 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) |
| 337 | wake_up_interruptible(sk->sk_sleep); | 337 | wake_up_interruptible_sync(sk->sk_sleep); |
| 338 | sk_wake_async(sk, 2, POLL_OUT); | 338 | sk_wake_async(sk, 2, POLL_OUT); |
| 339 | } | 339 | } |
| 340 | read_unlock(&sk->sk_callback_lock); | 340 | read_unlock(&sk->sk_callback_lock); |
| @@ -1639,7 +1639,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, | |||
| 1639 | if (!skb) | 1639 | if (!skb) |
| 1640 | goto out_unlock; | 1640 | goto out_unlock; |
| 1641 | 1641 | ||
| 1642 | wake_up_interruptible(&u->peer_wait); | 1642 | wake_up_interruptible_sync(&u->peer_wait); |
| 1643 | 1643 | ||
| 1644 | if (msg->msg_name) | 1644 | if (msg->msg_name) |
| 1645 | unix_copy_addr(msg, skb->sk); | 1645 | unix_copy_addr(msg, skb->sk); |
