aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/sched-design-CFS.txt67
-rw-r--r--arch/i386/Kconfig11
-rw-r--r--drivers/kvm/kvm.h10
-rw-r--r--drivers/kvm/kvm_main.c2
-rw-r--r--fs/pipe.c9
-rw-r--r--fs/proc/array.c17
-rw-r--r--fs/proc/base.c2
-rw-r--r--fs/proc/proc_misc.c15
-rw-r--r--include/linux/kernel_stat.h1
-rw-r--r--include/linux/sched.h99
-rw-r--r--include/linux/topology.h5
-rw-r--r--init/Kconfig21
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/exit.c6
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/ksysfs.c8
-rw-r--r--kernel/sched.c1444
-rw-r--r--kernel/sched_debug.c282
-rw-r--r--kernel/sched_fair.c811
-rw-r--r--kernel/sched_idletask.c8
-rw-r--r--kernel/sched_rt.c19
-rw-r--r--kernel/sched_stats.h28
-rw-r--r--kernel/sysctl.c37
-rw-r--r--kernel/user.c249
-rw-r--r--net/unix/af_unix.c4
25 files changed, 1872 insertions, 1288 deletions
diff --git a/Documentation/sched-design-CFS.txt b/Documentation/sched-design-CFS.txt
index 84901e7c0508..88bcb8767335 100644
--- a/Documentation/sched-design-CFS.txt
+++ b/Documentation/sched-design-CFS.txt
@@ -117,3 +117,70 @@ Some implementation details:
117 iterators of the scheduling modules are used. The balancing code got 117 iterators of the scheduling modules are used. The balancing code got
118 quite a bit simpler as a result. 118 quite a bit simpler as a result.
119 119
120
121Group scheduler extension to CFS
122================================
123
124Normally the scheduler operates on individual tasks and strives to provide
125fair CPU time to each task. Sometimes, it may be desirable to group tasks
126and provide fair CPU time to each such task group. For example, it may
127be desirable to first provide fair CPU time to each user on the system
128and then to each task belonging to a user.
129
130CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets
131SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such
132groups. At present, there are two (mutually exclusive) mechanisms to group
133tasks for CPU bandwidth control purpose:
134
135 - Based on user id (CONFIG_FAIR_USER_SCHED)
136 In this option, tasks are grouped according to their user id.
137 - Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED)
138 This options lets the administrator create arbitrary groups
139 of tasks, using the "cgroup" pseudo filesystem. See
140 Documentation/cgroups.txt for more information about this
141 filesystem.
142
143Only one of these options to group tasks can be chosen and not both.
144
145Group scheduler tunables:
146
147When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for
148each new user and a "cpu_share" file is added in that directory.
149
150 # cd /sys/kernel/uids
151 # cat 512/cpu_share # Display user 512's CPU share
152 1024
153 # echo 2048 > 512/cpu_share # Modify user 512's CPU share
154 # cat 512/cpu_share # Display user 512's CPU share
155 2048
156 #
157
158CPU bandwidth between two users are divided in the ratio of their CPU shares.
159For ex: if you would like user "root" to get twice the bandwidth of user
160"guest", then set the cpu_share for both the users such that "root"'s
161cpu_share is twice "guest"'s cpu_share
162
163
164When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created
165for each group created using the pseudo filesystem. See example steps
166below to create task groups and modify their CPU share using the "cgroups"
167pseudo filesystem
168
169 # mkdir /dev/cpuctl
170 # mount -t cgroup -ocpu none /dev/cpuctl
171 # cd /dev/cpuctl
172
173 # mkdir multimedia # create "multimedia" group of tasks
174 # mkdir browser # create "browser" group of tasks
175
176 # #Configure the multimedia group to receive twice the CPU bandwidth
177 # #that of browser group
178
179 # echo 2048 > multimedia/cpu.shares
180 # echo 1024 > browser/cpu.shares
181
182 # firefox & # Launch firefox and move it to "browser" group
183 # echo <firefox_pid> > browser/tasks
184
185 # #Launch gmplayer (or your favourite movie player)
186 # echo <movie_player_pid> > multimedia/tasks
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index f1486f8a3e6d..bf9aafad4978 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -214,6 +214,17 @@ config X86_ES7000
214 214
215endchoice 215endchoice
216 216
217config SCHED_NO_NO_OMIT_FRAME_POINTER
218 bool "Single-depth WCHAN output"
219 default y
220 help
221 Calculate simpler /proc/<PID>/wchan values. If this option
222 is disabled then wchan values will recurse back to the
223 caller function. This provides more accurate wchan values,
224 at the expense of slightly more scheduling overhead.
225
226 If in doubt, say "Y".
227
217config PARAVIRT 228config PARAVIRT
218 bool "Paravirtualization support (EXPERIMENTAL)" 229 bool "Paravirtualization support (EXPERIMENTAL)"
219 depends on EXPERIMENTAL 230 depends on EXPERIMENTAL
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index ad0813843adc..3b0bc4bda5f2 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -624,6 +624,16 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu);
624 624
625int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); 625int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
626 626
627static inline void kvm_guest_enter(void)
628{
629 current->flags |= PF_VCPU;
630}
631
632static inline void kvm_guest_exit(void)
633{
634 current->flags &= ~PF_VCPU;
635}
636
627static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 637static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
628 u32 error_code) 638 u32 error_code)
629{ 639{
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 353e58527d15..af2d288c881d 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -2046,6 +2046,7 @@ again:
2046 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); 2046 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
2047 2047
2048 vcpu->guest_mode = 1; 2048 vcpu->guest_mode = 1;
2049 kvm_guest_enter();
2049 2050
2050 if (vcpu->requests) 2051 if (vcpu->requests)
2051 if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) 2052 if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
@@ -2053,6 +2054,7 @@ again:
2053 2054
2054 kvm_x86_ops->run(vcpu, kvm_run); 2055 kvm_x86_ops->run(vcpu, kvm_run);
2055 2056
2057 kvm_guest_exit();
2056 vcpu->guest_mode = 0; 2058 vcpu->guest_mode = 0;
2057 local_irq_enable(); 2059 local_irq_enable();
2058 2060
diff --git a/fs/pipe.c b/fs/pipe.c
index 6b3d91a691bf..e66ec48e95d8 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -45,8 +45,7 @@ void pipe_wait(struct pipe_inode_info *pipe)
45 * Pipes are system-local resources, so sleeping on them 45 * Pipes are system-local resources, so sleeping on them
46 * is considered a noninteractive wait: 46 * is considered a noninteractive wait:
47 */ 47 */
48 prepare_to_wait(&pipe->wait, &wait, 48 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
49 TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE);
50 if (pipe->inode) 49 if (pipe->inode)
51 mutex_unlock(&pipe->inode->i_mutex); 50 mutex_unlock(&pipe->inode->i_mutex);
52 schedule(); 51 schedule();
@@ -383,7 +382,7 @@ redo:
383 382
384 /* Signal writers asynchronously that there is more room. */ 383 /* Signal writers asynchronously that there is more room. */
385 if (do_wakeup) { 384 if (do_wakeup) {
386 wake_up_interruptible(&pipe->wait); 385 wake_up_interruptible_sync(&pipe->wait);
387 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 386 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
388 } 387 }
389 if (ret > 0) 388 if (ret > 0)
@@ -556,7 +555,7 @@ redo2:
556out: 555out:
557 mutex_unlock(&inode->i_mutex); 556 mutex_unlock(&inode->i_mutex);
558 if (do_wakeup) { 557 if (do_wakeup) {
559 wake_up_interruptible(&pipe->wait); 558 wake_up_interruptible_sync(&pipe->wait);
560 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 559 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
561 } 560 }
562 if (ret > 0) 561 if (ret > 0)
@@ -650,7 +649,7 @@ pipe_release(struct inode *inode, int decr, int decw)
650 if (!pipe->readers && !pipe->writers) { 649 if (!pipe->readers && !pipe->writers) {
651 free_pipe_info(inode); 650 free_pipe_info(inode);
652 } else { 651 } else {
653 wake_up_interruptible(&pipe->wait); 652 wake_up_interruptible_sync(&pipe->wait);
654 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 653 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
655 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 654 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
656 } 655 }
diff --git a/fs/proc/array.c b/fs/proc/array.c
index ee4814dd98f9..27b59f5f3bd1 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -370,6 +370,11 @@ static cputime_t task_stime(struct task_struct *p)
370} 370}
371#endif 371#endif
372 372
373static cputime_t task_gtime(struct task_struct *p)
374{
375 return p->gtime;
376}
377
373static int do_task_stat(struct task_struct *task, char *buffer, int whole) 378static int do_task_stat(struct task_struct *task, char *buffer, int whole)
374{ 379{
375 unsigned long vsize, eip, esp, wchan = ~0UL; 380 unsigned long vsize, eip, esp, wchan = ~0UL;
@@ -385,6 +390,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
385 unsigned long cmin_flt = 0, cmaj_flt = 0; 390 unsigned long cmin_flt = 0, cmaj_flt = 0;
386 unsigned long min_flt = 0, maj_flt = 0; 391 unsigned long min_flt = 0, maj_flt = 0;
387 cputime_t cutime, cstime, utime, stime; 392 cputime_t cutime, cstime, utime, stime;
393 cputime_t cgtime, gtime;
388 unsigned long rsslim = 0; 394 unsigned long rsslim = 0;
389 char tcomm[sizeof(task->comm)]; 395 char tcomm[sizeof(task->comm)];
390 unsigned long flags; 396 unsigned long flags;
@@ -403,6 +409,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
403 sigemptyset(&sigign); 409 sigemptyset(&sigign);
404 sigemptyset(&sigcatch); 410 sigemptyset(&sigcatch);
405 cutime = cstime = utime = stime = cputime_zero; 411 cutime = cstime = utime = stime = cputime_zero;
412 cgtime = gtime = cputime_zero;
406 413
407 rcu_read_lock(); 414 rcu_read_lock();
408 if (lock_task_sighand(task, &flags)) { 415 if (lock_task_sighand(task, &flags)) {
@@ -420,6 +427,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
420 cmaj_flt = sig->cmaj_flt; 427 cmaj_flt = sig->cmaj_flt;
421 cutime = sig->cutime; 428 cutime = sig->cutime;
422 cstime = sig->cstime; 429 cstime = sig->cstime;
430 cgtime = sig->cgtime;
423 rsslim = sig->rlim[RLIMIT_RSS].rlim_cur; 431 rsslim = sig->rlim[RLIMIT_RSS].rlim_cur;
424 432
425 /* add up live thread stats at the group level */ 433 /* add up live thread stats at the group level */
@@ -430,6 +438,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
430 maj_flt += t->maj_flt; 438 maj_flt += t->maj_flt;
431 utime = cputime_add(utime, task_utime(t)); 439 utime = cputime_add(utime, task_utime(t));
432 stime = cputime_add(stime, task_stime(t)); 440 stime = cputime_add(stime, task_stime(t));
441 gtime = cputime_add(gtime, task_gtime(t));
433 t = next_thread(t); 442 t = next_thread(t);
434 } while (t != task); 443 } while (t != task);
435 444
@@ -437,6 +446,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
437 maj_flt += sig->maj_flt; 446 maj_flt += sig->maj_flt;
438 utime = cputime_add(utime, sig->utime); 447 utime = cputime_add(utime, sig->utime);
439 stime = cputime_add(stime, sig->stime); 448 stime = cputime_add(stime, sig->stime);
449 gtime += cputime_add(gtime, sig->gtime);
440 } 450 }
441 451
442 sid = signal_session(sig); 452 sid = signal_session(sig);
@@ -454,6 +464,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
454 maj_flt = task->maj_flt; 464 maj_flt = task->maj_flt;
455 utime = task_utime(task); 465 utime = task_utime(task);
456 stime = task_stime(task); 466 stime = task_stime(task);
467 gtime = task_gtime(task);
457 } 468 }
458 469
459 /* scale priority and nice values from timeslices to -20..20 */ 470 /* scale priority and nice values from timeslices to -20..20 */
@@ -471,7 +482,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
471 482
472 res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \ 483 res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \
473%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ 484%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
474%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu\n", 485%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
475 task->pid, 486 task->pid,
476 tcomm, 487 tcomm,
477 state, 488 state,
@@ -516,7 +527,9 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
516 task_cpu(task), 527 task_cpu(task),
517 task->rt_priority, 528 task->rt_priority,
518 task->policy, 529 task->policy,
519 (unsigned long long)delayacct_blkio_ticks(task)); 530 (unsigned long long)delayacct_blkio_ticks(task),
531 cputime_to_clock_t(gtime),
532 cputime_to_clock_t(cgtime));
520 if (mm) 533 if (mm)
521 mmput(mm); 534 mmput(mm);
522 return res; 535 return res;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 19489b0d5554..e5d0953d4db1 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -304,7 +304,7 @@ static int proc_pid_schedstat(struct task_struct *task, char *buffer)
304 return sprintf(buffer, "%llu %llu %lu\n", 304 return sprintf(buffer, "%llu %llu %lu\n",
305 task->sched_info.cpu_time, 305 task->sched_info.cpu_time,
306 task->sched_info.run_delay, 306 task->sched_info.run_delay,
307 task->sched_info.pcnt); 307 task->sched_info.pcount);
308} 308}
309#endif 309#endif
310 310
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index bee251cb87c8..b872a01ad3af 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -443,6 +443,7 @@ static int show_stat(struct seq_file *p, void *v)
443 int i; 443 int i;
444 unsigned long jif; 444 unsigned long jif;
445 cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; 445 cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
446 cputime64_t guest;
446 u64 sum = 0; 447 u64 sum = 0;
447 struct timespec boottime; 448 struct timespec boottime;
448 unsigned int *per_irq_sum; 449 unsigned int *per_irq_sum;
@@ -453,6 +454,7 @@ static int show_stat(struct seq_file *p, void *v)
453 454
454 user = nice = system = idle = iowait = 455 user = nice = system = idle = iowait =
455 irq = softirq = steal = cputime64_zero; 456 irq = softirq = steal = cputime64_zero;
457 guest = cputime64_zero;
456 getboottime(&boottime); 458 getboottime(&boottime);
457 jif = boottime.tv_sec; 459 jif = boottime.tv_sec;
458 460
@@ -467,6 +469,7 @@ static int show_stat(struct seq_file *p, void *v)
467 irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); 469 irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
468 softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); 470 softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
469 steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); 471 steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
472 guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
470 for (j = 0; j < NR_IRQS; j++) { 473 for (j = 0; j < NR_IRQS; j++) {
471 unsigned int temp = kstat_cpu(i).irqs[j]; 474 unsigned int temp = kstat_cpu(i).irqs[j];
472 sum += temp; 475 sum += temp;
@@ -474,7 +477,7 @@ static int show_stat(struct seq_file *p, void *v)
474 } 477 }
475 } 478 }
476 479
477 seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu\n", 480 seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
478 (unsigned long long)cputime64_to_clock_t(user), 481 (unsigned long long)cputime64_to_clock_t(user),
479 (unsigned long long)cputime64_to_clock_t(nice), 482 (unsigned long long)cputime64_to_clock_t(nice),
480 (unsigned long long)cputime64_to_clock_t(system), 483 (unsigned long long)cputime64_to_clock_t(system),
@@ -482,7 +485,8 @@ static int show_stat(struct seq_file *p, void *v)
482 (unsigned long long)cputime64_to_clock_t(iowait), 485 (unsigned long long)cputime64_to_clock_t(iowait),
483 (unsigned long long)cputime64_to_clock_t(irq), 486 (unsigned long long)cputime64_to_clock_t(irq),
484 (unsigned long long)cputime64_to_clock_t(softirq), 487 (unsigned long long)cputime64_to_clock_t(softirq),
485 (unsigned long long)cputime64_to_clock_t(steal)); 488 (unsigned long long)cputime64_to_clock_t(steal),
489 (unsigned long long)cputime64_to_clock_t(guest));
486 for_each_online_cpu(i) { 490 for_each_online_cpu(i) {
487 491
488 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ 492 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
@@ -494,7 +498,9 @@ static int show_stat(struct seq_file *p, void *v)
494 irq = kstat_cpu(i).cpustat.irq; 498 irq = kstat_cpu(i).cpustat.irq;
495 softirq = kstat_cpu(i).cpustat.softirq; 499 softirq = kstat_cpu(i).cpustat.softirq;
496 steal = kstat_cpu(i).cpustat.steal; 500 steal = kstat_cpu(i).cpustat.steal;
497 seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu\n", 501 guest = kstat_cpu(i).cpustat.guest;
502 seq_printf(p,
503 "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
498 i, 504 i,
499 (unsigned long long)cputime64_to_clock_t(user), 505 (unsigned long long)cputime64_to_clock_t(user),
500 (unsigned long long)cputime64_to_clock_t(nice), 506 (unsigned long long)cputime64_to_clock_t(nice),
@@ -503,7 +509,8 @@ static int show_stat(struct seq_file *p, void *v)
503 (unsigned long long)cputime64_to_clock_t(iowait), 509 (unsigned long long)cputime64_to_clock_t(iowait),
504 (unsigned long long)cputime64_to_clock_t(irq), 510 (unsigned long long)cputime64_to_clock_t(irq),
505 (unsigned long long)cputime64_to_clock_t(softirq), 511 (unsigned long long)cputime64_to_clock_t(softirq),
506 (unsigned long long)cputime64_to_clock_t(steal)); 512 (unsigned long long)cputime64_to_clock_t(steal),
513 (unsigned long long)cputime64_to_clock_t(guest));
507 } 514 }
508 seq_printf(p, "intr %llu", (unsigned long long)sum); 515 seq_printf(p, "intr %llu", (unsigned long long)sum);
509 516
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 43e895f1cabe..12bf44f083f5 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -23,6 +23,7 @@ struct cpu_usage_stat {
23 cputime64_t idle; 23 cputime64_t idle;
24 cputime64_t iowait; 24 cputime64_t iowait;
25 cputime64_t steal; 25 cputime64_t steal;
26 cputime64_t guest;
26}; 27};
27 28
28struct kernel_stat { 29struct kernel_stat {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 833f7dc2b8de..228e0a8ce248 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -87,6 +87,7 @@ struct sched_param {
87#include <linux/timer.h> 87#include <linux/timer.h>
88#include <linux/hrtimer.h> 88#include <linux/hrtimer.h>
89#include <linux/task_io_accounting.h> 89#include <linux/task_io_accounting.h>
90#include <linux/kobject.h>
90 91
91#include <asm/processor.h> 92#include <asm/processor.h>
92 93
@@ -136,6 +137,7 @@ extern unsigned long weighted_cpuload(const int cpu);
136 137
137struct seq_file; 138struct seq_file;
138struct cfs_rq; 139struct cfs_rq;
140struct task_group;
139#ifdef CONFIG_SCHED_DEBUG 141#ifdef CONFIG_SCHED_DEBUG
140extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); 142extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
141extern void proc_sched_set_task(struct task_struct *p); 143extern void proc_sched_set_task(struct task_struct *p);
@@ -174,8 +176,7 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
174#define EXIT_ZOMBIE 16 176#define EXIT_ZOMBIE 16
175#define EXIT_DEAD 32 177#define EXIT_DEAD 32
176/* in tsk->state again */ 178/* in tsk->state again */
177#define TASK_NONINTERACTIVE 64 179#define TASK_DEAD 64
178#define TASK_DEAD 128
179 180
180#define __set_task_state(tsk, state_value) \ 181#define __set_task_state(tsk, state_value) \
181 do { (tsk)->state = (state_value); } while (0) 182 do { (tsk)->state = (state_value); } while (0)
@@ -516,6 +517,8 @@ struct signal_struct {
516 * in __exit_signal, except for the group leader. 517 * in __exit_signal, except for the group leader.
517 */ 518 */
518 cputime_t utime, stime, cutime, cstime; 519 cputime_t utime, stime, cutime, cstime;
520 cputime_t gtime;
521 cputime_t cgtime;
519 unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; 522 unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
520 unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; 523 unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
521 unsigned long inblock, oublock, cinblock, coublock; 524 unsigned long inblock, oublock, cinblock, coublock;
@@ -596,8 +599,21 @@ struct user_struct {
596 /* Hash table maintenance information */ 599 /* Hash table maintenance information */
597 struct hlist_node uidhash_node; 600 struct hlist_node uidhash_node;
598 uid_t uid; 601 uid_t uid;
602
603#ifdef CONFIG_FAIR_USER_SCHED
604 struct task_group *tg;
605 struct kset kset;
606 struct subsys_attribute user_attr;
607 struct work_struct work;
608#endif
599}; 609};
600 610
611#ifdef CONFIG_FAIR_USER_SCHED
612extern int uids_kobject_init(void);
613#else
614static inline int uids_kobject_init(void) { return 0; }
615#endif
616
601extern struct user_struct *find_user(uid_t); 617extern struct user_struct *find_user(uid_t);
602 618
603extern struct user_struct root_user; 619extern struct user_struct root_user;
@@ -609,13 +625,17 @@ struct reclaim_state;
609#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 625#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
610struct sched_info { 626struct sched_info {
611 /* cumulative counters */ 627 /* cumulative counters */
612 unsigned long pcnt; /* # of times run on this cpu */ 628 unsigned long pcount; /* # of times run on this cpu */
613 unsigned long long cpu_time, /* time spent on the cpu */ 629 unsigned long long cpu_time, /* time spent on the cpu */
614 run_delay; /* time spent waiting on a runqueue */ 630 run_delay; /* time spent waiting on a runqueue */
615 631
616 /* timestamps */ 632 /* timestamps */
617 unsigned long long last_arrival,/* when we last ran on a cpu */ 633 unsigned long long last_arrival,/* when we last ran on a cpu */
618 last_queued; /* when we were last queued to run */ 634 last_queued; /* when we were last queued to run */
635#ifdef CONFIG_SCHEDSTATS
636 /* BKL stats */
637 unsigned long bkl_count;
638#endif
619}; 639};
620#endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ 640#endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
621 641
@@ -750,7 +770,7 @@ struct sched_domain {
750 770
751#ifdef CONFIG_SCHEDSTATS 771#ifdef CONFIG_SCHEDSTATS
752 /* load_balance() stats */ 772 /* load_balance() stats */
753 unsigned long lb_cnt[CPU_MAX_IDLE_TYPES]; 773 unsigned long lb_count[CPU_MAX_IDLE_TYPES];
754 unsigned long lb_failed[CPU_MAX_IDLE_TYPES]; 774 unsigned long lb_failed[CPU_MAX_IDLE_TYPES];
755 unsigned long lb_balanced[CPU_MAX_IDLE_TYPES]; 775 unsigned long lb_balanced[CPU_MAX_IDLE_TYPES];
756 unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES]; 776 unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES];
@@ -760,17 +780,17 @@ struct sched_domain {
760 unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES]; 780 unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES];
761 781
762 /* Active load balancing */ 782 /* Active load balancing */
763 unsigned long alb_cnt; 783 unsigned long alb_count;
764 unsigned long alb_failed; 784 unsigned long alb_failed;
765 unsigned long alb_pushed; 785 unsigned long alb_pushed;
766 786
767 /* SD_BALANCE_EXEC stats */ 787 /* SD_BALANCE_EXEC stats */
768 unsigned long sbe_cnt; 788 unsigned long sbe_count;
769 unsigned long sbe_balanced; 789 unsigned long sbe_balanced;
770 unsigned long sbe_pushed; 790 unsigned long sbe_pushed;
771 791
772 /* SD_BALANCE_FORK stats */ 792 /* SD_BALANCE_FORK stats */
773 unsigned long sbf_cnt; 793 unsigned long sbf_count;
774 unsigned long sbf_balanced; 794 unsigned long sbf_balanced;
775 unsigned long sbf_pushed; 795 unsigned long sbf_pushed;
776 796
@@ -854,11 +874,11 @@ struct rq;
854struct sched_domain; 874struct sched_domain;
855 875
856struct sched_class { 876struct sched_class {
857 struct sched_class *next; 877 const struct sched_class *next;
858 878
859 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); 879 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
860 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); 880 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
861 void (*yield_task) (struct rq *rq, struct task_struct *p); 881 void (*yield_task) (struct rq *rq);
862 882
863 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); 883 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
864 884
@@ -888,31 +908,22 @@ struct load_weight {
888 * 4 se->block_start 908 * 4 se->block_start
889 * 4 se->run_node 909 * 4 se->run_node
890 * 4 se->sleep_start 910 * 4 se->sleep_start
891 * 4 se->sleep_start_fair
892 * 6 se->load.weight 911 * 6 se->load.weight
893 * 7 se->delta_fair
894 * 15 se->wait_runtime
895 */ 912 */
896struct sched_entity { 913struct sched_entity {
897 long wait_runtime;
898 unsigned long delta_fair_run;
899 unsigned long delta_fair_sleep;
900 unsigned long delta_exec;
901 s64 fair_key;
902 struct load_weight load; /* for load-balancing */ 914 struct load_weight load; /* for load-balancing */
903 struct rb_node run_node; 915 struct rb_node run_node;
904 unsigned int on_rq; 916 unsigned int on_rq;
917 int peer_preempt;
905 918
906 u64 exec_start; 919 u64 exec_start;
907 u64 sum_exec_runtime; 920 u64 sum_exec_runtime;
921 u64 vruntime;
908 u64 prev_sum_exec_runtime; 922 u64 prev_sum_exec_runtime;
909 u64 wait_start_fair;
910 u64 sleep_start_fair;
911 923
912#ifdef CONFIG_SCHEDSTATS 924#ifdef CONFIG_SCHEDSTATS
913 u64 wait_start; 925 u64 wait_start;
914 u64 wait_max; 926 u64 wait_max;
915 s64 sum_wait_runtime;
916 927
917 u64 sleep_start; 928 u64 sleep_start;
918 u64 sleep_max; 929 u64 sleep_max;
@@ -921,9 +932,25 @@ struct sched_entity {
921 u64 block_start; 932 u64 block_start;
922 u64 block_max; 933 u64 block_max;
923 u64 exec_max; 934 u64 exec_max;
924 935 u64 slice_max;
925 unsigned long wait_runtime_overruns; 936
926 unsigned long wait_runtime_underruns; 937 u64 nr_migrations;
938 u64 nr_migrations_cold;
939 u64 nr_failed_migrations_affine;
940 u64 nr_failed_migrations_running;
941 u64 nr_failed_migrations_hot;
942 u64 nr_forced_migrations;
943 u64 nr_forced2_migrations;
944
945 u64 nr_wakeups;
946 u64 nr_wakeups_sync;
947 u64 nr_wakeups_migrate;
948 u64 nr_wakeups_local;
949 u64 nr_wakeups_remote;
950 u64 nr_wakeups_affine;
951 u64 nr_wakeups_affine_attempts;
952 u64 nr_wakeups_passive;
953 u64 nr_wakeups_idle;
927#endif 954#endif
928 955
929#ifdef CONFIG_FAIR_GROUP_SCHED 956#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -952,7 +979,7 @@ struct task_struct {
952 979
953 int prio, static_prio, normal_prio; 980 int prio, static_prio, normal_prio;
954 struct list_head run_list; 981 struct list_head run_list;
955 struct sched_class *sched_class; 982 const struct sched_class *sched_class;
956 struct sched_entity se; 983 struct sched_entity se;
957 984
958#ifdef CONFIG_PREEMPT_NOTIFIERS 985#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1023,6 +1050,7 @@ struct task_struct {
1023 1050
1024 unsigned int rt_priority; 1051 unsigned int rt_priority;
1025 cputime_t utime, stime; 1052 cputime_t utime, stime;
1053 cputime_t gtime;
1026 unsigned long nvcsw, nivcsw; /* context switch counts */ 1054 unsigned long nvcsw, nivcsw; /* context switch counts */
1027 struct timespec start_time; /* monotonic time */ 1055 struct timespec start_time; /* monotonic time */
1028 struct timespec real_start_time; /* boot based time */ 1056 struct timespec real_start_time; /* boot based time */
@@ -1314,6 +1342,7 @@ static inline void put_task_struct(struct task_struct *t)
1314#define PF_STARTING 0x00000002 /* being created */ 1342#define PF_STARTING 0x00000002 /* being created */
1315#define PF_EXITING 0x00000004 /* getting shut down */ 1343#define PF_EXITING 0x00000004 /* getting shut down */
1316#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ 1344#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
1345#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
1317#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ 1346#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
1318#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ 1347#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
1319#define PF_DUMPCORE 0x00000200 /* dumped core */ 1348#define PF_DUMPCORE 0x00000200 /* dumped core */
@@ -1401,15 +1430,17 @@ static inline void idle_task_exit(void) {}
1401 1430
1402extern void sched_idle_next(void); 1431extern void sched_idle_next(void);
1403 1432
1433#ifdef CONFIG_SCHED_DEBUG
1404extern unsigned int sysctl_sched_latency; 1434extern unsigned int sysctl_sched_latency;
1405extern unsigned int sysctl_sched_min_granularity; 1435extern unsigned int sysctl_sched_nr_latency;
1406extern unsigned int sysctl_sched_wakeup_granularity; 1436extern unsigned int sysctl_sched_wakeup_granularity;
1407extern unsigned int sysctl_sched_batch_wakeup_granularity; 1437extern unsigned int sysctl_sched_batch_wakeup_granularity;
1408extern unsigned int sysctl_sched_stat_granularity;
1409extern unsigned int sysctl_sched_runtime_limit;
1410extern unsigned int sysctl_sched_compat_yield;
1411extern unsigned int sysctl_sched_child_runs_first; 1438extern unsigned int sysctl_sched_child_runs_first;
1412extern unsigned int sysctl_sched_features; 1439extern unsigned int sysctl_sched_features;
1440extern unsigned int sysctl_sched_migration_cost;
1441#endif
1442
1443extern unsigned int sysctl_sched_compat_yield;
1413 1444
1414#ifdef CONFIG_RT_MUTEXES 1445#ifdef CONFIG_RT_MUTEXES
1415extern int rt_mutex_getprio(struct task_struct *p); 1446extern int rt_mutex_getprio(struct task_struct *p);
@@ -1843,6 +1874,18 @@ extern int sched_mc_power_savings, sched_smt_power_savings;
1843 1874
1844extern void normalize_rt_tasks(void); 1875extern void normalize_rt_tasks(void);
1845 1876
1877#ifdef CONFIG_FAIR_GROUP_SCHED
1878
1879extern struct task_group init_task_group;
1880
1881extern struct task_group *sched_create_group(void);
1882extern void sched_destroy_group(struct task_group *tg);
1883extern void sched_move_task(struct task_struct *tsk);
1884extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
1885extern unsigned long sched_group_shares(struct task_group *tg);
1886
1887#endif
1888
1846#ifdef CONFIG_TASK_XACCT 1889#ifdef CONFIG_TASK_XACCT
1847static inline void add_rchar(struct task_struct *tsk, ssize_t amt) 1890static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
1848{ 1891{
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 525d437b1253..47729f18bfdf 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -159,15 +159,14 @@
159 .imbalance_pct = 125, \ 159 .imbalance_pct = 125, \
160 .cache_nice_tries = 1, \ 160 .cache_nice_tries = 1, \
161 .busy_idx = 2, \ 161 .busy_idx = 2, \
162 .idle_idx = 0, \ 162 .idle_idx = 1, \
163 .newidle_idx = 0, \ 163 .newidle_idx = 2, \
164 .wake_idx = 1, \ 164 .wake_idx = 1, \
165 .forkexec_idx = 1, \ 165 .forkexec_idx = 1, \
166 .flags = SD_LOAD_BALANCE \ 166 .flags = SD_LOAD_BALANCE \
167 | SD_BALANCE_NEWIDLE \ 167 | SD_BALANCE_NEWIDLE \
168 | SD_BALANCE_EXEC \ 168 | SD_BALANCE_EXEC \
169 | SD_WAKE_AFFINE \ 169 | SD_WAKE_AFFINE \
170 | SD_WAKE_IDLE \
171 | BALANCE_FOR_PKG_POWER,\ 170 | BALANCE_FOR_PKG_POWER,\
172 .last_balance = jiffies, \ 171 .last_balance = jiffies, \
173 .balance_interval = 1, \ 172 .balance_interval = 1, \
diff --git a/init/Kconfig b/init/Kconfig
index d54d0cadcc06..54f31a191b88 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -281,6 +281,27 @@ config CPUSETS
281 281
282 Say N if unsure. 282 Say N if unsure.
283 283
284config FAIR_GROUP_SCHED
285 bool "Fair group CPU scheduler"
286 default y
287 depends on EXPERIMENTAL
288 help
289 This feature lets CPU scheduler recognize task groups and control CPU
290 bandwidth allocation to such task groups.
291
292choice
293 depends on FAIR_GROUP_SCHED
294 prompt "Basis for grouping tasks"
295 default FAIR_USER_SCHED
296
297config FAIR_USER_SCHED
298 bool "user id"
299 help
300 This option will choose userid as the basis for grouping
301 tasks, thus providing equal CPU bandwidth to each user.
302
303endchoice
304
284config SYSFS_DEPRECATED 305config SYSFS_DEPRECATED
285 bool "Create deprecated sysfs files" 306 bool "Create deprecated sysfs files"
286 default y 307 default y
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 81e697829633..09e9574eeb26 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -119,7 +119,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
119 * No locking available for sched_info (and too expensive to add one) 119 * No locking available for sched_info (and too expensive to add one)
120 * Mitigate by taking snapshot of values 120 * Mitigate by taking snapshot of values
121 */ 121 */
122 t1 = tsk->sched_info.pcnt; 122 t1 = tsk->sched_info.pcount;
123 t2 = tsk->sched_info.run_delay; 123 t2 = tsk->sched_info.run_delay;
124 t3 = tsk->sched_info.cpu_time; 124 t3 = tsk->sched_info.cpu_time;
125 125
diff --git a/kernel/exit.c b/kernel/exit.c
index 993369ee94d1..7f7959de4a87 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -111,6 +111,7 @@ static void __exit_signal(struct task_struct *tsk)
111 */ 111 */
112 sig->utime = cputime_add(sig->utime, tsk->utime); 112 sig->utime = cputime_add(sig->utime, tsk->utime);
113 sig->stime = cputime_add(sig->stime, tsk->stime); 113 sig->stime = cputime_add(sig->stime, tsk->stime);
114 sig->gtime = cputime_add(sig->gtime, tsk->gtime);
114 sig->min_flt += tsk->min_flt; 115 sig->min_flt += tsk->min_flt;
115 sig->maj_flt += tsk->maj_flt; 116 sig->maj_flt += tsk->maj_flt;
116 sig->nvcsw += tsk->nvcsw; 117 sig->nvcsw += tsk->nvcsw;
@@ -1242,6 +1243,11 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1242 cputime_add(p->stime, 1243 cputime_add(p->stime,
1243 cputime_add(sig->stime, 1244 cputime_add(sig->stime,
1244 sig->cstime))); 1245 sig->cstime)));
1246 psig->cgtime =
1247 cputime_add(psig->cgtime,
1248 cputime_add(p->gtime,
1249 cputime_add(sig->gtime,
1250 sig->cgtime)));
1245 psig->cmin_flt += 1251 psig->cmin_flt +=
1246 p->min_flt + sig->min_flt + sig->cmin_flt; 1252 p->min_flt + sig->min_flt + sig->cmin_flt;
1247 psig->cmaj_flt += 1253 psig->cmaj_flt +=
diff --git a/kernel/fork.c b/kernel/fork.c
index 5e67f90a1694..3fc3c1383912 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -877,6 +877,8 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
877 sig->tty_old_pgrp = NULL; 877 sig->tty_old_pgrp = NULL;
878 878
879 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; 879 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
880 sig->gtime = cputime_zero;
881 sig->cgtime = cputime_zero;
880 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 882 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
881 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 883 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
882 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 884 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
@@ -1045,6 +1047,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1045 1047
1046 p->utime = cputime_zero; 1048 p->utime = cputime_zero;
1047 p->stime = cputime_zero; 1049 p->stime = cputime_zero;
1050 p->gtime = cputime_zero;
1048 1051
1049#ifdef CONFIG_TASK_XACCT 1052#ifdef CONFIG_TASK_XACCT
1050 p->rchar = 0; /* I/O counter: bytes read */ 1053 p->rchar = 0; /* I/O counter: bytes read */
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index d0e5c48e18c7..6046939d0804 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -14,6 +14,7 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/kexec.h> 16#include <linux/kexec.h>
17#include <linux/sched.h>
17 18
18#define KERNEL_ATTR_RO(_name) \ 19#define KERNEL_ATTR_RO(_name) \
19static struct subsys_attribute _name##_attr = __ATTR_RO(_name) 20static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
@@ -116,6 +117,13 @@ static int __init ksysfs_init(void)
116 &notes_attr); 117 &notes_attr);
117 } 118 }
118 119
120 /*
121 * Create "/sys/kernel/uids" directory and corresponding root user's
122 * directory under it.
123 */
124 if (!error)
125 error = uids_kobject_init();
126
119 return error; 127 return error;
120} 128}
121 129
diff --git a/kernel/sched.c b/kernel/sched.c
index 6c10fa796ca0..bba57adb9504 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -96,7 +96,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
96/* 96/*
97 * Some helpers for converting nanosecond timing to jiffy resolution 97 * Some helpers for converting nanosecond timing to jiffy resolution
98 */ 98 */
99#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) 99#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (1000000000 / HZ))
100#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) 100#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
101 101
102#define NICE_0_LOAD SCHED_LOAD_SCALE 102#define NICE_0_LOAD SCHED_LOAD_SCALE
@@ -105,11 +105,9 @@ unsigned long long __attribute__((weak)) sched_clock(void)
105/* 105/*
106 * These are the 'tuning knobs' of the scheduler: 106 * These are the 'tuning knobs' of the scheduler:
107 * 107 *
108 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), 108 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
109 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
110 * Timeslices get refilled after they expire. 109 * Timeslices get refilled after they expire.
111 */ 110 */
112#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
113#define DEF_TIMESLICE (100 * HZ / 1000) 111#define DEF_TIMESLICE (100 * HZ / 1000)
114 112
115#ifdef CONFIG_SMP 113#ifdef CONFIG_SMP
@@ -133,24 +131,6 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
133} 131}
134#endif 132#endif
135 133
136#define SCALE_PRIO(x, prio) \
137 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
138
139/*
140 * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
141 * to time slice values: [800ms ... 100ms ... 5ms]
142 */
143static unsigned int static_prio_timeslice(int static_prio)
144{
145 if (static_prio == NICE_TO_PRIO(19))
146 return 1;
147
148 if (static_prio < NICE_TO_PRIO(0))
149 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
150 else
151 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
152}
153
154static inline int rt_policy(int policy) 134static inline int rt_policy(int policy)
155{ 135{
156 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) 136 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
@@ -171,31 +151,91 @@ struct rt_prio_array {
171 struct list_head queue[MAX_RT_PRIO]; 151 struct list_head queue[MAX_RT_PRIO];
172}; 152};
173 153
174struct load_stat { 154#ifdef CONFIG_FAIR_GROUP_SCHED
175 struct load_weight load; 155
176 u64 load_update_start, load_update_last; 156struct cfs_rq;
177 unsigned long delta_fair, delta_exec, delta_stat; 157
158/* task group related information */
159struct task_group {
160 /* schedulable entities of this group on each cpu */
161 struct sched_entity **se;
162 /* runqueue "owned" by this group on each cpu */
163 struct cfs_rq **cfs_rq;
164 unsigned long shares;
165 /* spinlock to serialize modification to shares */
166 spinlock_t lock;
167};
168
169/* Default task group's sched entity on each cpu */
170static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
171/* Default task group's cfs_rq on each cpu */
172static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
173
174static struct sched_entity *init_sched_entity_p[NR_CPUS];
175static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
176
177/* Default task group.
178 * Every task in system belong to this group at bootup.
179 */
180struct task_group init_task_group = {
181 .se = init_sched_entity_p,
182 .cfs_rq = init_cfs_rq_p,
178}; 183};
179 184
185#ifdef CONFIG_FAIR_USER_SCHED
186# define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD
187#else
188# define INIT_TASK_GRP_LOAD NICE_0_LOAD
189#endif
190
191static int init_task_group_load = INIT_TASK_GRP_LOAD;
192
193/* return group to which a task belongs */
194static inline struct task_group *task_group(struct task_struct *p)
195{
196 struct task_group *tg;
197
198#ifdef CONFIG_FAIR_USER_SCHED
199 tg = p->user->tg;
200#else
201 tg = &init_task_group;
202#endif
203
204 return tg;
205}
206
207/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
208static inline void set_task_cfs_rq(struct task_struct *p)
209{
210 p->se.cfs_rq = task_group(p)->cfs_rq[task_cpu(p)];
211 p->se.parent = task_group(p)->se[task_cpu(p)];
212}
213
214#else
215
216static inline void set_task_cfs_rq(struct task_struct *p) { }
217
218#endif /* CONFIG_FAIR_GROUP_SCHED */
219
180/* CFS-related fields in a runqueue */ 220/* CFS-related fields in a runqueue */
181struct cfs_rq { 221struct cfs_rq {
182 struct load_weight load; 222 struct load_weight load;
183 unsigned long nr_running; 223 unsigned long nr_running;
184 224
185 s64 fair_clock;
186 u64 exec_clock; 225 u64 exec_clock;
187 s64 wait_runtime; 226 u64 min_vruntime;
188 u64 sleeper_bonus;
189 unsigned long wait_runtime_overruns, wait_runtime_underruns;
190 227
191 struct rb_root tasks_timeline; 228 struct rb_root tasks_timeline;
192 struct rb_node *rb_leftmost; 229 struct rb_node *rb_leftmost;
193 struct rb_node *rb_load_balance_curr; 230 struct rb_node *rb_load_balance_curr;
194#ifdef CONFIG_FAIR_GROUP_SCHED
195 /* 'curr' points to currently running entity on this cfs_rq. 231 /* 'curr' points to currently running entity on this cfs_rq.
196 * It is set to NULL otherwise (i.e when none are currently running). 232 * It is set to NULL otherwise (i.e when none are currently running).
197 */ 233 */
198 struct sched_entity *curr; 234 struct sched_entity *curr;
235
236 unsigned long nr_spread_over;
237
238#ifdef CONFIG_FAIR_GROUP_SCHED
199 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 239 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
200 240
201 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 241 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
@@ -206,6 +246,8 @@ struct cfs_rq {
206 * list is used during load balance. 246 * list is used during load balance.
207 */ 247 */
208 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ 248 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
249 struct task_group *tg; /* group that "owns" this runqueue */
250 struct rcu_head rcu;
209#endif 251#endif
210}; 252};
211 253
@@ -237,7 +279,7 @@ struct rq {
237#ifdef CONFIG_NO_HZ 279#ifdef CONFIG_NO_HZ
238 unsigned char in_nohz_recently; 280 unsigned char in_nohz_recently;
239#endif 281#endif
240 struct load_stat ls; /* capture load from *all* tasks on this cpu */ 282 struct load_weight load; /* capture load from *all* tasks on this cpu */
241 unsigned long nr_load_updates; 283 unsigned long nr_load_updates;
242 u64 nr_switches; 284 u64 nr_switches;
243 285
@@ -289,16 +331,19 @@ struct rq {
289 unsigned long yld_exp_empty; 331 unsigned long yld_exp_empty;
290 unsigned long yld_act_empty; 332 unsigned long yld_act_empty;
291 unsigned long yld_both_empty; 333 unsigned long yld_both_empty;
292 unsigned long yld_cnt; 334 unsigned long yld_count;
293 335
294 /* schedule() stats */ 336 /* schedule() stats */
295 unsigned long sched_switch; 337 unsigned long sched_switch;
296 unsigned long sched_cnt; 338 unsigned long sched_count;
297 unsigned long sched_goidle; 339 unsigned long sched_goidle;
298 340
299 /* try_to_wake_up() stats */ 341 /* try_to_wake_up() stats */
300 unsigned long ttwu_cnt; 342 unsigned long ttwu_count;
301 unsigned long ttwu_local; 343 unsigned long ttwu_local;
344
345 /* BKL stats */
346 unsigned long bkl_count;
302#endif 347#endif
303 struct lock_class_key rq_lock_key; 348 struct lock_class_key rq_lock_key;
304}; 349};
@@ -383,6 +428,37 @@ static void update_rq_clock(struct rq *rq)
383#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 428#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
384 429
385/* 430/*
431 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
432 */
433#ifdef CONFIG_SCHED_DEBUG
434# define const_debug __read_mostly
435#else
436# define const_debug static const
437#endif
438
439/*
440 * Debugging: various feature bits
441 */
442enum {
443 SCHED_FEAT_NEW_FAIR_SLEEPERS = 1,
444 SCHED_FEAT_START_DEBIT = 2,
445 SCHED_FEAT_TREE_AVG = 4,
446 SCHED_FEAT_APPROX_AVG = 8,
447 SCHED_FEAT_WAKEUP_PREEMPT = 16,
448 SCHED_FEAT_PREEMPT_RESTRICT = 32,
449};
450
451const_debug unsigned int sysctl_sched_features =
452 SCHED_FEAT_NEW_FAIR_SLEEPERS *1 |
453 SCHED_FEAT_START_DEBIT *1 |
454 SCHED_FEAT_TREE_AVG *0 |
455 SCHED_FEAT_APPROX_AVG *0 |
456 SCHED_FEAT_WAKEUP_PREEMPT *1 |
457 SCHED_FEAT_PREEMPT_RESTRICT *1;
458
459#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
460
461/*
386 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu 462 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
387 * clock constructed from sched_clock(): 463 * clock constructed from sched_clock():
388 */ 464 */
@@ -400,18 +476,7 @@ unsigned long long cpu_clock(int cpu)
400 476
401 return now; 477 return now;
402} 478}
403 479EXPORT_SYMBOL_GPL(cpu_clock);
404#ifdef CONFIG_FAIR_GROUP_SCHED
405/* Change a task's ->cfs_rq if it moves across CPUs */
406static inline void set_task_cfs_rq(struct task_struct *p)
407{
408 p->se.cfs_rq = &task_rq(p)->cfs;
409}
410#else
411static inline void set_task_cfs_rq(struct task_struct *p)
412{
413}
414#endif
415 480
416#ifndef prepare_arch_switch 481#ifndef prepare_arch_switch
417# define prepare_arch_switch(next) do { } while (0) 482# define prepare_arch_switch(next) do { } while (0)
@@ -497,16 +562,13 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
497static inline struct rq *__task_rq_lock(struct task_struct *p) 562static inline struct rq *__task_rq_lock(struct task_struct *p)
498 __acquires(rq->lock) 563 __acquires(rq->lock)
499{ 564{
500 struct rq *rq; 565 for (;;) {
501 566 struct rq *rq = task_rq(p);
502repeat_lock_task: 567 spin_lock(&rq->lock);
503 rq = task_rq(p); 568 if (likely(rq == task_rq(p)))
504 spin_lock(&rq->lock); 569 return rq;
505 if (unlikely(rq != task_rq(p))) {
506 spin_unlock(&rq->lock); 570 spin_unlock(&rq->lock);
507 goto repeat_lock_task;
508 } 571 }
509 return rq;
510} 572}
511 573
512/* 574/*
@@ -519,18 +581,17 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
519{ 581{
520 struct rq *rq; 582 struct rq *rq;
521 583
522repeat_lock_task: 584 for (;;) {
523 local_irq_save(*flags); 585 local_irq_save(*flags);
524 rq = task_rq(p); 586 rq = task_rq(p);
525 spin_lock(&rq->lock); 587 spin_lock(&rq->lock);
526 if (unlikely(rq != task_rq(p))) { 588 if (likely(rq == task_rq(p)))
589 return rq;
527 spin_unlock_irqrestore(&rq->lock, *flags); 590 spin_unlock_irqrestore(&rq->lock, *flags);
528 goto repeat_lock_task;
529 } 591 }
530 return rq;
531} 592}
532 593
533static inline void __task_rq_unlock(struct rq *rq) 594static void __task_rq_unlock(struct rq *rq)
534 __releases(rq->lock) 595 __releases(rq->lock)
535{ 596{
536 spin_unlock(&rq->lock); 597 spin_unlock(&rq->lock);
@@ -545,7 +606,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
545/* 606/*
546 * this_rq_lock - lock this runqueue and disable interrupts. 607 * this_rq_lock - lock this runqueue and disable interrupts.
547 */ 608 */
548static inline struct rq *this_rq_lock(void) 609static struct rq *this_rq_lock(void)
549 __acquires(rq->lock) 610 __acquires(rq->lock)
550{ 611{
551 struct rq *rq; 612 struct rq *rq;
@@ -645,19 +706,6 @@ static inline void resched_task(struct task_struct *p)
645} 706}
646#endif 707#endif
647 708
648static u64 div64_likely32(u64 divident, unsigned long divisor)
649{
650#if BITS_PER_LONG == 32
651 if (likely(divident <= 0xffffffffULL))
652 return (u32)divident / divisor;
653 do_div(divident, divisor);
654
655 return divident;
656#else
657 return divident / divisor;
658#endif
659}
660
661#if BITS_PER_LONG == 32 709#if BITS_PER_LONG == 32
662# define WMULT_CONST (~0UL) 710# define WMULT_CONST (~0UL)
663#else 711#else
@@ -699,16 +747,14 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
699 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); 747 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
700} 748}
701 749
702static void update_load_add(struct load_weight *lw, unsigned long inc) 750static inline void update_load_add(struct load_weight *lw, unsigned long inc)
703{ 751{
704 lw->weight += inc; 752 lw->weight += inc;
705 lw->inv_weight = 0;
706} 753}
707 754
708static void update_load_sub(struct load_weight *lw, unsigned long dec) 755static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
709{ 756{
710 lw->weight -= dec; 757 lw->weight -= dec;
711 lw->inv_weight = 0;
712} 758}
713 759
714/* 760/*
@@ -784,29 +830,20 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
784 int *this_best_prio, struct rq_iterator *iterator); 830 int *this_best_prio, struct rq_iterator *iterator);
785 831
786#include "sched_stats.h" 832#include "sched_stats.h"
787#include "sched_rt.c"
788#include "sched_fair.c"
789#include "sched_idletask.c" 833#include "sched_idletask.c"
834#include "sched_fair.c"
835#include "sched_rt.c"
790#ifdef CONFIG_SCHED_DEBUG 836#ifdef CONFIG_SCHED_DEBUG
791# include "sched_debug.c" 837# include "sched_debug.c"
792#endif 838#endif
793 839
794#define sched_class_highest (&rt_sched_class) 840#define sched_class_highest (&rt_sched_class)
795 841
796static void __update_curr_load(struct rq *rq, struct load_stat *ls)
797{
798 if (rq->curr != rq->idle && ls->load.weight) {
799 ls->delta_exec += ls->delta_stat;
800 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
801 ls->delta_stat = 0;
802 }
803}
804
805/* 842/*
806 * Update delta_exec, delta_fair fields for rq. 843 * Update delta_exec, delta_fair fields for rq.
807 * 844 *
808 * delta_fair clock advances at a rate inversely proportional to 845 * delta_fair clock advances at a rate inversely proportional to
809 * total load (rq->ls.load.weight) on the runqueue, while 846 * total load (rq->load.weight) on the runqueue, while
810 * delta_exec advances at the same rate as wall-clock (provided 847 * delta_exec advances at the same rate as wall-clock (provided
811 * cpu is not idle). 848 * cpu is not idle).
812 * 849 *
@@ -814,35 +851,17 @@ static void __update_curr_load(struct rq *rq, struct load_stat *ls)
814 * runqueue over any given interval. This (smoothened) load is used 851 * runqueue over any given interval. This (smoothened) load is used
815 * during load balance. 852 * during load balance.
816 * 853 *
817 * This function is called /before/ updating rq->ls.load 854 * This function is called /before/ updating rq->load
818 * and when switching tasks. 855 * and when switching tasks.
819 */ 856 */
820static void update_curr_load(struct rq *rq)
821{
822 struct load_stat *ls = &rq->ls;
823 u64 start;
824
825 start = ls->load_update_start;
826 ls->load_update_start = rq->clock;
827 ls->delta_stat += rq->clock - start;
828 /*
829 * Stagger updates to ls->delta_fair. Very frequent updates
830 * can be expensive.
831 */
832 if (ls->delta_stat >= sysctl_sched_stat_granularity)
833 __update_curr_load(rq, ls);
834}
835
836static inline void inc_load(struct rq *rq, const struct task_struct *p) 857static inline void inc_load(struct rq *rq, const struct task_struct *p)
837{ 858{
838 update_curr_load(rq); 859 update_load_add(&rq->load, p->se.load.weight);
839 update_load_add(&rq->ls.load, p->se.load.weight);
840} 860}
841 861
842static inline void dec_load(struct rq *rq, const struct task_struct *p) 862static inline void dec_load(struct rq *rq, const struct task_struct *p)
843{ 863{
844 update_curr_load(rq); 864 update_load_sub(&rq->load, p->se.load.weight);
845 update_load_sub(&rq->ls.load, p->se.load.weight);
846} 865}
847 866
848static void inc_nr_running(struct task_struct *p, struct rq *rq) 867static void inc_nr_running(struct task_struct *p, struct rq *rq)
@@ -859,8 +878,6 @@ static void dec_nr_running(struct task_struct *p, struct rq *rq)
859 878
860static void set_load_weight(struct task_struct *p) 879static void set_load_weight(struct task_struct *p)
861{ 880{
862 p->se.wait_runtime = 0;
863
864 if (task_has_rt_policy(p)) { 881 if (task_has_rt_policy(p)) {
865 p->se.load.weight = prio_to_weight[0] * 2; 882 p->se.load.weight = prio_to_weight[0] * 2;
866 p->se.load.inv_weight = prio_to_wmult[0] >> 1; 883 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
@@ -952,20 +969,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
952} 969}
953 970
954/* 971/*
955 * activate_idle_task - move idle task to the _front_ of runqueue.
956 */
957static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
958{
959 update_rq_clock(rq);
960
961 if (p->state == TASK_UNINTERRUPTIBLE)
962 rq->nr_uninterruptible--;
963
964 enqueue_task(rq, p, 0);
965 inc_nr_running(p, rq);
966}
967
968/*
969 * deactivate_task - remove a task from the runqueue. 972 * deactivate_task - remove a task from the runqueue.
970 */ 973 */
971static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) 974static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
@@ -989,32 +992,50 @@ inline int task_curr(const struct task_struct *p)
989/* Used instead of source_load when we know the type == 0 */ 992/* Used instead of source_load when we know the type == 0 */
990unsigned long weighted_cpuload(const int cpu) 993unsigned long weighted_cpuload(const int cpu)
991{ 994{
992 return cpu_rq(cpu)->ls.load.weight; 995 return cpu_rq(cpu)->load.weight;
993} 996}
994 997
995static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 998static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
996{ 999{
997#ifdef CONFIG_SMP 1000#ifdef CONFIG_SMP
998 task_thread_info(p)->cpu = cpu; 1001 task_thread_info(p)->cpu = cpu;
999 set_task_cfs_rq(p);
1000#endif 1002#endif
1003 set_task_cfs_rq(p);
1001} 1004}
1002 1005
1003#ifdef CONFIG_SMP 1006#ifdef CONFIG_SMP
1004 1007
1008/*
1009 * Is this task likely cache-hot:
1010 */
1011static inline int
1012task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1013{
1014 s64 delta;
1015
1016 if (p->sched_class != &fair_sched_class)
1017 return 0;
1018
1019 if (sysctl_sched_migration_cost == -1)
1020 return 1;
1021 if (sysctl_sched_migration_cost == 0)
1022 return 0;
1023
1024 delta = now - p->se.exec_start;
1025
1026 return delta < (s64)sysctl_sched_migration_cost;
1027}
1028
1029
1005void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1030void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1006{ 1031{
1007 int old_cpu = task_cpu(p); 1032 int old_cpu = task_cpu(p);
1008 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); 1033 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
1009 u64 clock_offset, fair_clock_offset; 1034 struct cfs_rq *old_cfsrq = task_cfs_rq(p),
1035 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
1036 u64 clock_offset;
1010 1037
1011 clock_offset = old_rq->clock - new_rq->clock; 1038 clock_offset = old_rq->clock - new_rq->clock;
1012 fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock;
1013
1014 if (p->se.wait_start_fair)
1015 p->se.wait_start_fair -= fair_clock_offset;
1016 if (p->se.sleep_start_fair)
1017 p->se.sleep_start_fair -= fair_clock_offset;
1018 1039
1019#ifdef CONFIG_SCHEDSTATS 1040#ifdef CONFIG_SCHEDSTATS
1020 if (p->se.wait_start) 1041 if (p->se.wait_start)
@@ -1023,7 +1044,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1023 p->se.sleep_start -= clock_offset; 1044 p->se.sleep_start -= clock_offset;
1024 if (p->se.block_start) 1045 if (p->se.block_start)
1025 p->se.block_start -= clock_offset; 1046 p->se.block_start -= clock_offset;
1047 if (old_cpu != new_cpu) {
1048 schedstat_inc(p, se.nr_migrations);
1049 if (task_hot(p, old_rq->clock, NULL))
1050 schedstat_inc(p, se.nr_forced2_migrations);
1051 }
1026#endif 1052#endif
1053 p->se.vruntime -= old_cfsrq->min_vruntime -
1054 new_cfsrq->min_vruntime;
1027 1055
1028 __set_task_cpu(p, new_cpu); 1056 __set_task_cpu(p, new_cpu);
1029} 1057}
@@ -1078,69 +1106,71 @@ void wait_task_inactive(struct task_struct *p)
1078 int running, on_rq; 1106 int running, on_rq;
1079 struct rq *rq; 1107 struct rq *rq;
1080 1108
1081repeat: 1109 for (;;) {
1082 /* 1110 /*
1083 * We do the initial early heuristics without holding 1111 * We do the initial early heuristics without holding
1084 * any task-queue locks at all. We'll only try to get 1112 * any task-queue locks at all. We'll only try to get
1085 * the runqueue lock when things look like they will 1113 * the runqueue lock when things look like they will
1086 * work out! 1114 * work out!
1087 */ 1115 */
1088 rq = task_rq(p); 1116 rq = task_rq(p);
1089 1117
1090 /* 1118 /*
1091 * If the task is actively running on another CPU 1119 * If the task is actively running on another CPU
1092 * still, just relax and busy-wait without holding 1120 * still, just relax and busy-wait without holding
1093 * any locks. 1121 * any locks.
1094 * 1122 *
1095 * NOTE! Since we don't hold any locks, it's not 1123 * NOTE! Since we don't hold any locks, it's not
1096 * even sure that "rq" stays as the right runqueue! 1124 * even sure that "rq" stays as the right runqueue!
1097 * But we don't care, since "task_running()" will 1125 * But we don't care, since "task_running()" will
1098 * return false if the runqueue has changed and p 1126 * return false if the runqueue has changed and p
1099 * is actually now running somewhere else! 1127 * is actually now running somewhere else!
1100 */ 1128 */
1101 while (task_running(rq, p)) 1129 while (task_running(rq, p))
1102 cpu_relax(); 1130 cpu_relax();
1103 1131
1104 /* 1132 /*
1105 * Ok, time to look more closely! We need the rq 1133 * Ok, time to look more closely! We need the rq
1106 * lock now, to be *sure*. If we're wrong, we'll 1134 * lock now, to be *sure*. If we're wrong, we'll
1107 * just go back and repeat. 1135 * just go back and repeat.
1108 */ 1136 */
1109 rq = task_rq_lock(p, &flags); 1137 rq = task_rq_lock(p, &flags);
1110 running = task_running(rq, p); 1138 running = task_running(rq, p);
1111 on_rq = p->se.on_rq; 1139 on_rq = p->se.on_rq;
1112 task_rq_unlock(rq, &flags); 1140 task_rq_unlock(rq, &flags);
1113 1141
1114 /* 1142 /*
1115 * Was it really running after all now that we 1143 * Was it really running after all now that we
1116 * checked with the proper locks actually held? 1144 * checked with the proper locks actually held?
1117 * 1145 *
1118 * Oops. Go back and try again.. 1146 * Oops. Go back and try again..
1119 */ 1147 */
1120 if (unlikely(running)) { 1148 if (unlikely(running)) {
1121 cpu_relax(); 1149 cpu_relax();
1122 goto repeat; 1150 continue;
1123 } 1151 }
1124 1152
1125 /* 1153 /*
1126 * It's not enough that it's not actively running, 1154 * It's not enough that it's not actively running,
1127 * it must be off the runqueue _entirely_, and not 1155 * it must be off the runqueue _entirely_, and not
1128 * preempted! 1156 * preempted!
1129 * 1157 *
1130 * So if it wa still runnable (but just not actively 1158 * So if it wa still runnable (but just not actively
1131 * running right now), it's preempted, and we should 1159 * running right now), it's preempted, and we should
1132 * yield - it could be a while. 1160 * yield - it could be a while.
1133 */ 1161 */
1134 if (unlikely(on_rq)) { 1162 if (unlikely(on_rq)) {
1135 yield(); 1163 schedule_timeout_uninterruptible(1);
1136 goto repeat; 1164 continue;
1137 } 1165 }
1138 1166
1139 /* 1167 /*
1140 * Ahh, all good. It wasn't running, and it wasn't 1168 * Ahh, all good. It wasn't running, and it wasn't
1141 * runnable, which means that it will never become 1169 * runnable, which means that it will never become
1142 * running in the future either. We're all done! 1170 * running in the future either. We're all done!
1143 */ 1171 */
1172 break;
1173 }
1144} 1174}
1145 1175
1146/*** 1176/***
@@ -1174,7 +1204,7 @@ void kick_process(struct task_struct *p)
1174 * We want to under-estimate the load of migration sources, to 1204 * We want to under-estimate the load of migration sources, to
1175 * balance conservatively. 1205 * balance conservatively.
1176 */ 1206 */
1177static inline unsigned long source_load(int cpu, int type) 1207static unsigned long source_load(int cpu, int type)
1178{ 1208{
1179 struct rq *rq = cpu_rq(cpu); 1209 struct rq *rq = cpu_rq(cpu);
1180 unsigned long total = weighted_cpuload(cpu); 1210 unsigned long total = weighted_cpuload(cpu);
@@ -1189,7 +1219,7 @@ static inline unsigned long source_load(int cpu, int type)
1189 * Return a high guess at the load of a migration-target cpu weighted 1219 * Return a high guess at the load of a migration-target cpu weighted
1190 * according to the scheduling class and "nice" value. 1220 * according to the scheduling class and "nice" value.
1191 */ 1221 */
1192static inline unsigned long target_load(int cpu, int type) 1222static unsigned long target_load(int cpu, int type)
1193{ 1223{
1194 struct rq *rq = cpu_rq(cpu); 1224 struct rq *rq = cpu_rq(cpu);
1195 unsigned long total = weighted_cpuload(cpu); 1225 unsigned long total = weighted_cpuload(cpu);
@@ -1231,7 +1261,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1231 1261
1232 /* Skip over this group if it has no CPUs allowed */ 1262 /* Skip over this group if it has no CPUs allowed */
1233 if (!cpus_intersects(group->cpumask, p->cpus_allowed)) 1263 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1234 goto nextgroup; 1264 continue;
1235 1265
1236 local_group = cpu_isset(this_cpu, group->cpumask); 1266 local_group = cpu_isset(this_cpu, group->cpumask);
1237 1267
@@ -1259,9 +1289,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1259 min_load = avg_load; 1289 min_load = avg_load;
1260 idlest = group; 1290 idlest = group;
1261 } 1291 }
1262nextgroup: 1292 } while (group = group->next, group != sd->groups);
1263 group = group->next;
1264 } while (group != sd->groups);
1265 1293
1266 if (!idlest || 100*this_load < imbalance*min_load) 1294 if (!idlest || 100*this_load < imbalance*min_load)
1267 return NULL; 1295 return NULL;
@@ -1393,8 +1421,13 @@ static int wake_idle(int cpu, struct task_struct *p)
1393 if (sd->flags & SD_WAKE_IDLE) { 1421 if (sd->flags & SD_WAKE_IDLE) {
1394 cpus_and(tmp, sd->span, p->cpus_allowed); 1422 cpus_and(tmp, sd->span, p->cpus_allowed);
1395 for_each_cpu_mask(i, tmp) { 1423 for_each_cpu_mask(i, tmp) {
1396 if (idle_cpu(i)) 1424 if (idle_cpu(i)) {
1425 if (i != task_cpu(p)) {
1426 schedstat_inc(p,
1427 se.nr_wakeups_idle);
1428 }
1397 return i; 1429 return i;
1430 }
1398 } 1431 }
1399 } else { 1432 } else {
1400 break; 1433 break;
@@ -1425,7 +1458,7 @@ static inline int wake_idle(int cpu, struct task_struct *p)
1425 */ 1458 */
1426static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) 1459static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1427{ 1460{
1428 int cpu, this_cpu, success = 0; 1461 int cpu, orig_cpu, this_cpu, success = 0;
1429 unsigned long flags; 1462 unsigned long flags;
1430 long old_state; 1463 long old_state;
1431 struct rq *rq; 1464 struct rq *rq;
@@ -1444,6 +1477,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1444 goto out_running; 1477 goto out_running;
1445 1478
1446 cpu = task_cpu(p); 1479 cpu = task_cpu(p);
1480 orig_cpu = cpu;
1447 this_cpu = smp_processor_id(); 1481 this_cpu = smp_processor_id();
1448 1482
1449#ifdef CONFIG_SMP 1483#ifdef CONFIG_SMP
@@ -1452,7 +1486,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1452 1486
1453 new_cpu = cpu; 1487 new_cpu = cpu;
1454 1488
1455 schedstat_inc(rq, ttwu_cnt); 1489 schedstat_inc(rq, ttwu_count);
1456 if (cpu == this_cpu) { 1490 if (cpu == this_cpu) {
1457 schedstat_inc(rq, ttwu_local); 1491 schedstat_inc(rq, ttwu_local);
1458 goto out_set_cpu; 1492 goto out_set_cpu;
@@ -1487,6 +1521,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1487 unsigned long tl = this_load; 1521 unsigned long tl = this_load;
1488 unsigned long tl_per_task; 1522 unsigned long tl_per_task;
1489 1523
1524 /*
1525 * Attract cache-cold tasks on sync wakeups:
1526 */
1527 if (sync && !task_hot(p, rq->clock, this_sd))
1528 goto out_set_cpu;
1529
1530 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1490 tl_per_task = cpu_avg_load_per_task(this_cpu); 1531 tl_per_task = cpu_avg_load_per_task(this_cpu);
1491 1532
1492 /* 1533 /*
@@ -1506,6 +1547,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1506 * there is no bad imbalance. 1547 * there is no bad imbalance.
1507 */ 1548 */
1508 schedstat_inc(this_sd, ttwu_move_affine); 1549 schedstat_inc(this_sd, ttwu_move_affine);
1550 schedstat_inc(p, se.nr_wakeups_affine);
1509 goto out_set_cpu; 1551 goto out_set_cpu;
1510 } 1552 }
1511 } 1553 }
@@ -1517,6 +1559,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1517 if (this_sd->flags & SD_WAKE_BALANCE) { 1559 if (this_sd->flags & SD_WAKE_BALANCE) {
1518 if (imbalance*this_load <= 100*load) { 1560 if (imbalance*this_load <= 100*load) {
1519 schedstat_inc(this_sd, ttwu_move_balance); 1561 schedstat_inc(this_sd, ttwu_move_balance);
1562 schedstat_inc(p, se.nr_wakeups_passive);
1520 goto out_set_cpu; 1563 goto out_set_cpu;
1521 } 1564 }
1522 } 1565 }
@@ -1542,18 +1585,18 @@ out_set_cpu:
1542 1585
1543out_activate: 1586out_activate:
1544#endif /* CONFIG_SMP */ 1587#endif /* CONFIG_SMP */
1588 schedstat_inc(p, se.nr_wakeups);
1589 if (sync)
1590 schedstat_inc(p, se.nr_wakeups_sync);
1591 if (orig_cpu != cpu)
1592 schedstat_inc(p, se.nr_wakeups_migrate);
1593 if (cpu == this_cpu)
1594 schedstat_inc(p, se.nr_wakeups_local);
1595 else
1596 schedstat_inc(p, se.nr_wakeups_remote);
1545 update_rq_clock(rq); 1597 update_rq_clock(rq);
1546 activate_task(rq, p, 1); 1598 activate_task(rq, p, 1);
1547 /* 1599 check_preempt_curr(rq, p);
1548 * Sync wakeups (i.e. those types of wakeups where the waker
1549 * has indicated that it will leave the CPU in short order)
1550 * don't trigger a preemption, if the woken up task will run on
1551 * this cpu. (in this case the 'I will reschedule' promise of
1552 * the waker guarantees that the freshly woken up task is going
1553 * to be considered on this CPU.)
1554 */
1555 if (!sync || cpu != this_cpu)
1556 check_preempt_curr(rq, p);
1557 success = 1; 1600 success = 1;
1558 1601
1559out_running: 1602out_running:
@@ -1584,28 +1627,20 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1584 */ 1627 */
1585static void __sched_fork(struct task_struct *p) 1628static void __sched_fork(struct task_struct *p)
1586{ 1629{
1587 p->se.wait_start_fair = 0;
1588 p->se.exec_start = 0; 1630 p->se.exec_start = 0;
1589 p->se.sum_exec_runtime = 0; 1631 p->se.sum_exec_runtime = 0;
1590 p->se.prev_sum_exec_runtime = 0; 1632 p->se.prev_sum_exec_runtime = 0;
1591 p->se.delta_exec = 0;
1592 p->se.delta_fair_run = 0;
1593 p->se.delta_fair_sleep = 0;
1594 p->se.wait_runtime = 0;
1595 p->se.sleep_start_fair = 0;
1596 1633
1597#ifdef CONFIG_SCHEDSTATS 1634#ifdef CONFIG_SCHEDSTATS
1598 p->se.wait_start = 0; 1635 p->se.wait_start = 0;
1599 p->se.sum_wait_runtime = 0;
1600 p->se.sum_sleep_runtime = 0; 1636 p->se.sum_sleep_runtime = 0;
1601 p->se.sleep_start = 0; 1637 p->se.sleep_start = 0;
1602 p->se.block_start = 0; 1638 p->se.block_start = 0;
1603 p->se.sleep_max = 0; 1639 p->se.sleep_max = 0;
1604 p->se.block_max = 0; 1640 p->se.block_max = 0;
1605 p->se.exec_max = 0; 1641 p->se.exec_max = 0;
1642 p->se.slice_max = 0;
1606 p->se.wait_max = 0; 1643 p->se.wait_max = 0;
1607 p->se.wait_runtime_overruns = 0;
1608 p->se.wait_runtime_underruns = 0;
1609#endif 1644#endif
1610 1645
1611 INIT_LIST_HEAD(&p->run_list); 1646 INIT_LIST_HEAD(&p->run_list);
@@ -1636,12 +1671,14 @@ void sched_fork(struct task_struct *p, int clone_flags)
1636#ifdef CONFIG_SMP 1671#ifdef CONFIG_SMP
1637 cpu = sched_balance_self(cpu, SD_BALANCE_FORK); 1672 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1638#endif 1673#endif
1639 __set_task_cpu(p, cpu); 1674 set_task_cpu(p, cpu);
1640 1675
1641 /* 1676 /*
1642 * Make sure we do not leak PI boosting priority to the child: 1677 * Make sure we do not leak PI boosting priority to the child:
1643 */ 1678 */
1644 p->prio = current->normal_prio; 1679 p->prio = current->normal_prio;
1680 if (!rt_prio(p->prio))
1681 p->sched_class = &fair_sched_class;
1645 1682
1646#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1683#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1647 if (likely(sched_info_on())) 1684 if (likely(sched_info_on()))
@@ -1658,12 +1695,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
1658} 1695}
1659 1696
1660/* 1697/*
1661 * After fork, child runs first. (default) If set to 0 then
1662 * parent will (try to) run first.
1663 */
1664unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1665
1666/*
1667 * wake_up_new_task - wake up a newly created task for the first time. 1698 * wake_up_new_task - wake up a newly created task for the first time.
1668 * 1699 *
1669 * This function will do some initial scheduler statistics housekeeping 1700 * This function will do some initial scheduler statistics housekeeping
@@ -1674,24 +1705,14 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1674{ 1705{
1675 unsigned long flags; 1706 unsigned long flags;
1676 struct rq *rq; 1707 struct rq *rq;
1677 int this_cpu;
1678 1708
1679 rq = task_rq_lock(p, &flags); 1709 rq = task_rq_lock(p, &flags);
1680 BUG_ON(p->state != TASK_RUNNING); 1710 BUG_ON(p->state != TASK_RUNNING);
1681 this_cpu = smp_processor_id(); /* parent's CPU */
1682 update_rq_clock(rq); 1711 update_rq_clock(rq);
1683 1712
1684 p->prio = effective_prio(p); 1713 p->prio = effective_prio(p);
1685 1714
1686 if (rt_prio(p->prio)) 1715 if (!p->sched_class->task_new || !current->se.on_rq || !rq->cfs.curr) {
1687 p->sched_class = &rt_sched_class;
1688 else
1689 p->sched_class = &fair_sched_class;
1690
1691 if (!p->sched_class->task_new || !sysctl_sched_child_runs_first ||
1692 (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu ||
1693 !current->se.on_rq) {
1694
1695 activate_task(rq, p, 0); 1716 activate_task(rq, p, 0);
1696 } else { 1717 } else {
1697 /* 1718 /*
@@ -1800,7 +1821,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
1800 * with the lock held can cause deadlocks; see schedule() for 1821 * with the lock held can cause deadlocks; see schedule() for
1801 * details.) 1822 * details.)
1802 */ 1823 */
1803static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) 1824static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1804 __releases(rq->lock) 1825 __releases(rq->lock)
1805{ 1826{
1806 struct mm_struct *mm = rq->prev_mm; 1827 struct mm_struct *mm = rq->prev_mm;
@@ -1982,42 +2003,10 @@ unsigned long nr_active(void)
1982 */ 2003 */
1983static void update_cpu_load(struct rq *this_rq) 2004static void update_cpu_load(struct rq *this_rq)
1984{ 2005{
1985 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64; 2006 unsigned long this_load = this_rq->load.weight;
1986 unsigned long total_load = this_rq->ls.load.weight;
1987 unsigned long this_load = total_load;
1988 struct load_stat *ls = &this_rq->ls;
1989 int i, scale; 2007 int i, scale;
1990 2008
1991 this_rq->nr_load_updates++; 2009 this_rq->nr_load_updates++;
1992 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1993 goto do_avg;
1994
1995 /* Update delta_fair/delta_exec fields first */
1996 update_curr_load(this_rq);
1997
1998 fair_delta64 = ls->delta_fair + 1;
1999 ls->delta_fair = 0;
2000
2001 exec_delta64 = ls->delta_exec + 1;
2002 ls->delta_exec = 0;
2003
2004 sample_interval64 = this_rq->clock - ls->load_update_last;
2005 ls->load_update_last = this_rq->clock;
2006
2007 if ((s64)sample_interval64 < (s64)TICK_NSEC)
2008 sample_interval64 = TICK_NSEC;
2009
2010 if (exec_delta64 > sample_interval64)
2011 exec_delta64 = sample_interval64;
2012
2013 idle_delta64 = sample_interval64 - exec_delta64;
2014
2015 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
2016 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
2017
2018 this_load = (unsigned long)tmp64;
2019
2020do_avg:
2021 2010
2022 /* Update our load: */ 2011 /* Update our load: */
2023 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 2012 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2027,7 +2016,13 @@ do_avg:
2027 2016
2028 old_load = this_rq->cpu_load[i]; 2017 old_load = this_rq->cpu_load[i];
2029 new_load = this_load; 2018 new_load = this_load;
2030 2019 /*
2020 * Round up the averaging division if load is increasing. This
2021 * prevents us from getting stuck on 9 if the load is 10, for
2022 * example.
2023 */
2024 if (new_load > old_load)
2025 new_load += scale-1;
2031 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 2026 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2032 } 2027 }
2033} 2028}
@@ -2179,13 +2174,38 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2179 * 2) cannot be migrated to this CPU due to cpus_allowed, or 2174 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2180 * 3) are cache-hot on their current CPU. 2175 * 3) are cache-hot on their current CPU.
2181 */ 2176 */
2182 if (!cpu_isset(this_cpu, p->cpus_allowed)) 2177 if (!cpu_isset(this_cpu, p->cpus_allowed)) {
2178 schedstat_inc(p, se.nr_failed_migrations_affine);
2183 return 0; 2179 return 0;
2180 }
2184 *all_pinned = 0; 2181 *all_pinned = 0;
2185 2182
2186 if (task_running(rq, p)) 2183 if (task_running(rq, p)) {
2184 schedstat_inc(p, se.nr_failed_migrations_running);
2187 return 0; 2185 return 0;
2186 }
2187
2188 /*
2189 * Aggressive migration if:
2190 * 1) task is cache cold, or
2191 * 2) too many balance attempts have failed.
2192 */
2193
2194 if (!task_hot(p, rq->clock, sd) ||
2195 sd->nr_balance_failed > sd->cache_nice_tries) {
2196#ifdef CONFIG_SCHEDSTATS
2197 if (task_hot(p, rq->clock, sd)) {
2198 schedstat_inc(sd, lb_hot_gained[idle]);
2199 schedstat_inc(p, se.nr_forced_migrations);
2200 }
2201#endif
2202 return 1;
2203 }
2188 2204
2205 if (task_hot(p, rq->clock, sd)) {
2206 schedstat_inc(p, se.nr_failed_migrations_hot);
2207 return 0;
2208 }
2189 return 1; 2209 return 1;
2190} 2210}
2191 2211
@@ -2264,7 +2284,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2264 struct sched_domain *sd, enum cpu_idle_type idle, 2284 struct sched_domain *sd, enum cpu_idle_type idle,
2265 int *all_pinned) 2285 int *all_pinned)
2266{ 2286{
2267 struct sched_class *class = sched_class_highest; 2287 const struct sched_class *class = sched_class_highest;
2268 unsigned long total_load_moved = 0; 2288 unsigned long total_load_moved = 0;
2269 int this_best_prio = this_rq->curr->prio; 2289 int this_best_prio = this_rq->curr->prio;
2270 2290
@@ -2289,7 +2309,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2289static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, 2309static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2290 struct sched_domain *sd, enum cpu_idle_type idle) 2310 struct sched_domain *sd, enum cpu_idle_type idle)
2291{ 2311{
2292 struct sched_class *class; 2312 const struct sched_class *class;
2293 int this_best_prio = MAX_PRIO; 2313 int this_best_prio = MAX_PRIO;
2294 2314
2295 for (class = sched_class_highest; class; class = class->next) 2315 for (class = sched_class_highest; class; class = class->next)
@@ -2653,7 +2673,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2653 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2673 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2654 sd_idle = 1; 2674 sd_idle = 1;
2655 2675
2656 schedstat_inc(sd, lb_cnt[idle]); 2676 schedstat_inc(sd, lb_count[idle]);
2657 2677
2658redo: 2678redo:
2659 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 2679 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
@@ -2806,7 +2826,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2806 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2826 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2807 sd_idle = 1; 2827 sd_idle = 1;
2808 2828
2809 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]); 2829 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
2810redo: 2830redo:
2811 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, 2831 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
2812 &sd_idle, &cpus, NULL); 2832 &sd_idle, &cpus, NULL);
@@ -2940,7 +2960,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2940 } 2960 }
2941 2961
2942 if (likely(sd)) { 2962 if (likely(sd)) {
2943 schedstat_inc(sd, alb_cnt); 2963 schedstat_inc(sd, alb_count);
2944 2964
2945 if (move_one_task(target_rq, target_cpu, busiest_rq, 2965 if (move_one_task(target_rq, target_cpu, busiest_rq,
2946 sd, CPU_IDLE)) 2966 sd, CPU_IDLE))
@@ -3033,7 +3053,7 @@ static DEFINE_SPINLOCK(balancing);
3033 * 3053 *
3034 * Balancing parameters are set up in arch_init_sched_domains. 3054 * Balancing parameters are set up in arch_init_sched_domains.
3035 */ 3055 */
3036static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) 3056static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3037{ 3057{
3038 int balance = 1; 3058 int balance = 1;
3039 struct rq *rq = cpu_rq(cpu); 3059 struct rq *rq = cpu_rq(cpu);
@@ -3280,6 +3300,25 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
3280} 3300}
3281 3301
3282/* 3302/*
3303 * Account guest cpu time to a process.
3304 * @p: the process that the cpu time gets accounted to
3305 * @cputime: the cpu time spent in virtual machine since the last update
3306 */
3307void account_guest_time(struct task_struct *p, cputime_t cputime)
3308{
3309 cputime64_t tmp;
3310 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3311
3312 tmp = cputime_to_cputime64(cputime);
3313
3314 p->utime = cputime_add(p->utime, cputime);
3315 p->gtime = cputime_add(p->gtime, cputime);
3316
3317 cpustat->user = cputime64_add(cpustat->user, tmp);
3318 cpustat->guest = cputime64_add(cpustat->guest, tmp);
3319}
3320
3321/*
3283 * Account system cpu time to a process. 3322 * Account system cpu time to a process.
3284 * @p: the process that the cpu time gets accounted to 3323 * @p: the process that the cpu time gets accounted to
3285 * @hardirq_offset: the offset to subtract from hardirq_count() 3324 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3292,6 +3331,12 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3292 struct rq *rq = this_rq(); 3331 struct rq *rq = this_rq();
3293 cputime64_t tmp; 3332 cputime64_t tmp;
3294 3333
3334 if (p->flags & PF_VCPU) {
3335 account_guest_time(p, cputime);
3336 p->flags &= ~PF_VCPU;
3337 return;
3338 }
3339
3295 p->stime = cputime_add(p->stime, cputime); 3340 p->stime = cputime_add(p->stime, cputime);
3296 3341
3297 /* Add system time to cpustat. */ 3342 /* Add system time to cpustat. */
@@ -3430,7 +3475,13 @@ static inline void schedule_debug(struct task_struct *prev)
3430 3475
3431 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 3476 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3432 3477
3433 schedstat_inc(this_rq(), sched_cnt); 3478 schedstat_inc(this_rq(), sched_count);
3479#ifdef CONFIG_SCHEDSTATS
3480 if (unlikely(prev->lock_depth >= 0)) {
3481 schedstat_inc(this_rq(), bkl_count);
3482 schedstat_inc(prev, sched_info.bkl_count);
3483 }
3484#endif
3434} 3485}
3435 3486
3436/* 3487/*
@@ -3439,7 +3490,7 @@ static inline void schedule_debug(struct task_struct *prev)
3439static inline struct task_struct * 3490static inline struct task_struct *
3440pick_next_task(struct rq *rq, struct task_struct *prev) 3491pick_next_task(struct rq *rq, struct task_struct *prev)
3441{ 3492{
3442 struct sched_class *class; 3493 const struct sched_class *class;
3443 struct task_struct *p; 3494 struct task_struct *p;
3444 3495
3445 /* 3496 /*
@@ -3488,9 +3539,13 @@ need_resched_nonpreemptible:
3488 3539
3489 schedule_debug(prev); 3540 schedule_debug(prev);
3490 3541
3491 spin_lock_irq(&rq->lock); 3542 /*
3492 clear_tsk_need_resched(prev); 3543 * Do the rq-clock update outside the rq lock:
3544 */
3545 local_irq_disable();
3493 __update_rq_clock(rq); 3546 __update_rq_clock(rq);
3547 spin_lock(&rq->lock);
3548 clear_tsk_need_resched(prev);
3494 3549
3495 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3550 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3496 if (unlikely((prev->state & TASK_INTERRUPTIBLE) && 3551 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
@@ -3550,27 +3605,30 @@ asmlinkage void __sched preempt_schedule(void)
3550 if (likely(ti->preempt_count || irqs_disabled())) 3605 if (likely(ti->preempt_count || irqs_disabled()))
3551 return; 3606 return;
3552 3607
3553need_resched: 3608 do {
3554 add_preempt_count(PREEMPT_ACTIVE); 3609 add_preempt_count(PREEMPT_ACTIVE);
3555 /* 3610
3556 * We keep the big kernel semaphore locked, but we 3611 /*
3557 * clear ->lock_depth so that schedule() doesnt 3612 * We keep the big kernel semaphore locked, but we
3558 * auto-release the semaphore: 3613 * clear ->lock_depth so that schedule() doesnt
3559 */ 3614 * auto-release the semaphore:
3615 */
3560#ifdef CONFIG_PREEMPT_BKL 3616#ifdef CONFIG_PREEMPT_BKL
3561 saved_lock_depth = task->lock_depth; 3617 saved_lock_depth = task->lock_depth;
3562 task->lock_depth = -1; 3618 task->lock_depth = -1;
3563#endif 3619#endif
3564 schedule(); 3620 schedule();
3565#ifdef CONFIG_PREEMPT_BKL 3621#ifdef CONFIG_PREEMPT_BKL
3566 task->lock_depth = saved_lock_depth; 3622 task->lock_depth = saved_lock_depth;
3567#endif 3623#endif
3568 sub_preempt_count(PREEMPT_ACTIVE); 3624 sub_preempt_count(PREEMPT_ACTIVE);
3569 3625
3570 /* we could miss a preemption opportunity between schedule and now */ 3626 /*
3571 barrier(); 3627 * Check again in case we missed a preemption opportunity
3572 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3628 * between schedule and now.
3573 goto need_resched; 3629 */
3630 barrier();
3631 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
3574} 3632}
3575EXPORT_SYMBOL(preempt_schedule); 3633EXPORT_SYMBOL(preempt_schedule);
3576 3634
@@ -3590,29 +3648,32 @@ asmlinkage void __sched preempt_schedule_irq(void)
3590 /* Catch callers which need to be fixed */ 3648 /* Catch callers which need to be fixed */
3591 BUG_ON(ti->preempt_count || !irqs_disabled()); 3649 BUG_ON(ti->preempt_count || !irqs_disabled());
3592 3650
3593need_resched: 3651 do {
3594 add_preempt_count(PREEMPT_ACTIVE); 3652 add_preempt_count(PREEMPT_ACTIVE);
3595 /* 3653
3596 * We keep the big kernel semaphore locked, but we 3654 /*
3597 * clear ->lock_depth so that schedule() doesnt 3655 * We keep the big kernel semaphore locked, but we
3598 * auto-release the semaphore: 3656 * clear ->lock_depth so that schedule() doesnt
3599 */ 3657 * auto-release the semaphore:
3658 */
3600#ifdef CONFIG_PREEMPT_BKL 3659#ifdef CONFIG_PREEMPT_BKL
3601 saved_lock_depth = task->lock_depth; 3660 saved_lock_depth = task->lock_depth;
3602 task->lock_depth = -1; 3661 task->lock_depth = -1;
3603#endif 3662#endif
3604 local_irq_enable(); 3663 local_irq_enable();
3605 schedule(); 3664 schedule();
3606 local_irq_disable(); 3665 local_irq_disable();
3607#ifdef CONFIG_PREEMPT_BKL 3666#ifdef CONFIG_PREEMPT_BKL
3608 task->lock_depth = saved_lock_depth; 3667 task->lock_depth = saved_lock_depth;
3609#endif 3668#endif
3610 sub_preempt_count(PREEMPT_ACTIVE); 3669 sub_preempt_count(PREEMPT_ACTIVE);
3611 3670
3612 /* we could miss a preemption opportunity between schedule and now */ 3671 /*
3613 barrier(); 3672 * Check again in case we missed a preemption opportunity
3614 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3673 * between schedule and now.
3615 goto need_resched; 3674 */
3675 barrier();
3676 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
3616} 3677}
3617 3678
3618#endif /* CONFIG_PREEMPT */ 3679#endif /* CONFIG_PREEMPT */
@@ -3636,10 +3697,9 @@ EXPORT_SYMBOL(default_wake_function);
3636static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 3697static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3637 int nr_exclusive, int sync, void *key) 3698 int nr_exclusive, int sync, void *key)
3638{ 3699{
3639 struct list_head *tmp, *next; 3700 wait_queue_t *curr, *next;
3640 3701
3641 list_for_each_safe(tmp, next, &q->task_list) { 3702 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
3642 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3643 unsigned flags = curr->flags; 3703 unsigned flags = curr->flags;
3644 3704
3645 if (curr->func(curr, mode, sync, key) && 3705 if (curr->func(curr, mode, sync, key) &&
@@ -3729,206 +3789,116 @@ void fastcall complete_all(struct completion *x)
3729} 3789}
3730EXPORT_SYMBOL(complete_all); 3790EXPORT_SYMBOL(complete_all);
3731 3791
3732void fastcall __sched wait_for_completion(struct completion *x) 3792static inline long __sched
3733{ 3793do_wait_for_common(struct completion *x, long timeout, int state)
3734 might_sleep();
3735
3736 spin_lock_irq(&x->wait.lock);
3737 if (!x->done) {
3738 DECLARE_WAITQUEUE(wait, current);
3739
3740 wait.flags |= WQ_FLAG_EXCLUSIVE;
3741 __add_wait_queue_tail(&x->wait, &wait);
3742 do {
3743 __set_current_state(TASK_UNINTERRUPTIBLE);
3744 spin_unlock_irq(&x->wait.lock);
3745 schedule();
3746 spin_lock_irq(&x->wait.lock);
3747 } while (!x->done);
3748 __remove_wait_queue(&x->wait, &wait);
3749 }
3750 x->done--;
3751 spin_unlock_irq(&x->wait.lock);
3752}
3753EXPORT_SYMBOL(wait_for_completion);
3754
3755unsigned long fastcall __sched
3756wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3757{ 3794{
3758 might_sleep();
3759
3760 spin_lock_irq(&x->wait.lock);
3761 if (!x->done) { 3795 if (!x->done) {
3762 DECLARE_WAITQUEUE(wait, current); 3796 DECLARE_WAITQUEUE(wait, current);
3763 3797
3764 wait.flags |= WQ_FLAG_EXCLUSIVE; 3798 wait.flags |= WQ_FLAG_EXCLUSIVE;
3765 __add_wait_queue_tail(&x->wait, &wait); 3799 __add_wait_queue_tail(&x->wait, &wait);
3766 do { 3800 do {
3767 __set_current_state(TASK_UNINTERRUPTIBLE); 3801 if (state == TASK_INTERRUPTIBLE &&
3802 signal_pending(current)) {
3803 __remove_wait_queue(&x->wait, &wait);
3804 return -ERESTARTSYS;
3805 }
3806 __set_current_state(state);
3768 spin_unlock_irq(&x->wait.lock); 3807 spin_unlock_irq(&x->wait.lock);
3769 timeout = schedule_timeout(timeout); 3808 timeout = schedule_timeout(timeout);
3770 spin_lock_irq(&x->wait.lock); 3809 spin_lock_irq(&x->wait.lock);
3771 if (!timeout) { 3810 if (!timeout) {
3772 __remove_wait_queue(&x->wait, &wait); 3811 __remove_wait_queue(&x->wait, &wait);
3773 goto out; 3812 return timeout;
3774 } 3813 }
3775 } while (!x->done); 3814 } while (!x->done);
3776 __remove_wait_queue(&x->wait, &wait); 3815 __remove_wait_queue(&x->wait, &wait);
3777 } 3816 }
3778 x->done--; 3817 x->done--;
3779out:
3780 spin_unlock_irq(&x->wait.lock);
3781 return timeout; 3818 return timeout;
3782} 3819}
3783EXPORT_SYMBOL(wait_for_completion_timeout);
3784 3820
3785int fastcall __sched wait_for_completion_interruptible(struct completion *x) 3821static long __sched
3822wait_for_common(struct completion *x, long timeout, int state)
3786{ 3823{
3787 int ret = 0;
3788
3789 might_sleep(); 3824 might_sleep();
3790 3825
3791 spin_lock_irq(&x->wait.lock); 3826 spin_lock_irq(&x->wait.lock);
3792 if (!x->done) { 3827 timeout = do_wait_for_common(x, timeout, state);
3793 DECLARE_WAITQUEUE(wait, current);
3794
3795 wait.flags |= WQ_FLAG_EXCLUSIVE;
3796 __add_wait_queue_tail(&x->wait, &wait);
3797 do {
3798 if (signal_pending(current)) {
3799 ret = -ERESTARTSYS;
3800 __remove_wait_queue(&x->wait, &wait);
3801 goto out;
3802 }
3803 __set_current_state(TASK_INTERRUPTIBLE);
3804 spin_unlock_irq(&x->wait.lock);
3805 schedule();
3806 spin_lock_irq(&x->wait.lock);
3807 } while (!x->done);
3808 __remove_wait_queue(&x->wait, &wait);
3809 }
3810 x->done--;
3811out:
3812 spin_unlock_irq(&x->wait.lock); 3828 spin_unlock_irq(&x->wait.lock);
3829 return timeout;
3830}
3813 3831
3814 return ret; 3832void fastcall __sched wait_for_completion(struct completion *x)
3833{
3834 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3815} 3835}
3816EXPORT_SYMBOL(wait_for_completion_interruptible); 3836EXPORT_SYMBOL(wait_for_completion);
3817 3837
3818unsigned long fastcall __sched 3838unsigned long fastcall __sched
3819wait_for_completion_interruptible_timeout(struct completion *x, 3839wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3820 unsigned long timeout)
3821{ 3840{
3822 might_sleep(); 3841 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
3823
3824 spin_lock_irq(&x->wait.lock);
3825 if (!x->done) {
3826 DECLARE_WAITQUEUE(wait, current);
3827
3828 wait.flags |= WQ_FLAG_EXCLUSIVE;
3829 __add_wait_queue_tail(&x->wait, &wait);
3830 do {
3831 if (signal_pending(current)) {
3832 timeout = -ERESTARTSYS;
3833 __remove_wait_queue(&x->wait, &wait);
3834 goto out;
3835 }
3836 __set_current_state(TASK_INTERRUPTIBLE);
3837 spin_unlock_irq(&x->wait.lock);
3838 timeout = schedule_timeout(timeout);
3839 spin_lock_irq(&x->wait.lock);
3840 if (!timeout) {
3841 __remove_wait_queue(&x->wait, &wait);
3842 goto out;
3843 }
3844 } while (!x->done);
3845 __remove_wait_queue(&x->wait, &wait);
3846 }
3847 x->done--;
3848out:
3849 spin_unlock_irq(&x->wait.lock);
3850 return timeout;
3851} 3842}
3852EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 3843EXPORT_SYMBOL(wait_for_completion_timeout);
3853 3844
3854static inline void 3845int __sched wait_for_completion_interruptible(struct completion *x)
3855sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
3856{ 3846{
3857 spin_lock_irqsave(&q->lock, *flags); 3847 return wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
3858 __add_wait_queue(q, wait);
3859 spin_unlock(&q->lock);
3860} 3848}
3849EXPORT_SYMBOL(wait_for_completion_interruptible);
3861 3850
3862static inline void 3851unsigned long fastcall __sched
3863sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) 3852wait_for_completion_interruptible_timeout(struct completion *x,
3853 unsigned long timeout)
3864{ 3854{
3865 spin_lock_irq(&q->lock); 3855 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
3866 __remove_wait_queue(q, wait);
3867 spin_unlock_irqrestore(&q->lock, *flags);
3868} 3856}
3857EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3869 3858
3870void __sched interruptible_sleep_on(wait_queue_head_t *q) 3859static long __sched
3860sleep_on_common(wait_queue_head_t *q, int state, long timeout)
3871{ 3861{
3872 unsigned long flags; 3862 unsigned long flags;
3873 wait_queue_t wait; 3863 wait_queue_t wait;
3874 3864
3875 init_waitqueue_entry(&wait, current); 3865 init_waitqueue_entry(&wait, current);
3876 3866
3877 current->state = TASK_INTERRUPTIBLE; 3867 __set_current_state(state);
3878 3868
3879 sleep_on_head(q, &wait, &flags); 3869 spin_lock_irqsave(&q->lock, flags);
3880 schedule(); 3870 __add_wait_queue(q, &wait);
3881 sleep_on_tail(q, &wait, &flags); 3871 spin_unlock(&q->lock);
3872 timeout = schedule_timeout(timeout);
3873 spin_lock_irq(&q->lock);
3874 __remove_wait_queue(q, &wait);
3875 spin_unlock_irqrestore(&q->lock, flags);
3876
3877 return timeout;
3878}
3879
3880void __sched interruptible_sleep_on(wait_queue_head_t *q)
3881{
3882 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3882} 3883}
3883EXPORT_SYMBOL(interruptible_sleep_on); 3884EXPORT_SYMBOL(interruptible_sleep_on);
3884 3885
3885long __sched 3886long __sched
3886interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 3887interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3887{ 3888{
3888 unsigned long flags; 3889 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
3889 wait_queue_t wait;
3890
3891 init_waitqueue_entry(&wait, current);
3892
3893 current->state = TASK_INTERRUPTIBLE;
3894
3895 sleep_on_head(q, &wait, &flags);
3896 timeout = schedule_timeout(timeout);
3897 sleep_on_tail(q, &wait, &flags);
3898
3899 return timeout;
3900} 3890}
3901EXPORT_SYMBOL(interruptible_sleep_on_timeout); 3891EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3902 3892
3903void __sched sleep_on(wait_queue_head_t *q) 3893void __sched sleep_on(wait_queue_head_t *q)
3904{ 3894{
3905 unsigned long flags; 3895 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3906 wait_queue_t wait;
3907
3908 init_waitqueue_entry(&wait, current);
3909
3910 current->state = TASK_UNINTERRUPTIBLE;
3911
3912 sleep_on_head(q, &wait, &flags);
3913 schedule();
3914 sleep_on_tail(q, &wait, &flags);
3915} 3896}
3916EXPORT_SYMBOL(sleep_on); 3897EXPORT_SYMBOL(sleep_on);
3917 3898
3918long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 3899long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3919{ 3900{
3920 unsigned long flags; 3901 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
3921 wait_queue_t wait;
3922
3923 init_waitqueue_entry(&wait, current);
3924
3925 current->state = TASK_UNINTERRUPTIBLE;
3926
3927 sleep_on_head(q, &wait, &flags);
3928 timeout = schedule_timeout(timeout);
3929 sleep_on_tail(q, &wait, &flags);
3930
3931 return timeout;
3932} 3902}
3933EXPORT_SYMBOL(sleep_on_timeout); 3903EXPORT_SYMBOL(sleep_on_timeout);
3934 3904
@@ -3947,7 +3917,7 @@ EXPORT_SYMBOL(sleep_on_timeout);
3947void rt_mutex_setprio(struct task_struct *p, int prio) 3917void rt_mutex_setprio(struct task_struct *p, int prio)
3948{ 3918{
3949 unsigned long flags; 3919 unsigned long flags;
3950 int oldprio, on_rq; 3920 int oldprio, on_rq, running;
3951 struct rq *rq; 3921 struct rq *rq;
3952 3922
3953 BUG_ON(prio < 0 || prio > MAX_PRIO); 3923 BUG_ON(prio < 0 || prio > MAX_PRIO);
@@ -3957,8 +3927,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3957 3927
3958 oldprio = p->prio; 3928 oldprio = p->prio;
3959 on_rq = p->se.on_rq; 3929 on_rq = p->se.on_rq;
3960 if (on_rq) 3930 running = task_running(rq, p);
3931 if (on_rq) {
3961 dequeue_task(rq, p, 0); 3932 dequeue_task(rq, p, 0);
3933 if (running)
3934 p->sched_class->put_prev_task(rq, p);
3935 }
3962 3936
3963 if (rt_prio(prio)) 3937 if (rt_prio(prio))
3964 p->sched_class = &rt_sched_class; 3938 p->sched_class = &rt_sched_class;
@@ -3968,13 +3942,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3968 p->prio = prio; 3942 p->prio = prio;
3969 3943
3970 if (on_rq) { 3944 if (on_rq) {
3945 if (running)
3946 p->sched_class->set_curr_task(rq);
3971 enqueue_task(rq, p, 0); 3947 enqueue_task(rq, p, 0);
3972 /* 3948 /*
3973 * Reschedule if we are currently running on this runqueue and 3949 * Reschedule if we are currently running on this runqueue and
3974 * our priority decreased, or if we are not currently running on 3950 * our priority decreased, or if we are not currently running on
3975 * this runqueue and our priority is higher than the current's 3951 * this runqueue and our priority is higher than the current's
3976 */ 3952 */
3977 if (task_running(rq, p)) { 3953 if (running) {
3978 if (p->prio > oldprio) 3954 if (p->prio > oldprio)
3979 resched_task(rq->curr); 3955 resched_task(rq->curr);
3980 } else { 3956 } else {
@@ -4138,7 +4114,7 @@ struct task_struct *idle_task(int cpu)
4138 * find_process_by_pid - find a process with a matching PID value. 4114 * find_process_by_pid - find a process with a matching PID value.
4139 * @pid: the pid in question. 4115 * @pid: the pid in question.
4140 */ 4116 */
4141static inline struct task_struct *find_process_by_pid(pid_t pid) 4117static struct task_struct *find_process_by_pid(pid_t pid)
4142{ 4118{
4143 return pid ? find_task_by_pid(pid) : current; 4119 return pid ? find_task_by_pid(pid) : current;
4144} 4120}
@@ -4180,7 +4156,7 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4180int sched_setscheduler(struct task_struct *p, int policy, 4156int sched_setscheduler(struct task_struct *p, int policy,
4181 struct sched_param *param) 4157 struct sched_param *param)
4182{ 4158{
4183 int retval, oldprio, oldpolicy = -1, on_rq; 4159 int retval, oldprio, oldpolicy = -1, on_rq, running;
4184 unsigned long flags; 4160 unsigned long flags;
4185 struct rq *rq; 4161 struct rq *rq;
4186 4162
@@ -4262,18 +4238,26 @@ recheck:
4262 } 4238 }
4263 update_rq_clock(rq); 4239 update_rq_clock(rq);
4264 on_rq = p->se.on_rq; 4240 on_rq = p->se.on_rq;
4265 if (on_rq) 4241 running = task_running(rq, p);
4242 if (on_rq) {
4266 deactivate_task(rq, p, 0); 4243 deactivate_task(rq, p, 0);
4244 if (running)
4245 p->sched_class->put_prev_task(rq, p);
4246 }
4247
4267 oldprio = p->prio; 4248 oldprio = p->prio;
4268 __setscheduler(rq, p, policy, param->sched_priority); 4249 __setscheduler(rq, p, policy, param->sched_priority);
4250
4269 if (on_rq) { 4251 if (on_rq) {
4252 if (running)
4253 p->sched_class->set_curr_task(rq);
4270 activate_task(rq, p, 0); 4254 activate_task(rq, p, 0);
4271 /* 4255 /*
4272 * Reschedule if we are currently running on this runqueue and 4256 * Reschedule if we are currently running on this runqueue and
4273 * our priority decreased, or if we are not currently running on 4257 * our priority decreased, or if we are not currently running on
4274 * this runqueue and our priority is higher than the current's 4258 * this runqueue and our priority is higher than the current's
4275 */ 4259 */
4276 if (task_running(rq, p)) { 4260 if (running) {
4277 if (p->prio > oldprio) 4261 if (p->prio > oldprio)
4278 resched_task(rq->curr); 4262 resched_task(rq->curr);
4279 } else { 4263 } else {
@@ -4344,10 +4328,10 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4344asmlinkage long sys_sched_getscheduler(pid_t pid) 4328asmlinkage long sys_sched_getscheduler(pid_t pid)
4345{ 4329{
4346 struct task_struct *p; 4330 struct task_struct *p;
4347 int retval = -EINVAL; 4331 int retval;
4348 4332
4349 if (pid < 0) 4333 if (pid < 0)
4350 goto out_nounlock; 4334 return -EINVAL;
4351 4335
4352 retval = -ESRCH; 4336 retval = -ESRCH;
4353 read_lock(&tasklist_lock); 4337 read_lock(&tasklist_lock);
@@ -4358,8 +4342,6 @@ asmlinkage long sys_sched_getscheduler(pid_t pid)
4358 retval = p->policy; 4342 retval = p->policy;
4359 } 4343 }
4360 read_unlock(&tasklist_lock); 4344 read_unlock(&tasklist_lock);
4361
4362out_nounlock:
4363 return retval; 4345 return retval;
4364} 4346}
4365 4347
@@ -4372,10 +4354,10 @@ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4372{ 4354{
4373 struct sched_param lp; 4355 struct sched_param lp;
4374 struct task_struct *p; 4356 struct task_struct *p;
4375 int retval = -EINVAL; 4357 int retval;
4376 4358
4377 if (!param || pid < 0) 4359 if (!param || pid < 0)
4378 goto out_nounlock; 4360 return -EINVAL;
4379 4361
4380 read_lock(&tasklist_lock); 4362 read_lock(&tasklist_lock);
4381 p = find_process_by_pid(pid); 4363 p = find_process_by_pid(pid);
@@ -4395,7 +4377,6 @@ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4395 */ 4377 */
4396 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 4378 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4397 4379
4398out_nounlock:
4399 return retval; 4380 return retval;
4400 4381
4401out_unlock: 4382out_unlock:
@@ -4555,8 +4536,8 @@ asmlinkage long sys_sched_yield(void)
4555{ 4536{
4556 struct rq *rq = this_rq_lock(); 4537 struct rq *rq = this_rq_lock();
4557 4538
4558 schedstat_inc(rq, yld_cnt); 4539 schedstat_inc(rq, yld_count);
4559 current->sched_class->yield_task(rq, current); 4540 current->sched_class->yield_task(rq);
4560 4541
4561 /* 4542 /*
4562 * Since we are going to call schedule() anyway, there's 4543 * Since we are going to call schedule() anyway, there's
@@ -4750,11 +4731,12 @@ asmlinkage
4750long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) 4731long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4751{ 4732{
4752 struct task_struct *p; 4733 struct task_struct *p;
4753 int retval = -EINVAL; 4734 unsigned int time_slice;
4735 int retval;
4754 struct timespec t; 4736 struct timespec t;
4755 4737
4756 if (pid < 0) 4738 if (pid < 0)
4757 goto out_nounlock; 4739 return -EINVAL;
4758 4740
4759 retval = -ESRCH; 4741 retval = -ESRCH;
4760 read_lock(&tasklist_lock); 4742 read_lock(&tasklist_lock);
@@ -4766,12 +4748,24 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4766 if (retval) 4748 if (retval)
4767 goto out_unlock; 4749 goto out_unlock;
4768 4750
4769 jiffies_to_timespec(p->policy == SCHED_FIFO ? 4751 if (p->policy == SCHED_FIFO)
4770 0 : static_prio_timeslice(p->static_prio), &t); 4752 time_slice = 0;
4753 else if (p->policy == SCHED_RR)
4754 time_slice = DEF_TIMESLICE;
4755 else {
4756 struct sched_entity *se = &p->se;
4757 unsigned long flags;
4758 struct rq *rq;
4759
4760 rq = task_rq_lock(p, &flags);
4761 time_slice = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
4762 task_rq_unlock(rq, &flags);
4763 }
4771 read_unlock(&tasklist_lock); 4764 read_unlock(&tasklist_lock);
4765 jiffies_to_timespec(time_slice, &t);
4772 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4766 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4773out_nounlock:
4774 return retval; 4767 return retval;
4768
4775out_unlock: 4769out_unlock:
4776 read_unlock(&tasklist_lock); 4770 read_unlock(&tasklist_lock);
4777 return retval; 4771 return retval;
@@ -4900,32 +4894,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
4900 */ 4894 */
4901cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 4895cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4902 4896
4903/*
4904 * Increase the granularity value when there are more CPUs,
4905 * because with more CPUs the 'effective latency' as visible
4906 * to users decreases. But the relationship is not linear,
4907 * so pick a second-best guess by going with the log2 of the
4908 * number of CPUs.
4909 *
4910 * This idea comes from the SD scheduler of Con Kolivas:
4911 */
4912static inline void sched_init_granularity(void)
4913{
4914 unsigned int factor = 1 + ilog2(num_online_cpus());
4915 const unsigned long limit = 100000000;
4916
4917 sysctl_sched_min_granularity *= factor;
4918 if (sysctl_sched_min_granularity > limit)
4919 sysctl_sched_min_granularity = limit;
4920
4921 sysctl_sched_latency *= factor;
4922 if (sysctl_sched_latency > limit)
4923 sysctl_sched_latency = limit;
4924
4925 sysctl_sched_runtime_limit = sysctl_sched_latency;
4926 sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2;
4927}
4928
4929#ifdef CONFIG_SMP 4897#ifdef CONFIG_SMP
4930/* 4898/*
4931 * This is how migration works: 4899 * This is how migration works:
@@ -5103,35 +5071,34 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5103 struct rq *rq; 5071 struct rq *rq;
5104 int dest_cpu; 5072 int dest_cpu;
5105 5073
5106restart: 5074 do {
5107 /* On same node? */ 5075 /* On same node? */
5108 mask = node_to_cpumask(cpu_to_node(dead_cpu)); 5076 mask = node_to_cpumask(cpu_to_node(dead_cpu));
5109 cpus_and(mask, mask, p->cpus_allowed); 5077 cpus_and(mask, mask, p->cpus_allowed);
5110 dest_cpu = any_online_cpu(mask); 5078 dest_cpu = any_online_cpu(mask);
5111 5079
5112 /* On any allowed CPU? */ 5080 /* On any allowed CPU? */
5113 if (dest_cpu == NR_CPUS) 5081 if (dest_cpu == NR_CPUS)
5114 dest_cpu = any_online_cpu(p->cpus_allowed); 5082 dest_cpu = any_online_cpu(p->cpus_allowed);
5115 5083
5116 /* No more Mr. Nice Guy. */ 5084 /* No more Mr. Nice Guy. */
5117 if (dest_cpu == NR_CPUS) { 5085 if (dest_cpu == NR_CPUS) {
5118 rq = task_rq_lock(p, &flags); 5086 rq = task_rq_lock(p, &flags);
5119 cpus_setall(p->cpus_allowed); 5087 cpus_setall(p->cpus_allowed);
5120 dest_cpu = any_online_cpu(p->cpus_allowed); 5088 dest_cpu = any_online_cpu(p->cpus_allowed);
5121 task_rq_unlock(rq, &flags); 5089 task_rq_unlock(rq, &flags);
5122 5090
5123 /* 5091 /*
5124 * Don't tell them about moving exiting tasks or 5092 * Don't tell them about moving exiting tasks or
5125 * kernel threads (both mm NULL), since they never 5093 * kernel threads (both mm NULL), since they never
5126 * leave kernel. 5094 * leave kernel.
5127 */ 5095 */
5128 if (p->mm && printk_ratelimit()) 5096 if (p->mm && printk_ratelimit())
5129 printk(KERN_INFO "process %d (%s) no " 5097 printk(KERN_INFO "process %d (%s) no "
5130 "longer affine to cpu%d\n", 5098 "longer affine to cpu%d\n",
5131 p->pid, p->comm, dead_cpu); 5099 p->pid, p->comm, dead_cpu);
5132 } 5100 }
5133 if (!__migrate_task(p, dead_cpu, dest_cpu)) 5101 } while (!__migrate_task(p, dead_cpu, dest_cpu));
5134 goto restart;
5135} 5102}
5136 5103
5137/* 5104/*
@@ -5173,6 +5140,20 @@ static void migrate_live_tasks(int src_cpu)
5173} 5140}
5174 5141
5175/* 5142/*
5143 * activate_idle_task - move idle task to the _front_ of runqueue.
5144 */
5145static void activate_idle_task(struct task_struct *p, struct rq *rq)
5146{
5147 update_rq_clock(rq);
5148
5149 if (p->state == TASK_UNINTERRUPTIBLE)
5150 rq->nr_uninterruptible--;
5151
5152 enqueue_task(rq, p, 0);
5153 inc_nr_running(p, rq);
5154}
5155
5156/*
5176 * Schedules idle task to be the next runnable task on current CPU. 5157 * Schedules idle task to be the next runnable task on current CPU.
5177 * It does so by boosting its priority to highest possible and adding it to 5158 * It does so by boosting its priority to highest possible and adding it to
5178 * the _front_ of the runqueue. Used by CPU offline code. 5159 * the _front_ of the runqueue. Used by CPU offline code.
@@ -5284,14 +5265,23 @@ static struct ctl_table sd_ctl_root[] = {
5284static struct ctl_table *sd_alloc_ctl_entry(int n) 5265static struct ctl_table *sd_alloc_ctl_entry(int n)
5285{ 5266{
5286 struct ctl_table *entry = 5267 struct ctl_table *entry =
5287 kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL); 5268 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
5288
5289 BUG_ON(!entry);
5290 memset(entry, 0, n * sizeof(struct ctl_table));
5291 5269
5292 return entry; 5270 return entry;
5293} 5271}
5294 5272
5273static void sd_free_ctl_entry(struct ctl_table **tablep)
5274{
5275 struct ctl_table *entry = *tablep;
5276
5277 for (entry = *tablep; entry->procname; entry++)
5278 if (entry->child)
5279 sd_free_ctl_entry(&entry->child);
5280
5281 kfree(*tablep);
5282 *tablep = NULL;
5283}
5284
5295static void 5285static void
5296set_table_entry(struct ctl_table *entry, 5286set_table_entry(struct ctl_table *entry,
5297 const char *procname, void *data, int maxlen, 5287 const char *procname, void *data, int maxlen,
@@ -5307,7 +5297,10 @@ set_table_entry(struct ctl_table *entry,
5307static struct ctl_table * 5297static struct ctl_table *
5308sd_alloc_ctl_domain_table(struct sched_domain *sd) 5298sd_alloc_ctl_domain_table(struct sched_domain *sd)
5309{ 5299{
5310 struct ctl_table *table = sd_alloc_ctl_entry(14); 5300 struct ctl_table *table = sd_alloc_ctl_entry(12);
5301
5302 if (table == NULL)
5303 return NULL;
5311 5304
5312 set_table_entry(&table[0], "min_interval", &sd->min_interval, 5305 set_table_entry(&table[0], "min_interval", &sd->min_interval,
5313 sizeof(long), 0644, proc_doulongvec_minmax); 5306 sizeof(long), 0644, proc_doulongvec_minmax);
@@ -5327,11 +5320,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
5327 sizeof(int), 0644, proc_dointvec_minmax); 5320 sizeof(int), 0644, proc_dointvec_minmax);
5328 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, 5321 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
5329 sizeof(int), 0644, proc_dointvec_minmax); 5322 sizeof(int), 0644, proc_dointvec_minmax);
5330 set_table_entry(&table[10], "cache_nice_tries", 5323 set_table_entry(&table[9], "cache_nice_tries",
5331 &sd->cache_nice_tries, 5324 &sd->cache_nice_tries,
5332 sizeof(int), 0644, proc_dointvec_minmax); 5325 sizeof(int), 0644, proc_dointvec_minmax);
5333 set_table_entry(&table[12], "flags", &sd->flags, 5326 set_table_entry(&table[10], "flags", &sd->flags,
5334 sizeof(int), 0644, proc_dointvec_minmax); 5327 sizeof(int), 0644, proc_dointvec_minmax);
5328 /* &table[11] is terminator */
5335 5329
5336 return table; 5330 return table;
5337} 5331}
@@ -5346,6 +5340,8 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5346 for_each_domain(cpu, sd) 5340 for_each_domain(cpu, sd)
5347 domain_num++; 5341 domain_num++;
5348 entry = table = sd_alloc_ctl_entry(domain_num + 1); 5342 entry = table = sd_alloc_ctl_entry(domain_num + 1);
5343 if (table == NULL)
5344 return NULL;
5349 5345
5350 i = 0; 5346 i = 0;
5351 for_each_domain(cpu, sd) { 5347 for_each_domain(cpu, sd) {
@@ -5360,24 +5356,38 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5360} 5356}
5361 5357
5362static struct ctl_table_header *sd_sysctl_header; 5358static struct ctl_table_header *sd_sysctl_header;
5363static void init_sched_domain_sysctl(void) 5359static void register_sched_domain_sysctl(void)
5364{ 5360{
5365 int i, cpu_num = num_online_cpus(); 5361 int i, cpu_num = num_online_cpus();
5366 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 5362 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
5367 char buf[32]; 5363 char buf[32];
5368 5364
5365 if (entry == NULL)
5366 return;
5367
5369 sd_ctl_dir[0].child = entry; 5368 sd_ctl_dir[0].child = entry;
5370 5369
5371 for (i = 0; i < cpu_num; i++, entry++) { 5370 for_each_online_cpu(i) {
5372 snprintf(buf, 32, "cpu%d", i); 5371 snprintf(buf, 32, "cpu%d", i);
5373 entry->procname = kstrdup(buf, GFP_KERNEL); 5372 entry->procname = kstrdup(buf, GFP_KERNEL);
5374 entry->mode = 0555; 5373 entry->mode = 0555;
5375 entry->child = sd_alloc_ctl_cpu_table(i); 5374 entry->child = sd_alloc_ctl_cpu_table(i);
5375 entry++;
5376 } 5376 }
5377 sd_sysctl_header = register_sysctl_table(sd_ctl_root); 5377 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
5378} 5378}
5379
5380static void unregister_sched_domain_sysctl(void)
5381{
5382 unregister_sysctl_table(sd_sysctl_header);
5383 sd_sysctl_header = NULL;
5384 sd_free_ctl_entry(&sd_ctl_dir[0].child);
5385}
5379#else 5386#else
5380static void init_sched_domain_sysctl(void) 5387static void register_sched_domain_sysctl(void)
5388{
5389}
5390static void unregister_sched_domain_sysctl(void)
5381{ 5391{
5382} 5392}
5383#endif 5393#endif
@@ -5499,8 +5509,7 @@ int __init migration_init(void)
5499int nr_cpu_ids __read_mostly = NR_CPUS; 5509int nr_cpu_ids __read_mostly = NR_CPUS;
5500EXPORT_SYMBOL(nr_cpu_ids); 5510EXPORT_SYMBOL(nr_cpu_ids);
5501 5511
5502#undef SCHED_DOMAIN_DEBUG 5512#ifdef CONFIG_SCHED_DEBUG
5503#ifdef SCHED_DOMAIN_DEBUG
5504static void sched_domain_debug(struct sched_domain *sd, int cpu) 5513static void sched_domain_debug(struct sched_domain *sd, int cpu)
5505{ 5514{
5506 int level = 0; 5515 int level = 0;
@@ -5558,16 +5567,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5558 printk("\n"); 5567 printk("\n");
5559 printk(KERN_ERR "ERROR: domain->cpu_power not " 5568 printk(KERN_ERR "ERROR: domain->cpu_power not "
5560 "set\n"); 5569 "set\n");
5570 break;
5561 } 5571 }
5562 5572
5563 if (!cpus_weight(group->cpumask)) { 5573 if (!cpus_weight(group->cpumask)) {
5564 printk("\n"); 5574 printk("\n");
5565 printk(KERN_ERR "ERROR: empty group\n"); 5575 printk(KERN_ERR "ERROR: empty group\n");
5576 break;
5566 } 5577 }
5567 5578
5568 if (cpus_intersects(groupmask, group->cpumask)) { 5579 if (cpus_intersects(groupmask, group->cpumask)) {
5569 printk("\n"); 5580 printk("\n");
5570 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5581 printk(KERN_ERR "ERROR: repeated CPUs\n");
5582 break;
5571 } 5583 }
5572 5584
5573 cpus_or(groupmask, groupmask, group->cpumask); 5585 cpus_or(groupmask, groupmask, group->cpumask);
@@ -5701,7 +5713,7 @@ static int __init isolated_cpu_setup(char *str)
5701 return 1; 5713 return 1;
5702} 5714}
5703 5715
5704__setup ("isolcpus=", isolated_cpu_setup); 5716__setup("isolcpus=", isolated_cpu_setup);
5705 5717
5706/* 5718/*
5707 * init_sched_build_groups takes the cpumask we wish to span, and a pointer 5719 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
@@ -5930,24 +5942,23 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
5930 5942
5931 if (!sg) 5943 if (!sg)
5932 return; 5944 return;
5933next_sg: 5945 do {
5934 for_each_cpu_mask(j, sg->cpumask) { 5946 for_each_cpu_mask(j, sg->cpumask) {
5935 struct sched_domain *sd; 5947 struct sched_domain *sd;
5936 5948
5937 sd = &per_cpu(phys_domains, j); 5949 sd = &per_cpu(phys_domains, j);
5938 if (j != first_cpu(sd->groups->cpumask)) { 5950 if (j != first_cpu(sd->groups->cpumask)) {
5939 /* 5951 /*
5940 * Only add "power" once for each 5952 * Only add "power" once for each
5941 * physical package. 5953 * physical package.
5942 */ 5954 */
5943 continue; 5955 continue;
5944 } 5956 }
5945 5957
5946 sg_inc_cpu_power(sg, sd->groups->__cpu_power); 5958 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
5947 } 5959 }
5948 sg = sg->next; 5960 sg = sg->next;
5949 if (sg != group_head) 5961 } while (sg != group_head);
5950 goto next_sg;
5951} 5962}
5952#endif 5963#endif
5953 5964
@@ -6058,7 +6069,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6058 /* 6069 /*
6059 * Allocate the per-node list of sched groups 6070 * Allocate the per-node list of sched groups
6060 */ 6071 */
6061 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES, 6072 sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
6062 GFP_KERNEL); 6073 GFP_KERNEL);
6063 if (!sched_group_nodes) { 6074 if (!sched_group_nodes) {
6064 printk(KERN_WARNING "Can not alloc sched group node list\n"); 6075 printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -6311,6 +6322,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
6311 6322
6312 err = build_sched_domains(&cpu_default_map); 6323 err = build_sched_domains(&cpu_default_map);
6313 6324
6325 register_sched_domain_sysctl();
6326
6314 return err; 6327 return err;
6315} 6328}
6316 6329
@@ -6327,6 +6340,8 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6327{ 6340{
6328 int i; 6341 int i;
6329 6342
6343 unregister_sched_domain_sysctl();
6344
6330 for_each_cpu_mask(i, *cpu_map) 6345 for_each_cpu_mask(i, *cpu_map)
6331 cpu_attach_domain(NULL, i); 6346 cpu_attach_domain(NULL, i);
6332 synchronize_sched(); 6347 synchronize_sched();
@@ -6357,6 +6372,8 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6357 if (!err && !cpus_empty(*partition2)) 6372 if (!err && !cpus_empty(*partition2))
6358 err = build_sched_domains(partition2); 6373 err = build_sched_domains(partition2);
6359 6374
6375 register_sched_domain_sysctl();
6376
6360 return err; 6377 return err;
6361} 6378}
6362 6379
@@ -6488,17 +6505,13 @@ void __init sched_init_smp(void)
6488 /* XXX: Theoretical race here - CPU may be hotplugged now */ 6505 /* XXX: Theoretical race here - CPU may be hotplugged now */
6489 hotcpu_notifier(update_sched_domains, 0); 6506 hotcpu_notifier(update_sched_domains, 0);
6490 6507
6491 init_sched_domain_sysctl();
6492
6493 /* Move init over to a non-isolated CPU */ 6508 /* Move init over to a non-isolated CPU */
6494 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 6509 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6495 BUG(); 6510 BUG();
6496 sched_init_granularity();
6497} 6511}
6498#else 6512#else
6499void __init sched_init_smp(void) 6513void __init sched_init_smp(void)
6500{ 6514{
6501 sched_init_granularity();
6502} 6515}
6503#endif /* CONFIG_SMP */ 6516#endif /* CONFIG_SMP */
6504 6517
@@ -6512,28 +6525,20 @@ int in_sched_functions(unsigned long addr)
6512 && addr < (unsigned long)__sched_text_end); 6525 && addr < (unsigned long)__sched_text_end);
6513} 6526}
6514 6527
6515static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) 6528static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6516{ 6529{
6517 cfs_rq->tasks_timeline = RB_ROOT; 6530 cfs_rq->tasks_timeline = RB_ROOT;
6518 cfs_rq->fair_clock = 1;
6519#ifdef CONFIG_FAIR_GROUP_SCHED 6531#ifdef CONFIG_FAIR_GROUP_SCHED
6520 cfs_rq->rq = rq; 6532 cfs_rq->rq = rq;
6521#endif 6533#endif
6534 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
6522} 6535}
6523 6536
6524void __init sched_init(void) 6537void __init sched_init(void)
6525{ 6538{
6526 u64 now = sched_clock();
6527 int highest_cpu = 0; 6539 int highest_cpu = 0;
6528 int i, j; 6540 int i, j;
6529 6541
6530 /*
6531 * Link up the scheduling class hierarchy:
6532 */
6533 rt_sched_class.next = &fair_sched_class;
6534 fair_sched_class.next = &idle_sched_class;
6535 idle_sched_class.next = NULL;
6536
6537 for_each_possible_cpu(i) { 6542 for_each_possible_cpu(i) {
6538 struct rt_prio_array *array; 6543 struct rt_prio_array *array;
6539 struct rq *rq; 6544 struct rq *rq;
@@ -6546,10 +6551,28 @@ void __init sched_init(void)
6546 init_cfs_rq(&rq->cfs, rq); 6551 init_cfs_rq(&rq->cfs, rq);
6547#ifdef CONFIG_FAIR_GROUP_SCHED 6552#ifdef CONFIG_FAIR_GROUP_SCHED
6548 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6553 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6549 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); 6554 {
6555 struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
6556 struct sched_entity *se =
6557 &per_cpu(init_sched_entity, i);
6558
6559 init_cfs_rq_p[i] = cfs_rq;
6560 init_cfs_rq(cfs_rq, rq);
6561 cfs_rq->tg = &init_task_group;
6562 list_add(&cfs_rq->leaf_cfs_rq_list,
6563 &rq->leaf_cfs_rq_list);
6564
6565 init_sched_entity_p[i] = se;
6566 se->cfs_rq = &rq->cfs;
6567 se->my_q = cfs_rq;
6568 se->load.weight = init_task_group_load;
6569 se->load.inv_weight =
6570 div64_64(1ULL<<32, init_task_group_load);
6571 se->parent = NULL;
6572 }
6573 init_task_group.shares = init_task_group_load;
6574 spin_lock_init(&init_task_group.lock);
6550#endif 6575#endif
6551 rq->ls.load_update_last = now;
6552 rq->ls.load_update_start = now;
6553 6576
6554 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 6577 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6555 rq->cpu_load[j] = 0; 6578 rq->cpu_load[j] = 0;
@@ -6634,26 +6657,40 @@ EXPORT_SYMBOL(__might_sleep);
6634#endif 6657#endif
6635 6658
6636#ifdef CONFIG_MAGIC_SYSRQ 6659#ifdef CONFIG_MAGIC_SYSRQ
6660static void normalize_task(struct rq *rq, struct task_struct *p)
6661{
6662 int on_rq;
6663 update_rq_clock(rq);
6664 on_rq = p->se.on_rq;
6665 if (on_rq)
6666 deactivate_task(rq, p, 0);
6667 __setscheduler(rq, p, SCHED_NORMAL, 0);
6668 if (on_rq) {
6669 activate_task(rq, p, 0);
6670 resched_task(rq->curr);
6671 }
6672}
6673
6637void normalize_rt_tasks(void) 6674void normalize_rt_tasks(void)
6638{ 6675{
6639 struct task_struct *g, *p; 6676 struct task_struct *g, *p;
6640 unsigned long flags; 6677 unsigned long flags;
6641 struct rq *rq; 6678 struct rq *rq;
6642 int on_rq;
6643 6679
6644 read_lock_irq(&tasklist_lock); 6680 read_lock_irq(&tasklist_lock);
6645 do_each_thread(g, p) { 6681 do_each_thread(g, p) {
6646 p->se.fair_key = 0; 6682 /*
6647 p->se.wait_runtime = 0; 6683 * Only normalize user tasks:
6684 */
6685 if (!p->mm)
6686 continue;
6687
6648 p->se.exec_start = 0; 6688 p->se.exec_start = 0;
6649 p->se.wait_start_fair = 0;
6650 p->se.sleep_start_fair = 0;
6651#ifdef CONFIG_SCHEDSTATS 6689#ifdef CONFIG_SCHEDSTATS
6652 p->se.wait_start = 0; 6690 p->se.wait_start = 0;
6653 p->se.sleep_start = 0; 6691 p->se.sleep_start = 0;
6654 p->se.block_start = 0; 6692 p->se.block_start = 0;
6655#endif 6693#endif
6656 task_rq(p)->cfs.fair_clock = 0;
6657 task_rq(p)->clock = 0; 6694 task_rq(p)->clock = 0;
6658 6695
6659 if (!rt_task(p)) { 6696 if (!rt_task(p)) {
@@ -6668,26 +6705,9 @@ void normalize_rt_tasks(void)
6668 6705
6669 spin_lock_irqsave(&p->pi_lock, flags); 6706 spin_lock_irqsave(&p->pi_lock, flags);
6670 rq = __task_rq_lock(p); 6707 rq = __task_rq_lock(p);
6671#ifdef CONFIG_SMP
6672 /*
6673 * Do not touch the migration thread:
6674 */
6675 if (p == rq->migration_thread)
6676 goto out_unlock;
6677#endif
6678 6708
6679 update_rq_clock(rq); 6709 normalize_task(rq, p);
6680 on_rq = p->se.on_rq; 6710
6681 if (on_rq)
6682 deactivate_task(rq, p, 0);
6683 __setscheduler(rq, p, SCHED_NORMAL, 0);
6684 if (on_rq) {
6685 activate_task(rq, p, 0);
6686 resched_task(rq->curr);
6687 }
6688#ifdef CONFIG_SMP
6689 out_unlock:
6690#endif
6691 __task_rq_unlock(rq); 6711 __task_rq_unlock(rq);
6692 spin_unlock_irqrestore(&p->pi_lock, flags); 6712 spin_unlock_irqrestore(&p->pi_lock, flags);
6693 } while_each_thread(g, p); 6713 } while_each_thread(g, p);
@@ -6740,3 +6760,201 @@ void set_curr_task(int cpu, struct task_struct *p)
6740} 6760}
6741 6761
6742#endif 6762#endif
6763
6764#ifdef CONFIG_FAIR_GROUP_SCHED
6765
6766/* allocate runqueue etc for a new task group */
6767struct task_group *sched_create_group(void)
6768{
6769 struct task_group *tg;
6770 struct cfs_rq *cfs_rq;
6771 struct sched_entity *se;
6772 struct rq *rq;
6773 int i;
6774
6775 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
6776 if (!tg)
6777 return ERR_PTR(-ENOMEM);
6778
6779 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
6780 if (!tg->cfs_rq)
6781 goto err;
6782 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
6783 if (!tg->se)
6784 goto err;
6785
6786 for_each_possible_cpu(i) {
6787 rq = cpu_rq(i);
6788
6789 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL,
6790 cpu_to_node(i));
6791 if (!cfs_rq)
6792 goto err;
6793
6794 se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL,
6795 cpu_to_node(i));
6796 if (!se)
6797 goto err;
6798
6799 memset(cfs_rq, 0, sizeof(struct cfs_rq));
6800 memset(se, 0, sizeof(struct sched_entity));
6801
6802 tg->cfs_rq[i] = cfs_rq;
6803 init_cfs_rq(cfs_rq, rq);
6804 cfs_rq->tg = tg;
6805
6806 tg->se[i] = se;
6807 se->cfs_rq = &rq->cfs;
6808 se->my_q = cfs_rq;
6809 se->load.weight = NICE_0_LOAD;
6810 se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
6811 se->parent = NULL;
6812 }
6813
6814 for_each_possible_cpu(i) {
6815 rq = cpu_rq(i);
6816 cfs_rq = tg->cfs_rq[i];
6817 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6818 }
6819
6820 tg->shares = NICE_0_LOAD;
6821 spin_lock_init(&tg->lock);
6822
6823 return tg;
6824
6825err:
6826 for_each_possible_cpu(i) {
6827 if (tg->cfs_rq)
6828 kfree(tg->cfs_rq[i]);
6829 if (tg->se)
6830 kfree(tg->se[i]);
6831 }
6832 kfree(tg->cfs_rq);
6833 kfree(tg->se);
6834 kfree(tg);
6835
6836 return ERR_PTR(-ENOMEM);
6837}
6838
6839/* rcu callback to free various structures associated with a task group */
6840static void free_sched_group(struct rcu_head *rhp)
6841{
6842 struct cfs_rq *cfs_rq = container_of(rhp, struct cfs_rq, rcu);
6843 struct task_group *tg = cfs_rq->tg;
6844 struct sched_entity *se;
6845 int i;
6846
6847 /* now it should be safe to free those cfs_rqs */
6848 for_each_possible_cpu(i) {
6849 cfs_rq = tg->cfs_rq[i];
6850 kfree(cfs_rq);
6851
6852 se = tg->se[i];
6853 kfree(se);
6854 }
6855
6856 kfree(tg->cfs_rq);
6857 kfree(tg->se);
6858 kfree(tg);
6859}
6860
6861/* Destroy runqueue etc associated with a task group */
6862void sched_destroy_group(struct task_group *tg)
6863{
6864 struct cfs_rq *cfs_rq;
6865 int i;
6866
6867 for_each_possible_cpu(i) {
6868 cfs_rq = tg->cfs_rq[i];
6869 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
6870 }
6871
6872 cfs_rq = tg->cfs_rq[0];
6873
6874 /* wait for possible concurrent references to cfs_rqs complete */
6875 call_rcu(&cfs_rq->rcu, free_sched_group);
6876}
6877
6878/* change task's runqueue when it moves between groups.
6879 * The caller of this function should have put the task in its new group
6880 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
6881 * reflect its new group.
6882 */
6883void sched_move_task(struct task_struct *tsk)
6884{
6885 int on_rq, running;
6886 unsigned long flags;
6887 struct rq *rq;
6888
6889 rq = task_rq_lock(tsk, &flags);
6890
6891 if (tsk->sched_class != &fair_sched_class)
6892 goto done;
6893
6894 update_rq_clock(rq);
6895
6896 running = task_running(rq, tsk);
6897 on_rq = tsk->se.on_rq;
6898
6899 if (on_rq) {
6900 dequeue_task(rq, tsk, 0);
6901 if (unlikely(running))
6902 tsk->sched_class->put_prev_task(rq, tsk);
6903 }
6904
6905 set_task_cfs_rq(tsk);
6906
6907 if (on_rq) {
6908 if (unlikely(running))
6909 tsk->sched_class->set_curr_task(rq);
6910 enqueue_task(rq, tsk, 0);
6911 }
6912
6913done:
6914 task_rq_unlock(rq, &flags);
6915}
6916
6917static void set_se_shares(struct sched_entity *se, unsigned long shares)
6918{
6919 struct cfs_rq *cfs_rq = se->cfs_rq;
6920 struct rq *rq = cfs_rq->rq;
6921 int on_rq;
6922
6923 spin_lock_irq(&rq->lock);
6924
6925 on_rq = se->on_rq;
6926 if (on_rq)
6927 dequeue_entity(cfs_rq, se, 0);
6928
6929 se->load.weight = shares;
6930 se->load.inv_weight = div64_64((1ULL<<32), shares);
6931
6932 if (on_rq)
6933 enqueue_entity(cfs_rq, se, 0);
6934
6935 spin_unlock_irq(&rq->lock);
6936}
6937
6938int sched_group_set_shares(struct task_group *tg, unsigned long shares)
6939{
6940 int i;
6941
6942 spin_lock(&tg->lock);
6943 if (tg->shares == shares)
6944 goto done;
6945
6946 tg->shares = shares;
6947 for_each_possible_cpu(i)
6948 set_se_shares(tg->se[i], shares);
6949
6950done:
6951 spin_unlock(&tg->lock);
6952 return 0;
6953}
6954
6955unsigned long sched_group_shares(struct task_group *tg)
6956{
6957 return tg->shares;
6958}
6959
6960#endif /* CONFIG_FAIR_GROUP_SCHED */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index c3ee38bd3426..a5e517ec07c3 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -28,6 +28,31 @@
28 printk(x); \ 28 printk(x); \
29 } while (0) 29 } while (0)
30 30
31/*
32 * Ease the printing of nsec fields:
33 */
34static long long nsec_high(long long nsec)
35{
36 if (nsec < 0) {
37 nsec = -nsec;
38 do_div(nsec, 1000000);
39 return -nsec;
40 }
41 do_div(nsec, 1000000);
42
43 return nsec;
44}
45
46static unsigned long nsec_low(long long nsec)
47{
48 if (nsec < 0)
49 nsec = -nsec;
50
51 return do_div(nsec, 1000000);
52}
53
54#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
55
31static void 56static void
32print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 57print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
33{ 58{
@@ -36,23 +61,19 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
36 else 61 else
37 SEQ_printf(m, " "); 62 SEQ_printf(m, " ");
38 63
39 SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d ", 64 SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
40 p->comm, p->pid, 65 p->comm, p->pid,
41 (long long)p->se.fair_key, 66 SPLIT_NS(p->se.vruntime),
42 (long long)(p->se.fair_key - rq->cfs.fair_clock),
43 (long long)p->se.wait_runtime,
44 (long long)(p->nvcsw + p->nivcsw), 67 (long long)(p->nvcsw + p->nivcsw),
45 p->prio); 68 p->prio);
46#ifdef CONFIG_SCHEDSTATS 69#ifdef CONFIG_SCHEDSTATS
47 SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n", 70 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n",
48 (long long)p->se.sum_exec_runtime, 71 SPLIT_NS(p->se.vruntime),
49 (long long)p->se.sum_wait_runtime, 72 SPLIT_NS(p->se.sum_exec_runtime),
50 (long long)p->se.sum_sleep_runtime, 73 SPLIT_NS(p->se.sum_sleep_runtime));
51 (long long)p->se.wait_runtime_overruns,
52 (long long)p->se.wait_runtime_underruns);
53#else 74#else
54 SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n", 75 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n",
55 0LL, 0LL, 0LL, 0LL, 0LL); 76 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
56#endif 77#endif
57} 78}
58 79
@@ -62,14 +83,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
62 83
63 SEQ_printf(m, 84 SEQ_printf(m,
64 "\nrunnable tasks:\n" 85 "\nrunnable tasks:\n"
65 " task PID tree-key delta waiting" 86 " task PID tree-key switches prio"
66 " switches prio" 87 " exec-runtime sum-exec sum-sleep\n"
67 " sum-exec sum-wait sum-sleep" 88 "------------------------------------------------------"
68 " wait-overrun wait-underrun\n" 89 "----------------------------------------------------\n");
69 "------------------------------------------------------------------"
70 "----------------"
71 "------------------------------------------------"
72 "--------------------------------\n");
73 90
74 read_lock_irq(&tasklist_lock); 91 read_lock_irq(&tasklist_lock);
75 92
@@ -83,45 +100,48 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
83 read_unlock_irq(&tasklist_lock); 100 read_unlock_irq(&tasklist_lock);
84} 101}
85 102
86static void 103void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
87print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
88{ 104{
89 s64 wait_runtime_rq_sum = 0; 105 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
90 struct task_struct *p; 106 spread, rq0_min_vruntime, spread0;
91 struct rb_node *curr;
92 unsigned long flags;
93 struct rq *rq = &per_cpu(runqueues, cpu); 107 struct rq *rq = &per_cpu(runqueues, cpu);
108 struct sched_entity *last;
109 unsigned long flags;
94 110
95 spin_lock_irqsave(&rq->lock, flags);
96 curr = first_fair(cfs_rq);
97 while (curr) {
98 p = rb_entry(curr, struct task_struct, se.run_node);
99 wait_runtime_rq_sum += p->se.wait_runtime;
100
101 curr = rb_next(curr);
102 }
103 spin_unlock_irqrestore(&rq->lock, flags);
104
105 SEQ_printf(m, " .%-30s: %Ld\n", "wait_runtime_rq_sum",
106 (long long)wait_runtime_rq_sum);
107}
108
109void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
110{
111 SEQ_printf(m, "\ncfs_rq\n"); 111 SEQ_printf(m, "\ncfs_rq\n");
112 112
113#define P(x) \ 113 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
114 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(cfs_rq->x)) 114 SPLIT_NS(cfs_rq->exec_clock));
115
116 P(fair_clock);
117 P(exec_clock);
118 P(wait_runtime);
119 P(wait_runtime_overruns);
120 P(wait_runtime_underruns);
121 P(sleeper_bonus);
122#undef P
123 115
124 print_cfs_rq_runtime_sum(m, cpu, cfs_rq); 116 spin_lock_irqsave(&rq->lock, flags);
117 if (cfs_rq->rb_leftmost)
118 MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
119 last = __pick_last_entity(cfs_rq);
120 if (last)
121 max_vruntime = last->vruntime;
122 min_vruntime = rq->cfs.min_vruntime;
123 rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
124 spin_unlock_irqrestore(&rq->lock, flags);
125 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
126 SPLIT_NS(MIN_vruntime));
127 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
128 SPLIT_NS(min_vruntime));
129 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
130 SPLIT_NS(max_vruntime));
131 spread = max_vruntime - MIN_vruntime;
132 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
133 SPLIT_NS(spread));
134 spread0 = min_vruntime - rq0_min_vruntime;
135 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
136 SPLIT_NS(spread0));
137 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
138 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
139#ifdef CONFIG_SCHEDSTATS
140 SEQ_printf(m, " .%-30s: %ld\n", "bkl_count",
141 rq->bkl_count);
142#endif
143 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
144 cfs_rq->nr_spread_over);
125} 145}
126 146
127static void print_cpu(struct seq_file *m, int cpu) 147static void print_cpu(struct seq_file *m, int cpu)
@@ -141,31 +161,32 @@ static void print_cpu(struct seq_file *m, int cpu)
141 161
142#define P(x) \ 162#define P(x) \
143 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) 163 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x))
164#define PN(x) \
165 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
144 166
145 P(nr_running); 167 P(nr_running);
146 SEQ_printf(m, " .%-30s: %lu\n", "load", 168 SEQ_printf(m, " .%-30s: %lu\n", "load",
147 rq->ls.load.weight); 169 rq->load.weight);
148 P(ls.delta_fair);
149 P(ls.delta_exec);
150 P(nr_switches); 170 P(nr_switches);
151 P(nr_load_updates); 171 P(nr_load_updates);
152 P(nr_uninterruptible); 172 P(nr_uninterruptible);
153 SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies); 173 SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies);
154 P(next_balance); 174 PN(next_balance);
155 P(curr->pid); 175 P(curr->pid);
156 P(clock); 176 PN(clock);
157 P(idle_clock); 177 PN(idle_clock);
158 P(prev_clock_raw); 178 PN(prev_clock_raw);
159 P(clock_warps); 179 P(clock_warps);
160 P(clock_overflows); 180 P(clock_overflows);
161 P(clock_deep_idle_events); 181 P(clock_deep_idle_events);
162 P(clock_max_delta); 182 PN(clock_max_delta);
163 P(cpu_load[0]); 183 P(cpu_load[0]);
164 P(cpu_load[1]); 184 P(cpu_load[1]);
165 P(cpu_load[2]); 185 P(cpu_load[2]);
166 P(cpu_load[3]); 186 P(cpu_load[3]);
167 P(cpu_load[4]); 187 P(cpu_load[4]);
168#undef P 188#undef P
189#undef PN
169 190
170 print_cfs_stats(m, cpu); 191 print_cfs_stats(m, cpu);
171 192
@@ -177,12 +198,25 @@ static int sched_debug_show(struct seq_file *m, void *v)
177 u64 now = ktime_to_ns(ktime_get()); 198 u64 now = ktime_to_ns(ktime_get());
178 int cpu; 199 int cpu;
179 200
180 SEQ_printf(m, "Sched Debug Version: v0.05-v20, %s %.*s\n", 201 SEQ_printf(m, "Sched Debug Version: v0.06-v22, %s %.*s\n",
181 init_utsname()->release, 202 init_utsname()->release,
182 (int)strcspn(init_utsname()->version, " "), 203 (int)strcspn(init_utsname()->version, " "),
183 init_utsname()->version); 204 init_utsname()->version);
184 205
185 SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now); 206 SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now));
207
208#define P(x) \
209 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
210#define PN(x) \
211 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
212 PN(sysctl_sched_latency);
213 PN(sysctl_sched_nr_latency);
214 PN(sysctl_sched_wakeup_granularity);
215 PN(sysctl_sched_batch_wakeup_granularity);
216 PN(sysctl_sched_child_runs_first);
217 P(sysctl_sched_features);
218#undef PN
219#undef P
186 220
187 for_each_online_cpu(cpu) 221 for_each_online_cpu(cpu)
188 print_cpu(m, cpu); 222 print_cpu(m, cpu);
@@ -202,7 +236,7 @@ static int sched_debug_open(struct inode *inode, struct file *filp)
202 return single_open(filp, sched_debug_show, NULL); 236 return single_open(filp, sched_debug_show, NULL);
203} 237}
204 238
205static struct file_operations sched_debug_fops = { 239static const struct file_operations sched_debug_fops = {
206 .open = sched_debug_open, 240 .open = sched_debug_open,
207 .read = seq_read, 241 .read = seq_read,
208 .llseek = seq_lseek, 242 .llseek = seq_lseek,
@@ -226,6 +260,7 @@ __initcall(init_sched_debug_procfs);
226 260
227void proc_sched_show_task(struct task_struct *p, struct seq_file *m) 261void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
228{ 262{
263 unsigned long nr_switches;
229 unsigned long flags; 264 unsigned long flags;
230 int num_threads = 1; 265 int num_threads = 1;
231 266
@@ -237,41 +272,89 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
237 rcu_read_unlock(); 272 rcu_read_unlock();
238 273
239 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); 274 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
240 SEQ_printf(m, "----------------------------------------------\n"); 275 SEQ_printf(m,
276 "---------------------------------------------------------\n");
277#define __P(F) \
278 SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F)
241#define P(F) \ 279#define P(F) \
242 SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F) 280 SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F)
281#define __PN(F) \
282 SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
283#define PN(F) \
284 SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
243 285
244 P(se.wait_runtime); 286 PN(se.exec_start);
245 P(se.wait_start_fair); 287 PN(se.vruntime);
246 P(se.exec_start); 288 PN(se.sum_exec_runtime);
247 P(se.sleep_start_fair); 289
248 P(se.sum_exec_runtime); 290 nr_switches = p->nvcsw + p->nivcsw;
249 291
250#ifdef CONFIG_SCHEDSTATS 292#ifdef CONFIG_SCHEDSTATS
251 P(se.wait_start); 293 PN(se.wait_start);
252 P(se.sleep_start); 294 PN(se.sleep_start);
253 P(se.block_start); 295 PN(se.block_start);
254 P(se.sleep_max); 296 PN(se.sleep_max);
255 P(se.block_max); 297 PN(se.block_max);
256 P(se.exec_max); 298 PN(se.exec_max);
257 P(se.wait_max); 299 PN(se.slice_max);
258 P(se.wait_runtime_overruns); 300 PN(se.wait_max);
259 P(se.wait_runtime_underruns); 301 P(sched_info.bkl_count);
260 P(se.sum_wait_runtime); 302 P(se.nr_migrations);
303 P(se.nr_migrations_cold);
304 P(se.nr_failed_migrations_affine);
305 P(se.nr_failed_migrations_running);
306 P(se.nr_failed_migrations_hot);
307 P(se.nr_forced_migrations);
308 P(se.nr_forced2_migrations);
309 P(se.nr_wakeups);
310 P(se.nr_wakeups_sync);
311 P(se.nr_wakeups_migrate);
312 P(se.nr_wakeups_local);
313 P(se.nr_wakeups_remote);
314 P(se.nr_wakeups_affine);
315 P(se.nr_wakeups_affine_attempts);
316 P(se.nr_wakeups_passive);
317 P(se.nr_wakeups_idle);
318
319 {
320 u64 avg_atom, avg_per_cpu;
321
322 avg_atom = p->se.sum_exec_runtime;
323 if (nr_switches)
324 do_div(avg_atom, nr_switches);
325 else
326 avg_atom = -1LL;
327
328 avg_per_cpu = p->se.sum_exec_runtime;
329 if (p->se.nr_migrations)
330 avg_per_cpu = div64_64(avg_per_cpu, p->se.nr_migrations);
331 else
332 avg_per_cpu = -1LL;
333
334 __PN(avg_atom);
335 __PN(avg_per_cpu);
336 }
261#endif 337#endif
262 SEQ_printf(m, "%-25s:%20Ld\n", 338 __P(nr_switches);
263 "nr_switches", (long long)(p->nvcsw + p->nivcsw)); 339 SEQ_printf(m, "%-35s:%21Ld\n",
340 "nr_voluntary_switches", (long long)p->nvcsw);
341 SEQ_printf(m, "%-35s:%21Ld\n",
342 "nr_involuntary_switches", (long long)p->nivcsw);
343
264 P(se.load.weight); 344 P(se.load.weight);
265 P(policy); 345 P(policy);
266 P(prio); 346 P(prio);
347#undef PN
348#undef __PN
267#undef P 349#undef P
350#undef __P
268 351
269 { 352 {
270 u64 t0, t1; 353 u64 t0, t1;
271 354
272 t0 = sched_clock(); 355 t0 = sched_clock();
273 t1 = sched_clock(); 356 t1 = sched_clock();
274 SEQ_printf(m, "%-25s:%20Ld\n", 357 SEQ_printf(m, "%-35s:%21Ld\n",
275 "clock-delta", (long long)(t1-t0)); 358 "clock-delta", (long long)(t1-t0));
276 } 359 }
277} 360}
@@ -279,9 +362,32 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
279void proc_sched_set_task(struct task_struct *p) 362void proc_sched_set_task(struct task_struct *p)
280{ 363{
281#ifdef CONFIG_SCHEDSTATS 364#ifdef CONFIG_SCHEDSTATS
282 p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0; 365 p->se.wait_max = 0;
283 p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0; 366 p->se.sleep_max = 0;
367 p->se.sum_sleep_runtime = 0;
368 p->se.block_max = 0;
369 p->se.exec_max = 0;
370 p->se.slice_max = 0;
371 p->se.nr_migrations = 0;
372 p->se.nr_migrations_cold = 0;
373 p->se.nr_failed_migrations_affine = 0;
374 p->se.nr_failed_migrations_running = 0;
375 p->se.nr_failed_migrations_hot = 0;
376 p->se.nr_forced_migrations = 0;
377 p->se.nr_forced2_migrations = 0;
378 p->se.nr_wakeups = 0;
379 p->se.nr_wakeups_sync = 0;
380 p->se.nr_wakeups_migrate = 0;
381 p->se.nr_wakeups_local = 0;
382 p->se.nr_wakeups_remote = 0;
383 p->se.nr_wakeups_affine = 0;
384 p->se.nr_wakeups_affine_attempts = 0;
385 p->se.nr_wakeups_passive = 0;
386 p->se.nr_wakeups_idle = 0;
387 p->sched_info.bkl_count = 0;
284#endif 388#endif
285 p->se.sum_exec_runtime = 0; 389 p->se.sum_exec_runtime = 0;
286 p->se.prev_sum_exec_runtime = 0; 390 p->se.prev_sum_exec_runtime = 0;
391 p->nvcsw = 0;
392 p->nivcsw = 0;
287} 393}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 67c67a87146e..a17b785d7000 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -25,22 +25,26 @@
25 * (default: 20ms, units: nanoseconds) 25 * (default: 20ms, units: nanoseconds)
26 * 26 *
27 * NOTE: this latency value is not the same as the concept of 27 * NOTE: this latency value is not the same as the concept of
28 * 'timeslice length' - timeslices in CFS are of variable length. 28 * 'timeslice length' - timeslices in CFS are of variable length
29 * (to see the precise effective timeslice length of your workload, 29 * and have no persistent notion like in traditional, time-slice
30 * run vmstat and monitor the context-switches field) 30 * based scheduling concepts.
31 * 31 *
32 * On SMP systems the value of this is multiplied by the log2 of the 32 * (to see the precise effective timeslice length of your workload,
33 * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way 33 * run vmstat and monitor the context-switches (cs) field)
34 * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
35 * Targeted preemption latency for CPU-bound tasks:
36 */ 34 */
37unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; 35const_debug unsigned int sysctl_sched_latency = 20000000ULL;
36
37/*
38 * After fork, child runs first. (default) If set to 0 then
39 * parent will (try to) run first.
40 */
41const_debug unsigned int sysctl_sched_child_runs_first = 1;
38 42
39/* 43/*
40 * Minimal preemption granularity for CPU-bound tasks: 44 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 2 msec, units: nanoseconds) 45 * (default: 2 msec, units: nanoseconds)
42 */ 46 */
43unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL; 47const_debug unsigned int sysctl_sched_nr_latency = 20;
44 48
45/* 49/*
46 * sys_sched_yield() compat mode 50 * sys_sched_yield() compat mode
@@ -52,52 +56,25 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
52 56
53/* 57/*
54 * SCHED_BATCH wake-up granularity. 58 * SCHED_BATCH wake-up granularity.
55 * (default: 25 msec, units: nanoseconds) 59 * (default: 10 msec, units: nanoseconds)
56 * 60 *
57 * This option delays the preemption effects of decoupled workloads 61 * This option delays the preemption effects of decoupled workloads
58 * and reduces their over-scheduling. Synchronous workloads will still 62 * and reduces their over-scheduling. Synchronous workloads will still
59 * have immediate wakeup/sleep latencies. 63 * have immediate wakeup/sleep latencies.
60 */ 64 */
61unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL; 65const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
62 66
63/* 67/*
64 * SCHED_OTHER wake-up granularity. 68 * SCHED_OTHER wake-up granularity.
65 * (default: 1 msec, units: nanoseconds) 69 * (default: 10 msec, units: nanoseconds)
66 * 70 *
67 * This option delays the preemption effects of decoupled workloads 71 * This option delays the preemption effects of decoupled workloads
68 * and reduces their over-scheduling. Synchronous workloads will still 72 * and reduces their over-scheduling. Synchronous workloads will still
69 * have immediate wakeup/sleep latencies. 73 * have immediate wakeup/sleep latencies.
70 */ 74 */
71unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000UL; 75const_debug unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
72
73unsigned int sysctl_sched_stat_granularity __read_mostly;
74
75/*
76 * Initialized in sched_init_granularity() [to 5 times the base granularity]:
77 */
78unsigned int sysctl_sched_runtime_limit __read_mostly;
79
80/*
81 * Debugging: various feature bits
82 */
83enum {
84 SCHED_FEAT_FAIR_SLEEPERS = 1,
85 SCHED_FEAT_SLEEPER_AVG = 2,
86 SCHED_FEAT_SLEEPER_LOAD_AVG = 4,
87 SCHED_FEAT_PRECISE_CPU_LOAD = 8,
88 SCHED_FEAT_START_DEBIT = 16,
89 SCHED_FEAT_SKIP_INITIAL = 32,
90};
91 76
92unsigned int sysctl_sched_features __read_mostly = 77const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
93 SCHED_FEAT_FAIR_SLEEPERS *1 |
94 SCHED_FEAT_SLEEPER_AVG *0 |
95 SCHED_FEAT_SLEEPER_LOAD_AVG *1 |
96 SCHED_FEAT_PRECISE_CPU_LOAD *1 |
97 SCHED_FEAT_START_DEBIT *1 |
98 SCHED_FEAT_SKIP_INITIAL *0;
99
100extern struct sched_class fair_sched_class;
101 78
102/************************************************************** 79/**************************************************************
103 * CFS operations on generic schedulable entities: 80 * CFS operations on generic schedulable entities:
@@ -111,21 +88,9 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
111 return cfs_rq->rq; 88 return cfs_rq->rq;
112} 89}
113 90
114/* currently running entity (if any) on this cfs_rq */
115static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
116{
117 return cfs_rq->curr;
118}
119
120/* An entity is a task if it doesn't "own" a runqueue */ 91/* An entity is a task if it doesn't "own" a runqueue */
121#define entity_is_task(se) (!se->my_q) 92#define entity_is_task(se) (!se->my_q)
122 93
123static inline void
124set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se)
125{
126 cfs_rq->curr = se;
127}
128
129#else /* CONFIG_FAIR_GROUP_SCHED */ 94#else /* CONFIG_FAIR_GROUP_SCHED */
130 95
131static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 96static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -133,21 +98,8 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
133 return container_of(cfs_rq, struct rq, cfs); 98 return container_of(cfs_rq, struct rq, cfs);
134} 99}
135 100
136static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
137{
138 struct rq *rq = rq_of(cfs_rq);
139
140 if (unlikely(rq->curr->sched_class != &fair_sched_class))
141 return NULL;
142
143 return &rq->curr->se;
144}
145
146#define entity_is_task(se) 1 101#define entity_is_task(se) 1
147 102
148static inline void
149set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
150
151#endif /* CONFIG_FAIR_GROUP_SCHED */ 103#endif /* CONFIG_FAIR_GROUP_SCHED */
152 104
153static inline struct task_struct *task_of(struct sched_entity *se) 105static inline struct task_struct *task_of(struct sched_entity *se)
@@ -160,16 +112,38 @@ static inline struct task_struct *task_of(struct sched_entity *se)
160 * Scheduling class tree data structure manipulation methods: 112 * Scheduling class tree data structure manipulation methods:
161 */ 113 */
162 114
115static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
116{
117 s64 delta = (s64)(vruntime - min_vruntime);
118 if (delta > 0)
119 min_vruntime = vruntime;
120
121 return min_vruntime;
122}
123
124static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
125{
126 s64 delta = (s64)(vruntime - min_vruntime);
127 if (delta < 0)
128 min_vruntime = vruntime;
129
130 return min_vruntime;
131}
132
133static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
134{
135 return se->vruntime - cfs_rq->min_vruntime;
136}
137
163/* 138/*
164 * Enqueue an entity into the rb-tree: 139 * Enqueue an entity into the rb-tree:
165 */ 140 */
166static inline void 141static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
167__enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
168{ 142{
169 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; 143 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
170 struct rb_node *parent = NULL; 144 struct rb_node *parent = NULL;
171 struct sched_entity *entry; 145 struct sched_entity *entry;
172 s64 key = se->fair_key; 146 s64 key = entity_key(cfs_rq, se);
173 int leftmost = 1; 147 int leftmost = 1;
174 148
175 /* 149 /*
@@ -182,7 +156,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
182 * We dont care about collisions. Nodes with 156 * We dont care about collisions. Nodes with
183 * the same key stay together. 157 * the same key stay together.
184 */ 158 */
185 if (key - entry->fair_key < 0) { 159 if (key < entity_key(cfs_rq, entry)) {
186 link = &parent->rb_left; 160 link = &parent->rb_left;
187 } else { 161 } else {
188 link = &parent->rb_right; 162 link = &parent->rb_right;
@@ -199,24 +173,14 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
199 173
200 rb_link_node(&se->run_node, parent, link); 174 rb_link_node(&se->run_node, parent, link);
201 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); 175 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
202 update_load_add(&cfs_rq->load, se->load.weight);
203 cfs_rq->nr_running++;
204 se->on_rq = 1;
205
206 schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
207} 176}
208 177
209static inline void 178static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
210__dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
211{ 179{
212 if (cfs_rq->rb_leftmost == &se->run_node) 180 if (cfs_rq->rb_leftmost == &se->run_node)
213 cfs_rq->rb_leftmost = rb_next(&se->run_node); 181 cfs_rq->rb_leftmost = rb_next(&se->run_node);
214 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
215 update_load_sub(&cfs_rq->load, se->load.weight);
216 cfs_rq->nr_running--;
217 se->on_rq = 0;
218 182
219 schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); 183 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
220} 184}
221 185
222static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) 186static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
@@ -229,118 +193,86 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
229 return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); 193 return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
230} 194}
231 195
196static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
197{
198 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
199 struct sched_entity *se = NULL;
200 struct rb_node *parent;
201
202 while (*link) {
203 parent = *link;
204 se = rb_entry(parent, struct sched_entity, run_node);
205 link = &parent->rb_right;
206 }
207
208 return se;
209}
210
232/************************************************************** 211/**************************************************************
233 * Scheduling class statistics methods: 212 * Scheduling class statistics methods:
234 */ 213 */
235 214
215
236/* 216/*
237 * Calculate the preemption granularity needed to schedule every 217 * The idea is to set a period in which each task runs once.
238 * runnable task once per sysctl_sched_latency amount of time.
239 * (down to a sensible low limit on granularity)
240 *
241 * For example, if there are 2 tasks running and latency is 10 msecs,
242 * we switch tasks every 5 msecs. If we have 3 tasks running, we have
243 * to switch tasks every 3.33 msecs to get a 10 msecs observed latency
244 * for each task. We do finer and finer scheduling up to until we
245 * reach the minimum granularity value.
246 *
247 * To achieve this we use the following dynamic-granularity rule:
248 * 218 *
249 * gran = lat/nr - lat/nr/nr 219 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
220 * this period because otherwise the slices get too small.
250 * 221 *
251 * This comes out of the following equations: 222 * p = (nr <= nl) ? l : l*nr/nl
252 *
253 * kA1 + gran = kB1
254 * kB2 + gran = kA2
255 * kA2 = kA1
256 * kB2 = kB1 - d + d/nr
257 * lat = d * nr
258 *
259 * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running),
260 * '1' is start of time, '2' is end of time, 'd' is delay between
261 * 1 and 2 (during which task B was running), 'nr' is number of tasks
262 * running, 'lat' is the the period of each task. ('lat' is the
263 * sched_latency that we aim for.)
264 */ 223 */
265static long 224static u64 __sched_period(unsigned long nr_running)
266sched_granularity(struct cfs_rq *cfs_rq)
267{ 225{
268 unsigned int gran = sysctl_sched_latency; 226 u64 period = sysctl_sched_latency;
269 unsigned int nr = cfs_rq->nr_running; 227 unsigned long nr_latency = sysctl_sched_nr_latency;
270 228
271 if (nr > 1) { 229 if (unlikely(nr_running > nr_latency)) {
272 gran = gran/nr - gran/nr/nr; 230 period *= nr_running;
273 gran = max(gran, sysctl_sched_min_granularity); 231 do_div(period, nr_latency);
274 } 232 }
275 233
276 return gran; 234 return period;
277} 235}
278 236
279/* 237/*
280 * We rescale the rescheduling granularity of tasks according to their 238 * We calculate the wall-time slice from the period by taking a part
281 * nice level, but only linearly, not exponentially: 239 * proportional to the weight.
240 *
241 * s = p*w/rw
282 */ 242 */
283static long 243static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
284niced_granularity(struct sched_entity *curr, unsigned long granularity)
285{ 244{
286 u64 tmp; 245 u64 slice = __sched_period(cfs_rq->nr_running);
287 246
288 if (likely(curr->load.weight == NICE_0_LOAD)) 247 slice *= se->load.weight;
289 return granularity; 248 do_div(slice, cfs_rq->load.weight);
290 /*
291 * Positive nice levels get the same granularity as nice-0:
292 */
293 if (likely(curr->load.weight < NICE_0_LOAD)) {
294 tmp = curr->load.weight * (u64)granularity;
295 return (long) (tmp >> NICE_0_SHIFT);
296 }
297 /*
298 * Negative nice level tasks get linearly finer
299 * granularity:
300 */
301 tmp = curr->load.inv_weight * (u64)granularity;
302 249
303 /* 250 return slice;
304 * It will always fit into 'long':
305 */
306 return (long) (tmp >> (WMULT_SHIFT-NICE_0_SHIFT));
307} 251}
308 252
309static inline void 253/*
310limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se) 254 * We calculate the vruntime slice.
255 *
256 * vs = s/w = p/rw
257 */
258static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
311{ 259{
312 long limit = sysctl_sched_runtime_limit; 260 u64 vslice = __sched_period(nr_running);
313 261
314 /* 262 do_div(vslice, rq_weight);
315 * Niced tasks have the same history dynamic range as 263
316 * non-niced tasks: 264 return vslice;
317 */
318 if (unlikely(se->wait_runtime > limit)) {
319 se->wait_runtime = limit;
320 schedstat_inc(se, wait_runtime_overruns);
321 schedstat_inc(cfs_rq, wait_runtime_overruns);
322 }
323 if (unlikely(se->wait_runtime < -limit)) {
324 se->wait_runtime = -limit;
325 schedstat_inc(se, wait_runtime_underruns);
326 schedstat_inc(cfs_rq, wait_runtime_underruns);
327 }
328} 265}
329 266
330static inline void 267static u64 sched_vslice(struct cfs_rq *cfs_rq)
331__add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
332{ 268{
333 se->wait_runtime += delta; 269 return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running);
334 schedstat_add(se, sum_wait_runtime, delta);
335 limit_wait_runtime(cfs_rq, se);
336} 270}
337 271
338static void 272static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
339add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
340{ 273{
341 schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); 274 return __sched_vslice(cfs_rq->load.weight + se->load.weight,
342 __add_wait_runtime(cfs_rq, se, delta); 275 cfs_rq->nr_running + 1);
343 schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
344} 276}
345 277
346/* 278/*
@@ -348,46 +280,41 @@ add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
348 * are not in our scheduling class. 280 * are not in our scheduling class.
349 */ 281 */
350static inline void 282static inline void
351__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr) 283__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
284 unsigned long delta_exec)
352{ 285{
353 unsigned long delta, delta_exec, delta_fair, delta_mine; 286 unsigned long delta_exec_weighted;
354 struct load_weight *lw = &cfs_rq->load; 287 u64 vruntime;
355 unsigned long load = lw->weight;
356 288
357 delta_exec = curr->delta_exec;
358 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); 289 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
359 290
360 curr->sum_exec_runtime += delta_exec; 291 curr->sum_exec_runtime += delta_exec;
361 cfs_rq->exec_clock += delta_exec; 292 schedstat_add(cfs_rq, exec_clock, delta_exec);
362 293 delta_exec_weighted = delta_exec;
363 if (unlikely(!load)) 294 if (unlikely(curr->load.weight != NICE_0_LOAD)) {
364 return; 295 delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
365 296 &curr->load);
366 delta_fair = calc_delta_fair(delta_exec, lw);
367 delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
368
369 if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) {
370 delta = min((u64)delta_mine, cfs_rq->sleeper_bonus);
371 delta = min(delta, (unsigned long)(
372 (long)sysctl_sched_runtime_limit - curr->wait_runtime));
373 cfs_rq->sleeper_bonus -= delta;
374 delta_mine -= delta;
375 } 297 }
298 curr->vruntime += delta_exec_weighted;
376 299
377 cfs_rq->fair_clock += delta_fair;
378 /* 300 /*
379 * We executed delta_exec amount of time on the CPU, 301 * maintain cfs_rq->min_vruntime to be a monotonic increasing
380 * but we were only entitled to delta_mine amount of 302 * value tracking the leftmost vruntime in the tree.
381 * time during that period (if nr_running == 1 then
382 * the two values are equal)
383 * [Note: delta_mine - delta_exec is negative]:
384 */ 303 */
385 add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec); 304 if (first_fair(cfs_rq)) {
305 vruntime = min_vruntime(curr->vruntime,
306 __pick_next_entity(cfs_rq)->vruntime);
307 } else
308 vruntime = curr->vruntime;
309
310 cfs_rq->min_vruntime =
311 max_vruntime(cfs_rq->min_vruntime, vruntime);
386} 312}
387 313
388static void update_curr(struct cfs_rq *cfs_rq) 314static void update_curr(struct cfs_rq *cfs_rq)
389{ 315{
390 struct sched_entity *curr = cfs_rq_curr(cfs_rq); 316 struct sched_entity *curr = cfs_rq->curr;
317 u64 now = rq_of(cfs_rq)->clock;
391 unsigned long delta_exec; 318 unsigned long delta_exec;
392 319
393 if (unlikely(!curr)) 320 if (unlikely(!curr))
@@ -398,135 +325,47 @@ static void update_curr(struct cfs_rq *cfs_rq)
398 * since the last time we changed load (this cannot 325 * since the last time we changed load (this cannot
399 * overflow on 32 bits): 326 * overflow on 32 bits):
400 */ 327 */
401 delta_exec = (unsigned long)(rq_of(cfs_rq)->clock - curr->exec_start); 328 delta_exec = (unsigned long)(now - curr->exec_start);
402 329
403 curr->delta_exec += delta_exec; 330 __update_curr(cfs_rq, curr, delta_exec);
404 331 curr->exec_start = now;
405 if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) {
406 __update_curr(cfs_rq, curr);
407 curr->delta_exec = 0;
408 }
409 curr->exec_start = rq_of(cfs_rq)->clock;
410} 332}
411 333
412static inline void 334static inline void
413update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 335update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
414{ 336{
415 se->wait_start_fair = cfs_rq->fair_clock;
416 schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); 337 schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
417} 338}
418 339
419/* 340/*
420 * We calculate fair deltas here, so protect against the random effects
421 * of a multiplication overflow by capping it to the runtime limit:
422 */
423#if BITS_PER_LONG == 32
424static inline unsigned long
425calc_weighted(unsigned long delta, unsigned long weight, int shift)
426{
427 u64 tmp = (u64)delta * weight >> shift;
428
429 if (unlikely(tmp > sysctl_sched_runtime_limit*2))
430 return sysctl_sched_runtime_limit*2;
431 return tmp;
432}
433#else
434static inline unsigned long
435calc_weighted(unsigned long delta, unsigned long weight, int shift)
436{
437 return delta * weight >> shift;
438}
439#endif
440
441/*
442 * Task is being enqueued - update stats: 341 * Task is being enqueued - update stats:
443 */ 342 */
444static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 343static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
445{ 344{
446 s64 key;
447
448 /* 345 /*
449 * Are we enqueueing a waiting task? (for current tasks 346 * Are we enqueueing a waiting task? (for current tasks
450 * a dequeue/enqueue event is a NOP) 347 * a dequeue/enqueue event is a NOP)
451 */ 348 */
452 if (se != cfs_rq_curr(cfs_rq)) 349 if (se != cfs_rq->curr)
453 update_stats_wait_start(cfs_rq, se); 350 update_stats_wait_start(cfs_rq, se);
454 /*
455 * Update the key:
456 */
457 key = cfs_rq->fair_clock;
458
459 /*
460 * Optimize the common nice 0 case:
461 */
462 if (likely(se->load.weight == NICE_0_LOAD)) {
463 key -= se->wait_runtime;
464 } else {
465 u64 tmp;
466
467 if (se->wait_runtime < 0) {
468 tmp = -se->wait_runtime;
469 key += (tmp * se->load.inv_weight) >>
470 (WMULT_SHIFT - NICE_0_SHIFT);
471 } else {
472 tmp = se->wait_runtime;
473 key -= (tmp * se->load.inv_weight) >>
474 (WMULT_SHIFT - NICE_0_SHIFT);
475 }
476 }
477
478 se->fair_key = key;
479}
480
481/*
482 * Note: must be called with a freshly updated rq->fair_clock.
483 */
484static inline void
485__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
486{
487 unsigned long delta_fair = se->delta_fair_run;
488
489 schedstat_set(se->wait_max, max(se->wait_max,
490 rq_of(cfs_rq)->clock - se->wait_start));
491
492 if (unlikely(se->load.weight != NICE_0_LOAD))
493 delta_fair = calc_weighted(delta_fair, se->load.weight,
494 NICE_0_SHIFT);
495
496 add_wait_runtime(cfs_rq, se, delta_fair);
497} 351}
498 352
499static void 353static void
500update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 354update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
501{ 355{
502 unsigned long delta_fair; 356 schedstat_set(se->wait_max, max(se->wait_max,
503 357 rq_of(cfs_rq)->clock - se->wait_start));
504 if (unlikely(!se->wait_start_fair))
505 return;
506
507 delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
508 (u64)(cfs_rq->fair_clock - se->wait_start_fair));
509
510 se->delta_fair_run += delta_fair;
511 if (unlikely(abs(se->delta_fair_run) >=
512 sysctl_sched_stat_granularity)) {
513 __update_stats_wait_end(cfs_rq, se);
514 se->delta_fair_run = 0;
515 }
516
517 se->wait_start_fair = 0;
518 schedstat_set(se->wait_start, 0); 358 schedstat_set(se->wait_start, 0);
519} 359}
520 360
521static inline void 361static inline void
522update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 362update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
523{ 363{
524 update_curr(cfs_rq);
525 /* 364 /*
526 * Mark the end of the wait period if dequeueing a 365 * Mark the end of the wait period if dequeueing a
527 * waiting task: 366 * waiting task:
528 */ 367 */
529 if (se != cfs_rq_curr(cfs_rq)) 368 if (se != cfs_rq->curr)
530 update_stats_wait_end(cfs_rq, se); 369 update_stats_wait_end(cfs_rq, se);
531} 370}
532 371
@@ -542,79 +381,28 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
542 se->exec_start = rq_of(cfs_rq)->clock; 381 se->exec_start = rq_of(cfs_rq)->clock;
543} 382}
544 383
545/*
546 * We are descheduling a task - update its stats:
547 */
548static inline void
549update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
550{
551 se->exec_start = 0;
552}
553
554/************************************************** 384/**************************************************
555 * Scheduling class queueing methods: 385 * Scheduling class queueing methods:
556 */ 386 */
557 387
558static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 388static void
389account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
559{ 390{
560 unsigned long load = cfs_rq->load.weight, delta_fair; 391 update_load_add(&cfs_rq->load, se->load.weight);
561 long prev_runtime; 392 cfs_rq->nr_running++;
562 393 se->on_rq = 1;
563 /* 394}
564 * Do not boost sleepers if there's too much bonus 'in flight'
565 * already:
566 */
567 if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit))
568 return;
569
570 if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG)
571 load = rq_of(cfs_rq)->cpu_load[2];
572
573 delta_fair = se->delta_fair_sleep;
574
575 /*
576 * Fix up delta_fair with the effect of us running
577 * during the whole sleep period:
578 */
579 if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG)
580 delta_fair = div64_likely32((u64)delta_fair * load,
581 load + se->load.weight);
582
583 if (unlikely(se->load.weight != NICE_0_LOAD))
584 delta_fair = calc_weighted(delta_fair, se->load.weight,
585 NICE_0_SHIFT);
586
587 prev_runtime = se->wait_runtime;
588 __add_wait_runtime(cfs_rq, se, delta_fair);
589 delta_fair = se->wait_runtime - prev_runtime;
590 395
591 /* 396static void
592 * Track the amount of bonus we've given to sleepers: 397account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
593 */ 398{
594 cfs_rq->sleeper_bonus += delta_fair; 399 update_load_sub(&cfs_rq->load, se->load.weight);
400 cfs_rq->nr_running--;
401 se->on_rq = 0;
595} 402}
596 403
597static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 404static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
598{ 405{
599 struct task_struct *tsk = task_of(se);
600 unsigned long delta_fair;
601
602 if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) ||
603 !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS))
604 return;
605
606 delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
607 (u64)(cfs_rq->fair_clock - se->sleep_start_fair));
608
609 se->delta_fair_sleep += delta_fair;
610 if (unlikely(abs(se->delta_fair_sleep) >=
611 sysctl_sched_stat_granularity)) {
612 __enqueue_sleeper(cfs_rq, se);
613 se->delta_fair_sleep = 0;
614 }
615
616 se->sleep_start_fair = 0;
617
618#ifdef CONFIG_SCHEDSTATS 406#ifdef CONFIG_SCHEDSTATS
619 if (se->sleep_start) { 407 if (se->sleep_start) {
620 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 408 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
@@ -646,6 +434,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
646 * time that the task spent sleeping: 434 * time that the task spent sleeping:
647 */ 435 */
648 if (unlikely(prof_on == SLEEP_PROFILING)) { 436 if (unlikely(prof_on == SLEEP_PROFILING)) {
437 struct task_struct *tsk = task_of(se);
438
649 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), 439 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
650 delta >> 20); 440 delta >> 20);
651 } 441 }
@@ -653,27 +443,81 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
653#endif 443#endif
654} 444}
655 445
446static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
447{
448#ifdef CONFIG_SCHED_DEBUG
449 s64 d = se->vruntime - cfs_rq->min_vruntime;
450
451 if (d < 0)
452 d = -d;
453
454 if (d > 3*sysctl_sched_latency)
455 schedstat_inc(cfs_rq, nr_spread_over);
456#endif
457}
458
459static void
460place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
461{
462 u64 vruntime;
463
464 vruntime = cfs_rq->min_vruntime;
465
466 if (sched_feat(TREE_AVG)) {
467 struct sched_entity *last = __pick_last_entity(cfs_rq);
468 if (last) {
469 vruntime += last->vruntime;
470 vruntime >>= 1;
471 }
472 } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
473 vruntime += sched_vslice(cfs_rq)/2;
474
475 if (initial && sched_feat(START_DEBIT))
476 vruntime += sched_vslice_add(cfs_rq, se);
477
478 if (!initial) {
479 if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) &&
480 task_of(se)->policy != SCHED_BATCH)
481 vruntime -= sysctl_sched_latency;
482
483 vruntime = max_t(s64, vruntime, se->vruntime);
484 }
485
486 se->vruntime = vruntime;
487
488}
489
656static void 490static void
657enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) 491enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
658{ 492{
659 /* 493 /*
660 * Update the fair clock. 494 * Update run-time statistics of the 'current'.
661 */ 495 */
662 update_curr(cfs_rq); 496 update_curr(cfs_rq);
663 497
664 if (wakeup) 498 if (wakeup) {
499 place_entity(cfs_rq, se, 0);
665 enqueue_sleeper(cfs_rq, se); 500 enqueue_sleeper(cfs_rq, se);
501 }
666 502
667 update_stats_enqueue(cfs_rq, se); 503 update_stats_enqueue(cfs_rq, se);
668 __enqueue_entity(cfs_rq, se); 504 check_spread(cfs_rq, se);
505 if (se != cfs_rq->curr)
506 __enqueue_entity(cfs_rq, se);
507 account_entity_enqueue(cfs_rq, se);
669} 508}
670 509
671static void 510static void
672dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 511dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
673{ 512{
513 /*
514 * Update run-time statistics of the 'current'.
515 */
516 update_curr(cfs_rq);
517
674 update_stats_dequeue(cfs_rq, se); 518 update_stats_dequeue(cfs_rq, se);
675 if (sleep) { 519 if (sleep) {
676 se->sleep_start_fair = cfs_rq->fair_clock; 520 se->peer_preempt = 0;
677#ifdef CONFIG_SCHEDSTATS 521#ifdef CONFIG_SCHEDSTATS
678 if (entity_is_task(se)) { 522 if (entity_is_task(se)) {
679 struct task_struct *tsk = task_of(se); 523 struct task_struct *tsk = task_of(se);
@@ -685,68 +529,66 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
685 } 529 }
686#endif 530#endif
687 } 531 }
688 __dequeue_entity(cfs_rq, se); 532
533 if (se != cfs_rq->curr)
534 __dequeue_entity(cfs_rq, se);
535 account_entity_dequeue(cfs_rq, se);
689} 536}
690 537
691/* 538/*
692 * Preempt the current task with a newly woken task if needed: 539 * Preempt the current task with a newly woken task if needed:
693 */ 540 */
694static void 541static void
695__check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, 542check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
696 struct sched_entity *curr, unsigned long granularity)
697{ 543{
698 s64 __delta = curr->fair_key - se->fair_key;
699 unsigned long ideal_runtime, delta_exec; 544 unsigned long ideal_runtime, delta_exec;
700 545
701 /* 546 ideal_runtime = sched_slice(cfs_rq, curr);
702 * ideal_runtime is compared against sum_exec_runtime, which is
703 * walltime, hence do not scale.
704 */
705 ideal_runtime = max(sysctl_sched_latency / cfs_rq->nr_running,
706 (unsigned long)sysctl_sched_min_granularity);
707
708 /*
709 * If we executed more than what the latency constraint suggests,
710 * reduce the rescheduling granularity. This way the total latency
711 * of how much a task is not scheduled converges to
712 * sysctl_sched_latency:
713 */
714 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 547 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
715 if (delta_exec > ideal_runtime) 548 if (delta_exec > ideal_runtime ||
716 granularity = 0; 549 (sched_feat(PREEMPT_RESTRICT) && curr->peer_preempt))
717
718 /*
719 * Take scheduling granularity into account - do not
720 * preempt the current task unless the best task has
721 * a larger than sched_granularity fairness advantage:
722 *
723 * scale granularity as key space is in fair_clock.
724 */
725 if (__delta > niced_granularity(curr, granularity))
726 resched_task(rq_of(cfs_rq)->curr); 550 resched_task(rq_of(cfs_rq)->curr);
551 curr->peer_preempt = 0;
727} 552}
728 553
729static inline void 554static void
730set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 555set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
731{ 556{
557 /* 'current' is not kept within the tree. */
558 if (se->on_rq) {
559 /*
560 * Any task has to be enqueued before it get to execute on
561 * a CPU. So account for the time it spent waiting on the
562 * runqueue.
563 */
564 update_stats_wait_end(cfs_rq, se);
565 __dequeue_entity(cfs_rq, se);
566 }
567
568 update_stats_curr_start(cfs_rq, se);
569 cfs_rq->curr = se;
570#ifdef CONFIG_SCHEDSTATS
732 /* 571 /*
733 * Any task has to be enqueued before it get to execute on 572 * Track our maximum slice length, if the CPU's load is at
734 * a CPU. So account for the time it spent waiting on the 573 * least twice that of our own weight (i.e. dont track it
735 * runqueue. (note, here we rely on pick_next_task() having 574 * when there are only lesser-weight tasks around):
736 * done a put_prev_task_fair() shortly before this, which
737 * updated rq->fair_clock - used by update_stats_wait_end())
738 */ 575 */
739 update_stats_wait_end(cfs_rq, se); 576 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
740 update_stats_curr_start(cfs_rq, se); 577 se->slice_max = max(se->slice_max,
741 set_cfs_rq_curr(cfs_rq, se); 578 se->sum_exec_runtime - se->prev_sum_exec_runtime);
579 }
580#endif
742 se->prev_sum_exec_runtime = se->sum_exec_runtime; 581 se->prev_sum_exec_runtime = se->sum_exec_runtime;
743} 582}
744 583
745static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 584static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
746{ 585{
747 struct sched_entity *se = __pick_next_entity(cfs_rq); 586 struct sched_entity *se = NULL;
748 587
749 set_next_entity(cfs_rq, se); 588 if (first_fair(cfs_rq)) {
589 se = __pick_next_entity(cfs_rq);
590 set_next_entity(cfs_rq, se);
591 }
750 592
751 return se; 593 return se;
752} 594}
@@ -760,33 +602,24 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
760 if (prev->on_rq) 602 if (prev->on_rq)
761 update_curr(cfs_rq); 603 update_curr(cfs_rq);
762 604
763 update_stats_curr_end(cfs_rq, prev); 605 check_spread(cfs_rq, prev);
764 606 if (prev->on_rq) {
765 if (prev->on_rq)
766 update_stats_wait_start(cfs_rq, prev); 607 update_stats_wait_start(cfs_rq, prev);
767 set_cfs_rq_curr(cfs_rq, NULL); 608 /* Put 'current' back into the tree. */
609 __enqueue_entity(cfs_rq, prev);
610 }
611 cfs_rq->curr = NULL;
768} 612}
769 613
770static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) 614static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
771{ 615{
772 struct sched_entity *next;
773
774 /* 616 /*
775 * Dequeue and enqueue the task to update its 617 * Update run-time statistics of the 'current'.
776 * position within the tree:
777 */ 618 */
778 dequeue_entity(cfs_rq, curr, 0); 619 update_curr(cfs_rq);
779 enqueue_entity(cfs_rq, curr, 0);
780
781 /*
782 * Reschedule if another task tops the current one.
783 */
784 next = __pick_next_entity(cfs_rq);
785 if (next == curr)
786 return;
787 620
788 __check_preempt_curr_fair(cfs_rq, next, curr, 621 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
789 sched_granularity(cfs_rq)); 622 check_preempt_tick(cfs_rq, curr);
790} 623}
791 624
792/************************************************** 625/**************************************************
@@ -821,23 +654,28 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
821 */ 654 */
822static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) 655static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
823{ 656{
824 /* A later patch will take group into account */ 657 return cfs_rq->tg->cfs_rq[this_cpu];
825 return &cpu_rq(this_cpu)->cfs;
826} 658}
827 659
828/* Iterate thr' all leaf cfs_rq's on a runqueue */ 660/* Iterate thr' all leaf cfs_rq's on a runqueue */
829#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 661#define for_each_leaf_cfs_rq(rq, cfs_rq) \
830 list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 662 list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
831 663
832/* Do the two (enqueued) tasks belong to the same group ? */ 664/* Do the two (enqueued) entities belong to the same group ? */
833static inline int is_same_group(struct task_struct *curr, struct task_struct *p) 665static inline int
666is_same_group(struct sched_entity *se, struct sched_entity *pse)
834{ 667{
835 if (curr->se.cfs_rq == p->se.cfs_rq) 668 if (se->cfs_rq == pse->cfs_rq)
836 return 1; 669 return 1;
837 670
838 return 0; 671 return 0;
839} 672}
840 673
674static inline struct sched_entity *parent_entity(struct sched_entity *se)
675{
676 return se->parent;
677}
678
841#else /* CONFIG_FAIR_GROUP_SCHED */ 679#else /* CONFIG_FAIR_GROUP_SCHED */
842 680
843#define for_each_sched_entity(se) \ 681#define for_each_sched_entity(se) \
@@ -870,11 +708,17 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
870#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 708#define for_each_leaf_cfs_rq(rq, cfs_rq) \
871 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 709 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
872 710
873static inline int is_same_group(struct task_struct *curr, struct task_struct *p) 711static inline int
712is_same_group(struct sched_entity *se, struct sched_entity *pse)
874{ 713{
875 return 1; 714 return 1;
876} 715}
877 716
717static inline struct sched_entity *parent_entity(struct sched_entity *se)
718{
719 return NULL;
720}
721
878#endif /* CONFIG_FAIR_GROUP_SCHED */ 722#endif /* CONFIG_FAIR_GROUP_SCHED */
879 723
880/* 724/*
@@ -892,6 +736,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
892 break; 736 break;
893 cfs_rq = cfs_rq_of(se); 737 cfs_rq = cfs_rq_of(se);
894 enqueue_entity(cfs_rq, se, wakeup); 738 enqueue_entity(cfs_rq, se, wakeup);
739 wakeup = 1;
895 } 740 }
896} 741}
897 742
@@ -911,6 +756,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
911 /* Don't dequeue parent if it has other entities besides us */ 756 /* Don't dequeue parent if it has other entities besides us */
912 if (cfs_rq->load.weight) 757 if (cfs_rq->load.weight)
913 break; 758 break;
759 sleep = 1;
914 } 760 }
915} 761}
916 762
@@ -919,12 +765,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
919 * 765 *
920 * If compat_yield is turned on then we requeue to the end of the tree. 766 * If compat_yield is turned on then we requeue to the end of the tree.
921 */ 767 */
922static void yield_task_fair(struct rq *rq, struct task_struct *p) 768static void yield_task_fair(struct rq *rq)
923{ 769{
924 struct cfs_rq *cfs_rq = task_cfs_rq(p); 770 struct cfs_rq *cfs_rq = task_cfs_rq(rq->curr);
925 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; 771 struct sched_entity *rightmost, *se = &rq->curr->se;
926 struct sched_entity *rightmost, *se = &p->se;
927 struct rb_node *parent;
928 772
929 /* 773 /*
930 * Are we the only task in the tree? 774 * Are we the only task in the tree?
@@ -935,52 +779,39 @@ static void yield_task_fair(struct rq *rq, struct task_struct *p)
935 if (likely(!sysctl_sched_compat_yield)) { 779 if (likely(!sysctl_sched_compat_yield)) {
936 __update_rq_clock(rq); 780 __update_rq_clock(rq);
937 /* 781 /*
938 * Dequeue and enqueue the task to update its 782 * Update run-time statistics of the 'current'.
939 * position within the tree:
940 */ 783 */
941 dequeue_entity(cfs_rq, &p->se, 0); 784 update_curr(cfs_rq);
942 enqueue_entity(cfs_rq, &p->se, 0);
943 785
944 return; 786 return;
945 } 787 }
946 /* 788 /*
947 * Find the rightmost entry in the rbtree: 789 * Find the rightmost entry in the rbtree:
948 */ 790 */
949 do { 791 rightmost = __pick_last_entity(cfs_rq);
950 parent = *link;
951 link = &parent->rb_right;
952 } while (*link);
953
954 rightmost = rb_entry(parent, struct sched_entity, run_node);
955 /* 792 /*
956 * Already in the rightmost position? 793 * Already in the rightmost position?
957 */ 794 */
958 if (unlikely(rightmost == se)) 795 if (unlikely(rightmost->vruntime < se->vruntime))
959 return; 796 return;
960 797
961 /* 798 /*
962 * Minimally necessary key value to be last in the tree: 799 * Minimally necessary key value to be last in the tree:
800 * Upon rescheduling, sched_class::put_prev_task() will place
801 * 'current' within the tree based on its new key value.
963 */ 802 */
964 se->fair_key = rightmost->fair_key + 1; 803 se->vruntime = rightmost->vruntime + 1;
965
966 if (cfs_rq->rb_leftmost == &se->run_node)
967 cfs_rq->rb_leftmost = rb_next(&se->run_node);
968 /*
969 * Relink the task to the rightmost position:
970 */
971 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
972 rb_link_node(&se->run_node, parent, link);
973 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
974} 804}
975 805
976/* 806/*
977 * Preempt the current task with a newly woken task if needed: 807 * Preempt the current task with a newly woken task if needed:
978 */ 808 */
979static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) 809static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
980{ 810{
981 struct task_struct *curr = rq->curr; 811 struct task_struct *curr = rq->curr;
982 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 812 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
983 unsigned long gran; 813 struct sched_entity *se = &curr->se, *pse = &p->se;
814 s64 delta, gran;
984 815
985 if (unlikely(rt_prio(p->prio))) { 816 if (unlikely(rt_prio(p->prio))) {
986 update_rq_clock(rq); 817 update_rq_clock(rq);
@@ -988,16 +819,31 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
988 resched_task(curr); 819 resched_task(curr);
989 return; 820 return;
990 } 821 }
991
992 gran = sysctl_sched_wakeup_granularity;
993 /* 822 /*
994 * Batch tasks prefer throughput over latency: 823 * Batch tasks do not preempt (their preemption is driven by
824 * the tick):
995 */ 825 */
996 if (unlikely(p->policy == SCHED_BATCH)) 826 if (unlikely(p->policy == SCHED_BATCH))
997 gran = sysctl_sched_batch_wakeup_granularity; 827 return;
828
829 if (sched_feat(WAKEUP_PREEMPT)) {
830 while (!is_same_group(se, pse)) {
831 se = parent_entity(se);
832 pse = parent_entity(pse);
833 }
998 834
999 if (is_same_group(curr, p)) 835 delta = se->vruntime - pse->vruntime;
1000 __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran); 836 gran = sysctl_sched_wakeup_granularity;
837 if (unlikely(se->load.weight != NICE_0_LOAD))
838 gran = calc_delta_fair(gran, &se->load);
839
840 if (delta > gran) {
841 int now = !sched_feat(PREEMPT_RESTRICT);
842
843 if (now || p->prio < curr->prio || !se->peer_preempt++)
844 resched_task(curr);
845 }
846 }
1001} 847}
1002 848
1003static struct task_struct *pick_next_task_fair(struct rq *rq) 849static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1041,7 +887,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1041 * achieve that by always pre-iterating before returning 887 * achieve that by always pre-iterating before returning
1042 * the current task: 888 * the current task:
1043 */ 889 */
1044static inline struct task_struct * 890static struct task_struct *
1045__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) 891__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
1046{ 892{
1047 struct task_struct *p; 893 struct task_struct *p;
@@ -1078,7 +924,10 @@ static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
1078 if (!cfs_rq->nr_running) 924 if (!cfs_rq->nr_running)
1079 return MAX_PRIO; 925 return MAX_PRIO;
1080 926
1081 curr = __pick_next_entity(cfs_rq); 927 curr = cfs_rq->curr;
928 if (!curr)
929 curr = __pick_next_entity(cfs_rq);
930
1082 p = task_of(curr); 931 p = task_of(curr);
1083 932
1084 return p->prio; 933 return p->prio;
@@ -1153,6 +1002,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr)
1153 } 1002 }
1154} 1003}
1155 1004
1005#define swap(a,b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
1006
1156/* 1007/*
1157 * Share the fairness runtime between parent and child, thus the 1008 * Share the fairness runtime between parent and child, thus the
1158 * total amount of pressure for CPU stays equal - new tasks 1009 * total amount of pressure for CPU stays equal - new tasks
@@ -1163,37 +1014,32 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr)
1163static void task_new_fair(struct rq *rq, struct task_struct *p) 1014static void task_new_fair(struct rq *rq, struct task_struct *p)
1164{ 1015{
1165 struct cfs_rq *cfs_rq = task_cfs_rq(p); 1016 struct cfs_rq *cfs_rq = task_cfs_rq(p);
1166 struct sched_entity *se = &p->se, *curr = cfs_rq_curr(cfs_rq); 1017 struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
1018 int this_cpu = smp_processor_id();
1167 1019
1168 sched_info_queued(p); 1020 sched_info_queued(p);
1169 1021
1170 update_curr(cfs_rq); 1022 update_curr(cfs_rq);
1171 update_stats_enqueue(cfs_rq, se); 1023 place_entity(cfs_rq, se, 1);
1172 /*
1173 * Child runs first: we let it run before the parent
1174 * until it reschedules once. We set up the key so that
1175 * it will preempt the parent:
1176 */
1177 se->fair_key = curr->fair_key -
1178 niced_granularity(curr, sched_granularity(cfs_rq)) - 1;
1179 /*
1180 * The first wait is dominated by the child-runs-first logic,
1181 * so do not credit it with that waiting time yet:
1182 */
1183 if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL)
1184 se->wait_start_fair = 0;
1185 1024
1186 /* 1025 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1187 * The statistical average of wait_runtime is about 1026 curr->vruntime < se->vruntime) {
1188 * -granularity/2, so initialize the task with that: 1027 /*
1189 */ 1028 * Upon rescheduling, sched_class::put_prev_task() will place
1190 if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) 1029 * 'current' within the tree based on its new key value.
1191 se->wait_runtime = -(sched_granularity(cfs_rq) / 2); 1030 */
1031 swap(curr->vruntime, se->vruntime);
1032 }
1192 1033
1034 update_stats_enqueue(cfs_rq, se);
1035 check_spread(cfs_rq, se);
1036 check_spread(cfs_rq, curr);
1193 __enqueue_entity(cfs_rq, se); 1037 __enqueue_entity(cfs_rq, se);
1038 account_entity_enqueue(cfs_rq, se);
1039 se->peer_preempt = 0;
1040 resched_task(rq->curr);
1194} 1041}
1195 1042
1196#ifdef CONFIG_FAIR_GROUP_SCHED
1197/* Account for a task changing its policy or group. 1043/* Account for a task changing its policy or group.
1198 * 1044 *
1199 * This routine is mostly called to set cfs_rq->curr field when a task 1045 * This routine is mostly called to set cfs_rq->curr field when a task
@@ -1206,21 +1052,17 @@ static void set_curr_task_fair(struct rq *rq)
1206 for_each_sched_entity(se) 1052 for_each_sched_entity(se)
1207 set_next_entity(cfs_rq_of(se), se); 1053 set_next_entity(cfs_rq_of(se), se);
1208} 1054}
1209#else
1210static void set_curr_task_fair(struct rq *rq)
1211{
1212}
1213#endif
1214 1055
1215/* 1056/*
1216 * All the scheduling class methods: 1057 * All the scheduling class methods:
1217 */ 1058 */
1218struct sched_class fair_sched_class __read_mostly = { 1059static const struct sched_class fair_sched_class = {
1060 .next = &idle_sched_class,
1219 .enqueue_task = enqueue_task_fair, 1061 .enqueue_task = enqueue_task_fair,
1220 .dequeue_task = dequeue_task_fair, 1062 .dequeue_task = dequeue_task_fair,
1221 .yield_task = yield_task_fair, 1063 .yield_task = yield_task_fair,
1222 1064
1223 .check_preempt_curr = check_preempt_curr_fair, 1065 .check_preempt_curr = check_preempt_wakeup,
1224 1066
1225 .pick_next_task = pick_next_task_fair, 1067 .pick_next_task = pick_next_task_fair,
1226 .put_prev_task = put_prev_task_fair, 1068 .put_prev_task = put_prev_task_fair,
@@ -1237,6 +1079,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
1237{ 1079{
1238 struct cfs_rq *cfs_rq; 1080 struct cfs_rq *cfs_rq;
1239 1081
1082#ifdef CONFIG_FAIR_GROUP_SCHED
1083 print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
1084#endif
1240 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) 1085 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1241 print_cfs_rq(m, cpu, cfs_rq); 1086 print_cfs_rq(m, cpu, cfs_rq);
1242} 1087}
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 3503fb2d9f96..6e2ead41516e 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -50,10 +50,15 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr)
50{ 50{
51} 51}
52 52
53static void set_curr_task_idle(struct rq *rq)
54{
55}
56
53/* 57/*
54 * Simple, special scheduling class for the per-CPU idle tasks: 58 * Simple, special scheduling class for the per-CPU idle tasks:
55 */ 59 */
56static struct sched_class idle_sched_class __read_mostly = { 60const struct sched_class idle_sched_class = {
61 /* .next is NULL */
57 /* no enqueue/yield_task for idle tasks */ 62 /* no enqueue/yield_task for idle tasks */
58 63
59 /* dequeue is not valid, we print a debug message there: */ 64 /* dequeue is not valid, we print a debug message there: */
@@ -66,6 +71,7 @@ static struct sched_class idle_sched_class __read_mostly = {
66 71
67 .load_balance = load_balance_idle, 72 .load_balance = load_balance_idle,
68 73
74 .set_curr_task = set_curr_task_idle,
69 .task_tick = task_tick_idle, 75 .task_tick = task_tick_idle,
70 /* no .task_new for idle tasks */ 76 /* no .task_new for idle tasks */
71}; 77};
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 4b87476a02d0..d0097a0634e5 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -7,7 +7,7 @@
7 * Update the current task's runtime statistics. Skip current tasks that 7 * Update the current task's runtime statistics. Skip current tasks that
8 * are not in our scheduling class. 8 * are not in our scheduling class.
9 */ 9 */
10static inline void update_curr_rt(struct rq *rq) 10static void update_curr_rt(struct rq *rq)
11{ 11{
12 struct task_struct *curr = rq->curr; 12 struct task_struct *curr = rq->curr;
13 u64 delta_exec; 13 u64 delta_exec;
@@ -59,9 +59,9 @@ static void requeue_task_rt(struct rq *rq, struct task_struct *p)
59} 59}
60 60
61static void 61static void
62yield_task_rt(struct rq *rq, struct task_struct *p) 62yield_task_rt(struct rq *rq)
63{ 63{
64 requeue_task_rt(rq, p); 64 requeue_task_rt(rq, rq->curr);
65} 65}
66 66
67/* 67/*
@@ -206,7 +206,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
206 if (--p->time_slice) 206 if (--p->time_slice)
207 return; 207 return;
208 208
209 p->time_slice = static_prio_timeslice(p->static_prio); 209 p->time_slice = DEF_TIMESLICE;
210 210
211 /* 211 /*
212 * Requeue to the end of queue if we are not the only element 212 * Requeue to the end of queue if we are not the only element
@@ -218,7 +218,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
218 } 218 }
219} 219}
220 220
221static struct sched_class rt_sched_class __read_mostly = { 221static void set_curr_task_rt(struct rq *rq)
222{
223 struct task_struct *p = rq->curr;
224
225 p->se.exec_start = rq->clock;
226}
227
228const struct sched_class rt_sched_class = {
229 .next = &fair_sched_class,
222 .enqueue_task = enqueue_task_rt, 230 .enqueue_task = enqueue_task_rt,
223 .dequeue_task = dequeue_task_rt, 231 .dequeue_task = dequeue_task_rt,
224 .yield_task = yield_task_rt, 232 .yield_task = yield_task_rt,
@@ -230,5 +238,6 @@ static struct sched_class rt_sched_class __read_mostly = {
230 238
231 .load_balance = load_balance_rt, 239 .load_balance = load_balance_rt,
232 240
241 .set_curr_task = set_curr_task_rt,
233 .task_tick = task_tick_rt, 242 .task_tick = task_tick_rt,
234}; 243};
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index c20a94dda61e..1c084842c3e7 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -16,18 +16,18 @@ static int show_schedstat(struct seq_file *seq, void *v)
16 struct rq *rq = cpu_rq(cpu); 16 struct rq *rq = cpu_rq(cpu);
17#ifdef CONFIG_SMP 17#ifdef CONFIG_SMP
18 struct sched_domain *sd; 18 struct sched_domain *sd;
19 int dcnt = 0; 19 int dcount = 0;
20#endif 20#endif
21 21
22 /* runqueue-specific stats */ 22 /* runqueue-specific stats */
23 seq_printf(seq, 23 seq_printf(seq,
24 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu", 24 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu",
25 cpu, rq->yld_both_empty, 25 cpu, rq->yld_both_empty,
26 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, 26 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
27 rq->sched_switch, rq->sched_cnt, rq->sched_goidle, 27 rq->sched_switch, rq->sched_count, rq->sched_goidle,
28 rq->ttwu_cnt, rq->ttwu_local, 28 rq->ttwu_count, rq->ttwu_local,
29 rq->rq_sched_info.cpu_time, 29 rq->rq_sched_info.cpu_time,
30 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); 30 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
31 31
32 seq_printf(seq, "\n"); 32 seq_printf(seq, "\n");
33 33
@@ -39,12 +39,12 @@ static int show_schedstat(struct seq_file *seq, void *v)
39 char mask_str[NR_CPUS]; 39 char mask_str[NR_CPUS];
40 40
41 cpumask_scnprintf(mask_str, NR_CPUS, sd->span); 41 cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
42 seq_printf(seq, "domain%d %s", dcnt++, mask_str); 42 seq_printf(seq, "domain%d %s", dcount++, mask_str);
43 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; 43 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
44 itype++) { 44 itype++) {
45 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " 45 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
46 "%lu", 46 "%lu",
47 sd->lb_cnt[itype], 47 sd->lb_count[itype],
48 sd->lb_balanced[itype], 48 sd->lb_balanced[itype],
49 sd->lb_failed[itype], 49 sd->lb_failed[itype],
50 sd->lb_imbalance[itype], 50 sd->lb_imbalance[itype],
@@ -55,9 +55,9 @@ static int show_schedstat(struct seq_file *seq, void *v)
55 } 55 }
56 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" 56 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
57 " %lu %lu %lu\n", 57 " %lu %lu %lu\n",
58 sd->alb_cnt, sd->alb_failed, sd->alb_pushed, 58 sd->alb_count, sd->alb_failed, sd->alb_pushed,
59 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, 59 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
60 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, 60 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
61 sd->ttwu_wake_remote, sd->ttwu_move_affine, 61 sd->ttwu_wake_remote, sd->ttwu_move_affine,
62 sd->ttwu_move_balance); 62 sd->ttwu_move_balance);
63 } 63 }
@@ -101,7 +101,7 @@ rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
101{ 101{
102 if (rq) { 102 if (rq) {
103 rq->rq_sched_info.run_delay += delta; 103 rq->rq_sched_info.run_delay += delta;
104 rq->rq_sched_info.pcnt++; 104 rq->rq_sched_info.pcount++;
105 } 105 }
106} 106}
107 107
@@ -129,7 +129,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
129# define schedstat_set(var, val) do { } while (0) 129# define schedstat_set(var, val) do { } while (0)
130#endif 130#endif
131 131
132#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 132#ifdef CONFIG_SCHEDSTATS
133/* 133/*
134 * Called when a process is dequeued from the active array and given 134 * Called when a process is dequeued from the active array and given
135 * the cpu. We should note that with the exception of interactive 135 * the cpu. We should note that with the exception of interactive
@@ -164,7 +164,7 @@ static void sched_info_arrive(struct task_struct *t)
164 sched_info_dequeued(t); 164 sched_info_dequeued(t);
165 t->sched_info.run_delay += delta; 165 t->sched_info.run_delay += delta;
166 t->sched_info.last_arrival = now; 166 t->sched_info.last_arrival = now;
167 t->sched_info.pcnt++; 167 t->sched_info.pcount++;
168 168
169 rq_sched_info_arrive(task_rq(t), delta); 169 rq_sched_info_arrive(task_rq(t), delta);
170} 170}
@@ -233,5 +233,5 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
233#else 233#else
234#define sched_info_queued(t) do { } while (0) 234#define sched_info_queued(t) do { } while (0)
235#define sched_info_switch(t, next) do { } while (0) 235#define sched_info_switch(t, next) do { } while (0)
236#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ 236#endif /* CONFIG_SCHEDSTATS */
237 237
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6c97259e863e..ec14aa8ac51f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -222,14 +222,11 @@ static ctl_table kern_table[] = {
222#ifdef CONFIG_SCHED_DEBUG 222#ifdef CONFIG_SCHED_DEBUG
223 { 223 {
224 .ctl_name = CTL_UNNUMBERED, 224 .ctl_name = CTL_UNNUMBERED,
225 .procname = "sched_min_granularity_ns", 225 .procname = "sched_nr_latency",
226 .data = &sysctl_sched_min_granularity, 226 .data = &sysctl_sched_nr_latency,
227 .maxlen = sizeof(unsigned int), 227 .maxlen = sizeof(unsigned int),
228 .mode = 0644, 228 .mode = 0644,
229 .proc_handler = &proc_dointvec_minmax, 229 .proc_handler = &proc_dointvec,
230 .strategy = &sysctl_intvec,
231 .extra1 = &min_sched_granularity_ns,
232 .extra2 = &max_sched_granularity_ns,
233 }, 230 },
234 { 231 {
235 .ctl_name = CTL_UNNUMBERED, 232 .ctl_name = CTL_UNNUMBERED,
@@ -266,38 +263,24 @@ static ctl_table kern_table[] = {
266 }, 263 },
267 { 264 {
268 .ctl_name = CTL_UNNUMBERED, 265 .ctl_name = CTL_UNNUMBERED,
269 .procname = "sched_stat_granularity_ns", 266 .procname = "sched_child_runs_first",
270 .data = &sysctl_sched_stat_granularity, 267 .data = &sysctl_sched_child_runs_first,
271 .maxlen = sizeof(unsigned int),
272 .mode = 0644,
273 .proc_handler = &proc_dointvec_minmax,
274 .strategy = &sysctl_intvec,
275 .extra1 = &min_wakeup_granularity_ns,
276 .extra2 = &max_wakeup_granularity_ns,
277 },
278 {
279 .ctl_name = CTL_UNNUMBERED,
280 .procname = "sched_runtime_limit_ns",
281 .data = &sysctl_sched_runtime_limit,
282 .maxlen = sizeof(unsigned int), 268 .maxlen = sizeof(unsigned int),
283 .mode = 0644, 269 .mode = 0644,
284 .proc_handler = &proc_dointvec_minmax, 270 .proc_handler = &proc_dointvec,
285 .strategy = &sysctl_intvec,
286 .extra1 = &min_sched_granularity_ns,
287 .extra2 = &max_sched_granularity_ns,
288 }, 271 },
289 { 272 {
290 .ctl_name = CTL_UNNUMBERED, 273 .ctl_name = CTL_UNNUMBERED,
291 .procname = "sched_child_runs_first", 274 .procname = "sched_features",
292 .data = &sysctl_sched_child_runs_first, 275 .data = &sysctl_sched_features,
293 .maxlen = sizeof(unsigned int), 276 .maxlen = sizeof(unsigned int),
294 .mode = 0644, 277 .mode = 0644,
295 .proc_handler = &proc_dointvec, 278 .proc_handler = &proc_dointvec,
296 }, 279 },
297 { 280 {
298 .ctl_name = CTL_UNNUMBERED, 281 .ctl_name = CTL_UNNUMBERED,
299 .procname = "sched_features", 282 .procname = "sched_migration_cost",
300 .data = &sysctl_sched_features, 283 .data = &sysctl_sched_migration_cost,
301 .maxlen = sizeof(unsigned int), 284 .maxlen = sizeof(unsigned int),
302 .mode = 0644, 285 .mode = 0644,
303 .proc_handler = &proc_dointvec, 286 .proc_handler = &proc_dointvec,
diff --git a/kernel/user.c b/kernel/user.c
index 9ca2848fc356..f0e561e6d085 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -50,12 +50,16 @@ struct user_struct root_user = {
50 .uid_keyring = &root_user_keyring, 50 .uid_keyring = &root_user_keyring,
51 .session_keyring = &root_session_keyring, 51 .session_keyring = &root_session_keyring,
52#endif 52#endif
53#ifdef CONFIG_FAIR_USER_SCHED
54 .tg = &init_task_group,
55#endif
53}; 56};
54 57
55/* 58/*
56 * These routines must be called with the uidhash spinlock held! 59 * These routines must be called with the uidhash spinlock held!
57 */ 60 */
58static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) 61static inline void uid_hash_insert(struct user_struct *up,
62 struct hlist_head *hashent)
59{ 63{
60 hlist_add_head(&up->uidhash_node, hashent); 64 hlist_add_head(&up->uidhash_node, hashent);
61} 65}
@@ -65,13 +69,14 @@ static inline void uid_hash_remove(struct user_struct *up)
65 hlist_del_init(&up->uidhash_node); 69 hlist_del_init(&up->uidhash_node);
66} 70}
67 71
68static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) 72static inline struct user_struct *uid_hash_find(uid_t uid,
73 struct hlist_head *hashent)
69{ 74{
70 struct user_struct *user; 75 struct user_struct *user;
71 struct hlist_node *h; 76 struct hlist_node *h;
72 77
73 hlist_for_each_entry(user, h, hashent, uidhash_node) { 78 hlist_for_each_entry(user, h, hashent, uidhash_node) {
74 if(user->uid == uid) { 79 if (user->uid == uid) {
75 atomic_inc(&user->__count); 80 atomic_inc(&user->__count);
76 return user; 81 return user;
77 } 82 }
@@ -80,6 +85,203 @@ static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *ha
80 return NULL; 85 return NULL;
81} 86}
82 87
88#ifdef CONFIG_FAIR_USER_SCHED
89
90static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */
91static DEFINE_MUTEX(uids_mutex);
92
93static void sched_destroy_user(struct user_struct *up)
94{
95 sched_destroy_group(up->tg);
96}
97
98static int sched_create_user(struct user_struct *up)
99{
100 int rc = 0;
101
102 up->tg = sched_create_group();
103 if (IS_ERR(up->tg))
104 rc = -ENOMEM;
105
106 return rc;
107}
108
109static void sched_switch_user(struct task_struct *p)
110{
111 sched_move_task(p);
112}
113
114static inline void uids_mutex_lock(void)
115{
116 mutex_lock(&uids_mutex);
117}
118
119static inline void uids_mutex_unlock(void)
120{
121 mutex_unlock(&uids_mutex);
122}
123
124/* return cpu shares held by the user */
125ssize_t cpu_shares_show(struct kset *kset, char *buffer)
126{
127 struct user_struct *up = container_of(kset, struct user_struct, kset);
128
129 return sprintf(buffer, "%lu\n", sched_group_shares(up->tg));
130}
131
132/* modify cpu shares held by the user */
133ssize_t cpu_shares_store(struct kset *kset, const char *buffer, size_t size)
134{
135 struct user_struct *up = container_of(kset, struct user_struct, kset);
136 unsigned long shares;
137 int rc;
138
139 sscanf(buffer, "%lu", &shares);
140
141 rc = sched_group_set_shares(up->tg, shares);
142
143 return (rc ? rc : size);
144}
145
146static void user_attr_init(struct subsys_attribute *sa, char *name, int mode)
147{
148 sa->attr.name = name;
149 sa->attr.mode = mode;
150 sa->show = cpu_shares_show;
151 sa->store = cpu_shares_store;
152}
153
154/* Create "/sys/kernel/uids/<uid>" directory and
155 * "/sys/kernel/uids/<uid>/cpu_share" file for this user.
156 */
157static int user_kobject_create(struct user_struct *up)
158{
159 struct kset *kset = &up->kset;
160 struct kobject *kobj = &kset->kobj;
161 int error;
162
163 memset(kset, 0, sizeof(struct kset));
164 kobj->parent = &uids_kobject; /* create under /sys/kernel/uids dir */
165 kobject_set_name(kobj, "%d", up->uid);
166 kset_init(kset);
167 user_attr_init(&up->user_attr, "cpu_share", 0644);
168
169 error = kobject_add(kobj);
170 if (error)
171 goto done;
172
173 error = sysfs_create_file(kobj, &up->user_attr.attr);
174 if (error)
175 kobject_del(kobj);
176
177 kobject_uevent(kobj, KOBJ_ADD);
178
179done:
180 return error;
181}
182
183/* create these in sysfs filesystem:
184 * "/sys/kernel/uids" directory
185 * "/sys/kernel/uids/0" directory (for root user)
186 * "/sys/kernel/uids/0/cpu_share" file (for root user)
187 */
188int __init uids_kobject_init(void)
189{
190 int error;
191
192 /* create under /sys/kernel dir */
193 uids_kobject.parent = &kernel_subsys.kobj;
194 uids_kobject.kset = &kernel_subsys;
195 kobject_set_name(&uids_kobject, "uids");
196 kobject_init(&uids_kobject);
197
198 error = kobject_add(&uids_kobject);
199 if (!error)
200 error = user_kobject_create(&root_user);
201
202 return error;
203}
204
205/* work function to remove sysfs directory for a user and free up
206 * corresponding structures.
207 */
208static void remove_user_sysfs_dir(struct work_struct *w)
209{
210 struct user_struct *up = container_of(w, struct user_struct, work);
211 struct kobject *kobj = &up->kset.kobj;
212 unsigned long flags;
213 int remove_user = 0;
214
215 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
216 * atomic.
217 */
218 uids_mutex_lock();
219
220 local_irq_save(flags);
221
222 if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
223 uid_hash_remove(up);
224 remove_user = 1;
225 spin_unlock_irqrestore(&uidhash_lock, flags);
226 } else {
227 local_irq_restore(flags);
228 }
229
230 if (!remove_user)
231 goto done;
232
233 sysfs_remove_file(kobj, &up->user_attr.attr);
234 kobject_uevent(kobj, KOBJ_REMOVE);
235 kobject_del(kobj);
236
237 sched_destroy_user(up);
238 key_put(up->uid_keyring);
239 key_put(up->session_keyring);
240 kmem_cache_free(uid_cachep, up);
241
242done:
243 uids_mutex_unlock();
244}
245
246/* IRQs are disabled and uidhash_lock is held upon function entry.
247 * IRQ state (as stored in flags) is restored and uidhash_lock released
248 * upon function exit.
249 */
250static inline void free_user(struct user_struct *up, unsigned long flags)
251{
252 /* restore back the count */
253 atomic_inc(&up->__count);
254 spin_unlock_irqrestore(&uidhash_lock, flags);
255
256 INIT_WORK(&up->work, remove_user_sysfs_dir);
257 schedule_work(&up->work);
258}
259
260#else /* CONFIG_FAIR_USER_SCHED */
261
262static void sched_destroy_user(struct user_struct *up) { }
263static int sched_create_user(struct user_struct *up) { return 0; }
264static void sched_switch_user(struct task_struct *p) { }
265static inline int user_kobject_create(struct user_struct *up) { return 0; }
266static inline void uids_mutex_lock(void) { }
267static inline void uids_mutex_unlock(void) { }
268
269/* IRQs are disabled and uidhash_lock is held upon function entry.
270 * IRQ state (as stored in flags) is restored and uidhash_lock released
271 * upon function exit.
272 */
273static inline void free_user(struct user_struct *up, unsigned long flags)
274{
275 uid_hash_remove(up);
276 spin_unlock_irqrestore(&uidhash_lock, flags);
277 sched_destroy_user(up);
278 key_put(up->uid_keyring);
279 key_put(up->session_keyring);
280 kmem_cache_free(uid_cachep, up);
281}
282
283#endif /* CONFIG_FAIR_USER_SCHED */
284
83/* 285/*
84 * Locate the user_struct for the passed UID. If found, take a ref on it. The 286 * Locate the user_struct for the passed UID. If found, take a ref on it. The
85 * caller must undo that ref with free_uid(). 287 * caller must undo that ref with free_uid().
@@ -106,15 +308,10 @@ void free_uid(struct user_struct *up)
106 return; 308 return;
107 309
108 local_irq_save(flags); 310 local_irq_save(flags);
109 if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { 311 if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
110 uid_hash_remove(up); 312 free_user(up, flags);
111 spin_unlock_irqrestore(&uidhash_lock, flags); 313 else
112 key_put(up->uid_keyring);
113 key_put(up->session_keyring);
114 kmem_cache_free(uid_cachep, up);
115 } else {
116 local_irq_restore(flags); 314 local_irq_restore(flags);
117 }
118} 315}
119 316
120struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) 317struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
@@ -122,6 +319,11 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
122 struct hlist_head *hashent = uidhashentry(ns, uid); 319 struct hlist_head *hashent = uidhashentry(ns, uid);
123 struct user_struct *up; 320 struct user_struct *up;
124 321
322 /* Make uid_hash_find() + user_kobject_create() + uid_hash_insert()
323 * atomic.
324 */
325 uids_mutex_lock();
326
125 spin_lock_irq(&uidhash_lock); 327 spin_lock_irq(&uidhash_lock);
126 up = uid_hash_find(uid, hashent); 328 up = uid_hash_find(uid, hashent);
127 spin_unlock_irq(&uidhash_lock); 329 spin_unlock_irq(&uidhash_lock);
@@ -150,6 +352,22 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
150 return NULL; 352 return NULL;
151 } 353 }
152 354
355 if (sched_create_user(new) < 0) {
356 key_put(new->uid_keyring);
357 key_put(new->session_keyring);
358 kmem_cache_free(uid_cachep, new);
359 return NULL;
360 }
361
362 if (user_kobject_create(new)) {
363 sched_destroy_user(new);
364 key_put(new->uid_keyring);
365 key_put(new->session_keyring);
366 kmem_cache_free(uid_cachep, new);
367 uids_mutex_unlock();
368 return NULL;
369 }
370
153 /* 371 /*
154 * Before adding this, check whether we raced 372 * Before adding this, check whether we raced
155 * on adding the same user already.. 373 * on adding the same user already..
@@ -157,6 +375,11 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
157 spin_lock_irq(&uidhash_lock); 375 spin_lock_irq(&uidhash_lock);
158 up = uid_hash_find(uid, hashent); 376 up = uid_hash_find(uid, hashent);
159 if (up) { 377 if (up) {
378 /* This case is not possible when CONFIG_FAIR_USER_SCHED
379 * is defined, since we serialize alloc_uid() using
380 * uids_mutex. Hence no need to call
381 * sched_destroy_user() or remove_user_sysfs_dir().
382 */
160 key_put(new->uid_keyring); 383 key_put(new->uid_keyring);
161 key_put(new->session_keyring); 384 key_put(new->session_keyring);
162 kmem_cache_free(uid_cachep, new); 385 kmem_cache_free(uid_cachep, new);
@@ -167,6 +390,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
167 spin_unlock_irq(&uidhash_lock); 390 spin_unlock_irq(&uidhash_lock);
168 391
169 } 392 }
393
394 uids_mutex_unlock();
395
170 return up; 396 return up;
171} 397}
172 398
@@ -184,6 +410,7 @@ void switch_uid(struct user_struct *new_user)
184 atomic_dec(&old_user->processes); 410 atomic_dec(&old_user->processes);
185 switch_uid_keyring(new_user); 411 switch_uid_keyring(new_user);
186 current->user = new_user; 412 current->user = new_user;
413 sched_switch_user(current);
187 414
188 /* 415 /*
189 * We need to synchronize with __sigqueue_alloc() 416 * We need to synchronize with __sigqueue_alloc()
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 2b57eaf66abc..6996cba5aa96 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -334,7 +334,7 @@ static void unix_write_space(struct sock *sk)
334 read_lock(&sk->sk_callback_lock); 334 read_lock(&sk->sk_callback_lock);
335 if (unix_writable(sk)) { 335 if (unix_writable(sk)) {
336 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 336 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
337 wake_up_interruptible(sk->sk_sleep); 337 wake_up_interruptible_sync(sk->sk_sleep);
338 sk_wake_async(sk, 2, POLL_OUT); 338 sk_wake_async(sk, 2, POLL_OUT);
339 } 339 }
340 read_unlock(&sk->sk_callback_lock); 340 read_unlock(&sk->sk_callback_lock);
@@ -1639,7 +1639,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1639 if (!skb) 1639 if (!skb)
1640 goto out_unlock; 1640 goto out_unlock;
1641 1641
1642 wake_up_interruptible(&u->peer_wait); 1642 wake_up_interruptible_sync(&u->peer_wait);
1643 1643
1644 if (msg->msg_name) 1644 if (msg->msg_name)
1645 unix_copy_addr(msg, skb->sk); 1645 unix_copy_addr(msg, skb->sk);