aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/delayacct.c10
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/posix-cpu-timers.c34
-rw-r--r--kernel/relay.c205
-rw-r--r--kernel/sched.c3021
-rw-r--r--kernel/sched_debug.c275
-rw-r--r--kernel/sched_fair.c1131
-rw-r--r--kernel/sched_idletask.c71
-rw-r--r--kernel/sched_rt.c255
-rw-r--r--kernel/sched_stats.h235
-rw-r--r--kernel/softirq.c1
-rw-r--r--kernel/sysctl.c80
13 files changed, 3374 insertions, 1953 deletions
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index c0148ae992..81e6978296 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -99,9 +99,10 @@ void __delayacct_blkio_end(void)
99int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) 99int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
100{ 100{
101 s64 tmp; 101 s64 tmp;
102 struct timespec ts; 102 unsigned long t1;
103 unsigned long t1,t2,t3; 103 unsigned long long t2, t3;
104 unsigned long flags; 104 unsigned long flags;
105 struct timespec ts;
105 106
106 /* Though tsk->delays accessed later, early exit avoids 107 /* Though tsk->delays accessed later, early exit avoids
107 * unnecessary returning of other data 108 * unnecessary returning of other data
@@ -124,11 +125,10 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
124 125
125 d->cpu_count += t1; 126 d->cpu_count += t1;
126 127
127 jiffies_to_timespec(t2, &ts); 128 tmp = (s64)d->cpu_delay_total + t2;
128 tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts);
129 d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; 129 d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
130 130
131 tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000; 131 tmp = (s64)d->cpu_run_virtual_total + t3;
132 d->cpu_run_virtual_total = 132 d->cpu_run_virtual_total =
133 (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; 133 (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
134 134
diff --git a/kernel/exit.c b/kernel/exit.c
index 5c8ecbaa19..ca6a11b730 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -122,9 +122,9 @@ static void __exit_signal(struct task_struct *tsk)
122 sig->maj_flt += tsk->maj_flt; 122 sig->maj_flt += tsk->maj_flt;
123 sig->nvcsw += tsk->nvcsw; 123 sig->nvcsw += tsk->nvcsw;
124 sig->nivcsw += tsk->nivcsw; 124 sig->nivcsw += tsk->nivcsw;
125 sig->sched_time += tsk->sched_time;
126 sig->inblock += task_io_get_inblock(tsk); 125 sig->inblock += task_io_get_inblock(tsk);
127 sig->oublock += task_io_get_oublock(tsk); 126 sig->oublock += task_io_get_oublock(tsk);
127 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
128 sig = NULL; /* Marker for below. */ 128 sig = NULL; /* Marker for below. */
129 } 129 }
130 130
@@ -182,7 +182,6 @@ repeat:
182 zap_leader = (leader->exit_signal == -1); 182 zap_leader = (leader->exit_signal == -1);
183 } 183 }
184 184
185 sched_exit(p);
186 write_unlock_irq(&tasklist_lock); 185 write_unlock_irq(&tasklist_lock);
187 proc_flush_task(p); 186 proc_flush_task(p);
188 release_thread(p); 187 release_thread(p);
@@ -291,7 +290,7 @@ static void reparent_to_kthreadd(void)
291 /* Set the exit signal to SIGCHLD so we signal init on exit */ 290 /* Set the exit signal to SIGCHLD so we signal init on exit */
292 current->exit_signal = SIGCHLD; 291 current->exit_signal = SIGCHLD;
293 292
294 if (!has_rt_policy(current) && (task_nice(current) < 0)) 293 if (task_nice(current) < 0)
295 set_user_nice(current, 0); 294 set_user_nice(current, 0);
296 /* cpus_allowed? */ 295 /* cpus_allowed? */
297 /* rt_priority? */ 296 /* rt_priority? */
diff --git a/kernel/fork.c b/kernel/fork.c
index 73ad5cda1b..da3a155bba 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -877,7 +877,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
877 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 877 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
878 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 878 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
879 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 879 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
880 sig->sched_time = 0; 880 sig->sum_sched_runtime = 0;
881 INIT_LIST_HEAD(&sig->cpu_timers[0]); 881 INIT_LIST_HEAD(&sig->cpu_timers[0]);
882 INIT_LIST_HEAD(&sig->cpu_timers[1]); 882 INIT_LIST_HEAD(&sig->cpu_timers[1]);
883 INIT_LIST_HEAD(&sig->cpu_timers[2]); 883 INIT_LIST_HEAD(&sig->cpu_timers[2]);
@@ -1040,7 +1040,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1040 1040
1041 p->utime = cputime_zero; 1041 p->utime = cputime_zero;
1042 p->stime = cputime_zero; 1042 p->stime = cputime_zero;
1043 p->sched_time = 0; 1043
1044#ifdef CONFIG_TASK_XACCT 1044#ifdef CONFIG_TASK_XACCT
1045 p->rchar = 0; /* I/O counter: bytes read */ 1045 p->rchar = 0; /* I/O counter: bytes read */
1046 p->wchar = 0; /* I/O counter: bytes written */ 1046 p->wchar = 0; /* I/O counter: bytes written */
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 1de710e183..b53c8fcd9d 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -161,7 +161,7 @@ static inline cputime_t virt_ticks(struct task_struct *p)
161} 161}
162static inline unsigned long long sched_ns(struct task_struct *p) 162static inline unsigned long long sched_ns(struct task_struct *p)
163{ 163{
164 return (p == current) ? current_sched_time(p) : p->sched_time; 164 return task_sched_runtime(p);
165} 165}
166 166
167int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) 167int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
@@ -246,10 +246,10 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx,
246 } while (t != p); 246 } while (t != p);
247 break; 247 break;
248 case CPUCLOCK_SCHED: 248 case CPUCLOCK_SCHED:
249 cpu->sched = p->signal->sched_time; 249 cpu->sched = p->signal->sum_sched_runtime;
250 /* Add in each other live thread. */ 250 /* Add in each other live thread. */
251 while ((t = next_thread(t)) != p) { 251 while ((t = next_thread(t)) != p) {
252 cpu->sched += t->sched_time; 252 cpu->sched += t->se.sum_exec_runtime;
253 } 253 }
254 cpu->sched += sched_ns(p); 254 cpu->sched += sched_ns(p);
255 break; 255 break;
@@ -422,7 +422,7 @@ int posix_cpu_timer_del(struct k_itimer *timer)
422 */ 422 */
423static void cleanup_timers(struct list_head *head, 423static void cleanup_timers(struct list_head *head,
424 cputime_t utime, cputime_t stime, 424 cputime_t utime, cputime_t stime,
425 unsigned long long sched_time) 425 unsigned long long sum_exec_runtime)
426{ 426{
427 struct cpu_timer_list *timer, *next; 427 struct cpu_timer_list *timer, *next;
428 cputime_t ptime = cputime_add(utime, stime); 428 cputime_t ptime = cputime_add(utime, stime);
@@ -451,10 +451,10 @@ static void cleanup_timers(struct list_head *head,
451 ++head; 451 ++head;
452 list_for_each_entry_safe(timer, next, head, entry) { 452 list_for_each_entry_safe(timer, next, head, entry) {
453 list_del_init(&timer->entry); 453 list_del_init(&timer->entry);
454 if (timer->expires.sched < sched_time) { 454 if (timer->expires.sched < sum_exec_runtime) {
455 timer->expires.sched = 0; 455 timer->expires.sched = 0;
456 } else { 456 } else {
457 timer->expires.sched -= sched_time; 457 timer->expires.sched -= sum_exec_runtime;
458 } 458 }
459 } 459 }
460} 460}
@@ -467,7 +467,7 @@ static void cleanup_timers(struct list_head *head,
467void posix_cpu_timers_exit(struct task_struct *tsk) 467void posix_cpu_timers_exit(struct task_struct *tsk)
468{ 468{
469 cleanup_timers(tsk->cpu_timers, 469 cleanup_timers(tsk->cpu_timers,
470 tsk->utime, tsk->stime, tsk->sched_time); 470 tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
471 471
472} 472}
473void posix_cpu_timers_exit_group(struct task_struct *tsk) 473void posix_cpu_timers_exit_group(struct task_struct *tsk)
@@ -475,7 +475,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
475 cleanup_timers(tsk->signal->cpu_timers, 475 cleanup_timers(tsk->signal->cpu_timers,
476 cputime_add(tsk->utime, tsk->signal->utime), 476 cputime_add(tsk->utime, tsk->signal->utime),
477 cputime_add(tsk->stime, tsk->signal->stime), 477 cputime_add(tsk->stime, tsk->signal->stime),
478 tsk->sched_time + tsk->signal->sched_time); 478 tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime);
479} 479}
480 480
481 481
@@ -536,7 +536,7 @@ static void process_timer_rebalance(struct task_struct *p,
536 nsleft = max_t(unsigned long long, nsleft, 1); 536 nsleft = max_t(unsigned long long, nsleft, 1);
537 do { 537 do {
538 if (likely(!(t->flags & PF_EXITING))) { 538 if (likely(!(t->flags & PF_EXITING))) {
539 ns = t->sched_time + nsleft; 539 ns = t->se.sum_exec_runtime + nsleft;
540 if (t->it_sched_expires == 0 || 540 if (t->it_sched_expires == 0 ||
541 t->it_sched_expires > ns) { 541 t->it_sched_expires > ns) {
542 t->it_sched_expires = ns; 542 t->it_sched_expires = ns;
@@ -1004,7 +1004,7 @@ static void check_thread_timers(struct task_struct *tsk,
1004 struct cpu_timer_list *t = list_first_entry(timers, 1004 struct cpu_timer_list *t = list_first_entry(timers,
1005 struct cpu_timer_list, 1005 struct cpu_timer_list,
1006 entry); 1006 entry);
1007 if (!--maxfire || tsk->sched_time < t->expires.sched) { 1007 if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
1008 tsk->it_sched_expires = t->expires.sched; 1008 tsk->it_sched_expires = t->expires.sched;
1009 break; 1009 break;
1010 } 1010 }
@@ -1024,7 +1024,7 @@ static void check_process_timers(struct task_struct *tsk,
1024 int maxfire; 1024 int maxfire;
1025 struct signal_struct *const sig = tsk->signal; 1025 struct signal_struct *const sig = tsk->signal;
1026 cputime_t utime, stime, ptime, virt_expires, prof_expires; 1026 cputime_t utime, stime, ptime, virt_expires, prof_expires;
1027 unsigned long long sched_time, sched_expires; 1027 unsigned long long sum_sched_runtime, sched_expires;
1028 struct task_struct *t; 1028 struct task_struct *t;
1029 struct list_head *timers = sig->cpu_timers; 1029 struct list_head *timers = sig->cpu_timers;
1030 1030
@@ -1044,12 +1044,12 @@ static void check_process_timers(struct task_struct *tsk,
1044 */ 1044 */
1045 utime = sig->utime; 1045 utime = sig->utime;
1046 stime = sig->stime; 1046 stime = sig->stime;
1047 sched_time = sig->sched_time; 1047 sum_sched_runtime = sig->sum_sched_runtime;
1048 t = tsk; 1048 t = tsk;
1049 do { 1049 do {
1050 utime = cputime_add(utime, t->utime); 1050 utime = cputime_add(utime, t->utime);
1051 stime = cputime_add(stime, t->stime); 1051 stime = cputime_add(stime, t->stime);
1052 sched_time += t->sched_time; 1052 sum_sched_runtime += t->se.sum_exec_runtime;
1053 t = next_thread(t); 1053 t = next_thread(t);
1054 } while (t != tsk); 1054 } while (t != tsk);
1055 ptime = cputime_add(utime, stime); 1055 ptime = cputime_add(utime, stime);
@@ -1090,7 +1090,7 @@ static void check_process_timers(struct task_struct *tsk,
1090 struct cpu_timer_list *t = list_first_entry(timers, 1090 struct cpu_timer_list *t = list_first_entry(timers,
1091 struct cpu_timer_list, 1091 struct cpu_timer_list,
1092 entry); 1092 entry);
1093 if (!--maxfire || sched_time < t->expires.sched) { 1093 if (!--maxfire || sum_sched_runtime < t->expires.sched) {
1094 sched_expires = t->expires.sched; 1094 sched_expires = t->expires.sched;
1095 break; 1095 break;
1096 } 1096 }
@@ -1182,7 +1182,7 @@ static void check_process_timers(struct task_struct *tsk,
1182 virt_left = cputime_sub(virt_expires, utime); 1182 virt_left = cputime_sub(virt_expires, utime);
1183 virt_left = cputime_div_non_zero(virt_left, nthreads); 1183 virt_left = cputime_div_non_zero(virt_left, nthreads);
1184 if (sched_expires) { 1184 if (sched_expires) {
1185 sched_left = sched_expires - sched_time; 1185 sched_left = sched_expires - sum_sched_runtime;
1186 do_div(sched_left, nthreads); 1186 do_div(sched_left, nthreads);
1187 sched_left = max_t(unsigned long long, sched_left, 1); 1187 sched_left = max_t(unsigned long long, sched_left, 1);
1188 } else { 1188 } else {
@@ -1208,7 +1208,7 @@ static void check_process_timers(struct task_struct *tsk,
1208 t->it_virt_expires = ticks; 1208 t->it_virt_expires = ticks;
1209 } 1209 }
1210 1210
1211 sched = t->sched_time + sched_left; 1211 sched = t->se.sum_exec_runtime + sched_left;
1212 if (sched_expires && (t->it_sched_expires == 0 || 1212 if (sched_expires && (t->it_sched_expires == 0 ||
1213 t->it_sched_expires > sched)) { 1213 t->it_sched_expires > sched)) {
1214 t->it_sched_expires = sched; 1214 t->it_sched_expires = sched;
@@ -1300,7 +1300,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1300 1300
1301 if (UNEXPIRED(prof) && UNEXPIRED(virt) && 1301 if (UNEXPIRED(prof) && UNEXPIRED(virt) &&
1302 (tsk->it_sched_expires == 0 || 1302 (tsk->it_sched_expires == 0 ||
1303 tsk->sched_time < tsk->it_sched_expires)) 1303 tsk->se.sum_exec_runtime < tsk->it_sched_expires))
1304 return; 1304 return;
1305 1305
1306#undef UNEXPIRED 1306#undef UNEXPIRED
diff --git a/kernel/relay.c b/kernel/relay.c
index 95db8c79fe..3b299fb385 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -21,6 +21,7 @@
21#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/splice.h>
24 25
25/* list of open channels, for cpu hotplug */ 26/* list of open channels, for cpu hotplug */
26static DEFINE_MUTEX(relay_channels_mutex); 27static DEFINE_MUTEX(relay_channels_mutex);
@@ -121,6 +122,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
121 buf->page_array[i] = alloc_page(GFP_KERNEL); 122 buf->page_array[i] = alloc_page(GFP_KERNEL);
122 if (unlikely(!buf->page_array[i])) 123 if (unlikely(!buf->page_array[i]))
123 goto depopulate; 124 goto depopulate;
125 set_page_private(buf->page_array[i], (unsigned long)buf);
124 } 126 }
125 mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL); 127 mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL);
126 if (!mem) 128 if (!mem)
@@ -970,43 +972,6 @@ static int subbuf_read_actor(size_t read_start,
970 return ret; 972 return ret;
971} 973}
972 974
973/*
974 * subbuf_send_actor - send up to one subbuf's worth of data
975 */
976static int subbuf_send_actor(size_t read_start,
977 struct rchan_buf *buf,
978 size_t avail,
979 read_descriptor_t *desc,
980 read_actor_t actor)
981{
982 unsigned long pidx, poff;
983 unsigned int subbuf_pages;
984 int ret = 0;
985
986 subbuf_pages = buf->chan->alloc_size >> PAGE_SHIFT;
987 pidx = (read_start / PAGE_SIZE) % subbuf_pages;
988 poff = read_start & ~PAGE_MASK;
989 while (avail) {
990 struct page *p = buf->page_array[pidx];
991 unsigned int len;
992
993 len = PAGE_SIZE - poff;
994 if (len > avail)
995 len = avail;
996
997 len = actor(desc, p, poff, len);
998 if (desc->error)
999 break;
1000
1001 avail -= len;
1002 ret += len;
1003 poff = 0;
1004 pidx = (pidx + 1) % subbuf_pages;
1005 }
1006
1007 return ret;
1008}
1009
1010typedef int (*subbuf_actor_t) (size_t read_start, 975typedef int (*subbuf_actor_t) (size_t read_start,
1011 struct rchan_buf *buf, 976 struct rchan_buf *buf,
1012 size_t avail, 977 size_t avail,
@@ -1067,19 +1032,159 @@ static ssize_t relay_file_read(struct file *filp,
1067 NULL, &desc); 1032 NULL, &desc);
1068} 1033}
1069 1034
1070static ssize_t relay_file_sendfile(struct file *filp, 1035static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
1071 loff_t *ppos,
1072 size_t count,
1073 read_actor_t actor,
1074 void *target)
1075{ 1036{
1076 read_descriptor_t desc; 1037 rbuf->bytes_consumed += bytes_consumed;
1077 desc.written = 0; 1038
1078 desc.count = count; 1039 if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) {
1079 desc.arg.data = target; 1040 relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1);
1080 desc.error = 0; 1041 rbuf->bytes_consumed %= rbuf->chan->subbuf_size;
1081 return relay_file_read_subbufs(filp, ppos, subbuf_send_actor, 1042 }
1082 actor, &desc); 1043}
1044
1045static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
1046 struct pipe_buffer *buf)
1047{
1048 struct rchan_buf *rbuf;
1049
1050 rbuf = (struct rchan_buf *)page_private(buf->page);
1051 relay_consume_bytes(rbuf, buf->private);
1052}
1053
1054static struct pipe_buf_operations relay_pipe_buf_ops = {
1055 .can_merge = 0,
1056 .map = generic_pipe_buf_map,
1057 .unmap = generic_pipe_buf_unmap,
1058 .confirm = generic_pipe_buf_confirm,
1059 .release = relay_pipe_buf_release,
1060 .steal = generic_pipe_buf_steal,
1061 .get = generic_pipe_buf_get,
1062};
1063
1064/**
1065 * subbuf_splice_actor - splice up to one subbuf's worth of data
1066 */
1067static int subbuf_splice_actor(struct file *in,
1068 loff_t *ppos,
1069 struct pipe_inode_info *pipe,
1070 size_t len,
1071 unsigned int flags,
1072 int *nonpad_ret)
1073{
1074 unsigned int pidx, poff, total_len, subbuf_pages, ret;
1075 struct rchan_buf *rbuf = in->private_data;
1076 unsigned int subbuf_size = rbuf->chan->subbuf_size;
1077 size_t read_start = ((size_t)*ppos) % rbuf->chan->alloc_size;
1078 size_t read_subbuf = read_start / subbuf_size;
1079 size_t padding = rbuf->padding[read_subbuf];
1080 size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
1081 struct page *pages[PIPE_BUFFERS];
1082 struct partial_page partial[PIPE_BUFFERS];
1083 struct splice_pipe_desc spd = {
1084 .pages = pages,
1085 .nr_pages = 0,
1086 .partial = partial,
1087 .flags = flags,
1088 .ops = &relay_pipe_buf_ops,
1089 };
1090
1091 if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
1092 return 0;
1093
1094 /*
1095 * Adjust read len, if longer than what is available
1096 */
1097 if (len > (subbuf_size - read_start % subbuf_size))
1098 len = subbuf_size - read_start % subbuf_size;
1099
1100 subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
1101 pidx = (read_start / PAGE_SIZE) % subbuf_pages;
1102 poff = read_start & ~PAGE_MASK;
1103
1104 for (total_len = 0; spd.nr_pages < subbuf_pages; spd.nr_pages++) {
1105 unsigned int this_len, this_end, private;
1106 unsigned int cur_pos = read_start + total_len;
1107
1108 if (!len)
1109 break;
1110
1111 this_len = min_t(unsigned long, len, PAGE_SIZE - poff);
1112 private = this_len;
1113
1114 spd.pages[spd.nr_pages] = rbuf->page_array[pidx];
1115 spd.partial[spd.nr_pages].offset = poff;
1116
1117 this_end = cur_pos + this_len;
1118 if (this_end >= nonpad_end) {
1119 this_len = nonpad_end - cur_pos;
1120 private = this_len + padding;
1121 }
1122 spd.partial[spd.nr_pages].len = this_len;
1123 spd.partial[spd.nr_pages].private = private;
1124
1125 len -= this_len;
1126 total_len += this_len;
1127 poff = 0;
1128 pidx = (pidx + 1) % subbuf_pages;
1129
1130 if (this_end >= nonpad_end) {
1131 spd.nr_pages++;
1132 break;
1133 }
1134 }
1135
1136 if (!spd.nr_pages)
1137 return 0;
1138
1139 ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
1140 if (ret < 0 || ret < total_len)
1141 return ret;
1142
1143 if (read_start + ret == nonpad_end)
1144 ret += padding;
1145
1146 return ret;
1147}
1148
1149static ssize_t relay_file_splice_read(struct file *in,
1150 loff_t *ppos,
1151 struct pipe_inode_info *pipe,
1152 size_t len,
1153 unsigned int flags)
1154{
1155 ssize_t spliced;
1156 int ret;
1157 int nonpad_ret = 0;
1158
1159 ret = 0;
1160 spliced = 0;
1161
1162 while (len) {
1163 ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
1164 if (ret < 0)
1165 break;
1166 else if (!ret) {
1167 if (spliced)
1168 break;
1169 if (flags & SPLICE_F_NONBLOCK) {
1170 ret = -EAGAIN;
1171 break;
1172 }
1173 }
1174
1175 *ppos += ret;
1176 if (ret > len)
1177 len = 0;
1178 else
1179 len -= ret;
1180 spliced += nonpad_ret;
1181 nonpad_ret = 0;
1182 }
1183
1184 if (spliced)
1185 return spliced;
1186
1187 return ret;
1083} 1188}
1084 1189
1085const struct file_operations relay_file_operations = { 1190const struct file_operations relay_file_operations = {
@@ -1089,7 +1194,7 @@ const struct file_operations relay_file_operations = {
1089 .read = relay_file_read, 1194 .read = relay_file_read,
1090 .llseek = no_llseek, 1195 .llseek = no_llseek,
1091 .release = relay_file_release, 1196 .release = relay_file_release,
1092 .sendfile = relay_file_sendfile, 1197 .splice_read = relay_file_splice_read,
1093}; 1198};
1094EXPORT_SYMBOL_GPL(relay_file_operations); 1199EXPORT_SYMBOL_GPL(relay_file_operations);
1095 1200
diff --git a/kernel/sched.c b/kernel/sched.c
index 50e1a31226..9fbced64bf 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -16,13 +16,19 @@
16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas. 17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin 18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 * 2007-04-15 Work begun on replacing all interactivity tuning with a
20 * fair scheduling design by Con Kolivas.
21 * 2007-05-05 Load balancing (smp-nice) and other improvements
22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
19 */ 25 */
20 26
21#include <linux/mm.h> 27#include <linux/mm.h>
22#include <linux/module.h> 28#include <linux/module.h>
23#include <linux/nmi.h> 29#include <linux/nmi.h>
24#include <linux/init.h> 30#include <linux/init.h>
25#include <asm/uaccess.h> 31#include <linux/uaccess.h>
26#include <linux/highmem.h> 32#include <linux/highmem.h>
27#include <linux/smp_lock.h> 33#include <linux/smp_lock.h>
28#include <asm/mmu_context.h> 34#include <asm/mmu_context.h>
@@ -53,9 +59,9 @@
53#include <linux/kprobes.h> 59#include <linux/kprobes.h>
54#include <linux/delayacct.h> 60#include <linux/delayacct.h>
55#include <linux/reciprocal_div.h> 61#include <linux/reciprocal_div.h>
62#include <linux/unistd.h>
56 63
57#include <asm/tlb.h> 64#include <asm/tlb.h>
58#include <asm/unistd.h>
59 65
60/* 66/*
61 * Scheduler clock - returns current time in nanosec units. 67 * Scheduler clock - returns current time in nanosec units.
@@ -91,6 +97,9 @@ unsigned long long __attribute__((weak)) sched_clock(void)
91#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) 97#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
92#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) 98#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
93 99
100#define NICE_0_LOAD SCHED_LOAD_SCALE
101#define NICE_0_SHIFT SCHED_LOAD_SHIFT
102
94/* 103/*
95 * These are the 'tuning knobs' of the scheduler: 104 * These are the 'tuning knobs' of the scheduler:
96 * 105 *
@@ -100,87 +109,6 @@ unsigned long long __attribute__((weak)) sched_clock(void)
100 */ 109 */
101#define MIN_TIMESLICE max(5 * HZ / 1000, 1) 110#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
102#define DEF_TIMESLICE (100 * HZ / 1000) 111#define DEF_TIMESLICE (100 * HZ / 1000)
103#define ON_RUNQUEUE_WEIGHT 30
104#define CHILD_PENALTY 95
105#define PARENT_PENALTY 100
106#define EXIT_WEIGHT 3
107#define PRIO_BONUS_RATIO 25
108#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
109#define INTERACTIVE_DELTA 2
110#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
111#define STARVATION_LIMIT (MAX_SLEEP_AVG)
112#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
113
114/*
115 * If a task is 'interactive' then we reinsert it in the active
116 * array after it has expired its current timeslice. (it will not
117 * continue to run immediately, it will still roundrobin with
118 * other interactive tasks.)
119 *
120 * This part scales the interactivity limit depending on niceness.
121 *
122 * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
123 * Here are a few examples of different nice levels:
124 *
125 * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
126 * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
127 * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
128 * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
129 * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
130 *
131 * (the X axis represents the possible -5 ... 0 ... +5 dynamic
132 * priority range a task can explore, a value of '1' means the
133 * task is rated interactive.)
134 *
135 * Ie. nice +19 tasks can never get 'interactive' enough to be
136 * reinserted into the active array. And only heavily CPU-hog nice -20
137 * tasks will be expired. Default nice 0 tasks are somewhere between,
138 * it takes some effort for them to get interactive, but it's not
139 * too hard.
140 */
141
142#define CURRENT_BONUS(p) \
143 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
144 MAX_SLEEP_AVG)
145
146#define GRANULARITY (10 * HZ / 1000 ? : 1)
147
148#ifdef CONFIG_SMP
149#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
150 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
151 num_online_cpus())
152#else
153#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
154 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
155#endif
156
157#define SCALE(v1,v1_max,v2_max) \
158 (v1) * (v2_max) / (v1_max)
159
160#define DELTA(p) \
161 (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
162 INTERACTIVE_DELTA)
163
164#define TASK_INTERACTIVE(p) \
165 ((p)->prio <= (p)->static_prio - DELTA(p))
166
167#define INTERACTIVE_SLEEP(p) \
168 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
169 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
170
171#define TASK_PREEMPTS_CURR(p, rq) \
172 ((p)->prio < (rq)->curr->prio)
173
174#define SCALE_PRIO(x, prio) \
175 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
176
177static unsigned int static_prio_timeslice(int static_prio)
178{
179 if (static_prio < NICE_TO_PRIO(0))
180 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
181 else
182 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
183}
184 112
185#ifdef CONFIG_SMP 113#ifdef CONFIG_SMP
186/* 114/*
@@ -203,28 +131,87 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
203} 131}
204#endif 132#endif
205 133
134#define SCALE_PRIO(x, prio) \
135 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
136
206/* 137/*
207 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] 138 * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
208 * to time slice values: [800ms ... 100ms ... 5ms] 139 * to time slice values: [800ms ... 100ms ... 5ms]
209 *
210 * The higher a thread's priority, the bigger timeslices
211 * it gets during one round of execution. But even the lowest
212 * priority thread gets MIN_TIMESLICE worth of execution time.
213 */ 140 */
141static unsigned int static_prio_timeslice(int static_prio)
142{
143 if (static_prio == NICE_TO_PRIO(19))
144 return 1;
145
146 if (static_prio < NICE_TO_PRIO(0))
147 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
148 else
149 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
150}
151
152static inline int rt_policy(int policy)
153{
154 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
155 return 1;
156 return 0;
157}
214 158
215static inline unsigned int task_timeslice(struct task_struct *p) 159static inline int task_has_rt_policy(struct task_struct *p)
216{ 160{
217 return static_prio_timeslice(p->static_prio); 161 return rt_policy(p->policy);
218} 162}
219 163
220/* 164/*
221 * These are the runqueue data structures: 165 * This is the priority-queue data structure of the RT scheduling class:
222 */ 166 */
167struct rt_prio_array {
168 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
169 struct list_head queue[MAX_RT_PRIO];
170};
171
172struct load_stat {
173 struct load_weight load;
174 u64 load_update_start, load_update_last;
175 unsigned long delta_fair, delta_exec, delta_stat;
176};
177
178/* CFS-related fields in a runqueue */
179struct cfs_rq {
180 struct load_weight load;
181 unsigned long nr_running;
182
183 s64 fair_clock;
184 u64 exec_clock;
185 s64 wait_runtime;
186 u64 sleeper_bonus;
187 unsigned long wait_runtime_overruns, wait_runtime_underruns;
188
189 struct rb_root tasks_timeline;
190 struct rb_node *rb_leftmost;
191 struct rb_node *rb_load_balance_curr;
192#ifdef CONFIG_FAIR_GROUP_SCHED
193 /* 'curr' points to currently running entity on this cfs_rq.
194 * It is set to NULL otherwise (i.e when none are currently running).
195 */
196 struct sched_entity *curr;
197 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
223 198
224struct prio_array { 199 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
225 unsigned int nr_active; 200 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
226 DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ 201 * (like users, containers etc.)
227 struct list_head queue[MAX_PRIO]; 202 *
203 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
204 * list is used during load balance.
205 */
206 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
207#endif
208};
209
210/* Real-Time classes' related field in a runqueue: */
211struct rt_rq {
212 struct rt_prio_array active;
213 int rt_load_balance_idx;
214 struct list_head *rt_load_balance_head, *rt_load_balance_curr;
228}; 215};
229 216
230/* 217/*
@@ -235,22 +222,28 @@ struct prio_array {
235 * acquire operations must be ordered by ascending &runqueue. 222 * acquire operations must be ordered by ascending &runqueue.
236 */ 223 */
237struct rq { 224struct rq {
238 spinlock_t lock; 225 spinlock_t lock; /* runqueue lock */
239 226
240 /* 227 /*
241 * nr_running and cpu_load should be in the same cacheline because 228 * nr_running and cpu_load should be in the same cacheline because
242 * remote CPUs use both these fields when doing load calculation. 229 * remote CPUs use both these fields when doing load calculation.
243 */ 230 */
244 unsigned long nr_running; 231 unsigned long nr_running;
245 unsigned long raw_weighted_load; 232 #define CPU_LOAD_IDX_MAX 5
246#ifdef CONFIG_SMP 233 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
247 unsigned long cpu_load[3];
248 unsigned char idle_at_tick; 234 unsigned char idle_at_tick;
249#ifdef CONFIG_NO_HZ 235#ifdef CONFIG_NO_HZ
250 unsigned char in_nohz_recently; 236 unsigned char in_nohz_recently;
251#endif 237#endif
238 struct load_stat ls; /* capture load from *all* tasks on this cpu */
239 unsigned long nr_load_updates;
240 u64 nr_switches;
241
242 struct cfs_rq cfs;
243#ifdef CONFIG_FAIR_GROUP_SCHED
244 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
252#endif 245#endif
253 unsigned long long nr_switches; 246 struct rt_rq rt;
254 247
255 /* 248 /*
256 * This is part of a global counter where only the total sum 249 * This is part of a global counter where only the total sum
@@ -260,14 +253,18 @@ struct rq {
260 */ 253 */
261 unsigned long nr_uninterruptible; 254 unsigned long nr_uninterruptible;
262 255
263 unsigned long expired_timestamp;
264 /* Cached timestamp set by update_cpu_clock() */
265 unsigned long long most_recent_timestamp;
266 struct task_struct *curr, *idle; 256 struct task_struct *curr, *idle;
267 unsigned long next_balance; 257 unsigned long next_balance;
268 struct mm_struct *prev_mm; 258 struct mm_struct *prev_mm;
269 struct prio_array *active, *expired, arrays[2]; 259
270 int best_expired_prio; 260 u64 clock, prev_clock_raw;
261 s64 clock_max_delta;
262
263 unsigned int clock_warps, clock_overflows;
264 unsigned int clock_unstable_events;
265
266 struct sched_class *load_balance_class;
267
271 atomic_t nr_iowait; 268 atomic_t nr_iowait;
272 269
273#ifdef CONFIG_SMP 270#ifdef CONFIG_SMP
@@ -307,6 +304,11 @@ struct rq {
307static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; 304static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
308static DEFINE_MUTEX(sched_hotcpu_mutex); 305static DEFINE_MUTEX(sched_hotcpu_mutex);
309 306
307static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
308{
309 rq->curr->sched_class->check_preempt_curr(rq, p);
310}
311
310static inline int cpu_of(struct rq *rq) 312static inline int cpu_of(struct rq *rq)
311{ 313{
312#ifdef CONFIG_SMP 314#ifdef CONFIG_SMP
@@ -317,6 +319,52 @@ static inline int cpu_of(struct rq *rq)
317} 319}
318 320
319/* 321/*
322 * Per-runqueue clock, as finegrained as the platform can give us:
323 */
324static unsigned long long __rq_clock(struct rq *rq)
325{
326 u64 prev_raw = rq->prev_clock_raw;
327 u64 now = sched_clock();
328 s64 delta = now - prev_raw;
329 u64 clock = rq->clock;
330
331 /*
332 * Protect against sched_clock() occasionally going backwards:
333 */
334 if (unlikely(delta < 0)) {
335 clock++;
336 rq->clock_warps++;
337 } else {
338 /*
339 * Catch too large forward jumps too:
340 */
341 if (unlikely(delta > 2*TICK_NSEC)) {
342 clock++;
343 rq->clock_overflows++;
344 } else {
345 if (unlikely(delta > rq->clock_max_delta))
346 rq->clock_max_delta = delta;
347 clock += delta;
348 }
349 }
350
351 rq->prev_clock_raw = now;
352 rq->clock = clock;
353
354 return clock;
355}
356
357static inline unsigned long long rq_clock(struct rq *rq)
358{
359 int this_cpu = smp_processor_id();
360
361 if (this_cpu == cpu_of(rq))
362 return __rq_clock(rq);
363
364 return rq->clock;
365}
366
367/*
320 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 368 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
321 * See detach_destroy_domains: synchronize_sched for details. 369 * See detach_destroy_domains: synchronize_sched for details.
322 * 370 *
@@ -331,6 +379,18 @@ static inline int cpu_of(struct rq *rq)
331#define task_rq(p) cpu_rq(task_cpu(p)) 379#define task_rq(p) cpu_rq(task_cpu(p))
332#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 380#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
333 381
382#ifdef CONFIG_FAIR_GROUP_SCHED
383/* Change a task's ->cfs_rq if it moves across CPUs */
384static inline void set_task_cfs_rq(struct task_struct *p)
385{
386 p->se.cfs_rq = &task_rq(p)->cfs;
387}
388#else
389static inline void set_task_cfs_rq(struct task_struct *p)
390{
391}
392#endif
393
334#ifndef prepare_arch_switch 394#ifndef prepare_arch_switch
335# define prepare_arch_switch(next) do { } while (0) 395# define prepare_arch_switch(next) do { } while (0)
336#endif 396#endif
@@ -460,134 +520,6 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
460 spin_unlock_irqrestore(&rq->lock, *flags); 520 spin_unlock_irqrestore(&rq->lock, *flags);
461} 521}
462 522
463#ifdef CONFIG_SCHEDSTATS
464/*
465 * bump this up when changing the output format or the meaning of an existing
466 * format, so that tools can adapt (or abort)
467 */
468#define SCHEDSTAT_VERSION 14
469
470static int show_schedstat(struct seq_file *seq, void *v)
471{
472 int cpu;
473
474 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
475 seq_printf(seq, "timestamp %lu\n", jiffies);
476 for_each_online_cpu(cpu) {
477 struct rq *rq = cpu_rq(cpu);
478#ifdef CONFIG_SMP
479 struct sched_domain *sd;
480 int dcnt = 0;
481#endif
482
483 /* runqueue-specific stats */
484 seq_printf(seq,
485 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
486 cpu, rq->yld_both_empty,
487 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
488 rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
489 rq->ttwu_cnt, rq->ttwu_local,
490 rq->rq_sched_info.cpu_time,
491 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
492
493 seq_printf(seq, "\n");
494
495#ifdef CONFIG_SMP
496 /* domain-specific stats */
497 preempt_disable();
498 for_each_domain(cpu, sd) {
499 enum idle_type itype;
500 char mask_str[NR_CPUS];
501
502 cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
503 seq_printf(seq, "domain%d %s", dcnt++, mask_str);
504 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
505 itype++) {
506 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
507 "%lu",
508 sd->lb_cnt[itype],
509 sd->lb_balanced[itype],
510 sd->lb_failed[itype],
511 sd->lb_imbalance[itype],
512 sd->lb_gained[itype],
513 sd->lb_hot_gained[itype],
514 sd->lb_nobusyq[itype],
515 sd->lb_nobusyg[itype]);
516 }
517 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
518 " %lu %lu %lu\n",
519 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
520 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
521 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
522 sd->ttwu_wake_remote, sd->ttwu_move_affine,
523 sd->ttwu_move_balance);
524 }
525 preempt_enable();
526#endif
527 }
528 return 0;
529}
530
531static int schedstat_open(struct inode *inode, struct file *file)
532{
533 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
534 char *buf = kmalloc(size, GFP_KERNEL);
535 struct seq_file *m;
536 int res;
537
538 if (!buf)
539 return -ENOMEM;
540 res = single_open(file, show_schedstat, NULL);
541 if (!res) {
542 m = file->private_data;
543 m->buf = buf;
544 m->size = size;
545 } else
546 kfree(buf);
547 return res;
548}
549
550const struct file_operations proc_schedstat_operations = {
551 .open = schedstat_open,
552 .read = seq_read,
553 .llseek = seq_lseek,
554 .release = single_release,
555};
556
557/*
558 * Expects runqueue lock to be held for atomicity of update
559 */
560static inline void
561rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
562{
563 if (rq) {
564 rq->rq_sched_info.run_delay += delta_jiffies;
565 rq->rq_sched_info.pcnt++;
566 }
567}
568
569/*
570 * Expects runqueue lock to be held for atomicity of update
571 */
572static inline void
573rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
574{
575 if (rq)
576 rq->rq_sched_info.cpu_time += delta_jiffies;
577}
578# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
579# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
580#else /* !CONFIG_SCHEDSTATS */
581static inline void
582rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
583{}
584static inline void
585rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
586{}
587# define schedstat_inc(rq, field) do { } while (0)
588# define schedstat_add(rq, field, amt) do { } while (0)
589#endif
590
591/* 523/*
592 * this_rq_lock - lock this runqueue and disable interrupts. 524 * this_rq_lock - lock this runqueue and disable interrupts.
593 */ 525 */
@@ -603,177 +535,172 @@ static inline struct rq *this_rq_lock(void)
603 return rq; 535 return rq;
604} 536}
605 537
606#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
607/* 538/*
608 * Called when a process is dequeued from the active array and given 539 * CPU frequency is/was unstable - start new by setting prev_clock_raw:
609 * the cpu. We should note that with the exception of interactive
610 * tasks, the expired queue will become the active queue after the active
611 * queue is empty, without explicitly dequeuing and requeuing tasks in the
612 * expired queue. (Interactive tasks may be requeued directly to the
613 * active queue, thus delaying tasks in the expired queue from running;
614 * see scheduler_tick()).
615 *
616 * This function is only called from sched_info_arrive(), rather than
617 * dequeue_task(). Even though a task may be queued and dequeued multiple
618 * times as it is shuffled about, we're really interested in knowing how
619 * long it was from the *first* time it was queued to the time that it
620 * finally hit a cpu.
621 */ 540 */
622static inline void sched_info_dequeued(struct task_struct *t) 541void sched_clock_unstable_event(void)
623{ 542{
624 t->sched_info.last_queued = 0; 543 unsigned long flags;
544 struct rq *rq;
545
546 rq = task_rq_lock(current, &flags);
547 rq->prev_clock_raw = sched_clock();
548 rq->clock_unstable_events++;
549 task_rq_unlock(rq, &flags);
625} 550}
626 551
627/* 552/*
628 * Called when a task finally hits the cpu. We can now calculate how 553 * resched_task - mark a task 'to be rescheduled now'.
629 * long it was waiting to run. We also note when it began so that we 554 *
630 * can keep stats on how long its timeslice is. 555 * On UP this means the setting of the need_resched flag, on SMP it
556 * might also involve a cross-CPU call to trigger the scheduler on
557 * the target CPU.
631 */ 558 */
632static void sched_info_arrive(struct task_struct *t) 559#ifdef CONFIG_SMP
560
561#ifndef tsk_is_polling
562#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
563#endif
564
565static void resched_task(struct task_struct *p)
633{ 566{
634 unsigned long now = jiffies, delta_jiffies = 0; 567 int cpu;
568
569 assert_spin_locked(&task_rq(p)->lock);
570
571 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
572 return;
635 573
636 if (t->sched_info.last_queued) 574 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
637 delta_jiffies = now - t->sched_info.last_queued; 575
638 sched_info_dequeued(t); 576 cpu = task_cpu(p);
639 t->sched_info.run_delay += delta_jiffies; 577 if (cpu == smp_processor_id())
640 t->sched_info.last_arrival = now; 578 return;
641 t->sched_info.pcnt++;
642 579
643 rq_sched_info_arrive(task_rq(t), delta_jiffies); 580 /* NEED_RESCHED must be visible before we test polling */
581 smp_mb();
582 if (!tsk_is_polling(p))
583 smp_send_reschedule(cpu);
644} 584}
645 585
646/* 586static void resched_cpu(int cpu)
647 * Called when a process is queued into either the active or expired
648 * array. The time is noted and later used to determine how long we
649 * had to wait for us to reach the cpu. Since the expired queue will
650 * become the active queue after active queue is empty, without dequeuing
651 * and requeuing any tasks, we are interested in queuing to either. It
652 * is unusual but not impossible for tasks to be dequeued and immediately
653 * requeued in the same or another array: this can happen in sched_yield(),
654 * set_user_nice(), and even load_balance() as it moves tasks from runqueue
655 * to runqueue.
656 *
657 * This function is only called from enqueue_task(), but also only updates
658 * the timestamp if it is already not set. It's assumed that
659 * sched_info_dequeued() will clear that stamp when appropriate.
660 */
661static inline void sched_info_queued(struct task_struct *t)
662{ 587{
663 if (unlikely(sched_info_on())) 588 struct rq *rq = cpu_rq(cpu);
664 if (!t->sched_info.last_queued) 589 unsigned long flags;
665 t->sched_info.last_queued = jiffies; 590
591 if (!spin_trylock_irqsave(&rq->lock, flags))
592 return;
593 resched_task(cpu_curr(cpu));
594 spin_unlock_irqrestore(&rq->lock, flags);
666} 595}
596#else
597static inline void resched_task(struct task_struct *p)
598{
599 assert_spin_locked(&task_rq(p)->lock);
600 set_tsk_need_resched(p);
601}
602#endif
667 603
668/* 604static u64 div64_likely32(u64 divident, unsigned long divisor)
669 * Called when a process ceases being the active-running process, either
670 * voluntarily or involuntarily. Now we can calculate how long we ran.
671 */
672static inline void sched_info_depart(struct task_struct *t)
673{ 605{
674 unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival; 606#if BITS_PER_LONG == 32
607 if (likely(divident <= 0xffffffffULL))
608 return (u32)divident / divisor;
609 do_div(divident, divisor);
675 610
676 t->sched_info.cpu_time += delta_jiffies; 611 return divident;
677 rq_sched_info_depart(task_rq(t), delta_jiffies); 612#else
613 return divident / divisor;
614#endif
678} 615}
679 616
680/* 617#if BITS_PER_LONG == 32
681 * Called when tasks are switched involuntarily due, typically, to expiring 618# define WMULT_CONST (~0UL)
682 * their time slice. (This may also be called when switching to or from 619#else
683 * the idle task.) We are only called when prev != next. 620# define WMULT_CONST (1UL << 32)
684 */ 621#endif
685static inline void 622
686__sched_info_switch(struct task_struct *prev, struct task_struct *next) 623#define WMULT_SHIFT 32
624
625static inline unsigned long
626calc_delta_mine(unsigned long delta_exec, unsigned long weight,
627 struct load_weight *lw)
687{ 628{
688 struct rq *rq = task_rq(prev); 629 u64 tmp;
689 630
631 if (unlikely(!lw->inv_weight))
632 lw->inv_weight = WMULT_CONST / lw->weight;
633
634 tmp = (u64)delta_exec * weight;
690 /* 635 /*
691 * prev now departs the cpu. It's not interesting to record 636 * Check whether we'd overflow the 64-bit multiplication:
692 * stats about how efficient we were at scheduling the idle
693 * process, however.
694 */ 637 */
695 if (prev != rq->idle) 638 if (unlikely(tmp > WMULT_CONST)) {
696 sched_info_depart(prev); 639 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
640 >> (WMULT_SHIFT/2);
641 } else {
642 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
643 }
697 644
698 if (next != rq->idle) 645 return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
699 sched_info_arrive(next);
700}
701static inline void
702sched_info_switch(struct task_struct *prev, struct task_struct *next)
703{
704 if (unlikely(sched_info_on()))
705 __sched_info_switch(prev, next);
706} 646}
707#else
708#define sched_info_queued(t) do { } while (0)
709#define sched_info_switch(t, next) do { } while (0)
710#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
711 647
712/* 648static inline unsigned long
713 * Adding/removing a task to/from a priority array: 649calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
714 */
715static void dequeue_task(struct task_struct *p, struct prio_array *array)
716{ 650{
717 array->nr_active--; 651 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
718 list_del(&p->run_list);
719 if (list_empty(array->queue + p->prio))
720 __clear_bit(p->prio, array->bitmap);
721} 652}
722 653
723static void enqueue_task(struct task_struct *p, struct prio_array *array) 654static void update_load_add(struct load_weight *lw, unsigned long inc)
724{ 655{
725 sched_info_queued(p); 656 lw->weight += inc;
726 list_add_tail(&p->run_list, array->queue + p->prio); 657 lw->inv_weight = 0;
727 __set_bit(p->prio, array->bitmap);
728 array->nr_active++;
729 p->array = array;
730} 658}
731 659
732/* 660static void update_load_sub(struct load_weight *lw, unsigned long dec)
733 * Put task to the end of the run list without the overhead of dequeue
734 * followed by enqueue.
735 */
736static void requeue_task(struct task_struct *p, struct prio_array *array)
737{ 661{
738 list_move_tail(&p->run_list, array->queue + p->prio); 662 lw->weight -= dec;
663 lw->inv_weight = 0;
739} 664}
740 665
741static inline void 666static void __update_curr_load(struct rq *rq, struct load_stat *ls)
742enqueue_task_head(struct task_struct *p, struct prio_array *array)
743{ 667{
744 list_add(&p->run_list, array->queue + p->prio); 668 if (rq->curr != rq->idle && ls->load.weight) {
745 __set_bit(p->prio, array->bitmap); 669 ls->delta_exec += ls->delta_stat;
746 array->nr_active++; 670 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
747 p->array = array; 671 ls->delta_stat = 0;
672 }
748} 673}
749 674
750/* 675/*
751 * __normal_prio - return the priority that is based on the static 676 * Update delta_exec, delta_fair fields for rq.
752 * priority but is modified by bonuses/penalties.
753 * 677 *
754 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] 678 * delta_fair clock advances at a rate inversely proportional to
755 * into the -5 ... 0 ... +5 bonus/penalty range. 679 * total load (rq->ls.load.weight) on the runqueue, while
680 * delta_exec advances at the same rate as wall-clock (provided
681 * cpu is not idle).
756 * 682 *
757 * We use 25% of the full 0...39 priority range so that: 683 * delta_exec / delta_fair is a measure of the (smoothened) load on this
684 * runqueue over any given interval. This (smoothened) load is used
685 * during load balance.
758 * 686 *
759 * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. 687 * This function is called /before/ updating rq->ls.load
760 * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. 688 * and when switching tasks.
761 *
762 * Both properties are important to certain workloads.
763 */ 689 */
764 690static void update_curr_load(struct rq *rq, u64 now)
765static inline int __normal_prio(struct task_struct *p)
766{ 691{
767 int bonus, prio; 692 struct load_stat *ls = &rq->ls;
768 693 u64 start;
769 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
770 694
771 prio = p->static_prio - bonus; 695 start = ls->load_update_start;
772 if (prio < MAX_RT_PRIO) 696 ls->load_update_start = now;
773 prio = MAX_RT_PRIO; 697 ls->delta_stat += now - start;
774 if (prio > MAX_PRIO-1) 698 /*
775 prio = MAX_PRIO-1; 699 * Stagger updates to ls->delta_fair. Very frequent updates
776 return prio; 700 * can be expensive.
701 */
702 if (ls->delta_stat >= sysctl_sched_stat_granularity)
703 __update_curr_load(rq, ls);
777} 704}
778 705
779/* 706/*
@@ -791,53 +718,146 @@ static inline int __normal_prio(struct task_struct *p)
791 * this code will need modification 718 * this code will need modification
792 */ 719 */
793#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE 720#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
794#define LOAD_WEIGHT(lp) \ 721#define load_weight(lp) \
795 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) 722 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
796#define PRIO_TO_LOAD_WEIGHT(prio) \ 723#define PRIO_TO_LOAD_WEIGHT(prio) \
797 LOAD_WEIGHT(static_prio_timeslice(prio)) 724 load_weight(static_prio_timeslice(prio))
798#define RTPRIO_TO_LOAD_WEIGHT(rp) \ 725#define RTPRIO_TO_LOAD_WEIGHT(rp) \
799 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) 726 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
800 727
801static void set_load_weight(struct task_struct *p) 728#define WEIGHT_IDLEPRIO 2
802{ 729#define WMULT_IDLEPRIO (1 << 31)
803 if (has_rt_policy(p)) { 730
804#ifdef CONFIG_SMP 731/*
805 if (p == task_rq(p)->migration_thread) 732 * Nice levels are multiplicative, with a gentle 10% change for every
806 /* 733 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
807 * The migration thread does the actual balancing. 734 * nice 1, it will get ~10% less CPU time than another CPU-bound task
808 * Giving its load any weight will skew balancing 735 * that remained on nice 0.
809 * adversely. 736 *
810 */ 737 * The "10% effect" is relative and cumulative: from _any_ nice level,
811 p->load_weight = 0; 738 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
812 else 739 * it's +10% CPU usage.
813#endif 740 */
814 p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); 741static const int prio_to_weight[40] = {
815 } else 742/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
816 p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); 743/* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
817} 744/* 0 */ NICE_0_LOAD /* 1024 */,
745/* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
746/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
747};
748
749static const u32 prio_to_wmult[40] = {
750 48356, 60446, 75558, 94446, 118058, 147573,
751 184467, 230589, 288233, 360285, 450347,
752 562979, 703746, 879575, 1099582, 1374389,
753 717986, 2147483, 2684354, 3355443, 4194304,
754 244160, 6557201, 8196502, 10250518, 12782640,
755 16025997, 19976592, 24970740, 31350126, 39045157,
756 49367440, 61356675, 76695844, 95443717, 119304647,
757 148102320, 186737708, 238609294, 286331153,
758};
818 759
819static inline void 760static inline void
820inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) 761inc_load(struct rq *rq, const struct task_struct *p, u64 now)
821{ 762{
822 rq->raw_weighted_load += p->load_weight; 763 update_curr_load(rq, now);
764 update_load_add(&rq->ls.load, p->se.load.weight);
823} 765}
824 766
825static inline void 767static inline void
826dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) 768dec_load(struct rq *rq, const struct task_struct *p, u64 now)
827{ 769{
828 rq->raw_weighted_load -= p->load_weight; 770 update_curr_load(rq, now);
771 update_load_sub(&rq->ls.load, p->se.load.weight);
829} 772}
830 773
831static inline void inc_nr_running(struct task_struct *p, struct rq *rq) 774static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
832{ 775{
833 rq->nr_running++; 776 rq->nr_running++;
834 inc_raw_weighted_load(rq, p); 777 inc_load(rq, p, now);
835} 778}
836 779
837static inline void dec_nr_running(struct task_struct *p, struct rq *rq) 780static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
838{ 781{
839 rq->nr_running--; 782 rq->nr_running--;
840 dec_raw_weighted_load(rq, p); 783 dec_load(rq, p, now);
784}
785
786static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
787
788/*
789 * runqueue iterator, to support SMP load-balancing between different
790 * scheduling classes, without having to expose their internal data
791 * structures to the load-balancing proper:
792 */
793struct rq_iterator {
794 void *arg;
795 struct task_struct *(*start)(void *);
796 struct task_struct *(*next)(void *);
797};
798
799static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
800 unsigned long max_nr_move, unsigned long max_load_move,
801 struct sched_domain *sd, enum cpu_idle_type idle,
802 int *all_pinned, unsigned long *load_moved,
803 int this_best_prio, int best_prio, int best_prio_seen,
804 struct rq_iterator *iterator);
805
806#include "sched_stats.h"
807#include "sched_rt.c"
808#include "sched_fair.c"
809#include "sched_idletask.c"
810#ifdef CONFIG_SCHED_DEBUG
811# include "sched_debug.c"
812#endif
813
814#define sched_class_highest (&rt_sched_class)
815
816static void set_load_weight(struct task_struct *p)
817{
818 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
819 p->se.wait_runtime = 0;
820
821 if (task_has_rt_policy(p)) {
822 p->se.load.weight = prio_to_weight[0] * 2;
823 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
824 return;
825 }
826
827 /*
828 * SCHED_IDLE tasks get minimal weight:
829 */
830 if (p->policy == SCHED_IDLE) {
831 p->se.load.weight = WEIGHT_IDLEPRIO;
832 p->se.load.inv_weight = WMULT_IDLEPRIO;
833 return;
834 }
835
836 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
837 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
838}
839
840static void
841enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
842{
843 sched_info_queued(p);
844 p->sched_class->enqueue_task(rq, p, wakeup, now);
845 p->se.on_rq = 1;
846}
847
848static void
849dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
850{
851 p->sched_class->dequeue_task(rq, p, sleep, now);
852 p->se.on_rq = 0;
853}
854
855/*
856 * __normal_prio - return the priority that is based on the static prio
857 */
858static inline int __normal_prio(struct task_struct *p)
859{
860 return p->static_prio;
841} 861}
842 862
843/* 863/*
@@ -851,7 +871,7 @@ static inline int normal_prio(struct task_struct *p)
851{ 871{
852 int prio; 872 int prio;
853 873
854 if (has_rt_policy(p)) 874 if (task_has_rt_policy(p))
855 prio = MAX_RT_PRIO-1 - p->rt_priority; 875 prio = MAX_RT_PRIO-1 - p->rt_priority;
856 else 876 else
857 prio = __normal_prio(p); 877 prio = __normal_prio(p);
@@ -879,222 +899,47 @@ static int effective_prio(struct task_struct *p)
879} 899}
880 900
881/* 901/*
882 * __activate_task - move a task to the runqueue. 902 * activate_task - move a task to the runqueue.
883 */
884static void __activate_task(struct task_struct *p, struct rq *rq)
885{
886 struct prio_array *target = rq->active;
887
888 if (batch_task(p))
889 target = rq->expired;
890 enqueue_task(p, target);
891 inc_nr_running(p, rq);
892}
893
894/*
895 * __activate_idle_task - move idle task to the _front_ of runqueue.
896 */
897static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
898{
899 enqueue_task_head(p, rq->active);
900 inc_nr_running(p, rq);
901}
902
903/*
904 * Recalculate p->normal_prio and p->prio after having slept,
905 * updating the sleep-average too:
906 */ 903 */
907static int recalc_task_prio(struct task_struct *p, unsigned long long now) 904static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
908{ 905{
909 /* Caller must always ensure 'now >= p->timestamp' */ 906 u64 now = rq_clock(rq);
910 unsigned long sleep_time = now - p->timestamp;
911
912 if (batch_task(p))
913 sleep_time = 0;
914
915 if (likely(sleep_time > 0)) {
916 /*
917 * This ceiling is set to the lowest priority that would allow
918 * a task to be reinserted into the active array on timeslice
919 * completion.
920 */
921 unsigned long ceiling = INTERACTIVE_SLEEP(p);
922
923 if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
924 /*
925 * Prevents user tasks from achieving best priority
926 * with one single large enough sleep.
927 */
928 p->sleep_avg = ceiling;
929 /*
930 * Using INTERACTIVE_SLEEP() as a ceiling places a
931 * nice(0) task 1ms sleep away from promotion, and
932 * gives it 700ms to round-robin with no chance of
933 * being demoted. This is more than generous, so
934 * mark this sleep as non-interactive to prevent the
935 * on-runqueue bonus logic from intervening should
936 * this task not receive cpu immediately.
937 */
938 p->sleep_type = SLEEP_NONINTERACTIVE;
939 } else {
940 /*
941 * Tasks waking from uninterruptible sleep are
942 * limited in their sleep_avg rise as they
943 * are likely to be waiting on I/O
944 */
945 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
946 if (p->sleep_avg >= ceiling)
947 sleep_time = 0;
948 else if (p->sleep_avg + sleep_time >=
949 ceiling) {
950 p->sleep_avg = ceiling;
951 sleep_time = 0;
952 }
953 }
954 907
955 /* 908 if (p->state == TASK_UNINTERRUPTIBLE)
956 * This code gives a bonus to interactive tasks. 909 rq->nr_uninterruptible--;
957 *
958 * The boost works by updating the 'average sleep time'
959 * value here, based on ->timestamp. The more time a
960 * task spends sleeping, the higher the average gets -
961 * and the higher the priority boost gets as well.
962 */
963 p->sleep_avg += sleep_time;
964
965 }
966 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
967 p->sleep_avg = NS_MAX_SLEEP_AVG;
968 }
969 910
970 return effective_prio(p); 911 enqueue_task(rq, p, wakeup, now);
912 inc_nr_running(p, rq, now);
971} 913}
972 914
973/* 915/*
974 * activate_task - move a task to the runqueue and do priority recalculation 916 * activate_idle_task - move idle task to the _front_ of runqueue.
975 *
976 * Update all the scheduling statistics stuff. (sleep average
977 * calculation, priority modifiers, etc.)
978 */ 917 */
979static void activate_task(struct task_struct *p, struct rq *rq, int local) 918static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
980{ 919{
981 unsigned long long now; 920 u64 now = rq_clock(rq);
982 921
983 if (rt_task(p)) 922 if (p->state == TASK_UNINTERRUPTIBLE)
984 goto out; 923 rq->nr_uninterruptible--;
985
986 now = sched_clock();
987#ifdef CONFIG_SMP
988 if (!local) {
989 /* Compensate for drifting sched_clock */
990 struct rq *this_rq = this_rq();
991 now = (now - this_rq->most_recent_timestamp)
992 + rq->most_recent_timestamp;
993 }
994#endif
995
996 /*
997 * Sleep time is in units of nanosecs, so shift by 20 to get a
998 * milliseconds-range estimation of the amount of time that the task
999 * spent sleeping:
1000 */
1001 if (unlikely(prof_on == SLEEP_PROFILING)) {
1002 if (p->state == TASK_UNINTERRUPTIBLE)
1003 profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
1004 (now - p->timestamp) >> 20);
1005 }
1006
1007 p->prio = recalc_task_prio(p, now);
1008 924
1009 /* 925 enqueue_task(rq, p, 0, now);
1010 * This checks to make sure it's not an uninterruptible task 926 inc_nr_running(p, rq, now);
1011 * that is now waking up.
1012 */
1013 if (p->sleep_type == SLEEP_NORMAL) {
1014 /*
1015 * Tasks which were woken up by interrupts (ie. hw events)
1016 * are most likely of interactive nature. So we give them
1017 * the credit of extending their sleep time to the period
1018 * of time they spend on the runqueue, waiting for execution
1019 * on a CPU, first time around:
1020 */
1021 if (in_interrupt())
1022 p->sleep_type = SLEEP_INTERRUPTED;
1023 else {
1024 /*
1025 * Normal first-time wakeups get a credit too for
1026 * on-runqueue time, but it will be weighted down:
1027 */
1028 p->sleep_type = SLEEP_INTERACTIVE;
1029 }
1030 }
1031 p->timestamp = now;
1032out:
1033 __activate_task(p, rq);
1034} 927}
1035 928
1036/* 929/*
1037 * deactivate_task - remove a task from the runqueue. 930 * deactivate_task - remove a task from the runqueue.
1038 */ 931 */
1039static void deactivate_task(struct task_struct *p, struct rq *rq) 932static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1040{
1041 dec_nr_running(p, rq);
1042 dequeue_task(p, p->array);
1043 p->array = NULL;
1044}
1045
1046/*
1047 * resched_task - mark a task 'to be rescheduled now'.
1048 *
1049 * On UP this means the setting of the need_resched flag, on SMP it
1050 * might also involve a cross-CPU call to trigger the scheduler on
1051 * the target CPU.
1052 */
1053#ifdef CONFIG_SMP
1054
1055#ifndef tsk_is_polling
1056#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1057#endif
1058
1059static void resched_task(struct task_struct *p)
1060{ 933{
1061 int cpu; 934 u64 now = rq_clock(rq);
1062 935
1063 assert_spin_locked(&task_rq(p)->lock); 936 if (p->state == TASK_UNINTERRUPTIBLE)
937 rq->nr_uninterruptible++;
1064 938
1065 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) 939 dequeue_task(rq, p, sleep, now);
1066 return; 940 dec_nr_running(p, rq, now);
1067
1068 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
1069
1070 cpu = task_cpu(p);
1071 if (cpu == smp_processor_id())
1072 return;
1073
1074 /* NEED_RESCHED must be visible before we test polling */
1075 smp_mb();
1076 if (!tsk_is_polling(p))
1077 smp_send_reschedule(cpu);
1078} 941}
1079 942
1080static void resched_cpu(int cpu)
1081{
1082 struct rq *rq = cpu_rq(cpu);
1083 unsigned long flags;
1084
1085 if (!spin_trylock_irqsave(&rq->lock, flags))
1086 return;
1087 resched_task(cpu_curr(cpu));
1088 spin_unlock_irqrestore(&rq->lock, flags);
1089}
1090#else
1091static inline void resched_task(struct task_struct *p)
1092{
1093 assert_spin_locked(&task_rq(p)->lock);
1094 set_tsk_need_resched(p);
1095}
1096#endif
1097
1098/** 943/**
1099 * task_curr - is this task currently executing on a CPU? 944 * task_curr - is this task currently executing on a CPU?
1100 * @p: the task in question. 945 * @p: the task in question.
@@ -1107,10 +952,42 @@ inline int task_curr(const struct task_struct *p)
1107/* Used instead of source_load when we know the type == 0 */ 952/* Used instead of source_load when we know the type == 0 */
1108unsigned long weighted_cpuload(const int cpu) 953unsigned long weighted_cpuload(const int cpu)
1109{ 954{
1110 return cpu_rq(cpu)->raw_weighted_load; 955 return cpu_rq(cpu)->ls.load.weight;
956}
957
958static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
959{
960#ifdef CONFIG_SMP
961 task_thread_info(p)->cpu = cpu;
962 set_task_cfs_rq(p);
963#endif
1111} 964}
1112 965
1113#ifdef CONFIG_SMP 966#ifdef CONFIG_SMP
967
968void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
969{
970 int old_cpu = task_cpu(p);
971 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
972 u64 clock_offset, fair_clock_offset;
973
974 clock_offset = old_rq->clock - new_rq->clock;
975 fair_clock_offset = old_rq->cfs.fair_clock -
976 new_rq->cfs.fair_clock;
977 if (p->se.wait_start)
978 p->se.wait_start -= clock_offset;
979 if (p->se.wait_start_fair)
980 p->se.wait_start_fair -= fair_clock_offset;
981 if (p->se.sleep_start)
982 p->se.sleep_start -= clock_offset;
983 if (p->se.block_start)
984 p->se.block_start -= clock_offset;
985 if (p->se.sleep_start_fair)
986 p->se.sleep_start_fair -= fair_clock_offset;
987
988 __set_task_cpu(p, new_cpu);
989}
990
1114struct migration_req { 991struct migration_req {
1115 struct list_head list; 992 struct list_head list;
1116 993
@@ -1133,7 +1010,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1133 * If the task is not on a runqueue (and not running), then 1010 * If the task is not on a runqueue (and not running), then
1134 * it is sufficient to simply update the task's cpu field. 1011 * it is sufficient to simply update the task's cpu field.
1135 */ 1012 */
1136 if (!p->array && !task_running(rq, p)) { 1013 if (!p->se.on_rq && !task_running(rq, p)) {
1137 set_task_cpu(p, dest_cpu); 1014 set_task_cpu(p, dest_cpu);
1138 return 0; 1015 return 0;
1139 } 1016 }
@@ -1158,9 +1035,8 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1158void wait_task_inactive(struct task_struct *p) 1035void wait_task_inactive(struct task_struct *p)
1159{ 1036{
1160 unsigned long flags; 1037 unsigned long flags;
1038 int running, on_rq;
1161 struct rq *rq; 1039 struct rq *rq;
1162 struct prio_array *array;
1163 int running;
1164 1040
1165repeat: 1041repeat:
1166 /* 1042 /*
@@ -1192,7 +1068,7 @@ repeat:
1192 */ 1068 */
1193 rq = task_rq_lock(p, &flags); 1069 rq = task_rq_lock(p, &flags);
1194 running = task_running(rq, p); 1070 running = task_running(rq, p);
1195 array = p->array; 1071 on_rq = p->se.on_rq;
1196 task_rq_unlock(rq, &flags); 1072 task_rq_unlock(rq, &flags);
1197 1073
1198 /* 1074 /*
@@ -1215,7 +1091,7 @@ repeat:
1215 * running right now), it's preempted, and we should 1091 * running right now), it's preempted, and we should
1216 * yield - it could be a while. 1092 * yield - it could be a while.
1217 */ 1093 */
1218 if (unlikely(array)) { 1094 if (unlikely(on_rq)) {
1219 yield(); 1095 yield();
1220 goto repeat; 1096 goto repeat;
1221 } 1097 }
@@ -1261,11 +1137,12 @@ void kick_process(struct task_struct *p)
1261static inline unsigned long source_load(int cpu, int type) 1137static inline unsigned long source_load(int cpu, int type)
1262{ 1138{
1263 struct rq *rq = cpu_rq(cpu); 1139 struct rq *rq = cpu_rq(cpu);
1140 unsigned long total = weighted_cpuload(cpu);
1264 1141
1265 if (type == 0) 1142 if (type == 0)
1266 return rq->raw_weighted_load; 1143 return total;
1267 1144
1268 return min(rq->cpu_load[type-1], rq->raw_weighted_load); 1145 return min(rq->cpu_load[type-1], total);
1269} 1146}
1270 1147
1271/* 1148/*
@@ -1275,11 +1152,12 @@ static inline unsigned long source_load(int cpu, int type)
1275static inline unsigned long target_load(int cpu, int type) 1152static inline unsigned long target_load(int cpu, int type)
1276{ 1153{
1277 struct rq *rq = cpu_rq(cpu); 1154 struct rq *rq = cpu_rq(cpu);
1155 unsigned long total = weighted_cpuload(cpu);
1278 1156
1279 if (type == 0) 1157 if (type == 0)
1280 return rq->raw_weighted_load; 1158 return total;
1281 1159
1282 return max(rq->cpu_load[type-1], rq->raw_weighted_load); 1160 return max(rq->cpu_load[type-1], total);
1283} 1161}
1284 1162
1285/* 1163/*
@@ -1288,9 +1166,10 @@ static inline unsigned long target_load(int cpu, int type)
1288static inline unsigned long cpu_avg_load_per_task(int cpu) 1166static inline unsigned long cpu_avg_load_per_task(int cpu)
1289{ 1167{
1290 struct rq *rq = cpu_rq(cpu); 1168 struct rq *rq = cpu_rq(cpu);
1169 unsigned long total = weighted_cpuload(cpu);
1291 unsigned long n = rq->nr_running; 1170 unsigned long n = rq->nr_running;
1292 1171
1293 return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; 1172 return n ? total / n : SCHED_LOAD_SCALE;
1294} 1173}
1295 1174
1296/* 1175/*
@@ -1392,9 +1271,9 @@ static int sched_balance_self(int cpu, int flag)
1392 struct sched_domain *tmp, *sd = NULL; 1271 struct sched_domain *tmp, *sd = NULL;
1393 1272
1394 for_each_domain(cpu, tmp) { 1273 for_each_domain(cpu, tmp) {
1395 /* 1274 /*
1396 * If power savings logic is enabled for a domain, stop there. 1275 * If power savings logic is enabled for a domain, stop there.
1397 */ 1276 */
1398 if (tmp->flags & SD_POWERSAVINGS_BALANCE) 1277 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1399 break; 1278 break;
1400 if (tmp->flags & flag) 1279 if (tmp->flags & flag)
@@ -1477,9 +1356,9 @@ static int wake_idle(int cpu, struct task_struct *p)
1477 if (idle_cpu(i)) 1356 if (idle_cpu(i))
1478 return i; 1357 return i;
1479 } 1358 }
1480 } 1359 } else {
1481 else
1482 break; 1360 break;
1361 }
1483 } 1362 }
1484 return cpu; 1363 return cpu;
1485} 1364}
@@ -1521,7 +1400,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1521 if (!(old_state & state)) 1400 if (!(old_state & state))
1522 goto out; 1401 goto out;
1523 1402
1524 if (p->array) 1403 if (p->se.on_rq)
1525 goto out_running; 1404 goto out_running;
1526 1405
1527 cpu = task_cpu(p); 1406 cpu = task_cpu(p);
@@ -1576,11 +1455,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1576 * of the current CPU: 1455 * of the current CPU:
1577 */ 1456 */
1578 if (sync) 1457 if (sync)
1579 tl -= current->load_weight; 1458 tl -= current->se.load.weight;
1580 1459
1581 if ((tl <= load && 1460 if ((tl <= load &&
1582 tl + target_load(cpu, idx) <= tl_per_task) || 1461 tl + target_load(cpu, idx) <= tl_per_task) ||
1583 100*(tl + p->load_weight) <= imbalance*load) { 1462 100*(tl + p->se.load.weight) <= imbalance*load) {
1584 /* 1463 /*
1585 * This domain has SD_WAKE_AFFINE and 1464 * This domain has SD_WAKE_AFFINE and
1586 * p is cache cold in this domain, and 1465 * p is cache cold in this domain, and
@@ -1614,7 +1493,7 @@ out_set_cpu:
1614 old_state = p->state; 1493 old_state = p->state;
1615 if (!(old_state & state)) 1494 if (!(old_state & state))
1616 goto out; 1495 goto out;
1617 if (p->array) 1496 if (p->se.on_rq)
1618 goto out_running; 1497 goto out_running;
1619 1498
1620 this_cpu = smp_processor_id(); 1499 this_cpu = smp_processor_id();
@@ -1623,25 +1502,7 @@ out_set_cpu:
1623 1502
1624out_activate: 1503out_activate:
1625#endif /* CONFIG_SMP */ 1504#endif /* CONFIG_SMP */
1626 if (old_state == TASK_UNINTERRUPTIBLE) { 1505 activate_task(rq, p, 1);
1627 rq->nr_uninterruptible--;
1628 /*
1629 * Tasks on involuntary sleep don't earn
1630 * sleep_avg beyond just interactive state.
1631 */
1632 p->sleep_type = SLEEP_NONINTERACTIVE;
1633 } else
1634
1635 /*
1636 * Tasks that have marked their sleep as noninteractive get
1637 * woken up with their sleep average not weighted in an
1638 * interactive way.
1639 */
1640 if (old_state & TASK_NONINTERACTIVE)
1641 p->sleep_type = SLEEP_NONINTERACTIVE;
1642
1643
1644 activate_task(p, rq, cpu == this_cpu);
1645 /* 1506 /*
1646 * Sync wakeups (i.e. those types of wakeups where the waker 1507 * Sync wakeups (i.e. those types of wakeups where the waker
1647 * has indicated that it will leave the CPU in short order) 1508 * has indicated that it will leave the CPU in short order)
@@ -1650,10 +1511,8 @@ out_activate:
1650 * the waker guarantees that the freshly woken up task is going 1511 * the waker guarantees that the freshly woken up task is going
1651 * to be considered on this CPU.) 1512 * to be considered on this CPU.)
1652 */ 1513 */
1653 if (!sync || cpu != this_cpu) { 1514 if (!sync || cpu != this_cpu)
1654 if (TASK_PREEMPTS_CURR(p, rq)) 1515 check_preempt_curr(rq, p);
1655 resched_task(rq->curr);
1656 }
1657 success = 1; 1516 success = 1;
1658 1517
1659out_running: 1518out_running:
@@ -1676,19 +1535,36 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1676 return try_to_wake_up(p, state, 0); 1535 return try_to_wake_up(p, state, 0);
1677} 1536}
1678 1537
1679static void task_running_tick(struct rq *rq, struct task_struct *p);
1680/* 1538/*
1681 * Perform scheduler related setup for a newly forked process p. 1539 * Perform scheduler related setup for a newly forked process p.
1682 * p is forked by current. 1540 * p is forked by current.
1683 */ 1541 *
1684void fastcall sched_fork(struct task_struct *p, int clone_flags) 1542 * __sched_fork() is basic setup used by init_idle() too:
1685{ 1543 */
1686 int cpu = get_cpu(); 1544static void __sched_fork(struct task_struct *p)
1545{
1546 p->se.wait_start_fair = 0;
1547 p->se.wait_start = 0;
1548 p->se.exec_start = 0;
1549 p->se.sum_exec_runtime = 0;
1550 p->se.delta_exec = 0;
1551 p->se.delta_fair_run = 0;
1552 p->se.delta_fair_sleep = 0;
1553 p->se.wait_runtime = 0;
1554 p->se.sum_wait_runtime = 0;
1555 p->se.sum_sleep_runtime = 0;
1556 p->se.sleep_start = 0;
1557 p->se.sleep_start_fair = 0;
1558 p->se.block_start = 0;
1559 p->se.sleep_max = 0;
1560 p->se.block_max = 0;
1561 p->se.exec_max = 0;
1562 p->se.wait_max = 0;
1563 p->se.wait_runtime_overruns = 0;
1564 p->se.wait_runtime_underruns = 0;
1687 1565
1688#ifdef CONFIG_SMP 1566 INIT_LIST_HEAD(&p->run_list);
1689 cpu = sched_balance_self(cpu, SD_BALANCE_FORK); 1567 p->se.on_rq = 0;
1690#endif
1691 set_task_cpu(p, cpu);
1692 1568
1693 /* 1569 /*
1694 * We mark the process as running here, but have not actually 1570 * We mark the process as running here, but have not actually
@@ -1697,16 +1573,29 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
1697 * event cannot wake it up and insert it on the runqueue either. 1573 * event cannot wake it up and insert it on the runqueue either.
1698 */ 1574 */
1699 p->state = TASK_RUNNING; 1575 p->state = TASK_RUNNING;
1576}
1577
1578/*
1579 * fork()/clone()-time setup:
1580 */
1581void sched_fork(struct task_struct *p, int clone_flags)
1582{
1583 int cpu = get_cpu();
1584
1585 __sched_fork(p);
1586
1587#ifdef CONFIG_SMP
1588 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1589#endif
1590 __set_task_cpu(p, cpu);
1700 1591
1701 /* 1592 /*
1702 * Make sure we do not leak PI boosting priority to the child: 1593 * Make sure we do not leak PI boosting priority to the child:
1703 */ 1594 */
1704 p->prio = current->normal_prio; 1595 p->prio = current->normal_prio;
1705 1596
1706 INIT_LIST_HEAD(&p->run_list);
1707 p->array = NULL;
1708#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1597#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1709 if (unlikely(sched_info_on())) 1598 if (likely(sched_info_on()))
1710 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1599 memset(&p->sched_info, 0, sizeof(p->sched_info));
1711#endif 1600#endif
1712#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 1601#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -1716,34 +1605,16 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
1716 /* Want to start with kernel preemption disabled. */ 1605 /* Want to start with kernel preemption disabled. */
1717 task_thread_info(p)->preempt_count = 1; 1606 task_thread_info(p)->preempt_count = 1;
1718#endif 1607#endif
1719 /*
1720 * Share the timeslice between parent and child, thus the
1721 * total amount of pending timeslices in the system doesn't change,
1722 * resulting in more scheduling fairness.
1723 */
1724 local_irq_disable();
1725 p->time_slice = (current->time_slice + 1) >> 1;
1726 /*
1727 * The remainder of the first timeslice might be recovered by
1728 * the parent if the child exits early enough.
1729 */
1730 p->first_time_slice = 1;
1731 current->time_slice >>= 1;
1732 p->timestamp = sched_clock();
1733 if (unlikely(!current->time_slice)) {
1734 /*
1735 * This case is rare, it happens when the parent has only
1736 * a single jiffy left from its timeslice. Taking the
1737 * runqueue lock is not a problem.
1738 */
1739 current->time_slice = 1;
1740 task_running_tick(cpu_rq(cpu), current);
1741 }
1742 local_irq_enable();
1743 put_cpu(); 1608 put_cpu();
1744} 1609}
1745 1610
1746/* 1611/*
1612 * After fork, child runs first. (default) If set to 0 then
1613 * parent will (try to) run first.
1614 */
1615unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1616
1617/*
1747 * wake_up_new_task - wake up a newly created task for the first time. 1618 * wake_up_new_task - wake up a newly created task for the first time.
1748 * 1619 *
1749 * This function will do some initial scheduler statistics housekeeping 1620 * This function will do some initial scheduler statistics housekeeping
@@ -1752,107 +1623,27 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
1752 */ 1623 */
1753void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 1624void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1754{ 1625{
1755 struct rq *rq, *this_rq;
1756 unsigned long flags; 1626 unsigned long flags;
1757 int this_cpu, cpu; 1627 struct rq *rq;
1628 int this_cpu;
1758 1629
1759 rq = task_rq_lock(p, &flags); 1630 rq = task_rq_lock(p, &flags);
1760 BUG_ON(p->state != TASK_RUNNING); 1631 BUG_ON(p->state != TASK_RUNNING);
1761 this_cpu = smp_processor_id(); 1632 this_cpu = smp_processor_id(); /* parent's CPU */
1762 cpu = task_cpu(p);
1763
1764 /*
1765 * We decrease the sleep average of forking parents
1766 * and children as well, to keep max-interactive tasks
1767 * from forking tasks that are max-interactive. The parent
1768 * (current) is done further down, under its lock.
1769 */
1770 p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1771 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1772 1633
1773 p->prio = effective_prio(p); 1634 p->prio = effective_prio(p);
1774 1635
1775 if (likely(cpu == this_cpu)) { 1636 if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1776 if (!(clone_flags & CLONE_VM)) { 1637 task_cpu(p) != this_cpu || !current->se.on_rq) {
1777 /* 1638 activate_task(rq, p, 0);
1778 * The VM isn't cloned, so we're in a good position to
1779 * do child-runs-first in anticipation of an exec. This
1780 * usually avoids a lot of COW overhead.
1781 */
1782 if (unlikely(!current->array))
1783 __activate_task(p, rq);
1784 else {
1785 p->prio = current->prio;
1786 p->normal_prio = current->normal_prio;
1787 list_add_tail(&p->run_list, &current->run_list);
1788 p->array = current->array;
1789 p->array->nr_active++;
1790 inc_nr_running(p, rq);
1791 }
1792 set_need_resched();
1793 } else
1794 /* Run child last */
1795 __activate_task(p, rq);
1796 /*
1797 * We skip the following code due to cpu == this_cpu
1798 *
1799 * task_rq_unlock(rq, &flags);
1800 * this_rq = task_rq_lock(current, &flags);
1801 */
1802 this_rq = rq;
1803 } else { 1639 } else {
1804 this_rq = cpu_rq(this_cpu);
1805
1806 /* 1640 /*
1807 * Not the local CPU - must adjust timestamp. This should 1641 * Let the scheduling class do new task startup
1808 * get optimised away in the !CONFIG_SMP case. 1642 * management (if any):
1809 */ 1643 */
1810 p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) 1644 p->sched_class->task_new(rq, p);
1811 + rq->most_recent_timestamp;
1812 __activate_task(p, rq);
1813 if (TASK_PREEMPTS_CURR(p, rq))
1814 resched_task(rq->curr);
1815
1816 /*
1817 * Parent and child are on different CPUs, now get the
1818 * parent runqueue to update the parent's ->sleep_avg:
1819 */
1820 task_rq_unlock(rq, &flags);
1821 this_rq = task_rq_lock(current, &flags);
1822 }
1823 current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1824 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1825 task_rq_unlock(this_rq, &flags);
1826}
1827
1828/*
1829 * Potentially available exiting-child timeslices are
1830 * retrieved here - this way the parent does not get
1831 * penalized for creating too many threads.
1832 *
1833 * (this cannot be used to 'generate' timeslices
1834 * artificially, because any timeslice recovered here
1835 * was given away by the parent in the first place.)
1836 */
1837void fastcall sched_exit(struct task_struct *p)
1838{
1839 unsigned long flags;
1840 struct rq *rq;
1841
1842 /*
1843 * If the child was a (relative-) CPU hog then decrease
1844 * the sleep_avg of the parent as well.
1845 */
1846 rq = task_rq_lock(p->parent, &flags);
1847 if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
1848 p->parent->time_slice += p->time_slice;
1849 if (unlikely(p->parent->time_slice > task_timeslice(p)))
1850 p->parent->time_slice = task_timeslice(p);
1851 } 1645 }
1852 if (p->sleep_avg < p->parent->sleep_avg) 1646 check_preempt_curr(rq, p);
1853 p->parent->sleep_avg = p->parent->sleep_avg /
1854 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1855 (EXIT_WEIGHT + 1);
1856 task_rq_unlock(rq, &flags); 1647 task_rq_unlock(rq, &flags);
1857} 1648}
1858 1649
@@ -1917,7 +1708,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1917 /* 1708 /*
1918 * Remove function-return probe instances associated with this 1709 * Remove function-return probe instances associated with this
1919 * task and put them back on the free list. 1710 * task and put them back on the free list.
1920 */ 1711 */
1921 kprobe_flush_task(prev); 1712 kprobe_flush_task(prev);
1922 put_task_struct(prev); 1713 put_task_struct(prev);
1923 } 1714 }
@@ -1945,13 +1736,15 @@ asmlinkage void schedule_tail(struct task_struct *prev)
1945 * context_switch - switch to the new MM and the new 1736 * context_switch - switch to the new MM and the new
1946 * thread's register state. 1737 * thread's register state.
1947 */ 1738 */
1948static inline struct task_struct * 1739static inline void
1949context_switch(struct rq *rq, struct task_struct *prev, 1740context_switch(struct rq *rq, struct task_struct *prev,
1950 struct task_struct *next) 1741 struct task_struct *next)
1951{ 1742{
1952 struct mm_struct *mm = next->mm; 1743 struct mm_struct *mm, *oldmm;
1953 struct mm_struct *oldmm = prev->active_mm;
1954 1744
1745 prepare_task_switch(rq, next);
1746 mm = next->mm;
1747 oldmm = prev->active_mm;
1955 /* 1748 /*
1956 * For paravirt, this is coupled with an exit in switch_to to 1749 * For paravirt, this is coupled with an exit in switch_to to
1957 * combine the page table reload and the switch backend into 1750 * combine the page table reload and the switch backend into
@@ -1959,16 +1752,15 @@ context_switch(struct rq *rq, struct task_struct *prev,
1959 */ 1752 */
1960 arch_enter_lazy_cpu_mode(); 1753 arch_enter_lazy_cpu_mode();
1961 1754
1962 if (!mm) { 1755 if (unlikely(!mm)) {
1963 next->active_mm = oldmm; 1756 next->active_mm = oldmm;
1964 atomic_inc(&oldmm->mm_count); 1757 atomic_inc(&oldmm->mm_count);
1965 enter_lazy_tlb(oldmm, next); 1758 enter_lazy_tlb(oldmm, next);
1966 } else 1759 } else
1967 switch_mm(oldmm, mm, next); 1760 switch_mm(oldmm, mm, next);
1968 1761
1969 if (!prev->mm) { 1762 if (unlikely(!prev->mm)) {
1970 prev->active_mm = NULL; 1763 prev->active_mm = NULL;
1971 WARN_ON(rq->prev_mm);
1972 rq->prev_mm = oldmm; 1764 rq->prev_mm = oldmm;
1973 } 1765 }
1974 /* 1766 /*
@@ -1984,7 +1776,13 @@ context_switch(struct rq *rq, struct task_struct *prev,
1984 /* Here we just switch the register state and the stack. */ 1776 /* Here we just switch the register state and the stack. */
1985 switch_to(prev, next, prev); 1777 switch_to(prev, next, prev);
1986 1778
1987 return prev; 1779 barrier();
1780 /*
1781 * this_rq must be evaluated again because prev may have moved
1782 * CPUs since it called schedule(), thus the 'rq' on its stack
1783 * frame will be invalid.
1784 */
1785 finish_task_switch(this_rq(), prev);
1988} 1786}
1989 1787
1990/* 1788/*
@@ -2057,17 +1855,65 @@ unsigned long nr_active(void)
2057 return running + uninterruptible; 1855 return running + uninterruptible;
2058} 1856}
2059 1857
2060#ifdef CONFIG_SMP
2061
2062/* 1858/*
2063 * Is this task likely cache-hot: 1859 * Update rq->cpu_load[] statistics. This function is usually called every
1860 * scheduler tick (TICK_NSEC).
2064 */ 1861 */
2065static inline int 1862static void update_cpu_load(struct rq *this_rq)
2066task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
2067{ 1863{
2068 return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; 1864 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1865 unsigned long total_load = this_rq->ls.load.weight;
1866 unsigned long this_load = total_load;
1867 struct load_stat *ls = &this_rq->ls;
1868 u64 now = __rq_clock(this_rq);
1869 int i, scale;
1870
1871 this_rq->nr_load_updates++;
1872 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1873 goto do_avg;
1874
1875 /* Update delta_fair/delta_exec fields first */
1876 update_curr_load(this_rq, now);
1877
1878 fair_delta64 = ls->delta_fair + 1;
1879 ls->delta_fair = 0;
1880
1881 exec_delta64 = ls->delta_exec + 1;
1882 ls->delta_exec = 0;
1883
1884 sample_interval64 = now - ls->load_update_last;
1885 ls->load_update_last = now;
1886
1887 if ((s64)sample_interval64 < (s64)TICK_NSEC)
1888 sample_interval64 = TICK_NSEC;
1889
1890 if (exec_delta64 > sample_interval64)
1891 exec_delta64 = sample_interval64;
1892
1893 idle_delta64 = sample_interval64 - exec_delta64;
1894
1895 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1896 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1897
1898 this_load = (unsigned long)tmp64;
1899
1900do_avg:
1901
1902 /* Update our load: */
1903 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1904 unsigned long old_load, new_load;
1905
1906 /* scale is effectively 1 << i now, and >> i divides by scale */
1907
1908 old_load = this_rq->cpu_load[i];
1909 new_load = this_load;
1910
1911 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1912 }
2069} 1913}
2070 1914
1915#ifdef CONFIG_SMP
1916
2071/* 1917/*
2072 * double_rq_lock - safely lock two runqueues 1918 * double_rq_lock - safely lock two runqueues
2073 * 1919 *
@@ -2184,23 +2030,17 @@ void sched_exec(void)
2184 * pull_task - move a task from a remote runqueue to the local runqueue. 2030 * pull_task - move a task from a remote runqueue to the local runqueue.
2185 * Both runqueues must be locked. 2031 * Both runqueues must be locked.
2186 */ 2032 */
2187static void pull_task(struct rq *src_rq, struct prio_array *src_array, 2033static void pull_task(struct rq *src_rq, struct task_struct *p,
2188 struct task_struct *p, struct rq *this_rq, 2034 struct rq *this_rq, int this_cpu)
2189 struct prio_array *this_array, int this_cpu)
2190{ 2035{
2191 dequeue_task(p, src_array); 2036 deactivate_task(src_rq, p, 0);
2192 dec_nr_running(p, src_rq);
2193 set_task_cpu(p, this_cpu); 2037 set_task_cpu(p, this_cpu);
2194 inc_nr_running(p, this_rq); 2038 activate_task(this_rq, p, 0);
2195 enqueue_task(p, this_array);
2196 p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
2197 + this_rq->most_recent_timestamp;
2198 /* 2039 /*
2199 * Note that idle threads have a prio of MAX_PRIO, for this test 2040 * Note that idle threads have a prio of MAX_PRIO, for this test
2200 * to be always true for them. 2041 * to be always true for them.
2201 */ 2042 */
2202 if (TASK_PREEMPTS_CURR(p, this_rq)) 2043 check_preempt_curr(this_rq, p);
2203 resched_task(this_rq->curr);
2204} 2044}
2205 2045
2206/* 2046/*
@@ -2208,7 +2048,7 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array,
2208 */ 2048 */
2209static 2049static
2210int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, 2050int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2211 struct sched_domain *sd, enum idle_type idle, 2051 struct sched_domain *sd, enum cpu_idle_type idle,
2212 int *all_pinned) 2052 int *all_pinned)
2213{ 2053{
2214 /* 2054 /*
@@ -2225,132 +2065,67 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2225 return 0; 2065 return 0;
2226 2066
2227 /* 2067 /*
2228 * Aggressive migration if: 2068 * Aggressive migration if too many balance attempts have failed:
2229 * 1) task is cache cold, or
2230 * 2) too many balance attempts have failed.
2231 */ 2069 */
2232 2070 if (sd->nr_balance_failed > sd->cache_nice_tries)
2233 if (sd->nr_balance_failed > sd->cache_nice_tries) {
2234#ifdef CONFIG_SCHEDSTATS
2235 if (task_hot(p, rq->most_recent_timestamp, sd))
2236 schedstat_inc(sd, lb_hot_gained[idle]);
2237#endif
2238 return 1; 2071 return 1;
2239 }
2240 2072
2241 if (task_hot(p, rq->most_recent_timestamp, sd))
2242 return 0;
2243 return 1; 2073 return 1;
2244} 2074}
2245 2075
2246#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) 2076static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2247
2248/*
2249 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2250 * load from busiest to this_rq, as part of a balancing operation within
2251 * "domain". Returns the number of tasks moved.
2252 *
2253 * Called with both runqueues locked.
2254 */
2255static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2256 unsigned long max_nr_move, unsigned long max_load_move, 2077 unsigned long max_nr_move, unsigned long max_load_move,
2257 struct sched_domain *sd, enum idle_type idle, 2078 struct sched_domain *sd, enum cpu_idle_type idle,
2258 int *all_pinned) 2079 int *all_pinned, unsigned long *load_moved,
2080 int this_best_prio, int best_prio, int best_prio_seen,
2081 struct rq_iterator *iterator)
2259{ 2082{
2260 int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, 2083 int pulled = 0, pinned = 0, skip_for_load;
2261 best_prio_seen, skip_for_load; 2084 struct task_struct *p;
2262 struct prio_array *array, *dst_array; 2085 long rem_load_move = max_load_move;
2263 struct list_head *head, *curr;
2264 struct task_struct *tmp;
2265 long rem_load_move;
2266 2086
2267 if (max_nr_move == 0 || max_load_move == 0) 2087 if (max_nr_move == 0 || max_load_move == 0)
2268 goto out; 2088 goto out;
2269 2089
2270 rem_load_move = max_load_move;
2271 pinned = 1; 2090 pinned = 1;
2272 this_best_prio = rq_best_prio(this_rq);
2273 best_prio = rq_best_prio(busiest);
2274 /*
2275 * Enable handling of the case where there is more than one task
2276 * with the best priority. If the current running task is one
2277 * of those with prio==best_prio we know it won't be moved
2278 * and therefore it's safe to override the skip (based on load) of
2279 * any task we find with that prio.
2280 */
2281 best_prio_seen = best_prio == busiest->curr->prio;
2282 2091
2283 /* 2092 /*
2284 * We first consider expired tasks. Those will likely not be 2093 * Start the load-balancing iterator:
2285 * executed in the near future, and they are most likely to
2286 * be cache-cold, thus switching CPUs has the least effect
2287 * on them.
2288 */ 2094 */
2289 if (busiest->expired->nr_active) { 2095 p = iterator->start(iterator->arg);
2290 array = busiest->expired; 2096next:
2291 dst_array = this_rq->expired; 2097 if (!p)
2292 } else {
2293 array = busiest->active;
2294 dst_array = this_rq->active;
2295 }
2296
2297new_array:
2298 /* Start searching at priority 0: */
2299 idx = 0;
2300skip_bitmap:
2301 if (!idx)
2302 idx = sched_find_first_bit(array->bitmap);
2303 else
2304 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
2305 if (idx >= MAX_PRIO) {
2306 if (array == busiest->expired && busiest->active->nr_active) {
2307 array = busiest->active;
2308 dst_array = this_rq->active;
2309 goto new_array;
2310 }
2311 goto out; 2098 goto out;
2312 }
2313
2314 head = array->queue + idx;
2315 curr = head->prev;
2316skip_queue:
2317 tmp = list_entry(curr, struct task_struct, run_list);
2318
2319 curr = curr->prev;
2320
2321 /* 2099 /*
2322 * To help distribute high priority tasks accross CPUs we don't 2100 * To help distribute high priority tasks accross CPUs we don't
2323 * skip a task if it will be the highest priority task (i.e. smallest 2101 * skip a task if it will be the highest priority task (i.e. smallest
2324 * prio value) on its new queue regardless of its load weight 2102 * prio value) on its new queue regardless of its load weight
2325 */ 2103 */
2326 skip_for_load = tmp->load_weight > rem_load_move; 2104 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2327 if (skip_for_load && idx < this_best_prio) 2105 SCHED_LOAD_SCALE_FUZZ;
2328 skip_for_load = !best_prio_seen && idx == best_prio; 2106 if (skip_for_load && p->prio < this_best_prio)
2107 skip_for_load = !best_prio_seen && p->prio == best_prio;
2329 if (skip_for_load || 2108 if (skip_for_load ||
2330 !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { 2109 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2331 2110
2332 best_prio_seen |= idx == best_prio; 2111 best_prio_seen |= p->prio == best_prio;
2333 if (curr != head) 2112 p = iterator->next(iterator->arg);
2334 goto skip_queue; 2113 goto next;
2335 idx++;
2336 goto skip_bitmap;
2337 } 2114 }
2338 2115
2339 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); 2116 pull_task(busiest, p, this_rq, this_cpu);
2340 pulled++; 2117 pulled++;
2341 rem_load_move -= tmp->load_weight; 2118 rem_load_move -= p->se.load.weight;
2342 2119
2343 /* 2120 /*
2344 * We only want to steal up to the prescribed number of tasks 2121 * We only want to steal up to the prescribed number of tasks
2345 * and the prescribed amount of weighted load. 2122 * and the prescribed amount of weighted load.
2346 */ 2123 */
2347 if (pulled < max_nr_move && rem_load_move > 0) { 2124 if (pulled < max_nr_move && rem_load_move > 0) {
2348 if (idx < this_best_prio) 2125 if (p->prio < this_best_prio)
2349 this_best_prio = idx; 2126 this_best_prio = p->prio;
2350 if (curr != head) 2127 p = iterator->next(iterator->arg);
2351 goto skip_queue; 2128 goto next;
2352 idx++;
2353 goto skip_bitmap;
2354 } 2129 }
2355out: 2130out:
2356 /* 2131 /*
@@ -2362,18 +2137,48 @@ out:
2362 2137
2363 if (all_pinned) 2138 if (all_pinned)
2364 *all_pinned = pinned; 2139 *all_pinned = pinned;
2140 *load_moved = max_load_move - rem_load_move;
2365 return pulled; 2141 return pulled;
2366} 2142}
2367 2143
2368/* 2144/*
2145 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2146 * load from busiest to this_rq, as part of a balancing operation within
2147 * "domain". Returns the number of tasks moved.
2148 *
2149 * Called with both runqueues locked.
2150 */
2151static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2152 unsigned long max_nr_move, unsigned long max_load_move,
2153 struct sched_domain *sd, enum cpu_idle_type idle,
2154 int *all_pinned)
2155{
2156 struct sched_class *class = sched_class_highest;
2157 unsigned long load_moved, total_nr_moved = 0, nr_moved;
2158 long rem_load_move = max_load_move;
2159
2160 do {
2161 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2162 max_nr_move, (unsigned long)rem_load_move,
2163 sd, idle, all_pinned, &load_moved);
2164 total_nr_moved += nr_moved;
2165 max_nr_move -= nr_moved;
2166 rem_load_move -= load_moved;
2167 class = class->next;
2168 } while (class && max_nr_move && rem_load_move > 0);
2169
2170 return total_nr_moved;
2171}
2172
2173/*
2369 * find_busiest_group finds and returns the busiest CPU group within the 2174 * find_busiest_group finds and returns the busiest CPU group within the
2370 * domain. It calculates and returns the amount of weighted load which 2175 * domain. It calculates and returns the amount of weighted load which
2371 * should be moved to restore balance via the imbalance parameter. 2176 * should be moved to restore balance via the imbalance parameter.
2372 */ 2177 */
2373static struct sched_group * 2178static struct sched_group *
2374find_busiest_group(struct sched_domain *sd, int this_cpu, 2179find_busiest_group(struct sched_domain *sd, int this_cpu,
2375 unsigned long *imbalance, enum idle_type idle, int *sd_idle, 2180 unsigned long *imbalance, enum cpu_idle_type idle,
2376 cpumask_t *cpus, int *balance) 2181 int *sd_idle, cpumask_t *cpus, int *balance)
2377{ 2182{
2378 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2183 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2379 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2184 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2391,9 +2196,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2391 max_load = this_load = total_load = total_pwr = 0; 2196 max_load = this_load = total_load = total_pwr = 0;
2392 busiest_load_per_task = busiest_nr_running = 0; 2197 busiest_load_per_task = busiest_nr_running = 0;
2393 this_load_per_task = this_nr_running = 0; 2198 this_load_per_task = this_nr_running = 0;
2394 if (idle == NOT_IDLE) 2199 if (idle == CPU_NOT_IDLE)
2395 load_idx = sd->busy_idx; 2200 load_idx = sd->busy_idx;
2396 else if (idle == NEWLY_IDLE) 2201 else if (idle == CPU_NEWLY_IDLE)
2397 load_idx = sd->newidle_idx; 2202 load_idx = sd->newidle_idx;
2398 else 2203 else
2399 load_idx = sd->idle_idx; 2204 load_idx = sd->idle_idx;
@@ -2437,7 +2242,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2437 2242
2438 avg_load += load; 2243 avg_load += load;
2439 sum_nr_running += rq->nr_running; 2244 sum_nr_running += rq->nr_running;
2440 sum_weighted_load += rq->raw_weighted_load; 2245 sum_weighted_load += weighted_cpuload(i);
2441 } 2246 }
2442 2247
2443 /* 2248 /*
@@ -2477,8 +2282,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2477 * Busy processors will not participate in power savings 2282 * Busy processors will not participate in power savings
2478 * balance. 2283 * balance.
2479 */ 2284 */
2480 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) 2285 if (idle == CPU_NOT_IDLE ||
2481 goto group_next; 2286 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2287 goto group_next;
2482 2288
2483 /* 2289 /*
2484 * If the local group is idle or completely loaded 2290 * If the local group is idle or completely loaded
@@ -2488,42 +2294,42 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2488 !this_nr_running)) 2294 !this_nr_running))
2489 power_savings_balance = 0; 2295 power_savings_balance = 0;
2490 2296
2491 /* 2297 /*
2492 * If a group is already running at full capacity or idle, 2298 * If a group is already running at full capacity or idle,
2493 * don't include that group in power savings calculations 2299 * don't include that group in power savings calculations
2494 */ 2300 */
2495 if (!power_savings_balance || sum_nr_running >= group_capacity 2301 if (!power_savings_balance || sum_nr_running >= group_capacity
2496 || !sum_nr_running) 2302 || !sum_nr_running)
2497 goto group_next; 2303 goto group_next;
2498 2304
2499 /* 2305 /*
2500 * Calculate the group which has the least non-idle load. 2306 * Calculate the group which has the least non-idle load.
2501 * This is the group from where we need to pick up the load 2307 * This is the group from where we need to pick up the load
2502 * for saving power 2308 * for saving power
2503 */ 2309 */
2504 if ((sum_nr_running < min_nr_running) || 2310 if ((sum_nr_running < min_nr_running) ||
2505 (sum_nr_running == min_nr_running && 2311 (sum_nr_running == min_nr_running &&
2506 first_cpu(group->cpumask) < 2312 first_cpu(group->cpumask) <
2507 first_cpu(group_min->cpumask))) { 2313 first_cpu(group_min->cpumask))) {
2508 group_min = group; 2314 group_min = group;
2509 min_nr_running = sum_nr_running; 2315 min_nr_running = sum_nr_running;
2510 min_load_per_task = sum_weighted_load / 2316 min_load_per_task = sum_weighted_load /
2511 sum_nr_running; 2317 sum_nr_running;
2512 } 2318 }
2513 2319
2514 /* 2320 /*
2515 * Calculate the group which is almost near its 2321 * Calculate the group which is almost near its
2516 * capacity but still has some space to pick up some load 2322 * capacity but still has some space to pick up some load
2517 * from other group and save more power 2323 * from other group and save more power
2518 */ 2324 */
2519 if (sum_nr_running <= group_capacity - 1) { 2325 if (sum_nr_running <= group_capacity - 1) {
2520 if (sum_nr_running > leader_nr_running || 2326 if (sum_nr_running > leader_nr_running ||
2521 (sum_nr_running == leader_nr_running && 2327 (sum_nr_running == leader_nr_running &&
2522 first_cpu(group->cpumask) > 2328 first_cpu(group->cpumask) >
2523 first_cpu(group_leader->cpumask))) { 2329 first_cpu(group_leader->cpumask))) {
2524 group_leader = group; 2330 group_leader = group;
2525 leader_nr_running = sum_nr_running; 2331 leader_nr_running = sum_nr_running;
2526 } 2332 }
2527 } 2333 }
2528group_next: 2334group_next:
2529#endif 2335#endif
@@ -2578,7 +2384,7 @@ group_next:
2578 * a think about bumping its value to force at least one task to be 2384 * a think about bumping its value to force at least one task to be
2579 * moved 2385 * moved
2580 */ 2386 */
2581 if (*imbalance < busiest_load_per_task) { 2387 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
2582 unsigned long tmp, pwr_now, pwr_move; 2388 unsigned long tmp, pwr_now, pwr_move;
2583 unsigned int imbn; 2389 unsigned int imbn;
2584 2390
@@ -2592,7 +2398,8 @@ small_imbalance:
2592 } else 2398 } else
2593 this_load_per_task = SCHED_LOAD_SCALE; 2399 this_load_per_task = SCHED_LOAD_SCALE;
2594 2400
2595 if (max_load - this_load >= busiest_load_per_task * imbn) { 2401 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2402 busiest_load_per_task * imbn) {
2596 *imbalance = busiest_load_per_task; 2403 *imbalance = busiest_load_per_task;
2597 return busiest; 2404 return busiest;
2598 } 2405 }
@@ -2639,7 +2446,7 @@ small_imbalance:
2639 2446
2640out_balanced: 2447out_balanced:
2641#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2448#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2642 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) 2449 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2643 goto ret; 2450 goto ret;
2644 2451
2645 if (this == group_leader && group_leader != group_min) { 2452 if (this == group_leader && group_leader != group_min) {
@@ -2656,7 +2463,7 @@ ret:
2656 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2463 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2657 */ 2464 */
2658static struct rq * 2465static struct rq *
2659find_busiest_queue(struct sched_group *group, enum idle_type idle, 2466find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2660 unsigned long imbalance, cpumask_t *cpus) 2467 unsigned long imbalance, cpumask_t *cpus)
2661{ 2468{
2662 struct rq *busiest = NULL, *rq; 2469 struct rq *busiest = NULL, *rq;
@@ -2664,17 +2471,19 @@ find_busiest_queue(struct sched_group *group, enum idle_type idle,
2664 int i; 2471 int i;
2665 2472
2666 for_each_cpu_mask(i, group->cpumask) { 2473 for_each_cpu_mask(i, group->cpumask) {
2474 unsigned long wl;
2667 2475
2668 if (!cpu_isset(i, *cpus)) 2476 if (!cpu_isset(i, *cpus))
2669 continue; 2477 continue;
2670 2478
2671 rq = cpu_rq(i); 2479 rq = cpu_rq(i);
2480 wl = weighted_cpuload(i);
2672 2481
2673 if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) 2482 if (rq->nr_running == 1 && wl > imbalance)
2674 continue; 2483 continue;
2675 2484
2676 if (rq->raw_weighted_load > max_load) { 2485 if (wl > max_load) {
2677 max_load = rq->raw_weighted_load; 2486 max_load = wl;
2678 busiest = rq; 2487 busiest = rq;
2679 } 2488 }
2680 } 2489 }
@@ -2698,7 +2507,7 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
2698 * tasks if there is an imbalance. 2507 * tasks if there is an imbalance.
2699 */ 2508 */
2700static int load_balance(int this_cpu, struct rq *this_rq, 2509static int load_balance(int this_cpu, struct rq *this_rq,
2701 struct sched_domain *sd, enum idle_type idle, 2510 struct sched_domain *sd, enum cpu_idle_type idle,
2702 int *balance) 2511 int *balance)
2703{ 2512{
2704 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 2513 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
@@ -2711,10 +2520,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2711 /* 2520 /*
2712 * When power savings policy is enabled for the parent domain, idle 2521 * When power savings policy is enabled for the parent domain, idle
2713 * sibling can pick up load irrespective of busy siblings. In this case, 2522 * sibling can pick up load irrespective of busy siblings. In this case,
2714 * let the state of idle sibling percolate up as IDLE, instead of 2523 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2715 * portraying it as NOT_IDLE. 2524 * portraying it as CPU_NOT_IDLE.
2716 */ 2525 */
2717 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && 2526 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2718 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2527 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2719 sd_idle = 1; 2528 sd_idle = 1;
2720 2529
@@ -2848,7 +2657,7 @@ out_one_pinned:
2848 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2657 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2849 * tasks if there is an imbalance. 2658 * tasks if there is an imbalance.
2850 * 2659 *
2851 * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). 2660 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
2852 * this_rq is locked. 2661 * this_rq is locked.
2853 */ 2662 */
2854static int 2663static int
@@ -2865,31 +2674,31 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2865 * When power savings policy is enabled for the parent domain, idle 2674 * When power savings policy is enabled for the parent domain, idle
2866 * sibling can pick up load irrespective of busy siblings. In this case, 2675 * sibling can pick up load irrespective of busy siblings. In this case,
2867 * let the state of idle sibling percolate up as IDLE, instead of 2676 * let the state of idle sibling percolate up as IDLE, instead of
2868 * portraying it as NOT_IDLE. 2677 * portraying it as CPU_NOT_IDLE.
2869 */ 2678 */
2870 if (sd->flags & SD_SHARE_CPUPOWER && 2679 if (sd->flags & SD_SHARE_CPUPOWER &&
2871 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2680 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2872 sd_idle = 1; 2681 sd_idle = 1;
2873 2682
2874 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2683 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
2875redo: 2684redo:
2876 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, 2685 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
2877 &sd_idle, &cpus, NULL); 2686 &sd_idle, &cpus, NULL);
2878 if (!group) { 2687 if (!group) {
2879 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2688 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
2880 goto out_balanced; 2689 goto out_balanced;
2881 } 2690 }
2882 2691
2883 busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance, 2692 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
2884 &cpus); 2693 &cpus);
2885 if (!busiest) { 2694 if (!busiest) {
2886 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2695 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
2887 goto out_balanced; 2696 goto out_balanced;
2888 } 2697 }
2889 2698
2890 BUG_ON(busiest == this_rq); 2699 BUG_ON(busiest == this_rq);
2891 2700
2892 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); 2701 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
2893 2702
2894 nr_moved = 0; 2703 nr_moved = 0;
2895 if (busiest->nr_running > 1) { 2704 if (busiest->nr_running > 1) {
@@ -2897,7 +2706,7 @@ redo:
2897 double_lock_balance(this_rq, busiest); 2706 double_lock_balance(this_rq, busiest);
2898 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2707 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2899 minus_1_or_zero(busiest->nr_running), 2708 minus_1_or_zero(busiest->nr_running),
2900 imbalance, sd, NEWLY_IDLE, NULL); 2709 imbalance, sd, CPU_NEWLY_IDLE, NULL);
2901 spin_unlock(&busiest->lock); 2710 spin_unlock(&busiest->lock);
2902 2711
2903 if (!nr_moved) { 2712 if (!nr_moved) {
@@ -2908,7 +2717,7 @@ redo:
2908 } 2717 }
2909 2718
2910 if (!nr_moved) { 2719 if (!nr_moved) {
2911 schedstat_inc(sd, lb_failed[NEWLY_IDLE]); 2720 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
2912 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2721 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2913 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2722 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2914 return -1; 2723 return -1;
@@ -2918,7 +2727,7 @@ redo:
2918 return nr_moved; 2727 return nr_moved;
2919 2728
2920out_balanced: 2729out_balanced:
2921 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); 2730 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
2922 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2731 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2923 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2732 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2924 return -1; 2733 return -1;
@@ -2934,8 +2743,8 @@ out_balanced:
2934static void idle_balance(int this_cpu, struct rq *this_rq) 2743static void idle_balance(int this_cpu, struct rq *this_rq)
2935{ 2744{
2936 struct sched_domain *sd; 2745 struct sched_domain *sd;
2937 int pulled_task = 0; 2746 int pulled_task = -1;
2938 unsigned long next_balance = jiffies + 60 * HZ; 2747 unsigned long next_balance = jiffies + HZ;
2939 2748
2940 for_each_domain(this_cpu, sd) { 2749 for_each_domain(this_cpu, sd) {
2941 unsigned long interval; 2750 unsigned long interval;
@@ -2954,12 +2763,13 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
2954 if (pulled_task) 2763 if (pulled_task)
2955 break; 2764 break;
2956 } 2765 }
2957 if (!pulled_task) 2766 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
2958 /* 2767 /*
2959 * We are going idle. next_balance may be set based on 2768 * We are going idle. next_balance may be set based on
2960 * a busy processor. So reset next_balance. 2769 * a busy processor. So reset next_balance.
2961 */ 2770 */
2962 this_rq->next_balance = next_balance; 2771 this_rq->next_balance = next_balance;
2772 }
2963} 2773}
2964 2774
2965/* 2775/*
@@ -3003,7 +2813,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3003 schedstat_inc(sd, alb_cnt); 2813 schedstat_inc(sd, alb_cnt);
3004 2814
3005 if (move_tasks(target_rq, target_cpu, busiest_rq, 1, 2815 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
3006 RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, 2816 RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
3007 NULL)) 2817 NULL))
3008 schedstat_inc(sd, alb_pushed); 2818 schedstat_inc(sd, alb_pushed);
3009 else 2819 else
@@ -3012,32 +2822,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3012 spin_unlock(&target_rq->lock); 2822 spin_unlock(&target_rq->lock);
3013} 2823}
3014 2824
3015static void update_load(struct rq *this_rq)
3016{
3017 unsigned long this_load;
3018 unsigned int i, scale;
3019
3020 this_load = this_rq->raw_weighted_load;
3021
3022 /* Update our load: */
3023 for (i = 0, scale = 1; i < 3; i++, scale += scale) {
3024 unsigned long old_load, new_load;
3025
3026 /* scale is effectively 1 << i now, and >> i divides by scale */
3027
3028 old_load = this_rq->cpu_load[i];
3029 new_load = this_load;
3030 /*
3031 * Round up the averaging division if load is increasing. This
3032 * prevents us from getting stuck on 9 if the load is 10, for
3033 * example.
3034 */
3035 if (new_load > old_load)
3036 new_load += scale-1;
3037 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3038 }
3039}
3040
3041#ifdef CONFIG_NO_HZ 2825#ifdef CONFIG_NO_HZ
3042static struct { 2826static struct {
3043 atomic_t load_balancer; 2827 atomic_t load_balancer;
@@ -3120,7 +2904,7 @@ static DEFINE_SPINLOCK(balancing);
3120 * 2904 *
3121 * Balancing parameters are set up in arch_init_sched_domains. 2905 * Balancing parameters are set up in arch_init_sched_domains.
3122 */ 2906 */
3123static inline void rebalance_domains(int cpu, enum idle_type idle) 2907static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
3124{ 2908{
3125 int balance = 1; 2909 int balance = 1;
3126 struct rq *rq = cpu_rq(cpu); 2910 struct rq *rq = cpu_rq(cpu);
@@ -3134,13 +2918,16 @@ static inline void rebalance_domains(int cpu, enum idle_type idle)
3134 continue; 2918 continue;
3135 2919
3136 interval = sd->balance_interval; 2920 interval = sd->balance_interval;
3137 if (idle != SCHED_IDLE) 2921 if (idle != CPU_IDLE)
3138 interval *= sd->busy_factor; 2922 interval *= sd->busy_factor;
3139 2923
3140 /* scale ms to jiffies */ 2924 /* scale ms to jiffies */
3141 interval = msecs_to_jiffies(interval); 2925 interval = msecs_to_jiffies(interval);
3142 if (unlikely(!interval)) 2926 if (unlikely(!interval))
3143 interval = 1; 2927 interval = 1;
2928 if (interval > HZ*NR_CPUS/10)
2929 interval = HZ*NR_CPUS/10;
2930
3144 2931
3145 if (sd->flags & SD_SERIALIZE) { 2932 if (sd->flags & SD_SERIALIZE) {
3146 if (!spin_trylock(&balancing)) 2933 if (!spin_trylock(&balancing))
@@ -3154,7 +2941,7 @@ static inline void rebalance_domains(int cpu, enum idle_type idle)
3154 * longer idle, or one of our SMT siblings is 2941 * longer idle, or one of our SMT siblings is
3155 * not idle. 2942 * not idle.
3156 */ 2943 */
3157 idle = NOT_IDLE; 2944 idle = CPU_NOT_IDLE;
3158 } 2945 }
3159 sd->last_balance = jiffies; 2946 sd->last_balance = jiffies;
3160 } 2947 }
@@ -3182,11 +2969,12 @@ out:
3182 */ 2969 */
3183static void run_rebalance_domains(struct softirq_action *h) 2970static void run_rebalance_domains(struct softirq_action *h)
3184{ 2971{
3185 int local_cpu = smp_processor_id(); 2972 int this_cpu = smp_processor_id();
3186 struct rq *local_rq = cpu_rq(local_cpu); 2973 struct rq *this_rq = cpu_rq(this_cpu);
3187 enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE; 2974 enum cpu_idle_type idle = this_rq->idle_at_tick ?
2975 CPU_IDLE : CPU_NOT_IDLE;
3188 2976
3189 rebalance_domains(local_cpu, idle); 2977 rebalance_domains(this_cpu, idle);
3190 2978
3191#ifdef CONFIG_NO_HZ 2979#ifdef CONFIG_NO_HZ
3192 /* 2980 /*
@@ -3194,13 +2982,13 @@ static void run_rebalance_domains(struct softirq_action *h)
3194 * balancing on behalf of the other idle cpus whose ticks are 2982 * balancing on behalf of the other idle cpus whose ticks are
3195 * stopped. 2983 * stopped.
3196 */ 2984 */
3197 if (local_rq->idle_at_tick && 2985 if (this_rq->idle_at_tick &&
3198 atomic_read(&nohz.load_balancer) == local_cpu) { 2986 atomic_read(&nohz.load_balancer) == this_cpu) {
3199 cpumask_t cpus = nohz.cpu_mask; 2987 cpumask_t cpus = nohz.cpu_mask;
3200 struct rq *rq; 2988 struct rq *rq;
3201 int balance_cpu; 2989 int balance_cpu;
3202 2990
3203 cpu_clear(local_cpu, cpus); 2991 cpu_clear(this_cpu, cpus);
3204 for_each_cpu_mask(balance_cpu, cpus) { 2992 for_each_cpu_mask(balance_cpu, cpus) {
3205 /* 2993 /*
3206 * If this cpu gets work to do, stop the load balancing 2994 * If this cpu gets work to do, stop the load balancing
@@ -3213,8 +3001,8 @@ static void run_rebalance_domains(struct softirq_action *h)
3213 rebalance_domains(balance_cpu, SCHED_IDLE); 3001 rebalance_domains(balance_cpu, SCHED_IDLE);
3214 3002
3215 rq = cpu_rq(balance_cpu); 3003 rq = cpu_rq(balance_cpu);
3216 if (time_after(local_rq->next_balance, rq->next_balance)) 3004 if (time_after(this_rq->next_balance, rq->next_balance))
3217 local_rq->next_balance = rq->next_balance; 3005 this_rq->next_balance = rq->next_balance;
3218 } 3006 }
3219 } 3007 }
3220#endif 3008#endif
@@ -3227,9 +3015,8 @@ static void run_rebalance_domains(struct softirq_action *h)
3227 * idle load balancing owner or decide to stop the periodic load balancing, 3015 * idle load balancing owner or decide to stop the periodic load balancing,
3228 * if the whole system is idle. 3016 * if the whole system is idle.
3229 */ 3017 */
3230static inline void trigger_load_balance(int cpu) 3018static inline void trigger_load_balance(struct rq *rq, int cpu)
3231{ 3019{
3232 struct rq *rq = cpu_rq(cpu);
3233#ifdef CONFIG_NO_HZ 3020#ifdef CONFIG_NO_HZ
3234 /* 3021 /*
3235 * If we were in the nohz mode recently and busy at the current 3022 * If we were in the nohz mode recently and busy at the current
@@ -3281,13 +3068,29 @@ static inline void trigger_load_balance(int cpu)
3281 if (time_after_eq(jiffies, rq->next_balance)) 3068 if (time_after_eq(jiffies, rq->next_balance))
3282 raise_softirq(SCHED_SOFTIRQ); 3069 raise_softirq(SCHED_SOFTIRQ);
3283} 3070}
3284#else 3071
3072#else /* CONFIG_SMP */
3073
3285/* 3074/*
3286 * on UP we do not need to balance between CPUs: 3075 * on UP we do not need to balance between CPUs:
3287 */ 3076 */
3288static inline void idle_balance(int cpu, struct rq *rq) 3077static inline void idle_balance(int cpu, struct rq *rq)
3289{ 3078{
3290} 3079}
3080
3081/* Avoid "used but not defined" warning on UP */
3082static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3083 unsigned long max_nr_move, unsigned long max_load_move,
3084 struct sched_domain *sd, enum cpu_idle_type idle,
3085 int *all_pinned, unsigned long *load_moved,
3086 int this_best_prio, int best_prio, int best_prio_seen,
3087 struct rq_iterator *iterator)
3088{
3089 *load_moved = 0;
3090
3091 return 0;
3092}
3093
3291#endif 3094#endif
3292 3095
3293DEFINE_PER_CPU(struct kernel_stat, kstat); 3096DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -3295,54 +3098,28 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
3295EXPORT_PER_CPU_SYMBOL(kstat); 3098EXPORT_PER_CPU_SYMBOL(kstat);
3296 3099
3297/* 3100/*
3298 * This is called on clock ticks and on context switches. 3101 * Return p->sum_exec_runtime plus any more ns on the sched_clock
3299 * Bank in p->sched_time the ns elapsed since the last tick or switch. 3102 * that have not yet been banked in case the task is currently running.
3300 */
3301static inline void
3302update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
3303{
3304 p->sched_time += now - p->last_ran;
3305 p->last_ran = rq->most_recent_timestamp = now;
3306}
3307
3308/*
3309 * Return current->sched_time plus any more ns on the sched_clock
3310 * that have not yet been banked.
3311 */ 3103 */
3312unsigned long long current_sched_time(const struct task_struct *p) 3104unsigned long long task_sched_runtime(struct task_struct *p)
3313{ 3105{
3314 unsigned long long ns;
3315 unsigned long flags; 3106 unsigned long flags;
3107 u64 ns, delta_exec;
3108 struct rq *rq;
3316 3109
3317 local_irq_save(flags); 3110 rq = task_rq_lock(p, &flags);
3318 ns = p->sched_time + sched_clock() - p->last_ran; 3111 ns = p->se.sum_exec_runtime;
3319 local_irq_restore(flags); 3112 if (rq->curr == p) {
3113 delta_exec = rq_clock(rq) - p->se.exec_start;
3114 if ((s64)delta_exec > 0)
3115 ns += delta_exec;
3116 }
3117 task_rq_unlock(rq, &flags);
3320 3118
3321 return ns; 3119 return ns;
3322} 3120}
3323 3121
3324/* 3122/*
3325 * We place interactive tasks back into the active array, if possible.
3326 *
3327 * To guarantee that this does not starve expired tasks we ignore the
3328 * interactivity of a task if the first expired task had to wait more
3329 * than a 'reasonable' amount of time. This deadline timeout is
3330 * load-dependent, as the frequency of array switched decreases with
3331 * increasing number of running tasks. We also ignore the interactivity
3332 * if a better static_prio task has expired:
3333 */
3334static inline int expired_starving(struct rq *rq)
3335{
3336 if (rq->curr->static_prio > rq->best_expired_prio)
3337 return 1;
3338 if (!STARVATION_LIMIT || !rq->expired_timestamp)
3339 return 0;
3340 if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
3341 return 1;
3342 return 0;
3343}
3344
3345/*
3346 * Account user cpu time to a process. 3123 * Account user cpu time to a process.
3347 * @p: the process that the cpu time gets accounted to 3124 * @p: the process that the cpu time gets accounted to
3348 * @hardirq_offset: the offset to subtract from hardirq_count() 3125 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3415,81 +3192,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
3415 cpustat->steal = cputime64_add(cpustat->steal, tmp); 3192 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3416} 3193}
3417 3194
3418static void task_running_tick(struct rq *rq, struct task_struct *p)
3419{
3420 if (p->array != rq->active) {
3421 /* Task has expired but was not scheduled yet */
3422 set_tsk_need_resched(p);
3423 return;
3424 }
3425 spin_lock(&rq->lock);
3426 /*
3427 * The task was running during this tick - update the
3428 * time slice counter. Note: we do not update a thread's
3429 * priority until it either goes to sleep or uses up its
3430 * timeslice. This makes it possible for interactive tasks
3431 * to use up their timeslices at their highest priority levels.
3432 */
3433 if (rt_task(p)) {
3434 /*
3435 * RR tasks need a special form of timeslice management.
3436 * FIFO tasks have no timeslices.
3437 */
3438 if ((p->policy == SCHED_RR) && !--p->time_slice) {
3439 p->time_slice = task_timeslice(p);
3440 p->first_time_slice = 0;
3441 set_tsk_need_resched(p);
3442
3443 /* put it at the end of the queue: */
3444 requeue_task(p, rq->active);
3445 }
3446 goto out_unlock;
3447 }
3448 if (!--p->time_slice) {
3449 dequeue_task(p, rq->active);
3450 set_tsk_need_resched(p);
3451 p->prio = effective_prio(p);
3452 p->time_slice = task_timeslice(p);
3453 p->first_time_slice = 0;
3454
3455 if (!rq->expired_timestamp)
3456 rq->expired_timestamp = jiffies;
3457 if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
3458 enqueue_task(p, rq->expired);
3459 if (p->static_prio < rq->best_expired_prio)
3460 rq->best_expired_prio = p->static_prio;
3461 } else
3462 enqueue_task(p, rq->active);
3463 } else {
3464 /*
3465 * Prevent a too long timeslice allowing a task to monopolize
3466 * the CPU. We do this by splitting up the timeslice into
3467 * smaller pieces.
3468 *
3469 * Note: this does not mean the task's timeslices expire or
3470 * get lost in any way, they just might be preempted by
3471 * another task of equal priority. (one with higher
3472 * priority would have preempted this task already.) We
3473 * requeue this task to the end of the list on this priority
3474 * level, which is in essence a round-robin of tasks with
3475 * equal priority.
3476 *
3477 * This only applies to tasks in the interactive
3478 * delta range with at least TIMESLICE_GRANULARITY to requeue.
3479 */
3480 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
3481 p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
3482 (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
3483 (p->array == rq->active)) {
3484
3485 requeue_task(p, rq->active);
3486 set_tsk_need_resched(p);
3487 }
3488 }
3489out_unlock:
3490 spin_unlock(&rq->lock);
3491}
3492
3493/* 3195/*
3494 * This function gets called by the timer code, with HZ frequency. 3196 * This function gets called by the timer code, with HZ frequency.
3495 * We call it with interrupts disabled. 3197 * We call it with interrupts disabled.
@@ -3499,20 +3201,19 @@ out_unlock:
3499 */ 3201 */
3500void scheduler_tick(void) 3202void scheduler_tick(void)
3501{ 3203{
3502 unsigned long long now = sched_clock();
3503 struct task_struct *p = current;
3504 int cpu = smp_processor_id(); 3204 int cpu = smp_processor_id();
3505 int idle_at_tick = idle_cpu(cpu);
3506 struct rq *rq = cpu_rq(cpu); 3205 struct rq *rq = cpu_rq(cpu);
3206 struct task_struct *curr = rq->curr;
3507 3207
3508 update_cpu_clock(p, rq, now); 3208 spin_lock(&rq->lock);
3209 if (curr != rq->idle) /* FIXME: needed? */
3210 curr->sched_class->task_tick(rq, curr);
3211 update_cpu_load(rq);
3212 spin_unlock(&rq->lock);
3509 3213
3510 if (!idle_at_tick)
3511 task_running_tick(rq, p);
3512#ifdef CONFIG_SMP 3214#ifdef CONFIG_SMP
3513 update_load(rq); 3215 rq->idle_at_tick = idle_cpu(cpu);
3514 rq->idle_at_tick = idle_at_tick; 3216 trigger_load_balance(rq, cpu);
3515 trigger_load_balance(cpu);
3516#endif 3217#endif
3517} 3218}
3518 3219
@@ -3554,170 +3255,129 @@ EXPORT_SYMBOL(sub_preempt_count);
3554 3255
3555#endif 3256#endif
3556 3257
3557static inline int interactive_sleep(enum sleep_type sleep_type) 3258/*
3259 * Print scheduling while atomic bug:
3260 */
3261static noinline void __schedule_bug(struct task_struct *prev)
3558{ 3262{
3559 return (sleep_type == SLEEP_INTERACTIVE || 3263 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3560 sleep_type == SLEEP_INTERRUPTED); 3264 prev->comm, preempt_count(), prev->pid);
3265 debug_show_held_locks(prev);
3266 if (irqs_disabled())
3267 print_irqtrace_events(prev);
3268 dump_stack();
3561} 3269}
3562 3270
3563/* 3271/*
3564 * schedule() is the main scheduler function. 3272 * Various schedule()-time debugging checks and statistics:
3565 */ 3273 */
3566asmlinkage void __sched schedule(void) 3274static inline void schedule_debug(struct task_struct *prev)
3567{ 3275{
3568 struct task_struct *prev, *next;
3569 struct prio_array *array;
3570 struct list_head *queue;
3571 unsigned long long now;
3572 unsigned long run_time;
3573 int cpu, idx, new_prio;
3574 long *switch_count;
3575 struct rq *rq;
3576
3577 /* 3276 /*
3578 * Test if we are atomic. Since do_exit() needs to call into 3277 * Test if we are atomic. Since do_exit() needs to call into
3579 * schedule() atomically, we ignore that path for now. 3278 * schedule() atomically, we ignore that path for now.
3580 * Otherwise, whine if we are scheduling when we should not be. 3279 * Otherwise, whine if we are scheduling when we should not be.
3581 */ 3280 */
3582 if (unlikely(in_atomic() && !current->exit_state)) { 3281 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3583 printk(KERN_ERR "BUG: scheduling while atomic: " 3282 __schedule_bug(prev);
3584 "%s/0x%08x/%d\n",
3585 current->comm, preempt_count(), current->pid);
3586 debug_show_held_locks(current);
3587 if (irqs_disabled())
3588 print_irqtrace_events(current);
3589 dump_stack();
3590 }
3591 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3592 3283
3593need_resched: 3284 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3594 preempt_disable();
3595 prev = current;
3596 release_kernel_lock(prev);
3597need_resched_nonpreemptible:
3598 rq = this_rq();
3599 3285
3600 /* 3286 schedstat_inc(this_rq(), sched_cnt);
3601 * The idle thread is not allowed to schedule! 3287}
3602 * Remove this check after it has been exercised a bit.
3603 */
3604 if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
3605 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
3606 dump_stack();
3607 }
3608 3288
3609 schedstat_inc(rq, sched_cnt); 3289/*
3610 now = sched_clock(); 3290 * Pick up the highest-prio task:
3611 if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { 3291 */
3612 run_time = now - prev->timestamp; 3292static inline struct task_struct *
3613 if (unlikely((long long)(now - prev->timestamp) < 0)) 3293pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3614 run_time = 0; 3294{
3615 } else 3295 struct sched_class *class;
3616 run_time = NS_MAX_SLEEP_AVG; 3296 struct task_struct *p;
3617 3297
3618 /* 3298 /*
3619 * Tasks charged proportionately less run_time at high sleep_avg to 3299 * Optimization: we know that if all tasks are in
3620 * delay them losing their interactive status 3300 * the fair class we can call that function directly:
3621 */ 3301 */
3622 run_time /= (CURRENT_BONUS(prev) ? : 1); 3302 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3623 3303 p = fair_sched_class.pick_next_task(rq, now);
3624 spin_lock_irq(&rq->lock); 3304 if (likely(p))
3625 3305 return p;
3626 switch_count = &prev->nivcsw;
3627 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3628 switch_count = &prev->nvcsw;
3629 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3630 unlikely(signal_pending(prev))))
3631 prev->state = TASK_RUNNING;
3632 else {
3633 if (prev->state == TASK_UNINTERRUPTIBLE)
3634 rq->nr_uninterruptible++;
3635 deactivate_task(prev, rq);
3636 }
3637 }
3638
3639 cpu = smp_processor_id();
3640 if (unlikely(!rq->nr_running)) {
3641 idle_balance(cpu, rq);
3642 if (!rq->nr_running) {
3643 next = rq->idle;
3644 rq->expired_timestamp = 0;
3645 goto switch_tasks;
3646 }
3647 } 3306 }
3648 3307
3649 array = rq->active; 3308 class = sched_class_highest;
3650 if (unlikely(!array->nr_active)) { 3309 for ( ; ; ) {
3310 p = class->pick_next_task(rq, now);
3311 if (p)
3312 return p;
3651 /* 3313 /*
3652 * Switch the active and expired arrays. 3314 * Will never be NULL as the idle class always
3315 * returns a non-NULL p:
3653 */ 3316 */
3654 schedstat_inc(rq, sched_switch); 3317 class = class->next;
3655 rq->active = rq->expired;
3656 rq->expired = array;
3657 array = rq->active;
3658 rq->expired_timestamp = 0;
3659 rq->best_expired_prio = MAX_PRIO;
3660 } 3318 }
3319}
3320
3321/*
3322 * schedule() is the main scheduler function.
3323 */
3324asmlinkage void __sched schedule(void)
3325{
3326 struct task_struct *prev, *next;
3327 long *switch_count;
3328 struct rq *rq;
3329 u64 now;
3330 int cpu;
3661 3331
3662 idx = sched_find_first_bit(array->bitmap); 3332need_resched:
3663 queue = array->queue + idx; 3333 preempt_disable();
3664 next = list_entry(queue->next, struct task_struct, run_list); 3334 cpu = smp_processor_id();
3335 rq = cpu_rq(cpu);
3336 rcu_qsctr_inc(cpu);
3337 prev = rq->curr;
3338 switch_count = &prev->nivcsw;
3665 3339
3666 if (!rt_task(next) && interactive_sleep(next->sleep_type)) { 3340 release_kernel_lock(prev);
3667 unsigned long long delta = now - next->timestamp; 3341need_resched_nonpreemptible:
3668 if (unlikely((long long)(now - next->timestamp) < 0))
3669 delta = 0;
3670 3342
3671 if (next->sleep_type == SLEEP_INTERACTIVE) 3343 schedule_debug(prev);
3672 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
3673 3344
3674 array = next->array; 3345 spin_lock_irq(&rq->lock);
3675 new_prio = recalc_task_prio(next, next->timestamp + delta); 3346 clear_tsk_need_resched(prev);
3676 3347
3677 if (unlikely(next->prio != new_prio)) { 3348 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3678 dequeue_task(next, array); 3349 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3679 next->prio = new_prio; 3350 unlikely(signal_pending(prev)))) {
3680 enqueue_task(next, array); 3351 prev->state = TASK_RUNNING;
3352 } else {
3353 deactivate_task(rq, prev, 1);
3681 } 3354 }
3355 switch_count = &prev->nvcsw;
3682 } 3356 }
3683 next->sleep_type = SLEEP_NORMAL;
3684switch_tasks:
3685 if (next == rq->idle)
3686 schedstat_inc(rq, sched_goidle);
3687 prefetch(next);
3688 prefetch_stack(next);
3689 clear_tsk_need_resched(prev);
3690 rcu_qsctr_inc(task_cpu(prev));
3691 3357
3692 update_cpu_clock(prev, rq, now); 3358 if (unlikely(!rq->nr_running))
3359 idle_balance(cpu, rq);
3693 3360
3694 prev->sleep_avg -= run_time; 3361 now = __rq_clock(rq);
3695 if ((long)prev->sleep_avg <= 0) 3362 prev->sched_class->put_prev_task(rq, prev, now);
3696 prev->sleep_avg = 0; 3363 next = pick_next_task(rq, prev, now);
3697 prev->timestamp = prev->last_ran = now;
3698 3364
3699 sched_info_switch(prev, next); 3365 sched_info_switch(prev, next);
3366
3700 if (likely(prev != next)) { 3367 if (likely(prev != next)) {
3701 next->timestamp = next->last_ran = now;
3702 rq->nr_switches++; 3368 rq->nr_switches++;
3703 rq->curr = next; 3369 rq->curr = next;
3704 ++*switch_count; 3370 ++*switch_count;
3705 3371
3706 prepare_task_switch(rq, next); 3372 context_switch(rq, prev, next); /* unlocks the rq */
3707 prev = context_switch(rq, prev, next);
3708 barrier();
3709 /*
3710 * this_rq must be evaluated again because prev may have moved
3711 * CPUs since it called schedule(), thus the 'rq' on its stack
3712 * frame will be invalid.
3713 */
3714 finish_task_switch(this_rq(), prev);
3715 } else 3373 } else
3716 spin_unlock_irq(&rq->lock); 3374 spin_unlock_irq(&rq->lock);
3717 3375
3718 prev = current; 3376 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3719 if (unlikely(reacquire_kernel_lock(prev) < 0)) 3377 cpu = smp_processor_id();
3378 rq = cpu_rq(cpu);
3720 goto need_resched_nonpreemptible; 3379 goto need_resched_nonpreemptible;
3380 }
3721 preempt_enable_no_resched(); 3381 preempt_enable_no_resched();
3722 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3382 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3723 goto need_resched; 3383 goto need_resched;
@@ -4045,74 +3705,85 @@ out:
4045} 3705}
4046EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 3706EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4047 3707
4048 3708static inline void
4049#define SLEEP_ON_VAR \ 3709sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
4050 unsigned long flags; \ 3710{
4051 wait_queue_t wait; \ 3711 spin_lock_irqsave(&q->lock, *flags);
4052 init_waitqueue_entry(&wait, current); 3712 __add_wait_queue(q, wait);
4053
4054#define SLEEP_ON_HEAD \
4055 spin_lock_irqsave(&q->lock,flags); \
4056 __add_wait_queue(q, &wait); \
4057 spin_unlock(&q->lock); 3713 spin_unlock(&q->lock);
3714}
4058 3715
4059#define SLEEP_ON_TAIL \ 3716static inline void
4060 spin_lock_irq(&q->lock); \ 3717sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
4061 __remove_wait_queue(q, &wait); \ 3718{
4062 spin_unlock_irqrestore(&q->lock, flags); 3719 spin_lock_irq(&q->lock);
3720 __remove_wait_queue(q, wait);
3721 spin_unlock_irqrestore(&q->lock, *flags);
3722}
4063 3723
4064void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) 3724void __sched interruptible_sleep_on(wait_queue_head_t *q)
4065{ 3725{
4066 SLEEP_ON_VAR 3726 unsigned long flags;
3727 wait_queue_t wait;
3728
3729 init_waitqueue_entry(&wait, current);
4067 3730
4068 current->state = TASK_INTERRUPTIBLE; 3731 current->state = TASK_INTERRUPTIBLE;
4069 3732
4070 SLEEP_ON_HEAD 3733 sleep_on_head(q, &wait, &flags);
4071 schedule(); 3734 schedule();
4072 SLEEP_ON_TAIL 3735 sleep_on_tail(q, &wait, &flags);
4073} 3736}
4074EXPORT_SYMBOL(interruptible_sleep_on); 3737EXPORT_SYMBOL(interruptible_sleep_on);
4075 3738
4076long fastcall __sched 3739long __sched
4077interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 3740interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
4078{ 3741{
4079 SLEEP_ON_VAR 3742 unsigned long flags;
3743 wait_queue_t wait;
3744
3745 init_waitqueue_entry(&wait, current);
4080 3746
4081 current->state = TASK_INTERRUPTIBLE; 3747 current->state = TASK_INTERRUPTIBLE;
4082 3748
4083 SLEEP_ON_HEAD 3749 sleep_on_head(q, &wait, &flags);
4084 timeout = schedule_timeout(timeout); 3750 timeout = schedule_timeout(timeout);
4085 SLEEP_ON_TAIL 3751 sleep_on_tail(q, &wait, &flags);
4086 3752
4087 return timeout; 3753 return timeout;
4088} 3754}
4089EXPORT_SYMBOL(interruptible_sleep_on_timeout); 3755EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4090 3756
4091void fastcall __sched sleep_on(wait_queue_head_t *q) 3757void __sched sleep_on(wait_queue_head_t *q)
4092{ 3758{
4093 SLEEP_ON_VAR 3759 unsigned long flags;
3760 wait_queue_t wait;
3761
3762 init_waitqueue_entry(&wait, current);
4094 3763
4095 current->state = TASK_UNINTERRUPTIBLE; 3764 current->state = TASK_UNINTERRUPTIBLE;
4096 3765
4097 SLEEP_ON_HEAD 3766 sleep_on_head(q, &wait, &flags);
4098 schedule(); 3767 schedule();
4099 SLEEP_ON_TAIL 3768 sleep_on_tail(q, &wait, &flags);
4100} 3769}
4101EXPORT_SYMBOL(sleep_on); 3770EXPORT_SYMBOL(sleep_on);
4102 3771
4103long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 3772long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4104{ 3773{
4105 SLEEP_ON_VAR 3774 unsigned long flags;
3775 wait_queue_t wait;
3776
3777 init_waitqueue_entry(&wait, current);
4106 3778
4107 current->state = TASK_UNINTERRUPTIBLE; 3779 current->state = TASK_UNINTERRUPTIBLE;
4108 3780
4109 SLEEP_ON_HEAD 3781 sleep_on_head(q, &wait, &flags);
4110 timeout = schedule_timeout(timeout); 3782 timeout = schedule_timeout(timeout);
4111 SLEEP_ON_TAIL 3783 sleep_on_tail(q, &wait, &flags);
4112 3784
4113 return timeout; 3785 return timeout;
4114} 3786}
4115
4116EXPORT_SYMBOL(sleep_on_timeout); 3787EXPORT_SYMBOL(sleep_on_timeout);
4117 3788
4118#ifdef CONFIG_RT_MUTEXES 3789#ifdef CONFIG_RT_MUTEXES
@@ -4129,29 +3800,30 @@ EXPORT_SYMBOL(sleep_on_timeout);
4129 */ 3800 */
4130void rt_mutex_setprio(struct task_struct *p, int prio) 3801void rt_mutex_setprio(struct task_struct *p, int prio)
4131{ 3802{
4132 struct prio_array *array;
4133 unsigned long flags; 3803 unsigned long flags;
3804 int oldprio, on_rq;
4134 struct rq *rq; 3805 struct rq *rq;
4135 int oldprio; 3806 u64 now;
4136 3807
4137 BUG_ON(prio < 0 || prio > MAX_PRIO); 3808 BUG_ON(prio < 0 || prio > MAX_PRIO);
4138 3809
4139 rq = task_rq_lock(p, &flags); 3810 rq = task_rq_lock(p, &flags);
3811 now = rq_clock(rq);
4140 3812
4141 oldprio = p->prio; 3813 oldprio = p->prio;
4142 array = p->array; 3814 on_rq = p->se.on_rq;
4143 if (array) 3815 if (on_rq)
4144 dequeue_task(p, array); 3816 dequeue_task(rq, p, 0, now);
3817
3818 if (rt_prio(prio))
3819 p->sched_class = &rt_sched_class;
3820 else
3821 p->sched_class = &fair_sched_class;
3822
4145 p->prio = prio; 3823 p->prio = prio;
4146 3824
4147 if (array) { 3825 if (on_rq) {
4148 /* 3826 enqueue_task(rq, p, 0, now);
4149 * If changing to an RT priority then queue it
4150 * in the active array!
4151 */
4152 if (rt_task(p))
4153 array = rq->active;
4154 enqueue_task(p, array);
4155 /* 3827 /*
4156 * Reschedule if we are currently running on this runqueue and 3828 * Reschedule if we are currently running on this runqueue and
4157 * our priority decreased, or if we are not currently running on 3829 * our priority decreased, or if we are not currently running on
@@ -4160,8 +3832,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4160 if (task_running(rq, p)) { 3832 if (task_running(rq, p)) {
4161 if (p->prio > oldprio) 3833 if (p->prio > oldprio)
4162 resched_task(rq->curr); 3834 resched_task(rq->curr);
4163 } else if (TASK_PREEMPTS_CURR(p, rq)) 3835 } else {
4164 resched_task(rq->curr); 3836 check_preempt_curr(rq, p);
3837 }
4165 } 3838 }
4166 task_rq_unlock(rq, &flags); 3839 task_rq_unlock(rq, &flags);
4167} 3840}
@@ -4170,10 +3843,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4170 3843
4171void set_user_nice(struct task_struct *p, long nice) 3844void set_user_nice(struct task_struct *p, long nice)
4172{ 3845{
4173 struct prio_array *array; 3846 int old_prio, delta, on_rq;
4174 int old_prio, delta;
4175 unsigned long flags; 3847 unsigned long flags;
4176 struct rq *rq; 3848 struct rq *rq;
3849 u64 now;
4177 3850
4178 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3851 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4179 return; 3852 return;
@@ -4182,20 +3855,21 @@ void set_user_nice(struct task_struct *p, long nice)
4182 * the task might be in the middle of scheduling on another CPU. 3855 * the task might be in the middle of scheduling on another CPU.
4183 */ 3856 */
4184 rq = task_rq_lock(p, &flags); 3857 rq = task_rq_lock(p, &flags);
3858 now = rq_clock(rq);
4185 /* 3859 /*
4186 * The RT priorities are set via sched_setscheduler(), but we still 3860 * The RT priorities are set via sched_setscheduler(), but we still
4187 * allow the 'normal' nice value to be set - but as expected 3861 * allow the 'normal' nice value to be set - but as expected
4188 * it wont have any effect on scheduling until the task is 3862 * it wont have any effect on scheduling until the task is
4189 * not SCHED_NORMAL/SCHED_BATCH: 3863 * SCHED_FIFO/SCHED_RR:
4190 */ 3864 */
4191 if (has_rt_policy(p)) { 3865 if (task_has_rt_policy(p)) {
4192 p->static_prio = NICE_TO_PRIO(nice); 3866 p->static_prio = NICE_TO_PRIO(nice);
4193 goto out_unlock; 3867 goto out_unlock;
4194 } 3868 }
4195 array = p->array; 3869 on_rq = p->se.on_rq;
4196 if (array) { 3870 if (on_rq) {
4197 dequeue_task(p, array); 3871 dequeue_task(rq, p, 0, now);
4198 dec_raw_weighted_load(rq, p); 3872 dec_load(rq, p, now);
4199 } 3873 }
4200 3874
4201 p->static_prio = NICE_TO_PRIO(nice); 3875 p->static_prio = NICE_TO_PRIO(nice);
@@ -4204,9 +3878,9 @@ void set_user_nice(struct task_struct *p, long nice)
4204 p->prio = effective_prio(p); 3878 p->prio = effective_prio(p);
4205 delta = p->prio - old_prio; 3879 delta = p->prio - old_prio;
4206 3880
4207 if (array) { 3881 if (on_rq) {
4208 enqueue_task(p, array); 3882 enqueue_task(rq, p, 0, now);
4209 inc_raw_weighted_load(rq, p); 3883 inc_load(rq, p, now);
4210 /* 3884 /*
4211 * If the task increased its priority or is running and 3885 * If the task increased its priority or is running and
4212 * lowered its priority, then reschedule its CPU: 3886 * lowered its priority, then reschedule its CPU:
@@ -4326,20 +4000,28 @@ static inline struct task_struct *find_process_by_pid(pid_t pid)
4326} 4000}
4327 4001
4328/* Actually do priority change: must hold rq lock. */ 4002/* Actually do priority change: must hold rq lock. */
4329static void __setscheduler(struct task_struct *p, int policy, int prio) 4003static void
4004__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4330{ 4005{
4331 BUG_ON(p->array); 4006 BUG_ON(p->se.on_rq);
4332 4007
4333 p->policy = policy; 4008 p->policy = policy;
4009 switch (p->policy) {
4010 case SCHED_NORMAL:
4011 case SCHED_BATCH:
4012 case SCHED_IDLE:
4013 p->sched_class = &fair_sched_class;
4014 break;
4015 case SCHED_FIFO:
4016 case SCHED_RR:
4017 p->sched_class = &rt_sched_class;
4018 break;
4019 }
4020
4334 p->rt_priority = prio; 4021 p->rt_priority = prio;
4335 p->normal_prio = normal_prio(p); 4022 p->normal_prio = normal_prio(p);
4336 /* we are holding p->pi_lock already */ 4023 /* we are holding p->pi_lock already */
4337 p->prio = rt_mutex_getprio(p); 4024 p->prio = rt_mutex_getprio(p);
4338 /*
4339 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
4340 */
4341 if (policy == SCHED_BATCH)
4342 p->sleep_avg = 0;
4343 set_load_weight(p); 4025 set_load_weight(p);
4344} 4026}
4345 4027
@@ -4354,8 +4036,7 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
4354int sched_setscheduler(struct task_struct *p, int policy, 4036int sched_setscheduler(struct task_struct *p, int policy,
4355 struct sched_param *param) 4037 struct sched_param *param)
4356{ 4038{
4357 int retval, oldprio, oldpolicy = -1; 4039 int retval, oldprio, oldpolicy = -1, on_rq;
4358 struct prio_array *array;
4359 unsigned long flags; 4040 unsigned long flags;
4360 struct rq *rq; 4041 struct rq *rq;
4361 4042
@@ -4366,27 +4047,27 @@ recheck:
4366 if (policy < 0) 4047 if (policy < 0)
4367 policy = oldpolicy = p->policy; 4048 policy = oldpolicy = p->policy;
4368 else if (policy != SCHED_FIFO && policy != SCHED_RR && 4049 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4369 policy != SCHED_NORMAL && policy != SCHED_BATCH) 4050 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4051 policy != SCHED_IDLE)
4370 return -EINVAL; 4052 return -EINVAL;
4371 /* 4053 /*
4372 * Valid priorities for SCHED_FIFO and SCHED_RR are 4054 * Valid priorities for SCHED_FIFO and SCHED_RR are
4373 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and 4055 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4374 * SCHED_BATCH is 0. 4056 * SCHED_BATCH and SCHED_IDLE is 0.
4375 */ 4057 */
4376 if (param->sched_priority < 0 || 4058 if (param->sched_priority < 0 ||
4377 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 4059 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4378 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 4060 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4379 return -EINVAL; 4061 return -EINVAL;
4380 if (is_rt_policy(policy) != (param->sched_priority != 0)) 4062 if (rt_policy(policy) != (param->sched_priority != 0))
4381 return -EINVAL; 4063 return -EINVAL;
4382 4064
4383 /* 4065 /*
4384 * Allow unprivileged RT tasks to decrease priority: 4066 * Allow unprivileged RT tasks to decrease priority:
4385 */ 4067 */
4386 if (!capable(CAP_SYS_NICE)) { 4068 if (!capable(CAP_SYS_NICE)) {
4387 if (is_rt_policy(policy)) { 4069 if (rt_policy(policy)) {
4388 unsigned long rlim_rtprio; 4070 unsigned long rlim_rtprio;
4389 unsigned long flags;
4390 4071
4391 if (!lock_task_sighand(p, &flags)) 4072 if (!lock_task_sighand(p, &flags))
4392 return -ESRCH; 4073 return -ESRCH;
@@ -4402,6 +4083,12 @@ recheck:
4402 param->sched_priority > rlim_rtprio) 4083 param->sched_priority > rlim_rtprio)
4403 return -EPERM; 4084 return -EPERM;
4404 } 4085 }
4086 /*
4087 * Like positive nice levels, dont allow tasks to
4088 * move out of SCHED_IDLE either:
4089 */
4090 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4091 return -EPERM;
4405 4092
4406 /* can't change other user's priorities */ 4093 /* can't change other user's priorities */
4407 if ((current->euid != p->euid) && 4094 if ((current->euid != p->euid) &&
@@ -4429,13 +4116,13 @@ recheck:
4429 spin_unlock_irqrestore(&p->pi_lock, flags); 4116 spin_unlock_irqrestore(&p->pi_lock, flags);
4430 goto recheck; 4117 goto recheck;
4431 } 4118 }
4432 array = p->array; 4119 on_rq = p->se.on_rq;
4433 if (array) 4120 if (on_rq)
4434 deactivate_task(p, rq); 4121 deactivate_task(rq, p, 0);
4435 oldprio = p->prio; 4122 oldprio = p->prio;
4436 __setscheduler(p, policy, param->sched_priority); 4123 __setscheduler(rq, p, policy, param->sched_priority);
4437 if (array) { 4124 if (on_rq) {
4438 __activate_task(p, rq); 4125 activate_task(rq, p, 0);
4439 /* 4126 /*
4440 * Reschedule if we are currently running on this runqueue and 4127 * Reschedule if we are currently running on this runqueue and
4441 * our priority decreased, or if we are not currently running on 4128 * our priority decreased, or if we are not currently running on
@@ -4444,8 +4131,9 @@ recheck:
4444 if (task_running(rq, p)) { 4131 if (task_running(rq, p)) {
4445 if (p->prio > oldprio) 4132 if (p->prio > oldprio)
4446 resched_task(rq->curr); 4133 resched_task(rq->curr);
4447 } else if (TASK_PREEMPTS_CURR(p, rq)) 4134 } else {
4448 resched_task(rq->curr); 4135 check_preempt_curr(rq, p);
4136 }
4449 } 4137 }
4450 __task_rq_unlock(rq); 4138 __task_rq_unlock(rq);
4451 spin_unlock_irqrestore(&p->pi_lock, flags); 4139 spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -4717,41 +4405,18 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4717/** 4405/**
4718 * sys_sched_yield - yield the current processor to other threads. 4406 * sys_sched_yield - yield the current processor to other threads.
4719 * 4407 *
4720 * This function yields the current CPU by moving the calling thread 4408 * This function yields the current CPU to other tasks. If there are no
4721 * to the expired array. If there are no other threads running on this 4409 * other threads running on this CPU then this function will return.
4722 * CPU then this function will return.
4723 */ 4410 */
4724asmlinkage long sys_sched_yield(void) 4411asmlinkage long sys_sched_yield(void)
4725{ 4412{
4726 struct rq *rq = this_rq_lock(); 4413 struct rq *rq = this_rq_lock();
4727 struct prio_array *array = current->array, *target = rq->expired;
4728 4414
4729 schedstat_inc(rq, yld_cnt); 4415 schedstat_inc(rq, yld_cnt);
4730 /* 4416 if (unlikely(rq->nr_running == 1))
4731 * We implement yielding by moving the task into the expired
4732 * queue.
4733 *
4734 * (special rule: RT tasks will just roundrobin in the active
4735 * array.)
4736 */
4737 if (rt_task(current))
4738 target = rq->active;
4739
4740 if (array->nr_active == 1) {
4741 schedstat_inc(rq, yld_act_empty); 4417 schedstat_inc(rq, yld_act_empty);
4742 if (!rq->expired->nr_active) 4418 else
4743 schedstat_inc(rq, yld_both_empty); 4419 current->sched_class->yield_task(rq, current);
4744 } else if (!rq->expired->nr_active)
4745 schedstat_inc(rq, yld_exp_empty);
4746
4747 if (array != target) {
4748 dequeue_task(current, array);
4749 enqueue_task(current, target);
4750 } else
4751 /*
4752 * requeue_task is cheaper so perform that if possible.
4753 */
4754 requeue_task(current, array);
4755 4420
4756 /* 4421 /*
4757 * Since we are going to call schedule() anyway, there's 4422 * Since we are going to call schedule() anyway, there's
@@ -4902,6 +4567,7 @@ asmlinkage long sys_sched_get_priority_max(int policy)
4902 break; 4567 break;
4903 case SCHED_NORMAL: 4568 case SCHED_NORMAL:
4904 case SCHED_BATCH: 4569 case SCHED_BATCH:
4570 case SCHED_IDLE:
4905 ret = 0; 4571 ret = 0;
4906 break; 4572 break;
4907 } 4573 }
@@ -4926,6 +4592,7 @@ asmlinkage long sys_sched_get_priority_min(int policy)
4926 break; 4592 break;
4927 case SCHED_NORMAL: 4593 case SCHED_NORMAL:
4928 case SCHED_BATCH: 4594 case SCHED_BATCH:
4595 case SCHED_IDLE:
4929 ret = 0; 4596 ret = 0;
4930 } 4597 }
4931 return ret; 4598 return ret;
@@ -4960,7 +4627,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4960 goto out_unlock; 4627 goto out_unlock;
4961 4628
4962 jiffies_to_timespec(p->policy == SCHED_FIFO ? 4629 jiffies_to_timespec(p->policy == SCHED_FIFO ?
4963 0 : task_timeslice(p), &t); 4630 0 : static_prio_timeslice(p->static_prio), &t);
4964 read_unlock(&tasklist_lock); 4631 read_unlock(&tasklist_lock);
4965 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4632 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4966out_nounlock: 4633out_nounlock:
@@ -5035,6 +4702,9 @@ void show_state_filter(unsigned long state_filter)
5035 4702
5036 touch_all_softlockup_watchdogs(); 4703 touch_all_softlockup_watchdogs();
5037 4704
4705#ifdef CONFIG_SCHED_DEBUG
4706 sysrq_sched_debug_show();
4707#endif
5038 read_unlock(&tasklist_lock); 4708 read_unlock(&tasklist_lock);
5039 /* 4709 /*
5040 * Only show locks if all tasks are dumped: 4710 * Only show locks if all tasks are dumped:
@@ -5043,6 +4713,11 @@ void show_state_filter(unsigned long state_filter)
5043 debug_show_all_locks(); 4713 debug_show_all_locks();
5044} 4714}
5045 4715
4716void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4717{
4718 idle->sched_class = &idle_sched_class;
4719}
4720
5046/** 4721/**
5047 * init_idle - set up an idle thread for a given CPU 4722 * init_idle - set up an idle thread for a given CPU
5048 * @idle: task in question 4723 * @idle: task in question
@@ -5056,13 +4731,12 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5056 struct rq *rq = cpu_rq(cpu); 4731 struct rq *rq = cpu_rq(cpu);
5057 unsigned long flags; 4732 unsigned long flags;
5058 4733
5059 idle->timestamp = sched_clock(); 4734 __sched_fork(idle);
5060 idle->sleep_avg = 0; 4735 idle->se.exec_start = sched_clock();
5061 idle->array = NULL; 4736
5062 idle->prio = idle->normal_prio = MAX_PRIO; 4737 idle->prio = idle->normal_prio = MAX_PRIO;
5063 idle->state = TASK_RUNNING;
5064 idle->cpus_allowed = cpumask_of_cpu(cpu); 4738 idle->cpus_allowed = cpumask_of_cpu(cpu);
5065 set_task_cpu(idle, cpu); 4739 __set_task_cpu(idle, cpu);
5066 4740
5067 spin_lock_irqsave(&rq->lock, flags); 4741 spin_lock_irqsave(&rq->lock, flags);
5068 rq->curr = rq->idle = idle; 4742 rq->curr = rq->idle = idle;
@@ -5077,6 +4751,10 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5077#else 4751#else
5078 task_thread_info(idle)->preempt_count = 0; 4752 task_thread_info(idle)->preempt_count = 0;
5079#endif 4753#endif
4754 /*
4755 * The idle tasks have their own, simple scheduling class:
4756 */
4757 idle->sched_class = &idle_sched_class;
5080} 4758}
5081 4759
5082/* 4760/*
@@ -5088,6 +4766,28 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5088 */ 4766 */
5089cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 4767cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
5090 4768
4769/*
4770 * Increase the granularity value when there are more CPUs,
4771 * because with more CPUs the 'effective latency' as visible
4772 * to users decreases. But the relationship is not linear,
4773 * so pick a second-best guess by going with the log2 of the
4774 * number of CPUs.
4775 *
4776 * This idea comes from the SD scheduler of Con Kolivas:
4777 */
4778static inline void sched_init_granularity(void)
4779{
4780 unsigned int factor = 1 + ilog2(num_online_cpus());
4781 const unsigned long gran_limit = 10000000;
4782
4783 sysctl_sched_granularity *= factor;
4784 if (sysctl_sched_granularity > gran_limit)
4785 sysctl_sched_granularity = gran_limit;
4786
4787 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4788 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4789}
4790
5091#ifdef CONFIG_SMP 4791#ifdef CONFIG_SMP
5092/* 4792/*
5093 * This is how migration works: 4793 * This is how migration works:
@@ -5161,7 +4861,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
5161static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4861static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5162{ 4862{
5163 struct rq *rq_dest, *rq_src; 4863 struct rq *rq_dest, *rq_src;
5164 int ret = 0; 4864 int ret = 0, on_rq;
5165 4865
5166 if (unlikely(cpu_is_offline(dest_cpu))) 4866 if (unlikely(cpu_is_offline(dest_cpu)))
5167 return ret; 4867 return ret;
@@ -5177,20 +4877,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5177 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 4877 if (!cpu_isset(dest_cpu, p->cpus_allowed))
5178 goto out; 4878 goto out;
5179 4879
4880 on_rq = p->se.on_rq;
4881 if (on_rq)
4882 deactivate_task(rq_src, p, 0);
5180 set_task_cpu(p, dest_cpu); 4883 set_task_cpu(p, dest_cpu);
5181 if (p->array) { 4884 if (on_rq) {
5182 /* 4885 activate_task(rq_dest, p, 0);
5183 * Sync timestamp with rq_dest's before activating. 4886 check_preempt_curr(rq_dest, p);
5184 * The same thing could be achieved by doing this step
5185 * afterwards, and pretending it was a local activate.
5186 * This way is cleaner and logically correct.
5187 */
5188 p->timestamp = p->timestamp - rq_src->most_recent_timestamp
5189 + rq_dest->most_recent_timestamp;
5190 deactivate_task(p, rq_src);
5191 __activate_task(p, rq_dest);
5192 if (TASK_PREEMPTS_CURR(p, rq_dest))
5193 resched_task(rq_dest->curr);
5194 } 4887 }
5195 ret = 1; 4888 ret = 1;
5196out: 4889out:
@@ -5342,7 +5035,8 @@ static void migrate_live_tasks(int src_cpu)
5342 write_unlock_irq(&tasklist_lock); 5035 write_unlock_irq(&tasklist_lock);
5343} 5036}
5344 5037
5345/* Schedules idle task to be the next runnable task on current CPU. 5038/*
5039 * Schedules idle task to be the next runnable task on current CPU.
5346 * It does so by boosting its priority to highest possible and adding it to 5040 * It does so by boosting its priority to highest possible and adding it to
5347 * the _front_ of the runqueue. Used by CPU offline code. 5041 * the _front_ of the runqueue. Used by CPU offline code.
5348 */ 5042 */
@@ -5362,10 +5056,10 @@ void sched_idle_next(void)
5362 */ 5056 */
5363 spin_lock_irqsave(&rq->lock, flags); 5057 spin_lock_irqsave(&rq->lock, flags);
5364 5058
5365 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); 5059 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5366 5060
5367 /* Add idle task to the _front_ of its priority queue: */ 5061 /* Add idle task to the _front_ of its priority queue: */
5368 __activate_idle_task(p, rq); 5062 activate_idle_task(p, rq);
5369 5063
5370 spin_unlock_irqrestore(&rq->lock, flags); 5064 spin_unlock_irqrestore(&rq->lock, flags);
5371} 5065}
@@ -5415,16 +5109,15 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5415static void migrate_dead_tasks(unsigned int dead_cpu) 5109static void migrate_dead_tasks(unsigned int dead_cpu)
5416{ 5110{
5417 struct rq *rq = cpu_rq(dead_cpu); 5111 struct rq *rq = cpu_rq(dead_cpu);
5418 unsigned int arr, i; 5112 struct task_struct *next;
5419 5113
5420 for (arr = 0; arr < 2; arr++) { 5114 for ( ; ; ) {
5421 for (i = 0; i < MAX_PRIO; i++) { 5115 if (!rq->nr_running)
5422 struct list_head *list = &rq->arrays[arr].queue[i]; 5116 break;
5423 5117 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5424 while (!list_empty(list)) 5118 if (!next)
5425 migrate_dead(dead_cpu, list_entry(list->next, 5119 break;
5426 struct task_struct, run_list)); 5120 migrate_dead(dead_cpu, next);
5427 }
5428 } 5121 }
5429} 5122}
5430#endif /* CONFIG_HOTPLUG_CPU */ 5123#endif /* CONFIG_HOTPLUG_CPU */
@@ -5448,14 +5141,14 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5448 5141
5449 case CPU_UP_PREPARE: 5142 case CPU_UP_PREPARE:
5450 case CPU_UP_PREPARE_FROZEN: 5143 case CPU_UP_PREPARE_FROZEN:
5451 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); 5144 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5452 if (IS_ERR(p)) 5145 if (IS_ERR(p))
5453 return NOTIFY_BAD; 5146 return NOTIFY_BAD;
5454 p->flags |= PF_NOFREEZE; 5147 p->flags |= PF_NOFREEZE;
5455 kthread_bind(p, cpu); 5148 kthread_bind(p, cpu);
5456 /* Must be high prio: stop_machine expects to yield to it. */ 5149 /* Must be high prio: stop_machine expects to yield to it. */
5457 rq = task_rq_lock(p, &flags); 5150 rq = task_rq_lock(p, &flags);
5458 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); 5151 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5459 task_rq_unlock(rq, &flags); 5152 task_rq_unlock(rq, &flags);
5460 cpu_rq(cpu)->migration_thread = p; 5153 cpu_rq(cpu)->migration_thread = p;
5461 break; 5154 break;
@@ -5486,9 +5179,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5486 rq->migration_thread = NULL; 5179 rq->migration_thread = NULL;
5487 /* Idle task back to normal (off runqueue, low prio) */ 5180 /* Idle task back to normal (off runqueue, low prio) */
5488 rq = task_rq_lock(rq->idle, &flags); 5181 rq = task_rq_lock(rq->idle, &flags);
5489 deactivate_task(rq->idle, rq); 5182 deactivate_task(rq, rq->idle, 0);
5490 rq->idle->static_prio = MAX_PRIO; 5183 rq->idle->static_prio = MAX_PRIO;
5491 __setscheduler(rq->idle, SCHED_NORMAL, 0); 5184 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5185 rq->idle->sched_class = &idle_sched_class;
5492 migrate_dead_tasks(cpu); 5186 migrate_dead_tasks(cpu);
5493 task_rq_unlock(rq, &flags); 5187 task_rq_unlock(rq, &flags);
5494 migrate_nr_uninterruptible(rq); 5188 migrate_nr_uninterruptible(rq);
@@ -5797,483 +5491,6 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5797 5491
5798#define SD_NODES_PER_DOMAIN 16 5492#define SD_NODES_PER_DOMAIN 16
5799 5493
5800/*
5801 * Self-tuning task migration cost measurement between source and target CPUs.
5802 *
5803 * This is done by measuring the cost of manipulating buffers of varying
5804 * sizes. For a given buffer-size here are the steps that are taken:
5805 *
5806 * 1) the source CPU reads+dirties a shared buffer
5807 * 2) the target CPU reads+dirties the same shared buffer
5808 *
5809 * We measure how long they take, in the following 4 scenarios:
5810 *
5811 * - source: CPU1, target: CPU2 | cost1
5812 * - source: CPU2, target: CPU1 | cost2
5813 * - source: CPU1, target: CPU1 | cost3
5814 * - source: CPU2, target: CPU2 | cost4
5815 *
5816 * We then calculate the cost3+cost4-cost1-cost2 difference - this is
5817 * the cost of migration.
5818 *
5819 * We then start off from a small buffer-size and iterate up to larger
5820 * buffer sizes, in 5% steps - measuring each buffer-size separately, and
5821 * doing a maximum search for the cost. (The maximum cost for a migration
5822 * normally occurs when the working set size is around the effective cache
5823 * size.)
5824 */
5825#define SEARCH_SCOPE 2
5826#define MIN_CACHE_SIZE (64*1024U)
5827#define DEFAULT_CACHE_SIZE (5*1024*1024U)
5828#define ITERATIONS 1
5829#define SIZE_THRESH 130
5830#define COST_THRESH 130
5831
5832/*
5833 * The migration cost is a function of 'domain distance'. Domain
5834 * distance is the number of steps a CPU has to iterate down its
5835 * domain tree to share a domain with the other CPU. The farther
5836 * two CPUs are from each other, the larger the distance gets.
5837 *
5838 * Note that we use the distance only to cache measurement results,
5839 * the distance value is not used numerically otherwise. When two
5840 * CPUs have the same distance it is assumed that the migration
5841 * cost is the same. (this is a simplification but quite practical)
5842 */
5843#define MAX_DOMAIN_DISTANCE 32
5844
5845static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
5846 { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
5847/*
5848 * Architectures may override the migration cost and thus avoid
5849 * boot-time calibration. Unit is nanoseconds. Mostly useful for
5850 * virtualized hardware:
5851 */
5852#ifdef CONFIG_DEFAULT_MIGRATION_COST
5853 CONFIG_DEFAULT_MIGRATION_COST
5854#else
5855 -1LL
5856#endif
5857};
5858
5859/*
5860 * Allow override of migration cost - in units of microseconds.
5861 * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
5862 * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
5863 */
5864static int __init migration_cost_setup(char *str)
5865{
5866 int ints[MAX_DOMAIN_DISTANCE+1], i;
5867
5868 str = get_options(str, ARRAY_SIZE(ints), ints);
5869
5870 printk("#ints: %d\n", ints[0]);
5871 for (i = 1; i <= ints[0]; i++) {
5872 migration_cost[i-1] = (unsigned long long)ints[i]*1000;
5873 printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
5874 }
5875 return 1;
5876}
5877
5878__setup ("migration_cost=", migration_cost_setup);
5879
5880/*
5881 * Global multiplier (divisor) for migration-cutoff values,
5882 * in percentiles. E.g. use a value of 150 to get 1.5 times
5883 * longer cache-hot cutoff times.
5884 *
5885 * (We scale it from 100 to 128 to long long handling easier.)
5886 */
5887
5888#define MIGRATION_FACTOR_SCALE 128
5889
5890static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
5891
5892static int __init setup_migration_factor(char *str)
5893{
5894 get_option(&str, &migration_factor);
5895 migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
5896 return 1;
5897}
5898
5899__setup("migration_factor=", setup_migration_factor);
5900
5901/*
5902 * Estimated distance of two CPUs, measured via the number of domains
5903 * we have to pass for the two CPUs to be in the same span:
5904 */
5905static unsigned long domain_distance(int cpu1, int cpu2)
5906{
5907 unsigned long distance = 0;
5908 struct sched_domain *sd;
5909
5910 for_each_domain(cpu1, sd) {
5911 WARN_ON(!cpu_isset(cpu1, sd->span));
5912 if (cpu_isset(cpu2, sd->span))
5913 return distance;
5914 distance++;
5915 }
5916 if (distance >= MAX_DOMAIN_DISTANCE) {
5917 WARN_ON(1);
5918 distance = MAX_DOMAIN_DISTANCE-1;
5919 }
5920
5921 return distance;
5922}
5923
5924static unsigned int migration_debug;
5925
5926static int __init setup_migration_debug(char *str)
5927{
5928 get_option(&str, &migration_debug);
5929 return 1;
5930}
5931
5932__setup("migration_debug=", setup_migration_debug);
5933
5934/*
5935 * Maximum cache-size that the scheduler should try to measure.
5936 * Architectures with larger caches should tune this up during
5937 * bootup. Gets used in the domain-setup code (i.e. during SMP
5938 * bootup).
5939 */
5940unsigned int max_cache_size;
5941
5942static int __init setup_max_cache_size(char *str)
5943{
5944 get_option(&str, &max_cache_size);
5945 return 1;
5946}
5947
5948__setup("max_cache_size=", setup_max_cache_size);
5949
5950/*
5951 * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
5952 * is the operation that is timed, so we try to generate unpredictable
5953 * cachemisses that still end up filling the L2 cache:
5954 */
5955static void touch_cache(void *__cache, unsigned long __size)
5956{
5957 unsigned long size = __size / sizeof(long);
5958 unsigned long chunk1 = size / 3;
5959 unsigned long chunk2 = 2 * size / 3;
5960 unsigned long *cache = __cache;
5961 int i;
5962
5963 for (i = 0; i < size/6; i += 8) {
5964 switch (i % 6) {
5965 case 0: cache[i]++;
5966 case 1: cache[size-1-i]++;
5967 case 2: cache[chunk1-i]++;
5968 case 3: cache[chunk1+i]++;
5969 case 4: cache[chunk2-i]++;
5970 case 5: cache[chunk2+i]++;
5971 }
5972 }
5973}
5974
5975/*
5976 * Measure the cache-cost of one task migration. Returns in units of nsec.
5977 */
5978static unsigned long long
5979measure_one(void *cache, unsigned long size, int source, int target)
5980{
5981 cpumask_t mask, saved_mask;
5982 unsigned long long t0, t1, t2, t3, cost;
5983
5984 saved_mask = current->cpus_allowed;
5985
5986 /*
5987 * Flush source caches to RAM and invalidate them:
5988 */
5989 sched_cacheflush();
5990
5991 /*
5992 * Migrate to the source CPU:
5993 */
5994 mask = cpumask_of_cpu(source);
5995 set_cpus_allowed(current, mask);
5996 WARN_ON(smp_processor_id() != source);
5997
5998 /*
5999 * Dirty the working set:
6000 */
6001 t0 = sched_clock();
6002 touch_cache(cache, size);
6003 t1 = sched_clock();
6004
6005 /*
6006 * Migrate to the target CPU, dirty the L2 cache and access
6007 * the shared buffer. (which represents the working set
6008 * of a migrated task.)
6009 */
6010 mask = cpumask_of_cpu(target);
6011 set_cpus_allowed(current, mask);
6012 WARN_ON(smp_processor_id() != target);
6013
6014 t2 = sched_clock();
6015 touch_cache(cache, size);
6016 t3 = sched_clock();
6017
6018 cost = t1-t0 + t3-t2;
6019
6020 if (migration_debug >= 2)
6021 printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
6022 source, target, t1-t0, t1-t0, t3-t2, cost);
6023 /*
6024 * Flush target caches to RAM and invalidate them:
6025 */
6026 sched_cacheflush();
6027
6028 set_cpus_allowed(current, saved_mask);
6029
6030 return cost;
6031}
6032
6033/*
6034 * Measure a series of task migrations and return the average
6035 * result. Since this code runs early during bootup the system
6036 * is 'undisturbed' and the average latency makes sense.
6037 *
6038 * The algorithm in essence auto-detects the relevant cache-size,
6039 * so it will properly detect different cachesizes for different
6040 * cache-hierarchies, depending on how the CPUs are connected.
6041 *
6042 * Architectures can prime the upper limit of the search range via
6043 * max_cache_size, otherwise the search range defaults to 20MB...64K.
6044 */
6045static unsigned long long
6046measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
6047{
6048 unsigned long long cost1, cost2;
6049 int i;
6050
6051 /*
6052 * Measure the migration cost of 'size' bytes, over an
6053 * average of 10 runs:
6054 *
6055 * (We perturb the cache size by a small (0..4k)
6056 * value to compensate size/alignment related artifacts.
6057 * We also subtract the cost of the operation done on
6058 * the same CPU.)
6059 */
6060 cost1 = 0;
6061
6062 /*
6063 * dry run, to make sure we start off cache-cold on cpu1,
6064 * and to get any vmalloc pagefaults in advance:
6065 */
6066 measure_one(cache, size, cpu1, cpu2);
6067 for (i = 0; i < ITERATIONS; i++)
6068 cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
6069
6070 measure_one(cache, size, cpu2, cpu1);
6071 for (i = 0; i < ITERATIONS; i++)
6072 cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
6073
6074 /*
6075 * (We measure the non-migrating [cached] cost on both
6076 * cpu1 and cpu2, to handle CPUs with different speeds)
6077 */
6078 cost2 = 0;
6079
6080 measure_one(cache, size, cpu1, cpu1);
6081 for (i = 0; i < ITERATIONS; i++)
6082 cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
6083
6084 measure_one(cache, size, cpu2, cpu2);
6085 for (i = 0; i < ITERATIONS; i++)
6086 cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
6087
6088 /*
6089 * Get the per-iteration migration cost:
6090 */
6091 do_div(cost1, 2 * ITERATIONS);
6092 do_div(cost2, 2 * ITERATIONS);
6093
6094 return cost1 - cost2;
6095}
6096
6097static unsigned long long measure_migration_cost(int cpu1, int cpu2)
6098{
6099 unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
6100 unsigned int max_size, size, size_found = 0;
6101 long long cost = 0, prev_cost;
6102 void *cache;
6103
6104 /*
6105 * Search from max_cache_size*5 down to 64K - the real relevant
6106 * cachesize has to lie somewhere inbetween.
6107 */
6108 if (max_cache_size) {
6109 max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
6110 size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
6111 } else {
6112 /*
6113 * Since we have no estimation about the relevant
6114 * search range
6115 */
6116 max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
6117 size = MIN_CACHE_SIZE;
6118 }
6119
6120 if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
6121 printk("cpu %d and %d not both online!\n", cpu1, cpu2);
6122 return 0;
6123 }
6124
6125 /*
6126 * Allocate the working set:
6127 */
6128 cache = vmalloc(max_size);
6129 if (!cache) {
6130 printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
6131 return 1000000; /* return 1 msec on very small boxen */
6132 }
6133
6134 while (size <= max_size) {
6135 prev_cost = cost;
6136 cost = measure_cost(cpu1, cpu2, cache, size);
6137
6138 /*
6139 * Update the max:
6140 */
6141 if (cost > 0) {
6142 if (max_cost < cost) {
6143 max_cost = cost;
6144 size_found = size;
6145 }
6146 }
6147 /*
6148 * Calculate average fluctuation, we use this to prevent
6149 * noise from triggering an early break out of the loop:
6150 */
6151 fluct = abs(cost - prev_cost);
6152 avg_fluct = (avg_fluct + fluct)/2;
6153
6154 if (migration_debug)
6155 printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
6156 "(%8Ld %8Ld)\n",
6157 cpu1, cpu2, size,
6158 (long)cost / 1000000,
6159 ((long)cost / 100000) % 10,
6160 (long)max_cost / 1000000,
6161 ((long)max_cost / 100000) % 10,
6162 domain_distance(cpu1, cpu2),
6163 cost, avg_fluct);
6164
6165 /*
6166 * If we iterated at least 20% past the previous maximum,
6167 * and the cost has dropped by more than 20% already,
6168 * (taking fluctuations into account) then we assume to
6169 * have found the maximum and break out of the loop early:
6170 */
6171 if (size_found && (size*100 > size_found*SIZE_THRESH))
6172 if (cost+avg_fluct <= 0 ||
6173 max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
6174
6175 if (migration_debug)
6176 printk("-> found max.\n");
6177 break;
6178 }
6179 /*
6180 * Increase the cachesize in 10% steps:
6181 */
6182 size = size * 10 / 9;
6183 }
6184
6185 if (migration_debug)
6186 printk("[%d][%d] working set size found: %d, cost: %Ld\n",
6187 cpu1, cpu2, size_found, max_cost);
6188
6189 vfree(cache);
6190
6191 /*
6192 * A task is considered 'cache cold' if at least 2 times
6193 * the worst-case cost of migration has passed.
6194 *
6195 * (this limit is only listened to if the load-balancing
6196 * situation is 'nice' - if there is a large imbalance we
6197 * ignore it for the sake of CPU utilization and
6198 * processing fairness.)
6199 */
6200 return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
6201}
6202
6203static void calibrate_migration_costs(const cpumask_t *cpu_map)
6204{
6205 int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
6206 unsigned long j0, j1, distance, max_distance = 0;
6207 struct sched_domain *sd;
6208
6209 j0 = jiffies;
6210
6211 /*
6212 * First pass - calculate the cacheflush times:
6213 */
6214 for_each_cpu_mask(cpu1, *cpu_map) {
6215 for_each_cpu_mask(cpu2, *cpu_map) {
6216 if (cpu1 == cpu2)
6217 continue;
6218 distance = domain_distance(cpu1, cpu2);
6219 max_distance = max(max_distance, distance);
6220 /*
6221 * No result cached yet?
6222 */
6223 if (migration_cost[distance] == -1LL)
6224 migration_cost[distance] =
6225 measure_migration_cost(cpu1, cpu2);
6226 }
6227 }
6228 /*
6229 * Second pass - update the sched domain hierarchy with
6230 * the new cache-hot-time estimations:
6231 */
6232 for_each_cpu_mask(cpu, *cpu_map) {
6233 distance = 0;
6234 for_each_domain(cpu, sd) {
6235 sd->cache_hot_time = migration_cost[distance];
6236 distance++;
6237 }
6238 }
6239 /*
6240 * Print the matrix:
6241 */
6242 if (migration_debug)
6243 printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
6244 max_cache_size,
6245#ifdef CONFIG_X86
6246 cpu_khz/1000
6247#else
6248 -1
6249#endif
6250 );
6251 if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
6252 printk("migration_cost=");
6253 for (distance = 0; distance <= max_distance; distance++) {
6254 if (distance)
6255 printk(",");
6256 printk("%ld", (long)migration_cost[distance] / 1000);
6257 }
6258 printk("\n");
6259 }
6260 j1 = jiffies;
6261 if (migration_debug)
6262 printk("migration: %ld seconds\n", (j1-j0) / HZ);
6263
6264 /*
6265 * Move back to the original CPU. NUMA-Q gets confused
6266 * if we migrate to another quad during bootup.
6267 */
6268 if (raw_smp_processor_id() != orig_cpu) {
6269 cpumask_t mask = cpumask_of_cpu(orig_cpu),
6270 saved_mask = current->cpus_allowed;
6271
6272 set_cpus_allowed(current, mask);
6273 set_cpus_allowed(current, saved_mask);
6274 }
6275}
6276
6277#ifdef CONFIG_NUMA 5494#ifdef CONFIG_NUMA
6278 5495
6279/** 5496/**
@@ -6574,7 +5791,6 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6574static int build_sched_domains(const cpumask_t *cpu_map) 5791static int build_sched_domains(const cpumask_t *cpu_map)
6575{ 5792{
6576 int i; 5793 int i;
6577 struct sched_domain *sd;
6578#ifdef CONFIG_NUMA 5794#ifdef CONFIG_NUMA
6579 struct sched_group **sched_group_nodes = NULL; 5795 struct sched_group **sched_group_nodes = NULL;
6580 int sd_allnodes = 0; 5796 int sd_allnodes = 0;
@@ -6582,7 +5798,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6582 /* 5798 /*
6583 * Allocate the per-node list of sched groups 5799 * Allocate the per-node list of sched groups
6584 */ 5800 */
6585 sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, 5801 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
6586 GFP_KERNEL); 5802 GFP_KERNEL);
6587 if (!sched_group_nodes) { 5803 if (!sched_group_nodes) {
6588 printk(KERN_WARNING "Can not alloc sched group node list\n"); 5804 printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -6601,8 +5817,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6601 cpus_and(nodemask, nodemask, *cpu_map); 5817 cpus_and(nodemask, nodemask, *cpu_map);
6602 5818
6603#ifdef CONFIG_NUMA 5819#ifdef CONFIG_NUMA
6604 if (cpus_weight(*cpu_map) 5820 if (cpus_weight(*cpu_map) >
6605 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { 5821 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
6606 sd = &per_cpu(allnodes_domains, i); 5822 sd = &per_cpu(allnodes_domains, i);
6607 *sd = SD_ALLNODES_INIT; 5823 *sd = SD_ALLNODES_INIT;
6608 sd->span = *cpu_map; 5824 sd->span = *cpu_map;
@@ -6661,7 +5877,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6661 if (i != first_cpu(this_sibling_map)) 5877 if (i != first_cpu(this_sibling_map))
6662 continue; 5878 continue;
6663 5879
6664 init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group); 5880 init_sched_build_groups(this_sibling_map, cpu_map,
5881 &cpu_to_cpu_group);
6665 } 5882 }
6666#endif 5883#endif
6667 5884
@@ -6672,11 +5889,11 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6672 cpus_and(this_core_map, this_core_map, *cpu_map); 5889 cpus_and(this_core_map, this_core_map, *cpu_map);
6673 if (i != first_cpu(this_core_map)) 5890 if (i != first_cpu(this_core_map))
6674 continue; 5891 continue;
6675 init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group); 5892 init_sched_build_groups(this_core_map, cpu_map,
5893 &cpu_to_core_group);
6676 } 5894 }
6677#endif 5895#endif
6678 5896
6679
6680 /* Set up physical groups */ 5897 /* Set up physical groups */
6681 for (i = 0; i < MAX_NUMNODES; i++) { 5898 for (i = 0; i < MAX_NUMNODES; i++) {
6682 cpumask_t nodemask = node_to_cpumask(i); 5899 cpumask_t nodemask = node_to_cpumask(i);
@@ -6691,7 +5908,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6691#ifdef CONFIG_NUMA 5908#ifdef CONFIG_NUMA
6692 /* Set up node groups */ 5909 /* Set up node groups */
6693 if (sd_allnodes) 5910 if (sd_allnodes)
6694 init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group); 5911 init_sched_build_groups(*cpu_map, cpu_map,
5912 &cpu_to_allnodes_group);
6695 5913
6696 for (i = 0; i < MAX_NUMNODES; i++) { 5914 for (i = 0; i < MAX_NUMNODES; i++) {
6697 /* Set up node groups */ 5915 /* Set up node groups */
@@ -6719,6 +5937,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6719 sched_group_nodes[i] = sg; 5937 sched_group_nodes[i] = sg;
6720 for_each_cpu_mask(j, nodemask) { 5938 for_each_cpu_mask(j, nodemask) {
6721 struct sched_domain *sd; 5939 struct sched_domain *sd;
5940
6722 sd = &per_cpu(node_domains, j); 5941 sd = &per_cpu(node_domains, j);
6723 sd->groups = sg; 5942 sd->groups = sg;
6724 } 5943 }
@@ -6763,19 +5982,22 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6763 /* Calculate CPU power for physical packages and nodes */ 5982 /* Calculate CPU power for physical packages and nodes */
6764#ifdef CONFIG_SCHED_SMT 5983#ifdef CONFIG_SCHED_SMT
6765 for_each_cpu_mask(i, *cpu_map) { 5984 for_each_cpu_mask(i, *cpu_map) {
6766 sd = &per_cpu(cpu_domains, i); 5985 struct sched_domain *sd = &per_cpu(cpu_domains, i);
5986
6767 init_sched_groups_power(i, sd); 5987 init_sched_groups_power(i, sd);
6768 } 5988 }
6769#endif 5989#endif
6770#ifdef CONFIG_SCHED_MC 5990#ifdef CONFIG_SCHED_MC
6771 for_each_cpu_mask(i, *cpu_map) { 5991 for_each_cpu_mask(i, *cpu_map) {
6772 sd = &per_cpu(core_domains, i); 5992 struct sched_domain *sd = &per_cpu(core_domains, i);
5993
6773 init_sched_groups_power(i, sd); 5994 init_sched_groups_power(i, sd);
6774 } 5995 }
6775#endif 5996#endif
6776 5997
6777 for_each_cpu_mask(i, *cpu_map) { 5998 for_each_cpu_mask(i, *cpu_map) {
6778 sd = &per_cpu(phys_domains, i); 5999 struct sched_domain *sd = &per_cpu(phys_domains, i);
6000
6779 init_sched_groups_power(i, sd); 6001 init_sched_groups_power(i, sd);
6780 } 6002 }
6781 6003
@@ -6803,10 +6025,6 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6803#endif 6025#endif
6804 cpu_attach_domain(sd, i); 6026 cpu_attach_domain(sd, i);
6805 } 6027 }
6806 /*
6807 * Tune cache-hot values:
6808 */
6809 calibrate_migration_costs(cpu_map);
6810 6028
6811 return 0; 6029 return 0;
6812 6030
@@ -7013,10 +6231,12 @@ void __init sched_init_smp(void)
7013 /* Move init over to a non-isolated CPU */ 6231 /* Move init over to a non-isolated CPU */
7014 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 6232 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
7015 BUG(); 6233 BUG();
6234 sched_init_granularity();
7016} 6235}
7017#else 6236#else
7018void __init sched_init_smp(void) 6237void __init sched_init_smp(void)
7019{ 6238{
6239 sched_init_granularity();
7020} 6240}
7021#endif /* CONFIG_SMP */ 6241#endif /* CONFIG_SMP */
7022 6242
@@ -7030,28 +6250,51 @@ int in_sched_functions(unsigned long addr)
7030 && addr < (unsigned long)__sched_text_end); 6250 && addr < (unsigned long)__sched_text_end);
7031} 6251}
7032 6252
6253static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6254{
6255 cfs_rq->tasks_timeline = RB_ROOT;
6256 cfs_rq->fair_clock = 1;
6257#ifdef CONFIG_FAIR_GROUP_SCHED
6258 cfs_rq->rq = rq;
6259#endif
6260}
6261
7033void __init sched_init(void) 6262void __init sched_init(void)
7034{ 6263{
7035 int i, j, k; 6264 u64 now = sched_clock();
7036 int highest_cpu = 0; 6265 int highest_cpu = 0;
6266 int i, j;
6267
6268 /*
6269 * Link up the scheduling class hierarchy:
6270 */
6271 rt_sched_class.next = &fair_sched_class;
6272 fair_sched_class.next = &idle_sched_class;
6273 idle_sched_class.next = NULL;
7037 6274
7038 for_each_possible_cpu(i) { 6275 for_each_possible_cpu(i) {
7039 struct prio_array *array; 6276 struct rt_prio_array *array;
7040 struct rq *rq; 6277 struct rq *rq;
7041 6278
7042 rq = cpu_rq(i); 6279 rq = cpu_rq(i);
7043 spin_lock_init(&rq->lock); 6280 spin_lock_init(&rq->lock);
7044 lockdep_set_class(&rq->lock, &rq->rq_lock_key); 6281 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
7045 rq->nr_running = 0; 6282 rq->nr_running = 0;
7046 rq->active = rq->arrays; 6283 rq->clock = 1;
7047 rq->expired = rq->arrays + 1; 6284 init_cfs_rq(&rq->cfs, rq);
7048 rq->best_expired_prio = MAX_PRIO; 6285#ifdef CONFIG_FAIR_GROUP_SCHED
6286 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6287 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6288#endif
6289 rq->ls.load_update_last = now;
6290 rq->ls.load_update_start = now;
7049 6291
6292 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6293 rq->cpu_load[j] = 0;
7050#ifdef CONFIG_SMP 6294#ifdef CONFIG_SMP
7051 rq->sd = NULL; 6295 rq->sd = NULL;
7052 for (j = 1; j < 3; j++)
7053 rq->cpu_load[j] = 0;
7054 rq->active_balance = 0; 6296 rq->active_balance = 0;
6297 rq->next_balance = jiffies;
7055 rq->push_cpu = 0; 6298 rq->push_cpu = 0;
7056 rq->cpu = i; 6299 rq->cpu = i;
7057 rq->migration_thread = NULL; 6300 rq->migration_thread = NULL;
@@ -7059,16 +6302,14 @@ void __init sched_init(void)
7059#endif 6302#endif
7060 atomic_set(&rq->nr_iowait, 0); 6303 atomic_set(&rq->nr_iowait, 0);
7061 6304
7062 for (j = 0; j < 2; j++) { 6305 array = &rq->rt.active;
7063 array = rq->arrays + j; 6306 for (j = 0; j < MAX_RT_PRIO; j++) {
7064 for (k = 0; k < MAX_PRIO; k++) { 6307 INIT_LIST_HEAD(array->queue + j);
7065 INIT_LIST_HEAD(array->queue + k); 6308 __clear_bit(j, array->bitmap);
7066 __clear_bit(k, array->bitmap);
7067 }
7068 // delimiter for bitsearch
7069 __set_bit(MAX_PRIO, array->bitmap);
7070 } 6309 }
7071 highest_cpu = i; 6310 highest_cpu = i;
6311 /* delimiter for bitsearch: */
6312 __set_bit(MAX_RT_PRIO, array->bitmap);
7072 } 6313 }
7073 6314
7074 set_load_weight(&init_task); 6315 set_load_weight(&init_task);
@@ -7095,6 +6336,10 @@ void __init sched_init(void)
7095 * when this runqueue becomes "idle". 6336 * when this runqueue becomes "idle".
7096 */ 6337 */
7097 init_idle(current, smp_processor_id()); 6338 init_idle(current, smp_processor_id());
6339 /*
6340 * During early bootup we pretend to be a normal task:
6341 */
6342 current->sched_class = &fair_sched_class;
7098} 6343}
7099 6344
7100#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6345#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -7125,29 +6370,55 @@ EXPORT_SYMBOL(__might_sleep);
7125#ifdef CONFIG_MAGIC_SYSRQ 6370#ifdef CONFIG_MAGIC_SYSRQ
7126void normalize_rt_tasks(void) 6371void normalize_rt_tasks(void)
7127{ 6372{
7128 struct prio_array *array;
7129 struct task_struct *g, *p; 6373 struct task_struct *g, *p;
7130 unsigned long flags; 6374 unsigned long flags;
7131 struct rq *rq; 6375 struct rq *rq;
6376 int on_rq;
7132 6377
7133 read_lock_irq(&tasklist_lock); 6378 read_lock_irq(&tasklist_lock);
7134
7135 do_each_thread(g, p) { 6379 do_each_thread(g, p) {
7136 if (!rt_task(p)) 6380 p->se.fair_key = 0;
6381 p->se.wait_runtime = 0;
6382 p->se.wait_start_fair = 0;
6383 p->se.wait_start = 0;
6384 p->se.exec_start = 0;
6385 p->se.sleep_start = 0;
6386 p->se.sleep_start_fair = 0;
6387 p->se.block_start = 0;
6388 task_rq(p)->cfs.fair_clock = 0;
6389 task_rq(p)->clock = 0;
6390
6391 if (!rt_task(p)) {
6392 /*
6393 * Renice negative nice level userspace
6394 * tasks back to 0:
6395 */
6396 if (TASK_NICE(p) < 0 && p->mm)
6397 set_user_nice(p, 0);
7137 continue; 6398 continue;
6399 }
7138 6400
7139 spin_lock_irqsave(&p->pi_lock, flags); 6401 spin_lock_irqsave(&p->pi_lock, flags);
7140 rq = __task_rq_lock(p); 6402 rq = __task_rq_lock(p);
6403#ifdef CONFIG_SMP
6404 /*
6405 * Do not touch the migration thread:
6406 */
6407 if (p == rq->migration_thread)
6408 goto out_unlock;
6409#endif
7141 6410
7142 array = p->array; 6411 on_rq = p->se.on_rq;
7143 if (array) 6412 if (on_rq)
7144 deactivate_task(p, task_rq(p)); 6413 deactivate_task(task_rq(p), p, 0);
7145 __setscheduler(p, SCHED_NORMAL, 0); 6414 __setscheduler(rq, p, SCHED_NORMAL, 0);
7146 if (array) { 6415 if (on_rq) {
7147 __activate_task(p, task_rq(p)); 6416 activate_task(task_rq(p), p, 0);
7148 resched_task(rq->curr); 6417 resched_task(rq->curr);
7149 } 6418 }
7150 6419#ifdef CONFIG_SMP
6420 out_unlock:
6421#endif
7151 __task_rq_unlock(rq); 6422 __task_rq_unlock(rq);
7152 spin_unlock_irqrestore(&p->pi_lock, flags); 6423 spin_unlock_irqrestore(&p->pi_lock, flags);
7153 } while_each_thread(g, p); 6424 } while_each_thread(g, p);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
new file mode 100644
index 0000000000..1baf87cceb
--- /dev/null
+++ b/kernel/sched_debug.c
@@ -0,0 +1,275 @@
1/*
2 * kernel/time/sched_debug.c
3 *
4 * Print the CFS rbtree
5 *
6 * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/proc_fs.h>
14#include <linux/sched.h>
15#include <linux/seq_file.h>
16#include <linux/kallsyms.h>
17#include <linux/utsname.h>
18
19/*
20 * This allows printing both to /proc/sched_debug and
21 * to the console
22 */
23#define SEQ_printf(m, x...) \
24 do { \
25 if (m) \
26 seq_printf(m, x); \
27 else \
28 printk(x); \
29 } while (0)
30
31static void
32print_task(struct seq_file *m, struct rq *rq, struct task_struct *p, u64 now)
33{
34 if (rq->curr == p)
35 SEQ_printf(m, "R");
36 else
37 SEQ_printf(m, " ");
38
39 SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d "
40 "%15Ld %15Ld %15Ld %15Ld %15Ld\n",
41 p->comm, p->pid,
42 (long long)p->se.fair_key,
43 (long long)(p->se.fair_key - rq->cfs.fair_clock),
44 (long long)p->se.wait_runtime,
45 (long long)(p->nvcsw + p->nivcsw),
46 p->prio,
47 (long long)p->se.sum_exec_runtime,
48 (long long)p->se.sum_wait_runtime,
49 (long long)p->se.sum_sleep_runtime,
50 (long long)p->se.wait_runtime_overruns,
51 (long long)p->se.wait_runtime_underruns);
52}
53
54static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu, u64 now)
55{
56 struct task_struct *g, *p;
57
58 SEQ_printf(m,
59 "\nrunnable tasks:\n"
60 " task PID tree-key delta waiting"
61 " switches prio"
62 " sum-exec sum-wait sum-sleep"
63 " wait-overrun wait-underrun\n"
64 "------------------------------------------------------------------"
65 "----------------"
66 "------------------------------------------------"
67 "--------------------------------\n");
68
69 read_lock_irq(&tasklist_lock);
70
71 do_each_thread(g, p) {
72 if (!p->se.on_rq || task_cpu(p) != rq_cpu)
73 continue;
74
75 print_task(m, rq, p, now);
76 } while_each_thread(g, p);
77
78 read_unlock_irq(&tasklist_lock);
79}
80
81static void
82print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
83{
84 s64 wait_runtime_rq_sum = 0;
85 struct task_struct *p;
86 struct rb_node *curr;
87 unsigned long flags;
88 struct rq *rq = &per_cpu(runqueues, cpu);
89
90 spin_lock_irqsave(&rq->lock, flags);
91 curr = first_fair(cfs_rq);
92 while (curr) {
93 p = rb_entry(curr, struct task_struct, se.run_node);
94 wait_runtime_rq_sum += p->se.wait_runtime;
95
96 curr = rb_next(curr);
97 }
98 spin_unlock_irqrestore(&rq->lock, flags);
99
100 SEQ_printf(m, " .%-30s: %Ld\n", "wait_runtime_rq_sum",
101 (long long)wait_runtime_rq_sum);
102}
103
104void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now)
105{
106 SEQ_printf(m, "\ncfs_rq %p\n", cfs_rq);
107
108#define P(x) \
109 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(cfs_rq->x))
110
111 P(fair_clock);
112 P(exec_clock);
113 P(wait_runtime);
114 P(wait_runtime_overruns);
115 P(wait_runtime_underruns);
116 P(sleeper_bonus);
117#undef P
118
119 print_cfs_rq_runtime_sum(m, cpu, cfs_rq);
120}
121
122static void print_cpu(struct seq_file *m, int cpu, u64 now)
123{
124 struct rq *rq = &per_cpu(runqueues, cpu);
125
126#ifdef CONFIG_X86
127 {
128 unsigned int freq = cpu_khz ? : 1;
129
130 SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
131 cpu, freq / 1000, (freq % 1000));
132 }
133#else
134 SEQ_printf(m, "\ncpu#%d\n", cpu);
135#endif
136
137#define P(x) \
138 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x))
139
140 P(nr_running);
141 SEQ_printf(m, " .%-30s: %lu\n", "load",
142 rq->ls.load.weight);
143 P(ls.delta_fair);
144 P(ls.delta_exec);
145 P(nr_switches);
146 P(nr_load_updates);
147 P(nr_uninterruptible);
148 SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies);
149 P(next_balance);
150 P(curr->pid);
151 P(clock);
152 P(prev_clock_raw);
153 P(clock_warps);
154 P(clock_overflows);
155 P(clock_unstable_events);
156 P(clock_max_delta);
157 P(cpu_load[0]);
158 P(cpu_load[1]);
159 P(cpu_load[2]);
160 P(cpu_load[3]);
161 P(cpu_load[4]);
162#undef P
163
164 print_cfs_stats(m, cpu, now);
165
166 print_rq(m, rq, cpu, now);
167}
168
169static int sched_debug_show(struct seq_file *m, void *v)
170{
171 u64 now = ktime_to_ns(ktime_get());
172 int cpu;
173
174 SEQ_printf(m, "Sched Debug Version: v0.04, cfs-v20, %s %.*s\n",
175 init_utsname()->release,
176 (int)strcspn(init_utsname()->version, " "),
177 init_utsname()->version);
178
179 SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now);
180
181 for_each_online_cpu(cpu)
182 print_cpu(m, cpu, now);
183
184 SEQ_printf(m, "\n");
185
186 return 0;
187}
188
189void sysrq_sched_debug_show(void)
190{
191 sched_debug_show(NULL, NULL);
192}
193
194static int sched_debug_open(struct inode *inode, struct file *filp)
195{
196 return single_open(filp, sched_debug_show, NULL);
197}
198
199static struct file_operations sched_debug_fops = {
200 .open = sched_debug_open,
201 .read = seq_read,
202 .llseek = seq_lseek,
203 .release = seq_release,
204};
205
206static int __init init_sched_debug_procfs(void)
207{
208 struct proc_dir_entry *pe;
209
210 pe = create_proc_entry("sched_debug", 0644, NULL);
211 if (!pe)
212 return -ENOMEM;
213
214 pe->proc_fops = &sched_debug_fops;
215
216 return 0;
217}
218
219__initcall(init_sched_debug_procfs);
220
221void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
222{
223 unsigned long flags;
224 int num_threads = 1;
225
226 rcu_read_lock();
227 if (lock_task_sighand(p, &flags)) {
228 num_threads = atomic_read(&p->signal->count);
229 unlock_task_sighand(p, &flags);
230 }
231 rcu_read_unlock();
232
233 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
234 SEQ_printf(m, "----------------------------------------------\n");
235#define P(F) \
236 SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F)
237
238 P(se.wait_start);
239 P(se.wait_start_fair);
240 P(se.exec_start);
241 P(se.sleep_start);
242 P(se.sleep_start_fair);
243 P(se.block_start);
244 P(se.sleep_max);
245 P(se.block_max);
246 P(se.exec_max);
247 P(se.wait_max);
248 P(se.wait_runtime);
249 P(se.wait_runtime_overruns);
250 P(se.wait_runtime_underruns);
251 P(se.sum_wait_runtime);
252 P(se.sum_exec_runtime);
253 SEQ_printf(m, "%-25s:%20Ld\n",
254 "nr_switches", (long long)(p->nvcsw + p->nivcsw));
255 P(se.load.weight);
256 P(policy);
257 P(prio);
258#undef P
259
260 {
261 u64 t0, t1;
262
263 t0 = sched_clock();
264 t1 = sched_clock();
265 SEQ_printf(m, "%-25s:%20Ld\n",
266 "clock-delta", (long long)(t1-t0));
267 }
268}
269
270void proc_sched_set_task(struct task_struct *p)
271{
272 p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0;
273 p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0;
274 p->se.sum_exec_runtime = 0;
275}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
new file mode 100644
index 0000000000..6971db0a71
--- /dev/null
+++ b/kernel/sched_fair.c
@@ -0,0 +1,1131 @@
1/*
2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3 *
4 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5 *
6 * Interactivity improvements by Mike Galbraith
7 * (C) 2007 Mike Galbraith <efault@gmx.de>
8 *
9 * Various enhancements by Dmitry Adamushko.
10 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11 *
12 * Group scheduling enhancements by Srivatsa Vaddagiri
13 * Copyright IBM Corporation, 2007
14 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15 *
16 * Scaled math optimizations by Thomas Gleixner
17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18 */
19
20/*
21 * Preemption granularity:
22 * (default: 2 msec, units: nanoseconds)
23 *
24 * NOTE: this granularity value is not the same as the concept of
25 * 'timeslice length' - timeslices in CFS will typically be somewhat
26 * larger than this value. (to see the precise effective timeslice
27 * length of your workload, run vmstat and monitor the context-switches
28 * field)
29 *
30 * On SMP systems the value of this is multiplied by the log2 of the
31 * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
32 * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
33 */
34unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ;
35
36/*
37 * SCHED_BATCH wake-up granularity.
38 * (default: 10 msec, units: nanoseconds)
39 *
40 * This option delays the preemption effects of decoupled workloads
41 * and reduces their over-scheduling. Synchronous workloads will still
42 * have immediate wakeup/sleep latencies.
43 */
44unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly =
45 10000000000ULL/HZ;
46
47/*
48 * SCHED_OTHER wake-up granularity.
49 * (default: 1 msec, units: nanoseconds)
50 *
51 * This option delays the preemption effects of decoupled workloads
52 * and reduces their over-scheduling. Synchronous workloads will still
53 * have immediate wakeup/sleep latencies.
54 */
55unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000000ULL/HZ;
56
57unsigned int sysctl_sched_stat_granularity __read_mostly;
58
59/*
60 * Initialized in sched_init_granularity():
61 */
62unsigned int sysctl_sched_runtime_limit __read_mostly;
63
64/*
65 * Debugging: various feature bits
66 */
67enum {
68 SCHED_FEAT_FAIR_SLEEPERS = 1,
69 SCHED_FEAT_SLEEPER_AVG = 2,
70 SCHED_FEAT_SLEEPER_LOAD_AVG = 4,
71 SCHED_FEAT_PRECISE_CPU_LOAD = 8,
72 SCHED_FEAT_START_DEBIT = 16,
73 SCHED_FEAT_SKIP_INITIAL = 32,
74};
75
76unsigned int sysctl_sched_features __read_mostly =
77 SCHED_FEAT_FAIR_SLEEPERS *1 |
78 SCHED_FEAT_SLEEPER_AVG *1 |
79 SCHED_FEAT_SLEEPER_LOAD_AVG *1 |
80 SCHED_FEAT_PRECISE_CPU_LOAD *1 |
81 SCHED_FEAT_START_DEBIT *1 |
82 SCHED_FEAT_SKIP_INITIAL *0;
83
84extern struct sched_class fair_sched_class;
85
86/**************************************************************
87 * CFS operations on generic schedulable entities:
88 */
89
90#ifdef CONFIG_FAIR_GROUP_SCHED
91
92/* cpu runqueue to which this cfs_rq is attached */
93static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
94{
95 return cfs_rq->rq;
96}
97
98/* currently running entity (if any) on this cfs_rq */
99static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
100{
101 return cfs_rq->curr;
102}
103
104/* An entity is a task if it doesn't "own" a runqueue */
105#define entity_is_task(se) (!se->my_q)
106
107static inline void
108set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se)
109{
110 cfs_rq->curr = se;
111}
112
113#else /* CONFIG_FAIR_GROUP_SCHED */
114
115static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
116{
117 return container_of(cfs_rq, struct rq, cfs);
118}
119
120static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
121{
122 struct rq *rq = rq_of(cfs_rq);
123
124 if (unlikely(rq->curr->sched_class != &fair_sched_class))
125 return NULL;
126
127 return &rq->curr->se;
128}
129
130#define entity_is_task(se) 1
131
132static inline void
133set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
134
135#endif /* CONFIG_FAIR_GROUP_SCHED */
136
137static inline struct task_struct *task_of(struct sched_entity *se)
138{
139 return container_of(se, struct task_struct, se);
140}
141
142
143/**************************************************************
144 * Scheduling class tree data structure manipulation methods:
145 */
146
147/*
148 * Enqueue an entity into the rb-tree:
149 */
150static inline void
151__enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
152{
153 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
154 struct rb_node *parent = NULL;
155 struct sched_entity *entry;
156 s64 key = se->fair_key;
157 int leftmost = 1;
158
159 /*
160 * Find the right place in the rbtree:
161 */
162 while (*link) {
163 parent = *link;
164 entry = rb_entry(parent, struct sched_entity, run_node);
165 /*
166 * We dont care about collisions. Nodes with
167 * the same key stay together.
168 */
169 if (key - entry->fair_key < 0) {
170 link = &parent->rb_left;
171 } else {
172 link = &parent->rb_right;
173 leftmost = 0;
174 }
175 }
176
177 /*
178 * Maintain a cache of leftmost tree entries (it is frequently
179 * used):
180 */
181 if (leftmost)
182 cfs_rq->rb_leftmost = &se->run_node;
183
184 rb_link_node(&se->run_node, parent, link);
185 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
186 update_load_add(&cfs_rq->load, se->load.weight);
187 cfs_rq->nr_running++;
188 se->on_rq = 1;
189}
190
191static inline void
192__dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
193{
194 if (cfs_rq->rb_leftmost == &se->run_node)
195 cfs_rq->rb_leftmost = rb_next(&se->run_node);
196 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
197 update_load_sub(&cfs_rq->load, se->load.weight);
198 cfs_rq->nr_running--;
199 se->on_rq = 0;
200}
201
202static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
203{
204 return cfs_rq->rb_leftmost;
205}
206
207static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
208{
209 return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
210}
211
212/**************************************************************
213 * Scheduling class statistics methods:
214 */
215
216/*
217 * We rescale the rescheduling granularity of tasks according to their
218 * nice level, but only linearly, not exponentially:
219 */
220static long
221niced_granularity(struct sched_entity *curr, unsigned long granularity)
222{
223 u64 tmp;
224
225 /*
226 * Negative nice levels get the same granularity as nice-0:
227 */
228 if (likely(curr->load.weight >= NICE_0_LOAD))
229 return granularity;
230 /*
231 * Positive nice level tasks get linearly finer
232 * granularity:
233 */
234 tmp = curr->load.weight * (u64)granularity;
235
236 /*
237 * It will always fit into 'long':
238 */
239 return (long) (tmp >> NICE_0_SHIFT);
240}
241
242static inline void
243limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se)
244{
245 long limit = sysctl_sched_runtime_limit;
246
247 /*
248 * Niced tasks have the same history dynamic range as
249 * non-niced tasks:
250 */
251 if (unlikely(se->wait_runtime > limit)) {
252 se->wait_runtime = limit;
253 schedstat_inc(se, wait_runtime_overruns);
254 schedstat_inc(cfs_rq, wait_runtime_overruns);
255 }
256 if (unlikely(se->wait_runtime < -limit)) {
257 se->wait_runtime = -limit;
258 schedstat_inc(se, wait_runtime_underruns);
259 schedstat_inc(cfs_rq, wait_runtime_underruns);
260 }
261}
262
263static inline void
264__add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
265{
266 se->wait_runtime += delta;
267 schedstat_add(se, sum_wait_runtime, delta);
268 limit_wait_runtime(cfs_rq, se);
269}
270
271static void
272add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
273{
274 schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
275 __add_wait_runtime(cfs_rq, se, delta);
276 schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
277}
278
279/*
280 * Update the current task's runtime statistics. Skip current tasks that
281 * are not in our scheduling class.
282 */
283static inline void
284__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now)
285{
286 unsigned long delta, delta_exec, delta_fair;
287 long delta_mine;
288 struct load_weight *lw = &cfs_rq->load;
289 unsigned long load = lw->weight;
290
291 if (unlikely(!load))
292 return;
293
294 delta_exec = curr->delta_exec;
295#ifdef CONFIG_SCHEDSTATS
296 if (unlikely(delta_exec > curr->exec_max))
297 curr->exec_max = delta_exec;
298#endif
299
300 curr->sum_exec_runtime += delta_exec;
301 cfs_rq->exec_clock += delta_exec;
302
303 delta_fair = calc_delta_fair(delta_exec, lw);
304 delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
305
306 if (cfs_rq->sleeper_bonus > sysctl_sched_stat_granularity) {
307 delta = calc_delta_mine(cfs_rq->sleeper_bonus,
308 curr->load.weight, lw);
309 if (unlikely(delta > cfs_rq->sleeper_bonus))
310 delta = cfs_rq->sleeper_bonus;
311
312 cfs_rq->sleeper_bonus -= delta;
313 delta_mine -= delta;
314 }
315
316 cfs_rq->fair_clock += delta_fair;
317 /*
318 * We executed delta_exec amount of time on the CPU,
319 * but we were only entitled to delta_mine amount of
320 * time during that period (if nr_running == 1 then
321 * the two values are equal)
322 * [Note: delta_mine - delta_exec is negative]:
323 */
324 add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec);
325}
326
327static void update_curr(struct cfs_rq *cfs_rq, u64 now)
328{
329 struct sched_entity *curr = cfs_rq_curr(cfs_rq);
330 unsigned long delta_exec;
331
332 if (unlikely(!curr))
333 return;
334
335 /*
336 * Get the amount of time the current task was running
337 * since the last time we changed load (this cannot
338 * overflow on 32 bits):
339 */
340 delta_exec = (unsigned long)(now - curr->exec_start);
341
342 curr->delta_exec += delta_exec;
343
344 if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) {
345 __update_curr(cfs_rq, curr, now);
346 curr->delta_exec = 0;
347 }
348 curr->exec_start = now;
349}
350
351static inline void
352update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
353{
354 se->wait_start_fair = cfs_rq->fair_clock;
355 se->wait_start = now;
356}
357
358/*
359 * We calculate fair deltas here, so protect against the random effects
360 * of a multiplication overflow by capping it to the runtime limit:
361 */
362#if BITS_PER_LONG == 32
363static inline unsigned long
364calc_weighted(unsigned long delta, unsigned long weight, int shift)
365{
366 u64 tmp = (u64)delta * weight >> shift;
367
368 if (unlikely(tmp > sysctl_sched_runtime_limit*2))
369 return sysctl_sched_runtime_limit*2;
370 return tmp;
371}
372#else
373static inline unsigned long
374calc_weighted(unsigned long delta, unsigned long weight, int shift)
375{
376 return delta * weight >> shift;
377}
378#endif
379
380/*
381 * Task is being enqueued - update stats:
382 */
383static void
384update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
385{
386 s64 key;
387
388 /*
389 * Are we enqueueing a waiting task? (for current tasks
390 * a dequeue/enqueue event is a NOP)
391 */
392 if (se != cfs_rq_curr(cfs_rq))
393 update_stats_wait_start(cfs_rq, se, now);
394 /*
395 * Update the key:
396 */
397 key = cfs_rq->fair_clock;
398
399 /*
400 * Optimize the common nice 0 case:
401 */
402 if (likely(se->load.weight == NICE_0_LOAD)) {
403 key -= se->wait_runtime;
404 } else {
405 u64 tmp;
406
407 if (se->wait_runtime < 0) {
408 tmp = -se->wait_runtime;
409 key += (tmp * se->load.inv_weight) >>
410 (WMULT_SHIFT - NICE_0_SHIFT);
411 } else {
412 tmp = se->wait_runtime;
413 key -= (tmp * se->load.weight) >> NICE_0_SHIFT;
414 }
415 }
416
417 se->fair_key = key;
418}
419
420/*
421 * Note: must be called with a freshly updated rq->fair_clock.
422 */
423static inline void
424__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
425{
426 unsigned long delta_fair = se->delta_fair_run;
427
428#ifdef CONFIG_SCHEDSTATS
429 {
430 s64 delta_wait = now - se->wait_start;
431 if (unlikely(delta_wait > se->wait_max))
432 se->wait_max = delta_wait;
433 }
434#endif
435
436 if (unlikely(se->load.weight != NICE_0_LOAD))
437 delta_fair = calc_weighted(delta_fair, se->load.weight,
438 NICE_0_SHIFT);
439
440 add_wait_runtime(cfs_rq, se, delta_fair);
441}
442
443static void
444update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
445{
446 unsigned long delta_fair;
447
448 delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
449 (u64)(cfs_rq->fair_clock - se->wait_start_fair));
450
451 se->delta_fair_run += delta_fair;
452 if (unlikely(abs(se->delta_fair_run) >=
453 sysctl_sched_stat_granularity)) {
454 __update_stats_wait_end(cfs_rq, se, now);
455 se->delta_fair_run = 0;
456 }
457
458 se->wait_start_fair = 0;
459 se->wait_start = 0;
460}
461
462static inline void
463update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
464{
465 update_curr(cfs_rq, now);
466 /*
467 * Mark the end of the wait period if dequeueing a
468 * waiting task:
469 */
470 if (se != cfs_rq_curr(cfs_rq))
471 update_stats_wait_end(cfs_rq, se, now);
472}
473
474/*
475 * We are picking a new current task - update its stats:
476 */
477static inline void
478update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
479{
480 /*
481 * We are starting a new run period:
482 */
483 se->exec_start = now;
484}
485
486/*
487 * We are descheduling a task - update its stats:
488 */
489static inline void
490update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
491{
492 se->exec_start = 0;
493}
494
495/**************************************************
496 * Scheduling class queueing methods:
497 */
498
499static void
500__enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
501{
502 unsigned long load = cfs_rq->load.weight, delta_fair;
503 long prev_runtime;
504
505 if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG)
506 load = rq_of(cfs_rq)->cpu_load[2];
507
508 delta_fair = se->delta_fair_sleep;
509
510 /*
511 * Fix up delta_fair with the effect of us running
512 * during the whole sleep period:
513 */
514 if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG)
515 delta_fair = div64_likely32((u64)delta_fair * load,
516 load + se->load.weight);
517
518 if (unlikely(se->load.weight != NICE_0_LOAD))
519 delta_fair = calc_weighted(delta_fair, se->load.weight,
520 NICE_0_SHIFT);
521
522 prev_runtime = se->wait_runtime;
523 __add_wait_runtime(cfs_rq, se, delta_fair);
524 delta_fair = se->wait_runtime - prev_runtime;
525
526 /*
527 * Track the amount of bonus we've given to sleepers:
528 */
529 cfs_rq->sleeper_bonus += delta_fair;
530
531 schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
532}
533
534static void
535enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
536{
537 struct task_struct *tsk = task_of(se);
538 unsigned long delta_fair;
539
540 if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) ||
541 !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS))
542 return;
543
544 delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
545 (u64)(cfs_rq->fair_clock - se->sleep_start_fair));
546
547 se->delta_fair_sleep += delta_fair;
548 if (unlikely(abs(se->delta_fair_sleep) >=
549 sysctl_sched_stat_granularity)) {
550 __enqueue_sleeper(cfs_rq, se, now);
551 se->delta_fair_sleep = 0;
552 }
553
554 se->sleep_start_fair = 0;
555
556#ifdef CONFIG_SCHEDSTATS
557 if (se->sleep_start) {
558 u64 delta = now - se->sleep_start;
559
560 if ((s64)delta < 0)
561 delta = 0;
562
563 if (unlikely(delta > se->sleep_max))
564 se->sleep_max = delta;
565
566 se->sleep_start = 0;
567 se->sum_sleep_runtime += delta;
568 }
569 if (se->block_start) {
570 u64 delta = now - se->block_start;
571
572 if ((s64)delta < 0)
573 delta = 0;
574
575 if (unlikely(delta > se->block_max))
576 se->block_max = delta;
577
578 se->block_start = 0;
579 se->sum_sleep_runtime += delta;
580 }
581#endif
582}
583
584static void
585enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
586 int wakeup, u64 now)
587{
588 /*
589 * Update the fair clock.
590 */
591 update_curr(cfs_rq, now);
592
593 if (wakeup)
594 enqueue_sleeper(cfs_rq, se, now);
595
596 update_stats_enqueue(cfs_rq, se, now);
597 __enqueue_entity(cfs_rq, se);
598}
599
600static void
601dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
602 int sleep, u64 now)
603{
604 update_stats_dequeue(cfs_rq, se, now);
605 if (sleep) {
606 se->sleep_start_fair = cfs_rq->fair_clock;
607#ifdef CONFIG_SCHEDSTATS
608 if (entity_is_task(se)) {
609 struct task_struct *tsk = task_of(se);
610
611 if (tsk->state & TASK_INTERRUPTIBLE)
612 se->sleep_start = now;
613 if (tsk->state & TASK_UNINTERRUPTIBLE)
614 se->block_start = now;
615 }
616 cfs_rq->wait_runtime -= se->wait_runtime;
617#endif
618 }
619 __dequeue_entity(cfs_rq, se);
620}
621
622/*
623 * Preempt the current task with a newly woken task if needed:
624 */
625static void
626__check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
627 struct sched_entity *curr, unsigned long granularity)
628{
629 s64 __delta = curr->fair_key - se->fair_key;
630
631 /*
632 * Take scheduling granularity into account - do not
633 * preempt the current task unless the best task has
634 * a larger than sched_granularity fairness advantage:
635 */
636 if (__delta > niced_granularity(curr, granularity))
637 resched_task(rq_of(cfs_rq)->curr);
638}
639
640static inline void
641set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
642{
643 /*
644 * Any task has to be enqueued before it get to execute on
645 * a CPU. So account for the time it spent waiting on the
646 * runqueue. (note, here we rely on pick_next_task() having
647 * done a put_prev_task_fair() shortly before this, which
648 * updated rq->fair_clock - used by update_stats_wait_end())
649 */
650 update_stats_wait_end(cfs_rq, se, now);
651 update_stats_curr_start(cfs_rq, se, now);
652 set_cfs_rq_curr(cfs_rq, se);
653}
654
655static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq, u64 now)
656{
657 struct sched_entity *se = __pick_next_entity(cfs_rq);
658
659 set_next_entity(cfs_rq, se, now);
660
661 return se;
662}
663
664static void
665put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev, u64 now)
666{
667 /*
668 * If still on the runqueue then deactivate_task()
669 * was not called and update_curr() has to be done:
670 */
671 if (prev->on_rq)
672 update_curr(cfs_rq, now);
673
674 update_stats_curr_end(cfs_rq, prev, now);
675
676 if (prev->on_rq)
677 update_stats_wait_start(cfs_rq, prev, now);
678 set_cfs_rq_curr(cfs_rq, NULL);
679}
680
681static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
682{
683 struct rq *rq = rq_of(cfs_rq);
684 struct sched_entity *next;
685 u64 now = __rq_clock(rq);
686
687 /*
688 * Dequeue and enqueue the task to update its
689 * position within the tree:
690 */
691 dequeue_entity(cfs_rq, curr, 0, now);
692 enqueue_entity(cfs_rq, curr, 0, now);
693
694 /*
695 * Reschedule if another task tops the current one.
696 */
697 next = __pick_next_entity(cfs_rq);
698 if (next == curr)
699 return;
700
701 __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity);
702}
703
704/**************************************************
705 * CFS operations on tasks:
706 */
707
708#ifdef CONFIG_FAIR_GROUP_SCHED
709
710/* Walk up scheduling entities hierarchy */
711#define for_each_sched_entity(se) \
712 for (; se; se = se->parent)
713
714static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
715{
716 return p->se.cfs_rq;
717}
718
719/* runqueue on which this entity is (to be) queued */
720static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
721{
722 return se->cfs_rq;
723}
724
725/* runqueue "owned" by this group */
726static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
727{
728 return grp->my_q;
729}
730
731/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
732 * another cpu ('this_cpu')
733 */
734static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
735{
736 /* A later patch will take group into account */
737 return &cpu_rq(this_cpu)->cfs;
738}
739
740/* Iterate thr' all leaf cfs_rq's on a runqueue */
741#define for_each_leaf_cfs_rq(rq, cfs_rq) \
742 list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
743
744/* Do the two (enqueued) tasks belong to the same group ? */
745static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
746{
747 if (curr->se.cfs_rq == p->se.cfs_rq)
748 return 1;
749
750 return 0;
751}
752
753#else /* CONFIG_FAIR_GROUP_SCHED */
754
755#define for_each_sched_entity(se) \
756 for (; se; se = NULL)
757
758static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
759{
760 return &task_rq(p)->cfs;
761}
762
763static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
764{
765 struct task_struct *p = task_of(se);
766 struct rq *rq = task_rq(p);
767
768 return &rq->cfs;
769}
770
771/* runqueue "owned" by this group */
772static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
773{
774 return NULL;
775}
776
777static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
778{
779 return &cpu_rq(this_cpu)->cfs;
780}
781
782#define for_each_leaf_cfs_rq(rq, cfs_rq) \
783 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
784
785static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
786{
787 return 1;
788}
789
790#endif /* CONFIG_FAIR_GROUP_SCHED */
791
792/*
793 * The enqueue_task method is called before nr_running is
794 * increased. Here we update the fair scheduling stats and
795 * then put the task into the rbtree:
796 */
797static void
798enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
799{
800 struct cfs_rq *cfs_rq;
801 struct sched_entity *se = &p->se;
802
803 for_each_sched_entity(se) {
804 if (se->on_rq)
805 break;
806 cfs_rq = cfs_rq_of(se);
807 enqueue_entity(cfs_rq, se, wakeup, now);
808 }
809}
810
811/*
812 * The dequeue_task method is called before nr_running is
813 * decreased. We remove the task from the rbtree and
814 * update the fair scheduling stats:
815 */
816static void
817dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now)
818{
819 struct cfs_rq *cfs_rq;
820 struct sched_entity *se = &p->se;
821
822 for_each_sched_entity(se) {
823 cfs_rq = cfs_rq_of(se);
824 dequeue_entity(cfs_rq, se, sleep, now);
825 /* Don't dequeue parent if it has other entities besides us */
826 if (cfs_rq->load.weight)
827 break;
828 }
829}
830
831/*
832 * sched_yield() support is very simple - we dequeue and enqueue
833 */
834static void yield_task_fair(struct rq *rq, struct task_struct *p)
835{
836 struct cfs_rq *cfs_rq = task_cfs_rq(p);
837 u64 now = __rq_clock(rq);
838
839 /*
840 * Dequeue and enqueue the task to update its
841 * position within the tree:
842 */
843 dequeue_entity(cfs_rq, &p->se, 0, now);
844 enqueue_entity(cfs_rq, &p->se, 0, now);
845}
846
847/*
848 * Preempt the current task with a newly woken task if needed:
849 */
850static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
851{
852 struct task_struct *curr = rq->curr;
853 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
854 unsigned long gran;
855
856 if (unlikely(rt_prio(p->prio))) {
857 update_curr(cfs_rq, rq_clock(rq));
858 resched_task(curr);
859 return;
860 }
861
862 gran = sysctl_sched_wakeup_granularity;
863 /*
864 * Batch tasks prefer throughput over latency:
865 */
866 if (unlikely(p->policy == SCHED_BATCH))
867 gran = sysctl_sched_batch_wakeup_granularity;
868
869 if (is_same_group(curr, p))
870 __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran);
871}
872
873static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now)
874{
875 struct cfs_rq *cfs_rq = &rq->cfs;
876 struct sched_entity *se;
877
878 if (unlikely(!cfs_rq->nr_running))
879 return NULL;
880
881 do {
882 se = pick_next_entity(cfs_rq, now);
883 cfs_rq = group_cfs_rq(se);
884 } while (cfs_rq);
885
886 return task_of(se);
887}
888
889/*
890 * Account for a descheduled task:
891 */
892static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 now)
893{
894 struct sched_entity *se = &prev->se;
895 struct cfs_rq *cfs_rq;
896
897 for_each_sched_entity(se) {
898 cfs_rq = cfs_rq_of(se);
899 put_prev_entity(cfs_rq, se, now);
900 }
901}
902
903/**************************************************
904 * Fair scheduling class load-balancing methods:
905 */
906
907/*
908 * Load-balancing iterator. Note: while the runqueue stays locked
909 * during the whole iteration, the current task might be
910 * dequeued so the iterator has to be dequeue-safe. Here we
911 * achieve that by always pre-iterating before returning
912 * the current task:
913 */
914static inline struct task_struct *
915__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
916{
917 struct task_struct *p;
918
919 if (!curr)
920 return NULL;
921
922 p = rb_entry(curr, struct task_struct, se.run_node);
923 cfs_rq->rb_load_balance_curr = rb_next(curr);
924
925 return p;
926}
927
928static struct task_struct *load_balance_start_fair(void *arg)
929{
930 struct cfs_rq *cfs_rq = arg;
931
932 return __load_balance_iterator(cfs_rq, first_fair(cfs_rq));
933}
934
935static struct task_struct *load_balance_next_fair(void *arg)
936{
937 struct cfs_rq *cfs_rq = arg;
938
939 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
940}
941
942static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
943{
944 struct sched_entity *curr;
945 struct task_struct *p;
946
947 if (!cfs_rq->nr_running)
948 return MAX_PRIO;
949
950 curr = __pick_next_entity(cfs_rq);
951 p = task_of(curr);
952
953 return p->prio;
954}
955
956static int
957load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
958 unsigned long max_nr_move, unsigned long max_load_move,
959 struct sched_domain *sd, enum cpu_idle_type idle,
960 int *all_pinned, unsigned long *total_load_moved)
961{
962 struct cfs_rq *busy_cfs_rq;
963 unsigned long load_moved, total_nr_moved = 0, nr_moved;
964 long rem_load_move = max_load_move;
965 struct rq_iterator cfs_rq_iterator;
966
967 cfs_rq_iterator.start = load_balance_start_fair;
968 cfs_rq_iterator.next = load_balance_next_fair;
969
970 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
971 struct cfs_rq *this_cfs_rq;
972 long imbalance;
973 unsigned long maxload;
974 int this_best_prio, best_prio, best_prio_seen = 0;
975
976 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
977
978 imbalance = busy_cfs_rq->load.weight -
979 this_cfs_rq->load.weight;
980 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
981 if (imbalance <= 0)
982 continue;
983
984 /* Don't pull more than imbalance/2 */
985 imbalance /= 2;
986 maxload = min(rem_load_move, imbalance);
987
988 this_best_prio = cfs_rq_best_prio(this_cfs_rq);
989 best_prio = cfs_rq_best_prio(busy_cfs_rq);
990
991 /*
992 * Enable handling of the case where there is more than one task
993 * with the best priority. If the current running task is one
994 * of those with prio==best_prio we know it won't be moved
995 * and therefore it's safe to override the skip (based on load)
996 * of any task we find with that prio.
997 */
998 if (cfs_rq_curr(busy_cfs_rq) == &busiest->curr->se)
999 best_prio_seen = 1;
1000
1001 /* pass busy_cfs_rq argument into
1002 * load_balance_[start|next]_fair iterators
1003 */
1004 cfs_rq_iterator.arg = busy_cfs_rq;
1005 nr_moved = balance_tasks(this_rq, this_cpu, busiest,
1006 max_nr_move, maxload, sd, idle, all_pinned,
1007 &load_moved, this_best_prio, best_prio,
1008 best_prio_seen, &cfs_rq_iterator);
1009
1010 total_nr_moved += nr_moved;
1011 max_nr_move -= nr_moved;
1012 rem_load_move -= load_moved;
1013
1014 if (max_nr_move <= 0 || rem_load_move <= 0)
1015 break;
1016 }
1017
1018 *total_load_moved = max_load_move - rem_load_move;
1019
1020 return total_nr_moved;
1021}
1022
1023/*
1024 * scheduler tick hitting a task of our scheduling class:
1025 */
1026static void task_tick_fair(struct rq *rq, struct task_struct *curr)
1027{
1028 struct cfs_rq *cfs_rq;
1029 struct sched_entity *se = &curr->se;
1030
1031 for_each_sched_entity(se) {
1032 cfs_rq = cfs_rq_of(se);
1033 entity_tick(cfs_rq, se);
1034 }
1035}
1036
1037/*
1038 * Share the fairness runtime between parent and child, thus the
1039 * total amount of pressure for CPU stays equal - new tasks
1040 * get a chance to run but frequent forkers are not allowed to
1041 * monopolize the CPU. Note: the parent runqueue is locked,
1042 * the child is not running yet.
1043 */
1044static void task_new_fair(struct rq *rq, struct task_struct *p)
1045{
1046 struct cfs_rq *cfs_rq = task_cfs_rq(p);
1047 struct sched_entity *se = &p->se;
1048 u64 now = rq_clock(rq);
1049
1050 sched_info_queued(p);
1051
1052 update_stats_enqueue(cfs_rq, se, now);
1053 /*
1054 * Child runs first: we let it run before the parent
1055 * until it reschedules once. We set up the key so that
1056 * it will preempt the parent:
1057 */
1058 p->se.fair_key = current->se.fair_key -
1059 niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1;
1060 /*
1061 * The first wait is dominated by the child-runs-first logic,
1062 * so do not credit it with that waiting time yet:
1063 */
1064 if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL)
1065 p->se.wait_start_fair = 0;
1066
1067 /*
1068 * The statistical average of wait_runtime is about
1069 * -granularity/2, so initialize the task with that:
1070 */
1071 if (sysctl_sched_features & SCHED_FEAT_START_DEBIT)
1072 p->se.wait_runtime = -(sysctl_sched_granularity / 2);
1073
1074 __enqueue_entity(cfs_rq, se);
1075 inc_nr_running(p, rq, now);
1076}
1077
1078#ifdef CONFIG_FAIR_GROUP_SCHED
1079/* Account for a task changing its policy or group.
1080 *
1081 * This routine is mostly called to set cfs_rq->curr field when a task
1082 * migrates between groups/classes.
1083 */
1084static void set_curr_task_fair(struct rq *rq)
1085{
1086 struct task_struct *curr = rq->curr;
1087 struct sched_entity *se = &curr->se;
1088 u64 now = rq_clock(rq);
1089 struct cfs_rq *cfs_rq;
1090
1091 for_each_sched_entity(se) {
1092 cfs_rq = cfs_rq_of(se);
1093 set_next_entity(cfs_rq, se, now);
1094 }
1095}
1096#else
1097static void set_curr_task_fair(struct rq *rq)
1098{
1099}
1100#endif
1101
1102/*
1103 * All the scheduling class methods:
1104 */
1105struct sched_class fair_sched_class __read_mostly = {
1106 .enqueue_task = enqueue_task_fair,
1107 .dequeue_task = dequeue_task_fair,
1108 .yield_task = yield_task_fair,
1109
1110 .check_preempt_curr = check_preempt_curr_fair,
1111
1112 .pick_next_task = pick_next_task_fair,
1113 .put_prev_task = put_prev_task_fair,
1114
1115 .load_balance = load_balance_fair,
1116
1117 .set_curr_task = set_curr_task_fair,
1118 .task_tick = task_tick_fair,
1119 .task_new = task_new_fair,
1120};
1121
1122#ifdef CONFIG_SCHED_DEBUG
1123void print_cfs_stats(struct seq_file *m, int cpu, u64 now)
1124{
1125 struct rq *rq = cpu_rq(cpu);
1126 struct cfs_rq *cfs_rq;
1127
1128 for_each_leaf_cfs_rq(rq, cfs_rq)
1129 print_cfs_rq(m, cpu, cfs_rq, now);
1130}
1131#endif
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
new file mode 100644
index 0000000000..41841e741c
--- /dev/null
+++ b/kernel/sched_idletask.c
@@ -0,0 +1,71 @@
1/*
2 * idle-task scheduling class.
3 *
4 * (NOTE: these are not related to SCHED_IDLE tasks which are
5 * handled in sched_fair.c)
6 */
7
8/*
9 * Idle tasks are unconditionally rescheduled:
10 */
11static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p)
12{
13 resched_task(rq->idle);
14}
15
16static struct task_struct *pick_next_task_idle(struct rq *rq, u64 now)
17{
18 schedstat_inc(rq, sched_goidle);
19
20 return rq->idle;
21}
22
23/*
24 * It is not legal to sleep in the idle task - print a warning
25 * message if some code attempts to do it:
26 */
27static void
28dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep, u64 now)
29{
30 spin_unlock_irq(&rq->lock);
31 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
32 dump_stack();
33 spin_lock_irq(&rq->lock);
34}
35
36static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, u64 now)
37{
38}
39
40static int
41load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
42 unsigned long max_nr_move, unsigned long max_load_move,
43 struct sched_domain *sd, enum cpu_idle_type idle,
44 int *all_pinned, unsigned long *total_load_moved)
45{
46 return 0;
47}
48
49static void task_tick_idle(struct rq *rq, struct task_struct *curr)
50{
51}
52
53/*
54 * Simple, special scheduling class for the per-CPU idle tasks:
55 */
56static struct sched_class idle_sched_class __read_mostly = {
57 /* no enqueue/yield_task for idle tasks */
58
59 /* dequeue is not valid, we print a debug message there: */
60 .dequeue_task = dequeue_task_idle,
61
62 .check_preempt_curr = check_preempt_curr_idle,
63
64 .pick_next_task = pick_next_task_idle,
65 .put_prev_task = put_prev_task_idle,
66
67 .load_balance = load_balance_idle,
68
69 .task_tick = task_tick_idle,
70 /* no .task_new for idle tasks */
71};
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
new file mode 100644
index 0000000000..1192a2741b
--- /dev/null
+++ b/kernel/sched_rt.c
@@ -0,0 +1,255 @@
1/*
2 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
3 * policies)
4 */
5
6/*
7 * Update the current task's runtime statistics. Skip current tasks that
8 * are not in our scheduling class.
9 */
10static inline void update_curr_rt(struct rq *rq, u64 now)
11{
12 struct task_struct *curr = rq->curr;
13 u64 delta_exec;
14
15 if (!task_has_rt_policy(curr))
16 return;
17
18 delta_exec = now - curr->se.exec_start;
19 if (unlikely((s64)delta_exec < 0))
20 delta_exec = 0;
21 if (unlikely(delta_exec > curr->se.exec_max))
22 curr->se.exec_max = delta_exec;
23
24 curr->se.sum_exec_runtime += delta_exec;
25 curr->se.exec_start = now;
26}
27
28static void
29enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
30{
31 struct rt_prio_array *array = &rq->rt.active;
32
33 list_add_tail(&p->run_list, array->queue + p->prio);
34 __set_bit(p->prio, array->bitmap);
35}
36
37/*
38 * Adding/removing a task to/from a priority array:
39 */
40static void
41dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep, u64 now)
42{
43 struct rt_prio_array *array = &rq->rt.active;
44
45 update_curr_rt(rq, now);
46
47 list_del(&p->run_list);
48 if (list_empty(array->queue + p->prio))
49 __clear_bit(p->prio, array->bitmap);
50}
51
52/*
53 * Put task to the end of the run list without the overhead of dequeue
54 * followed by enqueue.
55 */
56static void requeue_task_rt(struct rq *rq, struct task_struct *p)
57{
58 struct rt_prio_array *array = &rq->rt.active;
59
60 list_move_tail(&p->run_list, array->queue + p->prio);
61}
62
63static void
64yield_task_rt(struct rq *rq, struct task_struct *p)
65{
66 requeue_task_rt(rq, p);
67}
68
69/*
70 * Preempt the current task with a newly woken task if needed:
71 */
72static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
73{
74 if (p->prio < rq->curr->prio)
75 resched_task(rq->curr);
76}
77
78static struct task_struct *pick_next_task_rt(struct rq *rq, u64 now)
79{
80 struct rt_prio_array *array = &rq->rt.active;
81 struct task_struct *next;
82 struct list_head *queue;
83 int idx;
84
85 idx = sched_find_first_bit(array->bitmap);
86 if (idx >= MAX_RT_PRIO)
87 return NULL;
88
89 queue = array->queue + idx;
90 next = list_entry(queue->next, struct task_struct, run_list);
91
92 next->se.exec_start = now;
93
94 return next;
95}
96
97static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now)
98{
99 update_curr_rt(rq, now);
100 p->se.exec_start = 0;
101}
102
103/*
104 * Load-balancing iterator. Note: while the runqueue stays locked
105 * during the whole iteration, the current task might be
106 * dequeued so the iterator has to be dequeue-safe. Here we
107 * achieve that by always pre-iterating before returning
108 * the current task:
109 */
110static struct task_struct *load_balance_start_rt(void *arg)
111{
112 struct rq *rq = arg;
113 struct rt_prio_array *array = &rq->rt.active;
114 struct list_head *head, *curr;
115 struct task_struct *p;
116 int idx;
117
118 idx = sched_find_first_bit(array->bitmap);
119 if (idx >= MAX_RT_PRIO)
120 return NULL;
121
122 head = array->queue + idx;
123 curr = head->prev;
124
125 p = list_entry(curr, struct task_struct, run_list);
126
127 curr = curr->prev;
128
129 rq->rt.rt_load_balance_idx = idx;
130 rq->rt.rt_load_balance_head = head;
131 rq->rt.rt_load_balance_curr = curr;
132
133 return p;
134}
135
136static struct task_struct *load_balance_next_rt(void *arg)
137{
138 struct rq *rq = arg;
139 struct rt_prio_array *array = &rq->rt.active;
140 struct list_head *head, *curr;
141 struct task_struct *p;
142 int idx;
143
144 idx = rq->rt.rt_load_balance_idx;
145 head = rq->rt.rt_load_balance_head;
146 curr = rq->rt.rt_load_balance_curr;
147
148 /*
149 * If we arrived back to the head again then
150 * iterate to the next queue (if any):
151 */
152 if (unlikely(head == curr)) {
153 int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
154
155 if (next_idx >= MAX_RT_PRIO)
156 return NULL;
157
158 idx = next_idx;
159 head = array->queue + idx;
160 curr = head->prev;
161
162 rq->rt.rt_load_balance_idx = idx;
163 rq->rt.rt_load_balance_head = head;
164 }
165
166 p = list_entry(curr, struct task_struct, run_list);
167
168 curr = curr->prev;
169
170 rq->rt.rt_load_balance_curr = curr;
171
172 return p;
173}
174
175static int
176load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
177 unsigned long max_nr_move, unsigned long max_load_move,
178 struct sched_domain *sd, enum cpu_idle_type idle,
179 int *all_pinned, unsigned long *load_moved)
180{
181 int this_best_prio, best_prio, best_prio_seen = 0;
182 int nr_moved;
183 struct rq_iterator rt_rq_iterator;
184
185 best_prio = sched_find_first_bit(busiest->rt.active.bitmap);
186 this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap);
187
188 /*
189 * Enable handling of the case where there is more than one task
190 * with the best priority. If the current running task is one
191 * of those with prio==best_prio we know it won't be moved
192 * and therefore it's safe to override the skip (based on load)
193 * of any task we find with that prio.
194 */
195 if (busiest->curr->prio == best_prio)
196 best_prio_seen = 1;
197
198 rt_rq_iterator.start = load_balance_start_rt;
199 rt_rq_iterator.next = load_balance_next_rt;
200 /* pass 'busiest' rq argument into
201 * load_balance_[start|next]_rt iterators
202 */
203 rt_rq_iterator.arg = busiest;
204
205 nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move,
206 max_load_move, sd, idle, all_pinned, load_moved,
207 this_best_prio, best_prio, best_prio_seen,
208 &rt_rq_iterator);
209
210 return nr_moved;
211}
212
213static void task_tick_rt(struct rq *rq, struct task_struct *p)
214{
215 /*
216 * RR tasks need a special form of timeslice management.
217 * FIFO tasks have no timeslices.
218 */
219 if (p->policy != SCHED_RR)
220 return;
221
222 if (--p->time_slice)
223 return;
224
225 p->time_slice = static_prio_timeslice(p->static_prio);
226 set_tsk_need_resched(p);
227
228 /* put it at the end of the queue: */
229 requeue_task_rt(rq, p);
230}
231
232/*
233 * No parent/child timeslice management necessary for RT tasks,
234 * just activate them:
235 */
236static void task_new_rt(struct rq *rq, struct task_struct *p)
237{
238 activate_task(rq, p, 1);
239}
240
241static struct sched_class rt_sched_class __read_mostly = {
242 .enqueue_task = enqueue_task_rt,
243 .dequeue_task = dequeue_task_rt,
244 .yield_task = yield_task_rt,
245
246 .check_preempt_curr = check_preempt_curr_rt,
247
248 .pick_next_task = pick_next_task_rt,
249 .put_prev_task = put_prev_task_rt,
250
251 .load_balance = load_balance_rt,
252
253 .task_tick = task_tick_rt,
254 .task_new = task_new_rt,
255};
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
new file mode 100644
index 0000000000..c63c38f6fa
--- /dev/null
+++ b/kernel/sched_stats.h
@@ -0,0 +1,235 @@
1
2#ifdef CONFIG_SCHEDSTATS
3/*
4 * bump this up when changing the output format or the meaning of an existing
5 * format, so that tools can adapt (or abort)
6 */
7#define SCHEDSTAT_VERSION 14
8
9static int show_schedstat(struct seq_file *seq, void *v)
10{
11 int cpu;
12
13 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
14 seq_printf(seq, "timestamp %lu\n", jiffies);
15 for_each_online_cpu(cpu) {
16 struct rq *rq = cpu_rq(cpu);
17#ifdef CONFIG_SMP
18 struct sched_domain *sd;
19 int dcnt = 0;
20#endif
21
22 /* runqueue-specific stats */
23 seq_printf(seq,
24 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu",
25 cpu, rq->yld_both_empty,
26 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
27 rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
28 rq->ttwu_cnt, rq->ttwu_local,
29 rq->rq_sched_info.cpu_time,
30 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
31
32 seq_printf(seq, "\n");
33
34#ifdef CONFIG_SMP
35 /* domain-specific stats */
36 preempt_disable();
37 for_each_domain(cpu, sd) {
38 enum cpu_idle_type itype;
39 char mask_str[NR_CPUS];
40
41 cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
42 seq_printf(seq, "domain%d %s", dcnt++, mask_str);
43 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
44 itype++) {
45 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
46 "%lu",
47 sd->lb_cnt[itype],
48 sd->lb_balanced[itype],
49 sd->lb_failed[itype],
50 sd->lb_imbalance[itype],
51 sd->lb_gained[itype],
52 sd->lb_hot_gained[itype],
53 sd->lb_nobusyq[itype],
54 sd->lb_nobusyg[itype]);
55 }
56 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
57 " %lu %lu %lu\n",
58 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
59 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
60 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
61 sd->ttwu_wake_remote, sd->ttwu_move_affine,
62 sd->ttwu_move_balance);
63 }
64 preempt_enable();
65#endif
66 }
67 return 0;
68}
69
70static int schedstat_open(struct inode *inode, struct file *file)
71{
72 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
73 char *buf = kmalloc(size, GFP_KERNEL);
74 struct seq_file *m;
75 int res;
76
77 if (!buf)
78 return -ENOMEM;
79 res = single_open(file, show_schedstat, NULL);
80 if (!res) {
81 m = file->private_data;
82 m->buf = buf;
83 m->size = size;
84 } else
85 kfree(buf);
86 return res;
87}
88
89const struct file_operations proc_schedstat_operations = {
90 .open = schedstat_open,
91 .read = seq_read,
92 .llseek = seq_lseek,
93 .release = single_release,
94};
95
96/*
97 * Expects runqueue lock to be held for atomicity of update
98 */
99static inline void
100rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
101{
102 if (rq) {
103 rq->rq_sched_info.run_delay += delta;
104 rq->rq_sched_info.pcnt++;
105 }
106}
107
108/*
109 * Expects runqueue lock to be held for atomicity of update
110 */
111static inline void
112rq_sched_info_depart(struct rq *rq, unsigned long long delta)
113{
114 if (rq)
115 rq->rq_sched_info.cpu_time += delta;
116}
117# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
118# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
119#else /* !CONFIG_SCHEDSTATS */
120static inline void
121rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
122{}
123static inline void
124rq_sched_info_depart(struct rq *rq, unsigned long long delta)
125{}
126# define schedstat_inc(rq, field) do { } while (0)
127# define schedstat_add(rq, field, amt) do { } while (0)
128#endif
129
130#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
131/*
132 * Called when a process is dequeued from the active array and given
133 * the cpu. We should note that with the exception of interactive
134 * tasks, the expired queue will become the active queue after the active
135 * queue is empty, without explicitly dequeuing and requeuing tasks in the
136 * expired queue. (Interactive tasks may be requeued directly to the
137 * active queue, thus delaying tasks in the expired queue from running;
138 * see scheduler_tick()).
139 *
140 * This function is only called from sched_info_arrive(), rather than
141 * dequeue_task(). Even though a task may be queued and dequeued multiple
142 * times as it is shuffled about, we're really interested in knowing how
143 * long it was from the *first* time it was queued to the time that it
144 * finally hit a cpu.
145 */
146static inline void sched_info_dequeued(struct task_struct *t)
147{
148 t->sched_info.last_queued = 0;
149}
150
151/*
152 * Called when a task finally hits the cpu. We can now calculate how
153 * long it was waiting to run. We also note when it began so that we
154 * can keep stats on how long its timeslice is.
155 */
156static void sched_info_arrive(struct task_struct *t)
157{
158 unsigned long long now = sched_clock(), delta = 0;
159
160 if (t->sched_info.last_queued)
161 delta = now - t->sched_info.last_queued;
162 sched_info_dequeued(t);
163 t->sched_info.run_delay += delta;
164 t->sched_info.last_arrival = now;
165 t->sched_info.pcnt++;
166
167 rq_sched_info_arrive(task_rq(t), delta);
168}
169
170/*
171 * Called when a process is queued into either the active or expired
172 * array. The time is noted and later used to determine how long we
173 * had to wait for us to reach the cpu. Since the expired queue will
174 * become the active queue after active queue is empty, without dequeuing
175 * and requeuing any tasks, we are interested in queuing to either. It
176 * is unusual but not impossible for tasks to be dequeued and immediately
177 * requeued in the same or another array: this can happen in sched_yield(),
178 * set_user_nice(), and even load_balance() as it moves tasks from runqueue
179 * to runqueue.
180 *
181 * This function is only called from enqueue_task(), but also only updates
182 * the timestamp if it is already not set. It's assumed that
183 * sched_info_dequeued() will clear that stamp when appropriate.
184 */
185static inline void sched_info_queued(struct task_struct *t)
186{
187 if (unlikely(sched_info_on()))
188 if (!t->sched_info.last_queued)
189 t->sched_info.last_queued = sched_clock();
190}
191
192/*
193 * Called when a process ceases being the active-running process, either
194 * voluntarily or involuntarily. Now we can calculate how long we ran.
195 */
196static inline void sched_info_depart(struct task_struct *t)
197{
198 unsigned long long delta = sched_clock() - t->sched_info.last_arrival;
199
200 t->sched_info.cpu_time += delta;
201 rq_sched_info_depart(task_rq(t), delta);
202}
203
204/*
205 * Called when tasks are switched involuntarily due, typically, to expiring
206 * their time slice. (This may also be called when switching to or from
207 * the idle task.) We are only called when prev != next.
208 */
209static inline void
210__sched_info_switch(struct task_struct *prev, struct task_struct *next)
211{
212 struct rq *rq = task_rq(prev);
213
214 /*
215 * prev now departs the cpu. It's not interesting to record
216 * stats about how efficient we were at scheduling the idle
217 * process, however.
218 */
219 if (prev != rq->idle)
220 sched_info_depart(prev);
221
222 if (next != rq->idle)
223 sched_info_arrive(next);
224}
225static inline void
226sched_info_switch(struct task_struct *prev, struct task_struct *next)
227{
228 if (unlikely(sched_info_on()))
229 __sched_info_switch(prev, next);
230}
231#else
232#define sched_info_queued(t) do { } while (0)
233#define sched_info_switch(t, next) do { } while (0)
234#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
235
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0b9886a00e..73217a9e28 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -488,7 +488,6 @@ void __init softirq_init(void)
488 488
489static int ksoftirqd(void * __bind_cpu) 489static int ksoftirqd(void * __bind_cpu)
490{ 490{
491 set_user_nice(current, 19);
492 current->flags |= PF_NOFREEZE; 491 current->flags |= PF_NOFREEZE;
493 492
494 set_current_state(TASK_INTERRUPTIBLE); 493 set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 30ee462ee7..51f5dac42a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -206,7 +206,87 @@ static ctl_table root_table[] = {
206 { .ctl_name = 0 } 206 { .ctl_name = 0 }
207}; 207};
208 208
209#ifdef CONFIG_SCHED_DEBUG
210static unsigned long min_sched_granularity_ns = 100000; /* 100 usecs */
211static unsigned long max_sched_granularity_ns = 1000000000; /* 1 second */
212static unsigned long min_wakeup_granularity_ns; /* 0 usecs */
213static unsigned long max_wakeup_granularity_ns = 1000000000; /* 1 second */
214#endif
215
209static ctl_table kern_table[] = { 216static ctl_table kern_table[] = {
217#ifdef CONFIG_SCHED_DEBUG
218 {
219 .ctl_name = CTL_UNNUMBERED,
220 .procname = "sched_granularity_ns",
221 .data = &sysctl_sched_granularity,
222 .maxlen = sizeof(unsigned int),
223 .mode = 0644,
224 .proc_handler = &proc_dointvec_minmax,
225 .strategy = &sysctl_intvec,
226 .extra1 = &min_sched_granularity_ns,
227 .extra2 = &max_sched_granularity_ns,
228 },
229 {
230 .ctl_name = CTL_UNNUMBERED,
231 .procname = "sched_wakeup_granularity_ns",
232 .data = &sysctl_sched_wakeup_granularity,
233 .maxlen = sizeof(unsigned int),
234 .mode = 0644,
235 .proc_handler = &proc_dointvec_minmax,
236 .strategy = &sysctl_intvec,
237 .extra1 = &min_wakeup_granularity_ns,
238 .extra2 = &max_wakeup_granularity_ns,
239 },
240 {
241 .ctl_name = CTL_UNNUMBERED,
242 .procname = "sched_batch_wakeup_granularity_ns",
243 .data = &sysctl_sched_batch_wakeup_granularity,
244 .maxlen = sizeof(unsigned int),
245 .mode = 0644,
246 .proc_handler = &proc_dointvec_minmax,
247 .strategy = &sysctl_intvec,
248 .extra1 = &min_wakeup_granularity_ns,
249 .extra2 = &max_wakeup_granularity_ns,
250 },
251 {
252 .ctl_name = CTL_UNNUMBERED,
253 .procname = "sched_stat_granularity_ns",
254 .data = &sysctl_sched_stat_granularity,
255 .maxlen = sizeof(unsigned int),
256 .mode = 0644,
257 .proc_handler = &proc_dointvec_minmax,
258 .strategy = &sysctl_intvec,
259 .extra1 = &min_wakeup_granularity_ns,
260 .extra2 = &max_wakeup_granularity_ns,
261 },
262 {
263 .ctl_name = CTL_UNNUMBERED,
264 .procname = "sched_runtime_limit_ns",
265 .data = &sysctl_sched_runtime_limit,
266 .maxlen = sizeof(unsigned int),
267 .mode = 0644,
268 .proc_handler = &proc_dointvec_minmax,
269 .strategy = &sysctl_intvec,
270 .extra1 = &min_sched_granularity_ns,
271 .extra2 = &max_sched_granularity_ns,
272 },
273 {
274 .ctl_name = CTL_UNNUMBERED,
275 .procname = "sched_child_runs_first",
276 .data = &sysctl_sched_child_runs_first,
277 .maxlen = sizeof(unsigned int),
278 .mode = 0644,
279 .proc_handler = &proc_dointvec,
280 },
281 {
282 .ctl_name = CTL_UNNUMBERED,
283 .procname = "sched_features",
284 .data = &sysctl_sched_features,
285 .maxlen = sizeof(unsigned int),
286 .mode = 0644,
287 .proc_handler = &proc_dointvec,
288 },
289#endif
210 { 290 {
211 .ctl_name = KERN_PANIC, 291 .ctl_name = KERN_PANIC,
212 .procname = "panic", 292 .procname = "panic",