aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c16
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/exit.c6
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/hrtimer.c24
-rw-r--r--kernel/ksysfs.c8
-rw-r--r--kernel/lockdep.c26
-rw-r--r--kernel/lockdep_proc.c61
-rw-r--r--kernel/mutex.c35
-rw-r--r--kernel/nsproxy.c15
-rw-r--r--kernel/posix-timers.c6
-rw-r--r--kernel/rcupdate.c8
-rw-r--r--kernel/sched.c1445
-rw-r--r--kernel/sched_debug.c282
-rw-r--r--kernel/sched_fair.c811
-rw-r--r--kernel/sched_idletask.c8
-rw-r--r--kernel/sched_rt.c19
-rw-r--r--kernel/sched_stats.h28
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/sysctl.c41
-rw-r--r--kernel/time/Kconfig5
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/clockevents.c3
-rw-r--r--kernel/time/tick-broadcast.c44
-rw-r--r--kernel/time/tick-common.c5
-rw-r--r--kernel/user.c249
26 files changed, 1834 insertions, 1325 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index eb0f9165b401..2924251a6547 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -847,18 +847,10 @@ static void audit_receive_skb(struct sk_buff *skb)
847} 847}
848 848
849/* Receive messages from netlink socket. */ 849/* Receive messages from netlink socket. */
850static void audit_receive(struct sock *sk, int length) 850static void audit_receive(struct sk_buff *skb)
851{ 851{
852 struct sk_buff *skb;
853 unsigned int qlen;
854
855 mutex_lock(&audit_cmd_mutex); 852 mutex_lock(&audit_cmd_mutex);
856 853 audit_receive_skb(skb);
857 for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
858 skb = skb_dequeue(&sk->sk_receive_queue);
859 audit_receive_skb(skb);
860 kfree_skb(skb);
861 }
862 mutex_unlock(&audit_cmd_mutex); 854 mutex_unlock(&audit_cmd_mutex);
863} 855}
864 856
@@ -876,8 +868,8 @@ static int __init audit_init(void)
876 868
877 printk(KERN_INFO "audit: initializing netlink socket (%s)\n", 869 printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
878 audit_default ? "enabled" : "disabled"); 870 audit_default ? "enabled" : "disabled");
879 audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, 871 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
880 NULL, THIS_MODULE); 872 audit_receive, NULL, THIS_MODULE);
881 if (!audit_sock) 873 if (!audit_sock)
882 audit_panic("cannot initialize netlink socket"); 874 audit_panic("cannot initialize netlink socket");
883 else 875 else
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 81e697829633..09e9574eeb26 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -119,7 +119,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
119 * No locking available for sched_info (and too expensive to add one) 119 * No locking available for sched_info (and too expensive to add one)
120 * Mitigate by taking snapshot of values 120 * Mitigate by taking snapshot of values
121 */ 121 */
122 t1 = tsk->sched_info.pcnt; 122 t1 = tsk->sched_info.pcount;
123 t2 = tsk->sched_info.run_delay; 123 t2 = tsk->sched_info.run_delay;
124 t3 = tsk->sched_info.cpu_time; 124 t3 = tsk->sched_info.cpu_time;
125 125
diff --git a/kernel/exit.c b/kernel/exit.c
index 993369ee94d1..7f7959de4a87 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -111,6 +111,7 @@ static void __exit_signal(struct task_struct *tsk)
111 */ 111 */
112 sig->utime = cputime_add(sig->utime, tsk->utime); 112 sig->utime = cputime_add(sig->utime, tsk->utime);
113 sig->stime = cputime_add(sig->stime, tsk->stime); 113 sig->stime = cputime_add(sig->stime, tsk->stime);
114 sig->gtime = cputime_add(sig->gtime, tsk->gtime);
114 sig->min_flt += tsk->min_flt; 115 sig->min_flt += tsk->min_flt;
115 sig->maj_flt += tsk->maj_flt; 116 sig->maj_flt += tsk->maj_flt;
116 sig->nvcsw += tsk->nvcsw; 117 sig->nvcsw += tsk->nvcsw;
@@ -1242,6 +1243,11 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1242 cputime_add(p->stime, 1243 cputime_add(p->stime,
1243 cputime_add(sig->stime, 1244 cputime_add(sig->stime,
1244 sig->cstime))); 1245 sig->cstime)));
1246 psig->cgtime =
1247 cputime_add(psig->cgtime,
1248 cputime_add(p->gtime,
1249 cputime_add(sig->gtime,
1250 sig->cgtime)));
1245 psig->cmin_flt += 1251 psig->cmin_flt +=
1246 p->min_flt + sig->min_flt + sig->cmin_flt; 1252 p->min_flt + sig->min_flt + sig->cmin_flt;
1247 psig->cmaj_flt += 1253 psig->cmaj_flt +=
diff --git a/kernel/fork.c b/kernel/fork.c
index 33f12f48684a..3fc3c1383912 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -877,6 +877,8 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
877 sig->tty_old_pgrp = NULL; 877 sig->tty_old_pgrp = NULL;
878 878
879 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; 879 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
880 sig->gtime = cputime_zero;
881 sig->cgtime = cputime_zero;
880 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 882 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
881 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 883 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
882 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 884 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
@@ -1045,6 +1047,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1045 1047
1046 p->utime = cputime_zero; 1048 p->utime = cputime_zero;
1047 p->stime = cputime_zero; 1049 p->stime = cputime_zero;
1050 p->gtime = cputime_zero;
1048 1051
1049#ifdef CONFIG_TASK_XACCT 1052#ifdef CONFIG_TASK_XACCT
1050 p->rchar = 0; /* I/O counter: bytes read */ 1053 p->rchar = 0; /* I/O counter: bytes read */
@@ -1608,7 +1611,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1608 err = -EINVAL; 1611 err = -EINVAL;
1609 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| 1612 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1610 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| 1613 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1611 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER)) 1614 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER|
1615 CLONE_NEWNET))
1612 goto bad_unshare_out; 1616 goto bad_unshare_out;
1613 1617
1614 if ((err = unshare_thread(unshare_flags))) 1618 if ((err = unshare_thread(unshare_flags)))
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index c21ca6bfaa66..dc8a4451d79b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -277,6 +277,30 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
277} 277}
278 278
279EXPORT_SYMBOL_GPL(ktime_add_ns); 279EXPORT_SYMBOL_GPL(ktime_add_ns);
280
281/**
282 * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable
283 * @kt: minuend
284 * @nsec: the scalar nsec value to subtract
285 *
286 * Returns the subtraction of @nsec from @kt in ktime_t format
287 */
288ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec)
289{
290 ktime_t tmp;
291
292 if (likely(nsec < NSEC_PER_SEC)) {
293 tmp.tv64 = nsec;
294 } else {
295 unsigned long rem = do_div(nsec, NSEC_PER_SEC);
296
297 tmp = ktime_set((long)nsec, rem);
298 }
299
300 return ktime_sub(kt, tmp);
301}
302
303EXPORT_SYMBOL_GPL(ktime_sub_ns);
280# endif /* !CONFIG_KTIME_SCALAR */ 304# endif /* !CONFIG_KTIME_SCALAR */
281 305
282/* 306/*
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index d0e5c48e18c7..6046939d0804 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -14,6 +14,7 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/kexec.h> 16#include <linux/kexec.h>
17#include <linux/sched.h>
17 18
18#define KERNEL_ATTR_RO(_name) \ 19#define KERNEL_ATTR_RO(_name) \
19static struct subsys_attribute _name##_attr = __ATTR_RO(_name) 20static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
@@ -116,6 +117,13 @@ static int __init ksysfs_init(void)
116 &notes_attr); 117 &notes_attr);
117 } 118 }
118 119
120 /*
121 * Create "/sys/kernel/uids" directory and corresponding root user's
122 * directory under it.
123 */
124 if (!error)
125 error = uids_kobject_init();
126
119 return error; 127 return error;
120} 128}
121 129
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 734da579ad13..a6f1ee9c92d9 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1521,7 +1521,7 @@ cache_hit:
1521} 1521}
1522 1522
1523static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, 1523static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
1524 struct held_lock *hlock, int chain_head) 1524 struct held_lock *hlock, int chain_head, u64 chain_key)
1525{ 1525{
1526 /* 1526 /*
1527 * Trylock needs to maintain the stack of held locks, but it 1527 * Trylock needs to maintain the stack of held locks, but it
@@ -1534,7 +1534,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
1534 * graph_lock for us) 1534 * graph_lock for us)
1535 */ 1535 */
1536 if (!hlock->trylock && (hlock->check == 2) && 1536 if (!hlock->trylock && (hlock->check == 2) &&
1537 lookup_chain_cache(curr->curr_chain_key, hlock->class)) { 1537 lookup_chain_cache(chain_key, hlock->class)) {
1538 /* 1538 /*
1539 * Check whether last held lock: 1539 * Check whether last held lock:
1540 * 1540 *
@@ -1576,7 +1576,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
1576#else 1576#else
1577static inline int validate_chain(struct task_struct *curr, 1577static inline int validate_chain(struct task_struct *curr,
1578 struct lockdep_map *lock, struct held_lock *hlock, 1578 struct lockdep_map *lock, struct held_lock *hlock,
1579 int chain_head) 1579 int chain_head, u64 chain_key)
1580{ 1580{
1581 return 1; 1581 return 1;
1582} 1582}
@@ -2450,11 +2450,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2450 chain_head = 1; 2450 chain_head = 1;
2451 } 2451 }
2452 chain_key = iterate_chain_key(chain_key, id); 2452 chain_key = iterate_chain_key(chain_key, id);
2453 curr->curr_chain_key = chain_key;
2454 2453
2455 if (!validate_chain(curr, lock, hlock, chain_head)) 2454 if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
2456 return 0; 2455 return 0;
2457 2456
2457 curr->curr_chain_key = chain_key;
2458 curr->lockdep_depth++; 2458 curr->lockdep_depth++;
2459 check_chain_key(curr); 2459 check_chain_key(curr);
2460#ifdef CONFIG_DEBUG_LOCKDEP 2460#ifdef CONFIG_DEBUG_LOCKDEP
@@ -3199,3 +3199,19 @@ void debug_show_held_locks(struct task_struct *task)
3199} 3199}
3200 3200
3201EXPORT_SYMBOL_GPL(debug_show_held_locks); 3201EXPORT_SYMBOL_GPL(debug_show_held_locks);
3202
3203void lockdep_sys_exit(void)
3204{
3205 struct task_struct *curr = current;
3206
3207 if (unlikely(curr->lockdep_depth)) {
3208 if (!debug_locks_off())
3209 return;
3210 printk("\n================================================\n");
3211 printk( "[ BUG: lock held when returning to user space! ]\n");
3212 printk( "------------------------------------------------\n");
3213 printk("%s/%d is leaving the kernel with locks still held!\n",
3214 curr->comm, curr->pid);
3215 lockdep_print_held_locks(curr);
3216 }
3217}
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index c851b2dcc685..8a135bd163c2 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -25,28 +25,38 @@
25 25
26static void *l_next(struct seq_file *m, void *v, loff_t *pos) 26static void *l_next(struct seq_file *m, void *v, loff_t *pos)
27{ 27{
28 struct lock_class *class = v; 28 struct lock_class *class;
29 29
30 (*pos)++; 30 (*pos)++;
31 31
32 if (class->lock_entry.next != &all_lock_classes) 32 if (v == SEQ_START_TOKEN)
33 class = list_entry(class->lock_entry.next, struct lock_class, 33 class = m->private;
34 lock_entry); 34 else {
35 else 35 class = v;
36 class = NULL; 36
37 m->private = class; 37 if (class->lock_entry.next != &all_lock_classes)
38 class = list_entry(class->lock_entry.next,
39 struct lock_class, lock_entry);
40 else
41 class = NULL;
42 }
38 43
39 return class; 44 return class;
40} 45}
41 46
42static void *l_start(struct seq_file *m, loff_t *pos) 47static void *l_start(struct seq_file *m, loff_t *pos)
43{ 48{
44 struct lock_class *class = m->private; 49 struct lock_class *class;
50 loff_t i = 0;
45 51
46 if (&class->lock_entry == all_lock_classes.next) 52 if (*pos == 0)
47 seq_printf(m, "all lock classes:\n"); 53 return SEQ_START_TOKEN;
48 54
49 return class; 55 list_for_each_entry(class, &all_lock_classes, lock_entry) {
56 if (++i == *pos)
57 return class;
58 }
59 return NULL;
50} 60}
51 61
52static void l_stop(struct seq_file *m, void *v) 62static void l_stop(struct seq_file *m, void *v)
@@ -101,10 +111,15 @@ static void print_name(struct seq_file *m, struct lock_class *class)
101static int l_show(struct seq_file *m, void *v) 111static int l_show(struct seq_file *m, void *v)
102{ 112{
103 unsigned long nr_forward_deps, nr_backward_deps; 113 unsigned long nr_forward_deps, nr_backward_deps;
104 struct lock_class *class = m->private; 114 struct lock_class *class = v;
105 struct lock_list *entry; 115 struct lock_list *entry;
106 char c1, c2, c3, c4; 116 char c1, c2, c3, c4;
107 117
118 if (v == SEQ_START_TOKEN) {
119 seq_printf(m, "all lock classes:\n");
120 return 0;
121 }
122
108 seq_printf(m, "%p", class->key); 123 seq_printf(m, "%p", class->key);
109#ifdef CONFIG_DEBUG_LOCKDEP 124#ifdef CONFIG_DEBUG_LOCKDEP
110 seq_printf(m, " OPS:%8ld", class->ops); 125 seq_printf(m, " OPS:%8ld", class->ops);
@@ -523,10 +538,11 @@ static void *ls_start(struct seq_file *m, loff_t *pos)
523{ 538{
524 struct lock_stat_seq *data = m->private; 539 struct lock_stat_seq *data = m->private;
525 540
526 if (data->iter == data->stats) 541 if (*pos == 0)
527 seq_header(m); 542 return SEQ_START_TOKEN;
528 543
529 if (data->iter == data->iter_end) 544 data->iter = data->stats + *pos;
545 if (data->iter >= data->iter_end)
530 data->iter = NULL; 546 data->iter = NULL;
531 547
532 return data->iter; 548 return data->iter;
@@ -538,8 +554,13 @@ static void *ls_next(struct seq_file *m, void *v, loff_t *pos)
538 554
539 (*pos)++; 555 (*pos)++;
540 556
541 data->iter = v; 557 if (v == SEQ_START_TOKEN)
542 data->iter++; 558 data->iter = data->stats;
559 else {
560 data->iter = v;
561 data->iter++;
562 }
563
543 if (data->iter == data->iter_end) 564 if (data->iter == data->iter_end)
544 data->iter = NULL; 565 data->iter = NULL;
545 566
@@ -552,9 +573,11 @@ static void ls_stop(struct seq_file *m, void *v)
552 573
553static int ls_show(struct seq_file *m, void *v) 574static int ls_show(struct seq_file *m, void *v)
554{ 575{
555 struct lock_stat_seq *data = m->private; 576 if (v == SEQ_START_TOKEN)
577 seq_header(m);
578 else
579 seq_stats(m, v);
556 580
557 seq_stats(m, data->iter);
558 return 0; 581 return 0;
559} 582}
560 583
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 691b86564dd9..d7fe50cc556f 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -51,6 +51,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
51 51
52EXPORT_SYMBOL(__mutex_init); 52EXPORT_SYMBOL(__mutex_init);
53 53
54#ifndef CONFIG_DEBUG_LOCK_ALLOC
54/* 55/*
55 * We split the mutex lock/unlock logic into separate fastpath and 56 * We split the mutex lock/unlock logic into separate fastpath and
56 * slowpath functions, to reduce the register pressure on the fastpath. 57 * slowpath functions, to reduce the register pressure on the fastpath.
@@ -92,6 +93,7 @@ void inline fastcall __sched mutex_lock(struct mutex *lock)
92} 93}
93 94
94EXPORT_SYMBOL(mutex_lock); 95EXPORT_SYMBOL(mutex_lock);
96#endif
95 97
96static void fastcall noinline __sched 98static void fastcall noinline __sched
97__mutex_unlock_slowpath(atomic_t *lock_count); 99__mutex_unlock_slowpath(atomic_t *lock_count);
@@ -122,7 +124,8 @@ EXPORT_SYMBOL(mutex_unlock);
122 * Lock a mutex (possibly interruptible), slowpath: 124 * Lock a mutex (possibly interruptible), slowpath:
123 */ 125 */
124static inline int __sched 126static inline int __sched
125__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass) 127__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
128 unsigned long ip)
126{ 129{
127 struct task_struct *task = current; 130 struct task_struct *task = current;
128 struct mutex_waiter waiter; 131 struct mutex_waiter waiter;
@@ -132,7 +135,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
132 spin_lock_mutex(&lock->wait_lock, flags); 135 spin_lock_mutex(&lock->wait_lock, flags);
133 136
134 debug_mutex_lock_common(lock, &waiter); 137 debug_mutex_lock_common(lock, &waiter);
135 mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); 138 mutex_acquire(&lock->dep_map, subclass, 0, ip);
136 debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); 139 debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
137 140
138 /* add waiting tasks to the end of the waitqueue (FIFO): */ 141 /* add waiting tasks to the end of the waitqueue (FIFO): */
@@ -143,7 +146,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
143 if (old_val == 1) 146 if (old_val == 1)
144 goto done; 147 goto done;
145 148
146 lock_contended(&lock->dep_map, _RET_IP_); 149 lock_contended(&lock->dep_map, ip);
147 150
148 for (;;) { 151 for (;;) {
149 /* 152 /*
@@ -166,7 +169,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
166 if (unlikely(state == TASK_INTERRUPTIBLE && 169 if (unlikely(state == TASK_INTERRUPTIBLE &&
167 signal_pending(task))) { 170 signal_pending(task))) {
168 mutex_remove_waiter(lock, &waiter, task_thread_info(task)); 171 mutex_remove_waiter(lock, &waiter, task_thread_info(task));
169 mutex_release(&lock->dep_map, 1, _RET_IP_); 172 mutex_release(&lock->dep_map, 1, ip);
170 spin_unlock_mutex(&lock->wait_lock, flags); 173 spin_unlock_mutex(&lock->wait_lock, flags);
171 174
172 debug_mutex_free_waiter(&waiter); 175 debug_mutex_free_waiter(&waiter);
@@ -197,20 +200,12 @@ done:
197 return 0; 200 return 0;
198} 201}
199 202
200static void fastcall noinline __sched
201__mutex_lock_slowpath(atomic_t *lock_count)
202{
203 struct mutex *lock = container_of(lock_count, struct mutex, count);
204
205 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0);
206}
207
208#ifdef CONFIG_DEBUG_LOCK_ALLOC 203#ifdef CONFIG_DEBUG_LOCK_ALLOC
209void __sched 204void __sched
210mutex_lock_nested(struct mutex *lock, unsigned int subclass) 205mutex_lock_nested(struct mutex *lock, unsigned int subclass)
211{ 206{
212 might_sleep(); 207 might_sleep();
213 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass); 208 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_);
214} 209}
215 210
216EXPORT_SYMBOL_GPL(mutex_lock_nested); 211EXPORT_SYMBOL_GPL(mutex_lock_nested);
@@ -219,7 +214,7 @@ int __sched
219mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) 214mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
220{ 215{
221 might_sleep(); 216 might_sleep();
222 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass); 217 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_);
223} 218}
224 219
225EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); 220EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -271,6 +266,7 @@ __mutex_unlock_slowpath(atomic_t *lock_count)
271 __mutex_unlock_common_slowpath(lock_count, 1); 266 __mutex_unlock_common_slowpath(lock_count, 1);
272} 267}
273 268
269#ifndef CONFIG_DEBUG_LOCK_ALLOC
274/* 270/*
275 * Here come the less common (and hence less performance-critical) APIs: 271 * Here come the less common (and hence less performance-critical) APIs:
276 * mutex_lock_interruptible() and mutex_trylock(). 272 * mutex_lock_interruptible() and mutex_trylock().
@@ -298,13 +294,22 @@ int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
298 294
299EXPORT_SYMBOL(mutex_lock_interruptible); 295EXPORT_SYMBOL(mutex_lock_interruptible);
300 296
297static void fastcall noinline __sched
298__mutex_lock_slowpath(atomic_t *lock_count)
299{
300 struct mutex *lock = container_of(lock_count, struct mutex, count);
301
302 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_);
303}
304
301static int fastcall noinline __sched 305static int fastcall noinline __sched
302__mutex_lock_interruptible_slowpath(atomic_t *lock_count) 306__mutex_lock_interruptible_slowpath(atomic_t *lock_count)
303{ 307{
304 struct mutex *lock = container_of(lock_count, struct mutex, count); 308 struct mutex *lock = container_of(lock_count, struct mutex, count);
305 309
306 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0); 310 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, _RET_IP_);
307} 311}
312#endif
308 313
309/* 314/*
310 * Spinlock based trylock, we take the spinlock and check whether we 315 * Spinlock based trylock, we take the spinlock and check whether we
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index a4fb7d46971f..f1decd21a534 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -20,6 +20,7 @@
20#include <linux/mnt_namespace.h> 20#include <linux/mnt_namespace.h>
21#include <linux/utsname.h> 21#include <linux/utsname.h>
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <net/net_namespace.h>
23 24
24static struct kmem_cache *nsproxy_cachep; 25static struct kmem_cache *nsproxy_cachep;
25 26
@@ -98,8 +99,17 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
98 goto out_user; 99 goto out_user;
99 } 100 }
100 101
102 new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns);
103 if (IS_ERR(new_nsp->net_ns)) {
104 err = PTR_ERR(new_nsp->net_ns);
105 goto out_net;
106 }
107
101 return new_nsp; 108 return new_nsp;
102 109
110out_net:
111 if (new_nsp->user_ns)
112 put_user_ns(new_nsp->user_ns);
103out_user: 113out_user:
104 if (new_nsp->pid_ns) 114 if (new_nsp->pid_ns)
105 put_pid_ns(new_nsp->pid_ns); 115 put_pid_ns(new_nsp->pid_ns);
@@ -132,7 +142,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
132 142
133 get_nsproxy(old_ns); 143 get_nsproxy(old_ns);
134 144
135 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER))) 145 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWNET)))
136 return 0; 146 return 0;
137 147
138 if (!capable(CAP_SYS_ADMIN)) { 148 if (!capable(CAP_SYS_ADMIN)) {
@@ -164,6 +174,7 @@ void free_nsproxy(struct nsproxy *ns)
164 put_pid_ns(ns->pid_ns); 174 put_pid_ns(ns->pid_ns);
165 if (ns->user_ns) 175 if (ns->user_ns)
166 put_user_ns(ns->user_ns); 176 put_user_ns(ns->user_ns);
177 put_net(ns->net_ns);
167 kmem_cache_free(nsproxy_cachep, ns); 178 kmem_cache_free(nsproxy_cachep, ns);
168} 179}
169 180
@@ -177,7 +188,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
177 int err = 0; 188 int err = 0;
178 189
179 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | 190 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
180 CLONE_NEWUSER))) 191 CLONE_NEWUSER | CLONE_NEWNET)))
181 return 0; 192 return 0;
182 193
183 if (!capable(CAP_SYS_ADMIN)) 194 if (!capable(CAP_SYS_ADMIN))
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 7a15afb73ed0..57efe0400bc2 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -712,7 +712,7 @@ sys_timer_getoverrun(timer_t timer_id)
712{ 712{
713 struct k_itimer *timr; 713 struct k_itimer *timr;
714 int overrun; 714 int overrun;
715 long flags; 715 unsigned long flags;
716 716
717 timr = lock_timer(timer_id, &flags); 717 timr = lock_timer(timer_id, &flags);
718 if (!timr) 718 if (!timr)
@@ -784,7 +784,7 @@ sys_timer_settime(timer_t timer_id, int flags,
784 struct k_itimer *timr; 784 struct k_itimer *timr;
785 struct itimerspec new_spec, old_spec; 785 struct itimerspec new_spec, old_spec;
786 int error = 0; 786 int error = 0;
787 long flag; 787 unsigned long flag;
788 struct itimerspec *rtn = old_setting ? &old_spec : NULL; 788 struct itimerspec *rtn = old_setting ? &old_spec : NULL;
789 789
790 if (!new_setting) 790 if (!new_setting)
@@ -836,7 +836,7 @@ asmlinkage long
836sys_timer_delete(timer_t timer_id) 836sys_timer_delete(timer_t timer_id)
837{ 837{
838 struct k_itimer *timer; 838 struct k_itimer *timer;
839 long flags; 839 unsigned long flags;
840 840
841retry_delete: 841retry_delete:
842 timer = lock_timer(timer_id, &flags); 842 timer = lock_timer(timer_id, &flags);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 2c2dd8410dc4..130214f3d229 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -49,6 +49,14 @@
49#include <linux/cpu.h> 49#include <linux/cpu.h>
50#include <linux/mutex.h> 50#include <linux/mutex.h>
51 51
52#ifdef CONFIG_DEBUG_LOCK_ALLOC
53static struct lock_class_key rcu_lock_key;
54struct lockdep_map rcu_lock_map =
55 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
56
57EXPORT_SYMBOL_GPL(rcu_lock_map);
58#endif
59
52/* Definition for rcupdate control block. */ 60/* Definition for rcupdate control block. */
53static struct rcu_ctrlblk rcu_ctrlblk = { 61static struct rcu_ctrlblk rcu_ctrlblk = {
54 .cur = -300, 62 .cur = -300,
diff --git a/kernel/sched.c b/kernel/sched.c
index 6107a0cd6325..bba57adb9504 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -61,6 +61,7 @@
61#include <linux/delayacct.h> 61#include <linux/delayacct.h>
62#include <linux/reciprocal_div.h> 62#include <linux/reciprocal_div.h>
63#include <linux/unistd.h> 63#include <linux/unistd.h>
64#include <linux/pagemap.h>
64 65
65#include <asm/tlb.h> 66#include <asm/tlb.h>
66 67
@@ -95,7 +96,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
95/* 96/*
96 * Some helpers for converting nanosecond timing to jiffy resolution 97 * Some helpers for converting nanosecond timing to jiffy resolution
97 */ 98 */
98#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) 99#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (1000000000 / HZ))
99#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) 100#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
100 101
101#define NICE_0_LOAD SCHED_LOAD_SCALE 102#define NICE_0_LOAD SCHED_LOAD_SCALE
@@ -104,11 +105,9 @@ unsigned long long __attribute__((weak)) sched_clock(void)
104/* 105/*
105 * These are the 'tuning knobs' of the scheduler: 106 * These are the 'tuning knobs' of the scheduler:
106 * 107 *
107 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), 108 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
108 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
109 * Timeslices get refilled after they expire. 109 * Timeslices get refilled after they expire.
110 */ 110 */
111#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
112#define DEF_TIMESLICE (100 * HZ / 1000) 111#define DEF_TIMESLICE (100 * HZ / 1000)
113 112
114#ifdef CONFIG_SMP 113#ifdef CONFIG_SMP
@@ -132,24 +131,6 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
132} 131}
133#endif 132#endif
134 133
135#define SCALE_PRIO(x, prio) \
136 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
137
138/*
139 * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
140 * to time slice values: [800ms ... 100ms ... 5ms]
141 */
142static unsigned int static_prio_timeslice(int static_prio)
143{
144 if (static_prio == NICE_TO_PRIO(19))
145 return 1;
146
147 if (static_prio < NICE_TO_PRIO(0))
148 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
149 else
150 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
151}
152
153static inline int rt_policy(int policy) 134static inline int rt_policy(int policy)
154{ 135{
155 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) 136 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
@@ -170,31 +151,91 @@ struct rt_prio_array {
170 struct list_head queue[MAX_RT_PRIO]; 151 struct list_head queue[MAX_RT_PRIO];
171}; 152};
172 153
173struct load_stat { 154#ifdef CONFIG_FAIR_GROUP_SCHED
174 struct load_weight load; 155
175 u64 load_update_start, load_update_last; 156struct cfs_rq;
176 unsigned long delta_fair, delta_exec, delta_stat; 157
158/* task group related information */
159struct task_group {
160 /* schedulable entities of this group on each cpu */
161 struct sched_entity **se;
162 /* runqueue "owned" by this group on each cpu */
163 struct cfs_rq **cfs_rq;
164 unsigned long shares;
165 /* spinlock to serialize modification to shares */
166 spinlock_t lock;
167};
168
169/* Default task group's sched entity on each cpu */
170static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
171/* Default task group's cfs_rq on each cpu */
172static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
173
174static struct sched_entity *init_sched_entity_p[NR_CPUS];
175static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
176
177/* Default task group.
178 * Every task in system belong to this group at bootup.
179 */
180struct task_group init_task_group = {
181 .se = init_sched_entity_p,
182 .cfs_rq = init_cfs_rq_p,
177}; 183};
178 184
185#ifdef CONFIG_FAIR_USER_SCHED
186# define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD
187#else
188# define INIT_TASK_GRP_LOAD NICE_0_LOAD
189#endif
190
191static int init_task_group_load = INIT_TASK_GRP_LOAD;
192
193/* return group to which a task belongs */
194static inline struct task_group *task_group(struct task_struct *p)
195{
196 struct task_group *tg;
197
198#ifdef CONFIG_FAIR_USER_SCHED
199 tg = p->user->tg;
200#else
201 tg = &init_task_group;
202#endif
203
204 return tg;
205}
206
207/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
208static inline void set_task_cfs_rq(struct task_struct *p)
209{
210 p->se.cfs_rq = task_group(p)->cfs_rq[task_cpu(p)];
211 p->se.parent = task_group(p)->se[task_cpu(p)];
212}
213
214#else
215
216static inline void set_task_cfs_rq(struct task_struct *p) { }
217
218#endif /* CONFIG_FAIR_GROUP_SCHED */
219
179/* CFS-related fields in a runqueue */ 220/* CFS-related fields in a runqueue */
180struct cfs_rq { 221struct cfs_rq {
181 struct load_weight load; 222 struct load_weight load;
182 unsigned long nr_running; 223 unsigned long nr_running;
183 224
184 s64 fair_clock;
185 u64 exec_clock; 225 u64 exec_clock;
186 s64 wait_runtime; 226 u64 min_vruntime;
187 u64 sleeper_bonus;
188 unsigned long wait_runtime_overruns, wait_runtime_underruns;
189 227
190 struct rb_root tasks_timeline; 228 struct rb_root tasks_timeline;
191 struct rb_node *rb_leftmost; 229 struct rb_node *rb_leftmost;
192 struct rb_node *rb_load_balance_curr; 230 struct rb_node *rb_load_balance_curr;
193#ifdef CONFIG_FAIR_GROUP_SCHED
194 /* 'curr' points to currently running entity on this cfs_rq. 231 /* 'curr' points to currently running entity on this cfs_rq.
195 * It is set to NULL otherwise (i.e when none are currently running). 232 * It is set to NULL otherwise (i.e when none are currently running).
196 */ 233 */
197 struct sched_entity *curr; 234 struct sched_entity *curr;
235
236 unsigned long nr_spread_over;
237
238#ifdef CONFIG_FAIR_GROUP_SCHED
198 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 239 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
199 240
200 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 241 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
@@ -205,6 +246,8 @@ struct cfs_rq {
205 * list is used during load balance. 246 * list is used during load balance.
206 */ 247 */
207 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ 248 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
249 struct task_group *tg; /* group that "owns" this runqueue */
250 struct rcu_head rcu;
208#endif 251#endif
209}; 252};
210 253
@@ -236,7 +279,7 @@ struct rq {
236#ifdef CONFIG_NO_HZ 279#ifdef CONFIG_NO_HZ
237 unsigned char in_nohz_recently; 280 unsigned char in_nohz_recently;
238#endif 281#endif
239 struct load_stat ls; /* capture load from *all* tasks on this cpu */ 282 struct load_weight load; /* capture load from *all* tasks on this cpu */
240 unsigned long nr_load_updates; 283 unsigned long nr_load_updates;
241 u64 nr_switches; 284 u64 nr_switches;
242 285
@@ -288,16 +331,19 @@ struct rq {
288 unsigned long yld_exp_empty; 331 unsigned long yld_exp_empty;
289 unsigned long yld_act_empty; 332 unsigned long yld_act_empty;
290 unsigned long yld_both_empty; 333 unsigned long yld_both_empty;
291 unsigned long yld_cnt; 334 unsigned long yld_count;
292 335
293 /* schedule() stats */ 336 /* schedule() stats */
294 unsigned long sched_switch; 337 unsigned long sched_switch;
295 unsigned long sched_cnt; 338 unsigned long sched_count;
296 unsigned long sched_goidle; 339 unsigned long sched_goidle;
297 340
298 /* try_to_wake_up() stats */ 341 /* try_to_wake_up() stats */
299 unsigned long ttwu_cnt; 342 unsigned long ttwu_count;
300 unsigned long ttwu_local; 343 unsigned long ttwu_local;
344
345 /* BKL stats */
346 unsigned long bkl_count;
301#endif 347#endif
302 struct lock_class_key rq_lock_key; 348 struct lock_class_key rq_lock_key;
303}; 349};
@@ -382,6 +428,37 @@ static void update_rq_clock(struct rq *rq)
382#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 428#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
383 429
384/* 430/*
431 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
432 */
433#ifdef CONFIG_SCHED_DEBUG
434# define const_debug __read_mostly
435#else
436# define const_debug static const
437#endif
438
439/*
440 * Debugging: various feature bits
441 */
442enum {
443 SCHED_FEAT_NEW_FAIR_SLEEPERS = 1,
444 SCHED_FEAT_START_DEBIT = 2,
445 SCHED_FEAT_TREE_AVG = 4,
446 SCHED_FEAT_APPROX_AVG = 8,
447 SCHED_FEAT_WAKEUP_PREEMPT = 16,
448 SCHED_FEAT_PREEMPT_RESTRICT = 32,
449};
450
451const_debug unsigned int sysctl_sched_features =
452 SCHED_FEAT_NEW_FAIR_SLEEPERS *1 |
453 SCHED_FEAT_START_DEBIT *1 |
454 SCHED_FEAT_TREE_AVG *0 |
455 SCHED_FEAT_APPROX_AVG *0 |
456 SCHED_FEAT_WAKEUP_PREEMPT *1 |
457 SCHED_FEAT_PREEMPT_RESTRICT *1;
458
459#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
460
461/*
385 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu 462 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
386 * clock constructed from sched_clock(): 463 * clock constructed from sched_clock():
387 */ 464 */
@@ -399,18 +476,7 @@ unsigned long long cpu_clock(int cpu)
399 476
400 return now; 477 return now;
401} 478}
402 479EXPORT_SYMBOL_GPL(cpu_clock);
403#ifdef CONFIG_FAIR_GROUP_SCHED
404/* Change a task's ->cfs_rq if it moves across CPUs */
405static inline void set_task_cfs_rq(struct task_struct *p)
406{
407 p->se.cfs_rq = &task_rq(p)->cfs;
408}
409#else
410static inline void set_task_cfs_rq(struct task_struct *p)
411{
412}
413#endif
414 480
415#ifndef prepare_arch_switch 481#ifndef prepare_arch_switch
416# define prepare_arch_switch(next) do { } while (0) 482# define prepare_arch_switch(next) do { } while (0)
@@ -496,16 +562,13 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
496static inline struct rq *__task_rq_lock(struct task_struct *p) 562static inline struct rq *__task_rq_lock(struct task_struct *p)
497 __acquires(rq->lock) 563 __acquires(rq->lock)
498{ 564{
499 struct rq *rq; 565 for (;;) {
500 566 struct rq *rq = task_rq(p);
501repeat_lock_task: 567 spin_lock(&rq->lock);
502 rq = task_rq(p); 568 if (likely(rq == task_rq(p)))
503 spin_lock(&rq->lock); 569 return rq;
504 if (unlikely(rq != task_rq(p))) {
505 spin_unlock(&rq->lock); 570 spin_unlock(&rq->lock);
506 goto repeat_lock_task;
507 } 571 }
508 return rq;
509} 572}
510 573
511/* 574/*
@@ -518,18 +581,17 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
518{ 581{
519 struct rq *rq; 582 struct rq *rq;
520 583
521repeat_lock_task: 584 for (;;) {
522 local_irq_save(*flags); 585 local_irq_save(*flags);
523 rq = task_rq(p); 586 rq = task_rq(p);
524 spin_lock(&rq->lock); 587 spin_lock(&rq->lock);
525 if (unlikely(rq != task_rq(p))) { 588 if (likely(rq == task_rq(p)))
589 return rq;
526 spin_unlock_irqrestore(&rq->lock, *flags); 590 spin_unlock_irqrestore(&rq->lock, *flags);
527 goto repeat_lock_task;
528 } 591 }
529 return rq;
530} 592}
531 593
532static inline void __task_rq_unlock(struct rq *rq) 594static void __task_rq_unlock(struct rq *rq)
533 __releases(rq->lock) 595 __releases(rq->lock)
534{ 596{
535 spin_unlock(&rq->lock); 597 spin_unlock(&rq->lock);
@@ -544,7 +606,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
544/* 606/*
545 * this_rq_lock - lock this runqueue and disable interrupts. 607 * this_rq_lock - lock this runqueue and disable interrupts.
546 */ 608 */
547static inline struct rq *this_rq_lock(void) 609static struct rq *this_rq_lock(void)
548 __acquires(rq->lock) 610 __acquires(rq->lock)
549{ 611{
550 struct rq *rq; 612 struct rq *rq;
@@ -644,19 +706,6 @@ static inline void resched_task(struct task_struct *p)
644} 706}
645#endif 707#endif
646 708
647static u64 div64_likely32(u64 divident, unsigned long divisor)
648{
649#if BITS_PER_LONG == 32
650 if (likely(divident <= 0xffffffffULL))
651 return (u32)divident / divisor;
652 do_div(divident, divisor);
653
654 return divident;
655#else
656 return divident / divisor;
657#endif
658}
659
660#if BITS_PER_LONG == 32 709#if BITS_PER_LONG == 32
661# define WMULT_CONST (~0UL) 710# define WMULT_CONST (~0UL)
662#else 711#else
@@ -698,16 +747,14 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
698 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); 747 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
699} 748}
700 749
701static void update_load_add(struct load_weight *lw, unsigned long inc) 750static inline void update_load_add(struct load_weight *lw, unsigned long inc)
702{ 751{
703 lw->weight += inc; 752 lw->weight += inc;
704 lw->inv_weight = 0;
705} 753}
706 754
707static void update_load_sub(struct load_weight *lw, unsigned long dec) 755static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
708{ 756{
709 lw->weight -= dec; 757 lw->weight -= dec;
710 lw->inv_weight = 0;
711} 758}
712 759
713/* 760/*
@@ -783,29 +830,20 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
783 int *this_best_prio, struct rq_iterator *iterator); 830 int *this_best_prio, struct rq_iterator *iterator);
784 831
785#include "sched_stats.h" 832#include "sched_stats.h"
786#include "sched_rt.c"
787#include "sched_fair.c"
788#include "sched_idletask.c" 833#include "sched_idletask.c"
834#include "sched_fair.c"
835#include "sched_rt.c"
789#ifdef CONFIG_SCHED_DEBUG 836#ifdef CONFIG_SCHED_DEBUG
790# include "sched_debug.c" 837# include "sched_debug.c"
791#endif 838#endif
792 839
793#define sched_class_highest (&rt_sched_class) 840#define sched_class_highest (&rt_sched_class)
794 841
795static void __update_curr_load(struct rq *rq, struct load_stat *ls)
796{
797 if (rq->curr != rq->idle && ls->load.weight) {
798 ls->delta_exec += ls->delta_stat;
799 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
800 ls->delta_stat = 0;
801 }
802}
803
804/* 842/*
805 * Update delta_exec, delta_fair fields for rq. 843 * Update delta_exec, delta_fair fields for rq.
806 * 844 *
807 * delta_fair clock advances at a rate inversely proportional to 845 * delta_fair clock advances at a rate inversely proportional to
808 * total load (rq->ls.load.weight) on the runqueue, while 846 * total load (rq->load.weight) on the runqueue, while
809 * delta_exec advances at the same rate as wall-clock (provided 847 * delta_exec advances at the same rate as wall-clock (provided
810 * cpu is not idle). 848 * cpu is not idle).
811 * 849 *
@@ -813,35 +851,17 @@ static void __update_curr_load(struct rq *rq, struct load_stat *ls)
813 * runqueue over any given interval. This (smoothened) load is used 851 * runqueue over any given interval. This (smoothened) load is used
814 * during load balance. 852 * during load balance.
815 * 853 *
816 * This function is called /before/ updating rq->ls.load 854 * This function is called /before/ updating rq->load
817 * and when switching tasks. 855 * and when switching tasks.
818 */ 856 */
819static void update_curr_load(struct rq *rq)
820{
821 struct load_stat *ls = &rq->ls;
822 u64 start;
823
824 start = ls->load_update_start;
825 ls->load_update_start = rq->clock;
826 ls->delta_stat += rq->clock - start;
827 /*
828 * Stagger updates to ls->delta_fair. Very frequent updates
829 * can be expensive.
830 */
831 if (ls->delta_stat >= sysctl_sched_stat_granularity)
832 __update_curr_load(rq, ls);
833}
834
835static inline void inc_load(struct rq *rq, const struct task_struct *p) 857static inline void inc_load(struct rq *rq, const struct task_struct *p)
836{ 858{
837 update_curr_load(rq); 859 update_load_add(&rq->load, p->se.load.weight);
838 update_load_add(&rq->ls.load, p->se.load.weight);
839} 860}
840 861
841static inline void dec_load(struct rq *rq, const struct task_struct *p) 862static inline void dec_load(struct rq *rq, const struct task_struct *p)
842{ 863{
843 update_curr_load(rq); 864 update_load_sub(&rq->load, p->se.load.weight);
844 update_load_sub(&rq->ls.load, p->se.load.weight);
845} 865}
846 866
847static void inc_nr_running(struct task_struct *p, struct rq *rq) 867static void inc_nr_running(struct task_struct *p, struct rq *rq)
@@ -858,8 +878,6 @@ static void dec_nr_running(struct task_struct *p, struct rq *rq)
858 878
859static void set_load_weight(struct task_struct *p) 879static void set_load_weight(struct task_struct *p)
860{ 880{
861 p->se.wait_runtime = 0;
862
863 if (task_has_rt_policy(p)) { 881 if (task_has_rt_policy(p)) {
864 p->se.load.weight = prio_to_weight[0] * 2; 882 p->se.load.weight = prio_to_weight[0] * 2;
865 p->se.load.inv_weight = prio_to_wmult[0] >> 1; 883 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
@@ -951,20 +969,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
951} 969}
952 970
953/* 971/*
954 * activate_idle_task - move idle task to the _front_ of runqueue.
955 */
956static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
957{
958 update_rq_clock(rq);
959
960 if (p->state == TASK_UNINTERRUPTIBLE)
961 rq->nr_uninterruptible--;
962
963 enqueue_task(rq, p, 0);
964 inc_nr_running(p, rq);
965}
966
967/*
968 * deactivate_task - remove a task from the runqueue. 972 * deactivate_task - remove a task from the runqueue.
969 */ 973 */
970static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) 974static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
@@ -988,32 +992,50 @@ inline int task_curr(const struct task_struct *p)
988/* Used instead of source_load when we know the type == 0 */ 992/* Used instead of source_load when we know the type == 0 */
989unsigned long weighted_cpuload(const int cpu) 993unsigned long weighted_cpuload(const int cpu)
990{ 994{
991 return cpu_rq(cpu)->ls.load.weight; 995 return cpu_rq(cpu)->load.weight;
992} 996}
993 997
994static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 998static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
995{ 999{
996#ifdef CONFIG_SMP 1000#ifdef CONFIG_SMP
997 task_thread_info(p)->cpu = cpu; 1001 task_thread_info(p)->cpu = cpu;
998 set_task_cfs_rq(p);
999#endif 1002#endif
1003 set_task_cfs_rq(p);
1000} 1004}
1001 1005
1002#ifdef CONFIG_SMP 1006#ifdef CONFIG_SMP
1003 1007
1008/*
1009 * Is this task likely cache-hot:
1010 */
1011static inline int
1012task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1013{
1014 s64 delta;
1015
1016 if (p->sched_class != &fair_sched_class)
1017 return 0;
1018
1019 if (sysctl_sched_migration_cost == -1)
1020 return 1;
1021 if (sysctl_sched_migration_cost == 0)
1022 return 0;
1023
1024 delta = now - p->se.exec_start;
1025
1026 return delta < (s64)sysctl_sched_migration_cost;
1027}
1028
1029
1004void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1030void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1005{ 1031{
1006 int old_cpu = task_cpu(p); 1032 int old_cpu = task_cpu(p);
1007 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); 1033 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
1008 u64 clock_offset, fair_clock_offset; 1034 struct cfs_rq *old_cfsrq = task_cfs_rq(p),
1035 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
1036 u64 clock_offset;
1009 1037
1010 clock_offset = old_rq->clock - new_rq->clock; 1038 clock_offset = old_rq->clock - new_rq->clock;
1011 fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock;
1012
1013 if (p->se.wait_start_fair)
1014 p->se.wait_start_fair -= fair_clock_offset;
1015 if (p->se.sleep_start_fair)
1016 p->se.sleep_start_fair -= fair_clock_offset;
1017 1039
1018#ifdef CONFIG_SCHEDSTATS 1040#ifdef CONFIG_SCHEDSTATS
1019 if (p->se.wait_start) 1041 if (p->se.wait_start)
@@ -1022,7 +1044,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1022 p->se.sleep_start -= clock_offset; 1044 p->se.sleep_start -= clock_offset;
1023 if (p->se.block_start) 1045 if (p->se.block_start)
1024 p->se.block_start -= clock_offset; 1046 p->se.block_start -= clock_offset;
1047 if (old_cpu != new_cpu) {
1048 schedstat_inc(p, se.nr_migrations);
1049 if (task_hot(p, old_rq->clock, NULL))
1050 schedstat_inc(p, se.nr_forced2_migrations);
1051 }
1025#endif 1052#endif
1053 p->se.vruntime -= old_cfsrq->min_vruntime -
1054 new_cfsrq->min_vruntime;
1026 1055
1027 __set_task_cpu(p, new_cpu); 1056 __set_task_cpu(p, new_cpu);
1028} 1057}
@@ -1077,69 +1106,71 @@ void wait_task_inactive(struct task_struct *p)
1077 int running, on_rq; 1106 int running, on_rq;
1078 struct rq *rq; 1107 struct rq *rq;
1079 1108
1080repeat: 1109 for (;;) {
1081 /* 1110 /*
1082 * We do the initial early heuristics without holding 1111 * We do the initial early heuristics without holding
1083 * any task-queue locks at all. We'll only try to get 1112 * any task-queue locks at all. We'll only try to get
1084 * the runqueue lock when things look like they will 1113 * the runqueue lock when things look like they will
1085 * work out! 1114 * work out!
1086 */ 1115 */
1087 rq = task_rq(p); 1116 rq = task_rq(p);
1088 1117
1089 /* 1118 /*
1090 * If the task is actively running on another CPU 1119 * If the task is actively running on another CPU
1091 * still, just relax and busy-wait without holding 1120 * still, just relax and busy-wait without holding
1092 * any locks. 1121 * any locks.
1093 * 1122 *
1094 * NOTE! Since we don't hold any locks, it's not 1123 * NOTE! Since we don't hold any locks, it's not
1095 * even sure that "rq" stays as the right runqueue! 1124 * even sure that "rq" stays as the right runqueue!
1096 * But we don't care, since "task_running()" will 1125 * But we don't care, since "task_running()" will
1097 * return false if the runqueue has changed and p 1126 * return false if the runqueue has changed and p
1098 * is actually now running somewhere else! 1127 * is actually now running somewhere else!
1099 */ 1128 */
1100 while (task_running(rq, p)) 1129 while (task_running(rq, p))
1101 cpu_relax(); 1130 cpu_relax();
1102 1131
1103 /* 1132 /*
1104 * Ok, time to look more closely! We need the rq 1133 * Ok, time to look more closely! We need the rq
1105 * lock now, to be *sure*. If we're wrong, we'll 1134 * lock now, to be *sure*. If we're wrong, we'll
1106 * just go back and repeat. 1135 * just go back and repeat.
1107 */ 1136 */
1108 rq = task_rq_lock(p, &flags); 1137 rq = task_rq_lock(p, &flags);
1109 running = task_running(rq, p); 1138 running = task_running(rq, p);
1110 on_rq = p->se.on_rq; 1139 on_rq = p->se.on_rq;
1111 task_rq_unlock(rq, &flags); 1140 task_rq_unlock(rq, &flags);
1112 1141
1113 /* 1142 /*
1114 * Was it really running after all now that we 1143 * Was it really running after all now that we
1115 * checked with the proper locks actually held? 1144 * checked with the proper locks actually held?
1116 * 1145 *
1117 * Oops. Go back and try again.. 1146 * Oops. Go back and try again..
1118 */ 1147 */
1119 if (unlikely(running)) { 1148 if (unlikely(running)) {
1120 cpu_relax(); 1149 cpu_relax();
1121 goto repeat; 1150 continue;
1122 } 1151 }
1123 1152
1124 /* 1153 /*
1125 * It's not enough that it's not actively running, 1154 * It's not enough that it's not actively running,
1126 * it must be off the runqueue _entirely_, and not 1155 * it must be off the runqueue _entirely_, and not
1127 * preempted! 1156 * preempted!
1128 * 1157 *
1129 * So if it wa still runnable (but just not actively 1158 * So if it wa still runnable (but just not actively
1130 * running right now), it's preempted, and we should 1159 * running right now), it's preempted, and we should
1131 * yield - it could be a while. 1160 * yield - it could be a while.
1132 */ 1161 */
1133 if (unlikely(on_rq)) { 1162 if (unlikely(on_rq)) {
1134 yield(); 1163 schedule_timeout_uninterruptible(1);
1135 goto repeat; 1164 continue;
1136 } 1165 }
1137 1166
1138 /* 1167 /*
1139 * Ahh, all good. It wasn't running, and it wasn't 1168 * Ahh, all good. It wasn't running, and it wasn't
1140 * runnable, which means that it will never become 1169 * runnable, which means that it will never become
1141 * running in the future either. We're all done! 1170 * running in the future either. We're all done!
1142 */ 1171 */
1172 break;
1173 }
1143} 1174}
1144 1175
1145/*** 1176/***
@@ -1173,7 +1204,7 @@ void kick_process(struct task_struct *p)
1173 * We want to under-estimate the load of migration sources, to 1204 * We want to under-estimate the load of migration sources, to
1174 * balance conservatively. 1205 * balance conservatively.
1175 */ 1206 */
1176static inline unsigned long source_load(int cpu, int type) 1207static unsigned long source_load(int cpu, int type)
1177{ 1208{
1178 struct rq *rq = cpu_rq(cpu); 1209 struct rq *rq = cpu_rq(cpu);
1179 unsigned long total = weighted_cpuload(cpu); 1210 unsigned long total = weighted_cpuload(cpu);
@@ -1188,7 +1219,7 @@ static inline unsigned long source_load(int cpu, int type)
1188 * Return a high guess at the load of a migration-target cpu weighted 1219 * Return a high guess at the load of a migration-target cpu weighted
1189 * according to the scheduling class and "nice" value. 1220 * according to the scheduling class and "nice" value.
1190 */ 1221 */
1191static inline unsigned long target_load(int cpu, int type) 1222static unsigned long target_load(int cpu, int type)
1192{ 1223{
1193 struct rq *rq = cpu_rq(cpu); 1224 struct rq *rq = cpu_rq(cpu);
1194 unsigned long total = weighted_cpuload(cpu); 1225 unsigned long total = weighted_cpuload(cpu);
@@ -1230,7 +1261,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1230 1261
1231 /* Skip over this group if it has no CPUs allowed */ 1262 /* Skip over this group if it has no CPUs allowed */
1232 if (!cpus_intersects(group->cpumask, p->cpus_allowed)) 1263 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1233 goto nextgroup; 1264 continue;
1234 1265
1235 local_group = cpu_isset(this_cpu, group->cpumask); 1266 local_group = cpu_isset(this_cpu, group->cpumask);
1236 1267
@@ -1258,9 +1289,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1258 min_load = avg_load; 1289 min_load = avg_load;
1259 idlest = group; 1290 idlest = group;
1260 } 1291 }
1261nextgroup: 1292 } while (group = group->next, group != sd->groups);
1262 group = group->next;
1263 } while (group != sd->groups);
1264 1293
1265 if (!idlest || 100*this_load < imbalance*min_load) 1294 if (!idlest || 100*this_load < imbalance*min_load)
1266 return NULL; 1295 return NULL;
@@ -1392,8 +1421,13 @@ static int wake_idle(int cpu, struct task_struct *p)
1392 if (sd->flags & SD_WAKE_IDLE) { 1421 if (sd->flags & SD_WAKE_IDLE) {
1393 cpus_and(tmp, sd->span, p->cpus_allowed); 1422 cpus_and(tmp, sd->span, p->cpus_allowed);
1394 for_each_cpu_mask(i, tmp) { 1423 for_each_cpu_mask(i, tmp) {
1395 if (idle_cpu(i)) 1424 if (idle_cpu(i)) {
1425 if (i != task_cpu(p)) {
1426 schedstat_inc(p,
1427 se.nr_wakeups_idle);
1428 }
1396 return i; 1429 return i;
1430 }
1397 } 1431 }
1398 } else { 1432 } else {
1399 break; 1433 break;
@@ -1424,7 +1458,7 @@ static inline int wake_idle(int cpu, struct task_struct *p)
1424 */ 1458 */
1425static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) 1459static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1426{ 1460{
1427 int cpu, this_cpu, success = 0; 1461 int cpu, orig_cpu, this_cpu, success = 0;
1428 unsigned long flags; 1462 unsigned long flags;
1429 long old_state; 1463 long old_state;
1430 struct rq *rq; 1464 struct rq *rq;
@@ -1443,6 +1477,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1443 goto out_running; 1477 goto out_running;
1444 1478
1445 cpu = task_cpu(p); 1479 cpu = task_cpu(p);
1480 orig_cpu = cpu;
1446 this_cpu = smp_processor_id(); 1481 this_cpu = smp_processor_id();
1447 1482
1448#ifdef CONFIG_SMP 1483#ifdef CONFIG_SMP
@@ -1451,7 +1486,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1451 1486
1452 new_cpu = cpu; 1487 new_cpu = cpu;
1453 1488
1454 schedstat_inc(rq, ttwu_cnt); 1489 schedstat_inc(rq, ttwu_count);
1455 if (cpu == this_cpu) { 1490 if (cpu == this_cpu) {
1456 schedstat_inc(rq, ttwu_local); 1491 schedstat_inc(rq, ttwu_local);
1457 goto out_set_cpu; 1492 goto out_set_cpu;
@@ -1486,6 +1521,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1486 unsigned long tl = this_load; 1521 unsigned long tl = this_load;
1487 unsigned long tl_per_task; 1522 unsigned long tl_per_task;
1488 1523
1524 /*
1525 * Attract cache-cold tasks on sync wakeups:
1526 */
1527 if (sync && !task_hot(p, rq->clock, this_sd))
1528 goto out_set_cpu;
1529
1530 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1489 tl_per_task = cpu_avg_load_per_task(this_cpu); 1531 tl_per_task = cpu_avg_load_per_task(this_cpu);
1490 1532
1491 /* 1533 /*
@@ -1505,6 +1547,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1505 * there is no bad imbalance. 1547 * there is no bad imbalance.
1506 */ 1548 */
1507 schedstat_inc(this_sd, ttwu_move_affine); 1549 schedstat_inc(this_sd, ttwu_move_affine);
1550 schedstat_inc(p, se.nr_wakeups_affine);
1508 goto out_set_cpu; 1551 goto out_set_cpu;
1509 } 1552 }
1510 } 1553 }
@@ -1516,6 +1559,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1516 if (this_sd->flags & SD_WAKE_BALANCE) { 1559 if (this_sd->flags & SD_WAKE_BALANCE) {
1517 if (imbalance*this_load <= 100*load) { 1560 if (imbalance*this_load <= 100*load) {
1518 schedstat_inc(this_sd, ttwu_move_balance); 1561 schedstat_inc(this_sd, ttwu_move_balance);
1562 schedstat_inc(p, se.nr_wakeups_passive);
1519 goto out_set_cpu; 1563 goto out_set_cpu;
1520 } 1564 }
1521 } 1565 }
@@ -1541,18 +1585,18 @@ out_set_cpu:
1541 1585
1542out_activate: 1586out_activate:
1543#endif /* CONFIG_SMP */ 1587#endif /* CONFIG_SMP */
1588 schedstat_inc(p, se.nr_wakeups);
1589 if (sync)
1590 schedstat_inc(p, se.nr_wakeups_sync);
1591 if (orig_cpu != cpu)
1592 schedstat_inc(p, se.nr_wakeups_migrate);
1593 if (cpu == this_cpu)
1594 schedstat_inc(p, se.nr_wakeups_local);
1595 else
1596 schedstat_inc(p, se.nr_wakeups_remote);
1544 update_rq_clock(rq); 1597 update_rq_clock(rq);
1545 activate_task(rq, p, 1); 1598 activate_task(rq, p, 1);
1546 /* 1599 check_preempt_curr(rq, p);
1547 * Sync wakeups (i.e. those types of wakeups where the waker
1548 * has indicated that it will leave the CPU in short order)
1549 * don't trigger a preemption, if the woken up task will run on
1550 * this cpu. (in this case the 'I will reschedule' promise of
1551 * the waker guarantees that the freshly woken up task is going
1552 * to be considered on this CPU.)
1553 */
1554 if (!sync || cpu != this_cpu)
1555 check_preempt_curr(rq, p);
1556 success = 1; 1600 success = 1;
1557 1601
1558out_running: 1602out_running:
@@ -1583,28 +1627,20 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1583 */ 1627 */
1584static void __sched_fork(struct task_struct *p) 1628static void __sched_fork(struct task_struct *p)
1585{ 1629{
1586 p->se.wait_start_fair = 0;
1587 p->se.exec_start = 0; 1630 p->se.exec_start = 0;
1588 p->se.sum_exec_runtime = 0; 1631 p->se.sum_exec_runtime = 0;
1589 p->se.prev_sum_exec_runtime = 0; 1632 p->se.prev_sum_exec_runtime = 0;
1590 p->se.delta_exec = 0;
1591 p->se.delta_fair_run = 0;
1592 p->se.delta_fair_sleep = 0;
1593 p->se.wait_runtime = 0;
1594 p->se.sleep_start_fair = 0;
1595 1633
1596#ifdef CONFIG_SCHEDSTATS 1634#ifdef CONFIG_SCHEDSTATS
1597 p->se.wait_start = 0; 1635 p->se.wait_start = 0;
1598 p->se.sum_wait_runtime = 0;
1599 p->se.sum_sleep_runtime = 0; 1636 p->se.sum_sleep_runtime = 0;
1600 p->se.sleep_start = 0; 1637 p->se.sleep_start = 0;
1601 p->se.block_start = 0; 1638 p->se.block_start = 0;
1602 p->se.sleep_max = 0; 1639 p->se.sleep_max = 0;
1603 p->se.block_max = 0; 1640 p->se.block_max = 0;
1604 p->se.exec_max = 0; 1641 p->se.exec_max = 0;
1642 p->se.slice_max = 0;
1605 p->se.wait_max = 0; 1643 p->se.wait_max = 0;
1606 p->se.wait_runtime_overruns = 0;
1607 p->se.wait_runtime_underruns = 0;
1608#endif 1644#endif
1609 1645
1610 INIT_LIST_HEAD(&p->run_list); 1646 INIT_LIST_HEAD(&p->run_list);
@@ -1635,12 +1671,14 @@ void sched_fork(struct task_struct *p, int clone_flags)
1635#ifdef CONFIG_SMP 1671#ifdef CONFIG_SMP
1636 cpu = sched_balance_self(cpu, SD_BALANCE_FORK); 1672 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1637#endif 1673#endif
1638 __set_task_cpu(p, cpu); 1674 set_task_cpu(p, cpu);
1639 1675
1640 /* 1676 /*
1641 * Make sure we do not leak PI boosting priority to the child: 1677 * Make sure we do not leak PI boosting priority to the child:
1642 */ 1678 */
1643 p->prio = current->normal_prio; 1679 p->prio = current->normal_prio;
1680 if (!rt_prio(p->prio))
1681 p->sched_class = &fair_sched_class;
1644 1682
1645#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1683#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1646 if (likely(sched_info_on())) 1684 if (likely(sched_info_on()))
@@ -1657,12 +1695,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
1657} 1695}
1658 1696
1659/* 1697/*
1660 * After fork, child runs first. (default) If set to 0 then
1661 * parent will (try to) run first.
1662 */
1663unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1664
1665/*
1666 * wake_up_new_task - wake up a newly created task for the first time. 1698 * wake_up_new_task - wake up a newly created task for the first time.
1667 * 1699 *
1668 * This function will do some initial scheduler statistics housekeeping 1700 * This function will do some initial scheduler statistics housekeeping
@@ -1673,24 +1705,14 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1673{ 1705{
1674 unsigned long flags; 1706 unsigned long flags;
1675 struct rq *rq; 1707 struct rq *rq;
1676 int this_cpu;
1677 1708
1678 rq = task_rq_lock(p, &flags); 1709 rq = task_rq_lock(p, &flags);
1679 BUG_ON(p->state != TASK_RUNNING); 1710 BUG_ON(p->state != TASK_RUNNING);
1680 this_cpu = smp_processor_id(); /* parent's CPU */
1681 update_rq_clock(rq); 1711 update_rq_clock(rq);
1682 1712
1683 p->prio = effective_prio(p); 1713 p->prio = effective_prio(p);
1684 1714
1685 if (rt_prio(p->prio)) 1715 if (!p->sched_class->task_new || !current->se.on_rq || !rq->cfs.curr) {
1686 p->sched_class = &rt_sched_class;
1687 else
1688 p->sched_class = &fair_sched_class;
1689
1690 if (!p->sched_class->task_new || !sysctl_sched_child_runs_first ||
1691 (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu ||
1692 !current->se.on_rq) {
1693
1694 activate_task(rq, p, 0); 1716 activate_task(rq, p, 0);
1695 } else { 1717 } else {
1696 /* 1718 /*
@@ -1799,7 +1821,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
1799 * with the lock held can cause deadlocks; see schedule() for 1821 * with the lock held can cause deadlocks; see schedule() for
1800 * details.) 1822 * details.)
1801 */ 1823 */
1802static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) 1824static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1803 __releases(rq->lock) 1825 __releases(rq->lock)
1804{ 1826{
1805 struct mm_struct *mm = rq->prev_mm; 1827 struct mm_struct *mm = rq->prev_mm;
@@ -1981,42 +2003,10 @@ unsigned long nr_active(void)
1981 */ 2003 */
1982static void update_cpu_load(struct rq *this_rq) 2004static void update_cpu_load(struct rq *this_rq)
1983{ 2005{
1984 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64; 2006 unsigned long this_load = this_rq->load.weight;
1985 unsigned long total_load = this_rq->ls.load.weight;
1986 unsigned long this_load = total_load;
1987 struct load_stat *ls = &this_rq->ls;
1988 int i, scale; 2007 int i, scale;
1989 2008
1990 this_rq->nr_load_updates++; 2009 this_rq->nr_load_updates++;
1991 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1992 goto do_avg;
1993
1994 /* Update delta_fair/delta_exec fields first */
1995 update_curr_load(this_rq);
1996
1997 fair_delta64 = ls->delta_fair + 1;
1998 ls->delta_fair = 0;
1999
2000 exec_delta64 = ls->delta_exec + 1;
2001 ls->delta_exec = 0;
2002
2003 sample_interval64 = this_rq->clock - ls->load_update_last;
2004 ls->load_update_last = this_rq->clock;
2005
2006 if ((s64)sample_interval64 < (s64)TICK_NSEC)
2007 sample_interval64 = TICK_NSEC;
2008
2009 if (exec_delta64 > sample_interval64)
2010 exec_delta64 = sample_interval64;
2011
2012 idle_delta64 = sample_interval64 - exec_delta64;
2013
2014 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
2015 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
2016
2017 this_load = (unsigned long)tmp64;
2018
2019do_avg:
2020 2010
2021 /* Update our load: */ 2011 /* Update our load: */
2022 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 2012 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2026,7 +2016,13 @@ do_avg:
2026 2016
2027 old_load = this_rq->cpu_load[i]; 2017 old_load = this_rq->cpu_load[i];
2028 new_load = this_load; 2018 new_load = this_load;
2029 2019 /*
2020 * Round up the averaging division if load is increasing. This
2021 * prevents us from getting stuck on 9 if the load is 10, for
2022 * example.
2023 */
2024 if (new_load > old_load)
2025 new_load += scale-1;
2030 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 2026 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2031 } 2027 }
2032} 2028}
@@ -2178,13 +2174,38 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2178 * 2) cannot be migrated to this CPU due to cpus_allowed, or 2174 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2179 * 3) are cache-hot on their current CPU. 2175 * 3) are cache-hot on their current CPU.
2180 */ 2176 */
2181 if (!cpu_isset(this_cpu, p->cpus_allowed)) 2177 if (!cpu_isset(this_cpu, p->cpus_allowed)) {
2178 schedstat_inc(p, se.nr_failed_migrations_affine);
2182 return 0; 2179 return 0;
2180 }
2183 *all_pinned = 0; 2181 *all_pinned = 0;
2184 2182
2185 if (task_running(rq, p)) 2183 if (task_running(rq, p)) {
2184 schedstat_inc(p, se.nr_failed_migrations_running);
2186 return 0; 2185 return 0;
2186 }
2187
2188 /*
2189 * Aggressive migration if:
2190 * 1) task is cache cold, or
2191 * 2) too many balance attempts have failed.
2192 */
2193
2194 if (!task_hot(p, rq->clock, sd) ||
2195 sd->nr_balance_failed > sd->cache_nice_tries) {
2196#ifdef CONFIG_SCHEDSTATS
2197 if (task_hot(p, rq->clock, sd)) {
2198 schedstat_inc(sd, lb_hot_gained[idle]);
2199 schedstat_inc(p, se.nr_forced_migrations);
2200 }
2201#endif
2202 return 1;
2203 }
2187 2204
2205 if (task_hot(p, rq->clock, sd)) {
2206 schedstat_inc(p, se.nr_failed_migrations_hot);
2207 return 0;
2208 }
2188 return 1; 2209 return 1;
2189} 2210}
2190 2211
@@ -2263,7 +2284,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2263 struct sched_domain *sd, enum cpu_idle_type idle, 2284 struct sched_domain *sd, enum cpu_idle_type idle,
2264 int *all_pinned) 2285 int *all_pinned)
2265{ 2286{
2266 struct sched_class *class = sched_class_highest; 2287 const struct sched_class *class = sched_class_highest;
2267 unsigned long total_load_moved = 0; 2288 unsigned long total_load_moved = 0;
2268 int this_best_prio = this_rq->curr->prio; 2289 int this_best_prio = this_rq->curr->prio;
2269 2290
@@ -2288,7 +2309,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2288static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, 2309static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2289 struct sched_domain *sd, enum cpu_idle_type idle) 2310 struct sched_domain *sd, enum cpu_idle_type idle)
2290{ 2311{
2291 struct sched_class *class; 2312 const struct sched_class *class;
2292 int this_best_prio = MAX_PRIO; 2313 int this_best_prio = MAX_PRIO;
2293 2314
2294 for (class = sched_class_highest; class; class = class->next) 2315 for (class = sched_class_highest; class; class = class->next)
@@ -2652,7 +2673,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2652 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2673 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2653 sd_idle = 1; 2674 sd_idle = 1;
2654 2675
2655 schedstat_inc(sd, lb_cnt[idle]); 2676 schedstat_inc(sd, lb_count[idle]);
2656 2677
2657redo: 2678redo:
2658 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 2679 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
@@ -2805,7 +2826,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2805 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2826 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2806 sd_idle = 1; 2827 sd_idle = 1;
2807 2828
2808 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]); 2829 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
2809redo: 2830redo:
2810 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, 2831 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
2811 &sd_idle, &cpus, NULL); 2832 &sd_idle, &cpus, NULL);
@@ -2939,7 +2960,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2939 } 2960 }
2940 2961
2941 if (likely(sd)) { 2962 if (likely(sd)) {
2942 schedstat_inc(sd, alb_cnt); 2963 schedstat_inc(sd, alb_count);
2943 2964
2944 if (move_one_task(target_rq, target_cpu, busiest_rq, 2965 if (move_one_task(target_rq, target_cpu, busiest_rq,
2945 sd, CPU_IDLE)) 2966 sd, CPU_IDLE))
@@ -3032,7 +3053,7 @@ static DEFINE_SPINLOCK(balancing);
3032 * 3053 *
3033 * Balancing parameters are set up in arch_init_sched_domains. 3054 * Balancing parameters are set up in arch_init_sched_domains.
3034 */ 3055 */
3035static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) 3056static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3036{ 3057{
3037 int balance = 1; 3058 int balance = 1;
3038 struct rq *rq = cpu_rq(cpu); 3059 struct rq *rq = cpu_rq(cpu);
@@ -3279,6 +3300,25 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
3279} 3300}
3280 3301
3281/* 3302/*
3303 * Account guest cpu time to a process.
3304 * @p: the process that the cpu time gets accounted to
3305 * @cputime: the cpu time spent in virtual machine since the last update
3306 */
3307void account_guest_time(struct task_struct *p, cputime_t cputime)
3308{
3309 cputime64_t tmp;
3310 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3311
3312 tmp = cputime_to_cputime64(cputime);
3313
3314 p->utime = cputime_add(p->utime, cputime);
3315 p->gtime = cputime_add(p->gtime, cputime);
3316
3317 cpustat->user = cputime64_add(cpustat->user, tmp);
3318 cpustat->guest = cputime64_add(cpustat->guest, tmp);
3319}
3320
3321/*
3282 * Account system cpu time to a process. 3322 * Account system cpu time to a process.
3283 * @p: the process that the cpu time gets accounted to 3323 * @p: the process that the cpu time gets accounted to
3284 * @hardirq_offset: the offset to subtract from hardirq_count() 3324 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3291,6 +3331,12 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3291 struct rq *rq = this_rq(); 3331 struct rq *rq = this_rq();
3292 cputime64_t tmp; 3332 cputime64_t tmp;
3293 3333
3334 if (p->flags & PF_VCPU) {
3335 account_guest_time(p, cputime);
3336 p->flags &= ~PF_VCPU;
3337 return;
3338 }
3339
3294 p->stime = cputime_add(p->stime, cputime); 3340 p->stime = cputime_add(p->stime, cputime);
3295 3341
3296 /* Add system time to cpustat. */ 3342 /* Add system time to cpustat. */
@@ -3429,7 +3475,13 @@ static inline void schedule_debug(struct task_struct *prev)
3429 3475
3430 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 3476 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3431 3477
3432 schedstat_inc(this_rq(), sched_cnt); 3478 schedstat_inc(this_rq(), sched_count);
3479#ifdef CONFIG_SCHEDSTATS
3480 if (unlikely(prev->lock_depth >= 0)) {
3481 schedstat_inc(this_rq(), bkl_count);
3482 schedstat_inc(prev, sched_info.bkl_count);
3483 }
3484#endif
3433} 3485}
3434 3486
3435/* 3487/*
@@ -3438,7 +3490,7 @@ static inline void schedule_debug(struct task_struct *prev)
3438static inline struct task_struct * 3490static inline struct task_struct *
3439pick_next_task(struct rq *rq, struct task_struct *prev) 3491pick_next_task(struct rq *rq, struct task_struct *prev)
3440{ 3492{
3441 struct sched_class *class; 3493 const struct sched_class *class;
3442 struct task_struct *p; 3494 struct task_struct *p;
3443 3495
3444 /* 3496 /*
@@ -3487,9 +3539,13 @@ need_resched_nonpreemptible:
3487 3539
3488 schedule_debug(prev); 3540 schedule_debug(prev);
3489 3541
3490 spin_lock_irq(&rq->lock); 3542 /*
3491 clear_tsk_need_resched(prev); 3543 * Do the rq-clock update outside the rq lock:
3544 */
3545 local_irq_disable();
3492 __update_rq_clock(rq); 3546 __update_rq_clock(rq);
3547 spin_lock(&rq->lock);
3548 clear_tsk_need_resched(prev);
3493 3549
3494 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3550 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3495 if (unlikely((prev->state & TASK_INTERRUPTIBLE) && 3551 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
@@ -3549,27 +3605,30 @@ asmlinkage void __sched preempt_schedule(void)
3549 if (likely(ti->preempt_count || irqs_disabled())) 3605 if (likely(ti->preempt_count || irqs_disabled()))
3550 return; 3606 return;
3551 3607
3552need_resched: 3608 do {
3553 add_preempt_count(PREEMPT_ACTIVE); 3609 add_preempt_count(PREEMPT_ACTIVE);
3554 /* 3610
3555 * We keep the big kernel semaphore locked, but we 3611 /*
3556 * clear ->lock_depth so that schedule() doesnt 3612 * We keep the big kernel semaphore locked, but we
3557 * auto-release the semaphore: 3613 * clear ->lock_depth so that schedule() doesnt
3558 */ 3614 * auto-release the semaphore:
3615 */
3559#ifdef CONFIG_PREEMPT_BKL 3616#ifdef CONFIG_PREEMPT_BKL
3560 saved_lock_depth = task->lock_depth; 3617 saved_lock_depth = task->lock_depth;
3561 task->lock_depth = -1; 3618 task->lock_depth = -1;
3562#endif 3619#endif
3563 schedule(); 3620 schedule();
3564#ifdef CONFIG_PREEMPT_BKL 3621#ifdef CONFIG_PREEMPT_BKL
3565 task->lock_depth = saved_lock_depth; 3622 task->lock_depth = saved_lock_depth;
3566#endif 3623#endif
3567 sub_preempt_count(PREEMPT_ACTIVE); 3624 sub_preempt_count(PREEMPT_ACTIVE);
3568 3625
3569 /* we could miss a preemption opportunity between schedule and now */ 3626 /*
3570 barrier(); 3627 * Check again in case we missed a preemption opportunity
3571 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3628 * between schedule and now.
3572 goto need_resched; 3629 */
3630 barrier();
3631 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
3573} 3632}
3574EXPORT_SYMBOL(preempt_schedule); 3633EXPORT_SYMBOL(preempt_schedule);
3575 3634
@@ -3589,29 +3648,32 @@ asmlinkage void __sched preempt_schedule_irq(void)
3589 /* Catch callers which need to be fixed */ 3648 /* Catch callers which need to be fixed */
3590 BUG_ON(ti->preempt_count || !irqs_disabled()); 3649 BUG_ON(ti->preempt_count || !irqs_disabled());
3591 3650
3592need_resched: 3651 do {
3593 add_preempt_count(PREEMPT_ACTIVE); 3652 add_preempt_count(PREEMPT_ACTIVE);
3594 /* 3653
3595 * We keep the big kernel semaphore locked, but we 3654 /*
3596 * clear ->lock_depth so that schedule() doesnt 3655 * We keep the big kernel semaphore locked, but we
3597 * auto-release the semaphore: 3656 * clear ->lock_depth so that schedule() doesnt
3598 */ 3657 * auto-release the semaphore:
3658 */
3599#ifdef CONFIG_PREEMPT_BKL 3659#ifdef CONFIG_PREEMPT_BKL
3600 saved_lock_depth = task->lock_depth; 3660 saved_lock_depth = task->lock_depth;
3601 task->lock_depth = -1; 3661 task->lock_depth = -1;
3602#endif 3662#endif
3603 local_irq_enable(); 3663 local_irq_enable();
3604 schedule(); 3664 schedule();
3605 local_irq_disable(); 3665 local_irq_disable();
3606#ifdef CONFIG_PREEMPT_BKL 3666#ifdef CONFIG_PREEMPT_BKL
3607 task->lock_depth = saved_lock_depth; 3667 task->lock_depth = saved_lock_depth;
3608#endif 3668#endif
3609 sub_preempt_count(PREEMPT_ACTIVE); 3669 sub_preempt_count(PREEMPT_ACTIVE);
3610 3670
3611 /* we could miss a preemption opportunity between schedule and now */ 3671 /*
3612 barrier(); 3672 * Check again in case we missed a preemption opportunity
3613 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3673 * between schedule and now.
3614 goto need_resched; 3674 */
3675 barrier();
3676 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
3615} 3677}
3616 3678
3617#endif /* CONFIG_PREEMPT */ 3679#endif /* CONFIG_PREEMPT */
@@ -3635,10 +3697,9 @@ EXPORT_SYMBOL(default_wake_function);
3635static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 3697static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3636 int nr_exclusive, int sync, void *key) 3698 int nr_exclusive, int sync, void *key)
3637{ 3699{
3638 struct list_head *tmp, *next; 3700 wait_queue_t *curr, *next;
3639 3701
3640 list_for_each_safe(tmp, next, &q->task_list) { 3702 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
3641 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3642 unsigned flags = curr->flags; 3703 unsigned flags = curr->flags;
3643 3704
3644 if (curr->func(curr, mode, sync, key) && 3705 if (curr->func(curr, mode, sync, key) &&
@@ -3728,206 +3789,116 @@ void fastcall complete_all(struct completion *x)
3728} 3789}
3729EXPORT_SYMBOL(complete_all); 3790EXPORT_SYMBOL(complete_all);
3730 3791
3731void fastcall __sched wait_for_completion(struct completion *x) 3792static inline long __sched
3732{ 3793do_wait_for_common(struct completion *x, long timeout, int state)
3733 might_sleep();
3734
3735 spin_lock_irq(&x->wait.lock);
3736 if (!x->done) {
3737 DECLARE_WAITQUEUE(wait, current);
3738
3739 wait.flags |= WQ_FLAG_EXCLUSIVE;
3740 __add_wait_queue_tail(&x->wait, &wait);
3741 do {
3742 __set_current_state(TASK_UNINTERRUPTIBLE);
3743 spin_unlock_irq(&x->wait.lock);
3744 schedule();
3745 spin_lock_irq(&x->wait.lock);
3746 } while (!x->done);
3747 __remove_wait_queue(&x->wait, &wait);
3748 }
3749 x->done--;
3750 spin_unlock_irq(&x->wait.lock);
3751}
3752EXPORT_SYMBOL(wait_for_completion);
3753
3754unsigned long fastcall __sched
3755wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3756{ 3794{
3757 might_sleep();
3758
3759 spin_lock_irq(&x->wait.lock);
3760 if (!x->done) { 3795 if (!x->done) {
3761 DECLARE_WAITQUEUE(wait, current); 3796 DECLARE_WAITQUEUE(wait, current);
3762 3797
3763 wait.flags |= WQ_FLAG_EXCLUSIVE; 3798 wait.flags |= WQ_FLAG_EXCLUSIVE;
3764 __add_wait_queue_tail(&x->wait, &wait); 3799 __add_wait_queue_tail(&x->wait, &wait);
3765 do { 3800 do {
3766 __set_current_state(TASK_UNINTERRUPTIBLE); 3801 if (state == TASK_INTERRUPTIBLE &&
3802 signal_pending(current)) {
3803 __remove_wait_queue(&x->wait, &wait);
3804 return -ERESTARTSYS;
3805 }
3806 __set_current_state(state);
3767 spin_unlock_irq(&x->wait.lock); 3807 spin_unlock_irq(&x->wait.lock);
3768 timeout = schedule_timeout(timeout); 3808 timeout = schedule_timeout(timeout);
3769 spin_lock_irq(&x->wait.lock); 3809 spin_lock_irq(&x->wait.lock);
3770 if (!timeout) { 3810 if (!timeout) {
3771 __remove_wait_queue(&x->wait, &wait); 3811 __remove_wait_queue(&x->wait, &wait);
3772 goto out; 3812 return timeout;
3773 } 3813 }
3774 } while (!x->done); 3814 } while (!x->done);
3775 __remove_wait_queue(&x->wait, &wait); 3815 __remove_wait_queue(&x->wait, &wait);
3776 } 3816 }
3777 x->done--; 3817 x->done--;
3778out:
3779 spin_unlock_irq(&x->wait.lock);
3780 return timeout; 3818 return timeout;
3781} 3819}
3782EXPORT_SYMBOL(wait_for_completion_timeout);
3783 3820
3784int fastcall __sched wait_for_completion_interruptible(struct completion *x) 3821static long __sched
3822wait_for_common(struct completion *x, long timeout, int state)
3785{ 3823{
3786 int ret = 0;
3787
3788 might_sleep(); 3824 might_sleep();
3789 3825
3790 spin_lock_irq(&x->wait.lock); 3826 spin_lock_irq(&x->wait.lock);
3791 if (!x->done) { 3827 timeout = do_wait_for_common(x, timeout, state);
3792 DECLARE_WAITQUEUE(wait, current);
3793
3794 wait.flags |= WQ_FLAG_EXCLUSIVE;
3795 __add_wait_queue_tail(&x->wait, &wait);
3796 do {
3797 if (signal_pending(current)) {
3798 ret = -ERESTARTSYS;
3799 __remove_wait_queue(&x->wait, &wait);
3800 goto out;
3801 }
3802 __set_current_state(TASK_INTERRUPTIBLE);
3803 spin_unlock_irq(&x->wait.lock);
3804 schedule();
3805 spin_lock_irq(&x->wait.lock);
3806 } while (!x->done);
3807 __remove_wait_queue(&x->wait, &wait);
3808 }
3809 x->done--;
3810out:
3811 spin_unlock_irq(&x->wait.lock); 3828 spin_unlock_irq(&x->wait.lock);
3829 return timeout;
3830}
3812 3831
3813 return ret; 3832void fastcall __sched wait_for_completion(struct completion *x)
3833{
3834 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3814} 3835}
3815EXPORT_SYMBOL(wait_for_completion_interruptible); 3836EXPORT_SYMBOL(wait_for_completion);
3816 3837
3817unsigned long fastcall __sched 3838unsigned long fastcall __sched
3818wait_for_completion_interruptible_timeout(struct completion *x, 3839wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3819 unsigned long timeout)
3820{ 3840{
3821 might_sleep(); 3841 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
3822
3823 spin_lock_irq(&x->wait.lock);
3824 if (!x->done) {
3825 DECLARE_WAITQUEUE(wait, current);
3826
3827 wait.flags |= WQ_FLAG_EXCLUSIVE;
3828 __add_wait_queue_tail(&x->wait, &wait);
3829 do {
3830 if (signal_pending(current)) {
3831 timeout = -ERESTARTSYS;
3832 __remove_wait_queue(&x->wait, &wait);
3833 goto out;
3834 }
3835 __set_current_state(TASK_INTERRUPTIBLE);
3836 spin_unlock_irq(&x->wait.lock);
3837 timeout = schedule_timeout(timeout);
3838 spin_lock_irq(&x->wait.lock);
3839 if (!timeout) {
3840 __remove_wait_queue(&x->wait, &wait);
3841 goto out;
3842 }
3843 } while (!x->done);
3844 __remove_wait_queue(&x->wait, &wait);
3845 }
3846 x->done--;
3847out:
3848 spin_unlock_irq(&x->wait.lock);
3849 return timeout;
3850} 3842}
3851EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 3843EXPORT_SYMBOL(wait_for_completion_timeout);
3852 3844
3853static inline void 3845int __sched wait_for_completion_interruptible(struct completion *x)
3854sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
3855{ 3846{
3856 spin_lock_irqsave(&q->lock, *flags); 3847 return wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
3857 __add_wait_queue(q, wait);
3858 spin_unlock(&q->lock);
3859} 3848}
3849EXPORT_SYMBOL(wait_for_completion_interruptible);
3860 3850
3861static inline void 3851unsigned long fastcall __sched
3862sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) 3852wait_for_completion_interruptible_timeout(struct completion *x,
3853 unsigned long timeout)
3863{ 3854{
3864 spin_lock_irq(&q->lock); 3855 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
3865 __remove_wait_queue(q, wait);
3866 spin_unlock_irqrestore(&q->lock, *flags);
3867} 3856}
3857EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3868 3858
3869void __sched interruptible_sleep_on(wait_queue_head_t *q) 3859static long __sched
3860sleep_on_common(wait_queue_head_t *q, int state, long timeout)
3870{ 3861{
3871 unsigned long flags; 3862 unsigned long flags;
3872 wait_queue_t wait; 3863 wait_queue_t wait;
3873 3864
3874 init_waitqueue_entry(&wait, current); 3865 init_waitqueue_entry(&wait, current);
3875 3866
3876 current->state = TASK_INTERRUPTIBLE; 3867 __set_current_state(state);
3877 3868
3878 sleep_on_head(q, &wait, &flags); 3869 spin_lock_irqsave(&q->lock, flags);
3879 schedule(); 3870 __add_wait_queue(q, &wait);
3880 sleep_on_tail(q, &wait, &flags); 3871 spin_unlock(&q->lock);
3872 timeout = schedule_timeout(timeout);
3873 spin_lock_irq(&q->lock);
3874 __remove_wait_queue(q, &wait);
3875 spin_unlock_irqrestore(&q->lock, flags);
3876
3877 return timeout;
3878}
3879
3880void __sched interruptible_sleep_on(wait_queue_head_t *q)
3881{
3882 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3881} 3883}
3882EXPORT_SYMBOL(interruptible_sleep_on); 3884EXPORT_SYMBOL(interruptible_sleep_on);
3883 3885
3884long __sched 3886long __sched
3885interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 3887interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3886{ 3888{
3887 unsigned long flags; 3889 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
3888 wait_queue_t wait;
3889
3890 init_waitqueue_entry(&wait, current);
3891
3892 current->state = TASK_INTERRUPTIBLE;
3893
3894 sleep_on_head(q, &wait, &flags);
3895 timeout = schedule_timeout(timeout);
3896 sleep_on_tail(q, &wait, &flags);
3897
3898 return timeout;
3899} 3890}
3900EXPORT_SYMBOL(interruptible_sleep_on_timeout); 3891EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3901 3892
3902void __sched sleep_on(wait_queue_head_t *q) 3893void __sched sleep_on(wait_queue_head_t *q)
3903{ 3894{
3904 unsigned long flags; 3895 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3905 wait_queue_t wait;
3906
3907 init_waitqueue_entry(&wait, current);
3908
3909 current->state = TASK_UNINTERRUPTIBLE;
3910
3911 sleep_on_head(q, &wait, &flags);
3912 schedule();
3913 sleep_on_tail(q, &wait, &flags);
3914} 3896}
3915EXPORT_SYMBOL(sleep_on); 3897EXPORT_SYMBOL(sleep_on);
3916 3898
3917long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 3899long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3918{ 3900{
3919 unsigned long flags; 3901 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
3920 wait_queue_t wait;
3921
3922 init_waitqueue_entry(&wait, current);
3923
3924 current->state = TASK_UNINTERRUPTIBLE;
3925
3926 sleep_on_head(q, &wait, &flags);
3927 timeout = schedule_timeout(timeout);
3928 sleep_on_tail(q, &wait, &flags);
3929
3930 return timeout;
3931} 3902}
3932EXPORT_SYMBOL(sleep_on_timeout); 3903EXPORT_SYMBOL(sleep_on_timeout);
3933 3904
@@ -3946,7 +3917,7 @@ EXPORT_SYMBOL(sleep_on_timeout);
3946void rt_mutex_setprio(struct task_struct *p, int prio) 3917void rt_mutex_setprio(struct task_struct *p, int prio)
3947{ 3918{
3948 unsigned long flags; 3919 unsigned long flags;
3949 int oldprio, on_rq; 3920 int oldprio, on_rq, running;
3950 struct rq *rq; 3921 struct rq *rq;
3951 3922
3952 BUG_ON(prio < 0 || prio > MAX_PRIO); 3923 BUG_ON(prio < 0 || prio > MAX_PRIO);
@@ -3956,8 +3927,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3956 3927
3957 oldprio = p->prio; 3928 oldprio = p->prio;
3958 on_rq = p->se.on_rq; 3929 on_rq = p->se.on_rq;
3959 if (on_rq) 3930 running = task_running(rq, p);
3931 if (on_rq) {
3960 dequeue_task(rq, p, 0); 3932 dequeue_task(rq, p, 0);
3933 if (running)
3934 p->sched_class->put_prev_task(rq, p);
3935 }
3961 3936
3962 if (rt_prio(prio)) 3937 if (rt_prio(prio))
3963 p->sched_class = &rt_sched_class; 3938 p->sched_class = &rt_sched_class;
@@ -3967,13 +3942,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3967 p->prio = prio; 3942 p->prio = prio;
3968 3943
3969 if (on_rq) { 3944 if (on_rq) {
3945 if (running)
3946 p->sched_class->set_curr_task(rq);
3970 enqueue_task(rq, p, 0); 3947 enqueue_task(rq, p, 0);
3971 /* 3948 /*
3972 * Reschedule if we are currently running on this runqueue and 3949 * Reschedule if we are currently running on this runqueue and
3973 * our priority decreased, or if we are not currently running on 3950 * our priority decreased, or if we are not currently running on
3974 * this runqueue and our priority is higher than the current's 3951 * this runqueue and our priority is higher than the current's
3975 */ 3952 */
3976 if (task_running(rq, p)) { 3953 if (running) {
3977 if (p->prio > oldprio) 3954 if (p->prio > oldprio)
3978 resched_task(rq->curr); 3955 resched_task(rq->curr);
3979 } else { 3956 } else {
@@ -4137,7 +4114,7 @@ struct task_struct *idle_task(int cpu)
4137 * find_process_by_pid - find a process with a matching PID value. 4114 * find_process_by_pid - find a process with a matching PID value.
4138 * @pid: the pid in question. 4115 * @pid: the pid in question.
4139 */ 4116 */
4140static inline struct task_struct *find_process_by_pid(pid_t pid) 4117static struct task_struct *find_process_by_pid(pid_t pid)
4141{ 4118{
4142 return pid ? find_task_by_pid(pid) : current; 4119 return pid ? find_task_by_pid(pid) : current;
4143} 4120}
@@ -4179,7 +4156,7 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4179int sched_setscheduler(struct task_struct *p, int policy, 4156int sched_setscheduler(struct task_struct *p, int policy,
4180 struct sched_param *param) 4157 struct sched_param *param)
4181{ 4158{
4182 int retval, oldprio, oldpolicy = -1, on_rq; 4159 int retval, oldprio, oldpolicy = -1, on_rq, running;
4183 unsigned long flags; 4160 unsigned long flags;
4184 struct rq *rq; 4161 struct rq *rq;
4185 4162
@@ -4261,18 +4238,26 @@ recheck:
4261 } 4238 }
4262 update_rq_clock(rq); 4239 update_rq_clock(rq);
4263 on_rq = p->se.on_rq; 4240 on_rq = p->se.on_rq;
4264 if (on_rq) 4241 running = task_running(rq, p);
4242 if (on_rq) {
4265 deactivate_task(rq, p, 0); 4243 deactivate_task(rq, p, 0);
4244 if (running)
4245 p->sched_class->put_prev_task(rq, p);
4246 }
4247
4266 oldprio = p->prio; 4248 oldprio = p->prio;
4267 __setscheduler(rq, p, policy, param->sched_priority); 4249 __setscheduler(rq, p, policy, param->sched_priority);
4250
4268 if (on_rq) { 4251 if (on_rq) {
4252 if (running)
4253 p->sched_class->set_curr_task(rq);
4269 activate_task(rq, p, 0); 4254 activate_task(rq, p, 0);
4270 /* 4255 /*
4271 * Reschedule if we are currently running on this runqueue and 4256 * Reschedule if we are currently running on this runqueue and
4272 * our priority decreased, or if we are not currently running on 4257 * our priority decreased, or if we are not currently running on
4273 * this runqueue and our priority is higher than the current's 4258 * this runqueue and our priority is higher than the current's
4274 */ 4259 */
4275 if (task_running(rq, p)) { 4260 if (running) {
4276 if (p->prio > oldprio) 4261 if (p->prio > oldprio)
4277 resched_task(rq->curr); 4262 resched_task(rq->curr);
4278 } else { 4263 } else {
@@ -4343,10 +4328,10 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4343asmlinkage long sys_sched_getscheduler(pid_t pid) 4328asmlinkage long sys_sched_getscheduler(pid_t pid)
4344{ 4329{
4345 struct task_struct *p; 4330 struct task_struct *p;
4346 int retval = -EINVAL; 4331 int retval;
4347 4332
4348 if (pid < 0) 4333 if (pid < 0)
4349 goto out_nounlock; 4334 return -EINVAL;
4350 4335
4351 retval = -ESRCH; 4336 retval = -ESRCH;
4352 read_lock(&tasklist_lock); 4337 read_lock(&tasklist_lock);
@@ -4357,8 +4342,6 @@ asmlinkage long sys_sched_getscheduler(pid_t pid)
4357 retval = p->policy; 4342 retval = p->policy;
4358 } 4343 }
4359 read_unlock(&tasklist_lock); 4344 read_unlock(&tasklist_lock);
4360
4361out_nounlock:
4362 return retval; 4345 return retval;
4363} 4346}
4364 4347
@@ -4371,10 +4354,10 @@ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4371{ 4354{
4372 struct sched_param lp; 4355 struct sched_param lp;
4373 struct task_struct *p; 4356 struct task_struct *p;
4374 int retval = -EINVAL; 4357 int retval;
4375 4358
4376 if (!param || pid < 0) 4359 if (!param || pid < 0)
4377 goto out_nounlock; 4360 return -EINVAL;
4378 4361
4379 read_lock(&tasklist_lock); 4362 read_lock(&tasklist_lock);
4380 p = find_process_by_pid(pid); 4363 p = find_process_by_pid(pid);
@@ -4394,7 +4377,6 @@ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4394 */ 4377 */
4395 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 4378 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4396 4379
4397out_nounlock:
4398 return retval; 4380 return retval;
4399 4381
4400out_unlock: 4382out_unlock:
@@ -4554,8 +4536,8 @@ asmlinkage long sys_sched_yield(void)
4554{ 4536{
4555 struct rq *rq = this_rq_lock(); 4537 struct rq *rq = this_rq_lock();
4556 4538
4557 schedstat_inc(rq, yld_cnt); 4539 schedstat_inc(rq, yld_count);
4558 current->sched_class->yield_task(rq, current); 4540 current->sched_class->yield_task(rq);
4559 4541
4560 /* 4542 /*
4561 * Since we are going to call schedule() anyway, there's 4543 * Since we are going to call schedule() anyway, there's
@@ -4749,11 +4731,12 @@ asmlinkage
4749long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) 4731long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4750{ 4732{
4751 struct task_struct *p; 4733 struct task_struct *p;
4752 int retval = -EINVAL; 4734 unsigned int time_slice;
4735 int retval;
4753 struct timespec t; 4736 struct timespec t;
4754 4737
4755 if (pid < 0) 4738 if (pid < 0)
4756 goto out_nounlock; 4739 return -EINVAL;
4757 4740
4758 retval = -ESRCH; 4741 retval = -ESRCH;
4759 read_lock(&tasklist_lock); 4742 read_lock(&tasklist_lock);
@@ -4765,12 +4748,24 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4765 if (retval) 4748 if (retval)
4766 goto out_unlock; 4749 goto out_unlock;
4767 4750
4768 jiffies_to_timespec(p->policy == SCHED_FIFO ? 4751 if (p->policy == SCHED_FIFO)
4769 0 : static_prio_timeslice(p->static_prio), &t); 4752 time_slice = 0;
4753 else if (p->policy == SCHED_RR)
4754 time_slice = DEF_TIMESLICE;
4755 else {
4756 struct sched_entity *se = &p->se;
4757 unsigned long flags;
4758 struct rq *rq;
4759
4760 rq = task_rq_lock(p, &flags);
4761 time_slice = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
4762 task_rq_unlock(rq, &flags);
4763 }
4770 read_unlock(&tasklist_lock); 4764 read_unlock(&tasklist_lock);
4765 jiffies_to_timespec(time_slice, &t);
4771 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4766 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4772out_nounlock:
4773 return retval; 4767 return retval;
4768
4774out_unlock: 4769out_unlock:
4775 read_unlock(&tasklist_lock); 4770 read_unlock(&tasklist_lock);
4776 return retval; 4771 return retval;
@@ -4899,32 +4894,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
4899 */ 4894 */
4900cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 4895cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4901 4896
4902/*
4903 * Increase the granularity value when there are more CPUs,
4904 * because with more CPUs the 'effective latency' as visible
4905 * to users decreases. But the relationship is not linear,
4906 * so pick a second-best guess by going with the log2 of the
4907 * number of CPUs.
4908 *
4909 * This idea comes from the SD scheduler of Con Kolivas:
4910 */
4911static inline void sched_init_granularity(void)
4912{
4913 unsigned int factor = 1 + ilog2(num_online_cpus());
4914 const unsigned long limit = 100000000;
4915
4916 sysctl_sched_min_granularity *= factor;
4917 if (sysctl_sched_min_granularity > limit)
4918 sysctl_sched_min_granularity = limit;
4919
4920 sysctl_sched_latency *= factor;
4921 if (sysctl_sched_latency > limit)
4922 sysctl_sched_latency = limit;
4923
4924 sysctl_sched_runtime_limit = sysctl_sched_latency;
4925 sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2;
4926}
4927
4928#ifdef CONFIG_SMP 4897#ifdef CONFIG_SMP
4929/* 4898/*
4930 * This is how migration works: 4899 * This is how migration works:
@@ -5102,35 +5071,34 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5102 struct rq *rq; 5071 struct rq *rq;
5103 int dest_cpu; 5072 int dest_cpu;
5104 5073
5105restart: 5074 do {
5106 /* On same node? */ 5075 /* On same node? */
5107 mask = node_to_cpumask(cpu_to_node(dead_cpu)); 5076 mask = node_to_cpumask(cpu_to_node(dead_cpu));
5108 cpus_and(mask, mask, p->cpus_allowed); 5077 cpus_and(mask, mask, p->cpus_allowed);
5109 dest_cpu = any_online_cpu(mask); 5078 dest_cpu = any_online_cpu(mask);
5110 5079
5111 /* On any allowed CPU? */ 5080 /* On any allowed CPU? */
5112 if (dest_cpu == NR_CPUS) 5081 if (dest_cpu == NR_CPUS)
5113 dest_cpu = any_online_cpu(p->cpus_allowed); 5082 dest_cpu = any_online_cpu(p->cpus_allowed);
5114 5083
5115 /* No more Mr. Nice Guy. */ 5084 /* No more Mr. Nice Guy. */
5116 if (dest_cpu == NR_CPUS) { 5085 if (dest_cpu == NR_CPUS) {
5117 rq = task_rq_lock(p, &flags); 5086 rq = task_rq_lock(p, &flags);
5118 cpus_setall(p->cpus_allowed); 5087 cpus_setall(p->cpus_allowed);
5119 dest_cpu = any_online_cpu(p->cpus_allowed); 5088 dest_cpu = any_online_cpu(p->cpus_allowed);
5120 task_rq_unlock(rq, &flags); 5089 task_rq_unlock(rq, &flags);
5121 5090
5122 /* 5091 /*
5123 * Don't tell them about moving exiting tasks or 5092 * Don't tell them about moving exiting tasks or
5124 * kernel threads (both mm NULL), since they never 5093 * kernel threads (both mm NULL), since they never
5125 * leave kernel. 5094 * leave kernel.
5126 */ 5095 */
5127 if (p->mm && printk_ratelimit()) 5096 if (p->mm && printk_ratelimit())
5128 printk(KERN_INFO "process %d (%s) no " 5097 printk(KERN_INFO "process %d (%s) no "
5129 "longer affine to cpu%d\n", 5098 "longer affine to cpu%d\n",
5130 p->pid, p->comm, dead_cpu); 5099 p->pid, p->comm, dead_cpu);
5131 } 5100 }
5132 if (!__migrate_task(p, dead_cpu, dest_cpu)) 5101 } while (!__migrate_task(p, dead_cpu, dest_cpu));
5133 goto restart;
5134} 5102}
5135 5103
5136/* 5104/*
@@ -5172,6 +5140,20 @@ static void migrate_live_tasks(int src_cpu)
5172} 5140}
5173 5141
5174/* 5142/*
5143 * activate_idle_task - move idle task to the _front_ of runqueue.
5144 */
5145static void activate_idle_task(struct task_struct *p, struct rq *rq)
5146{
5147 update_rq_clock(rq);
5148
5149 if (p->state == TASK_UNINTERRUPTIBLE)
5150 rq->nr_uninterruptible--;
5151
5152 enqueue_task(rq, p, 0);
5153 inc_nr_running(p, rq);
5154}
5155
5156/*
5175 * Schedules idle task to be the next runnable task on current CPU. 5157 * Schedules idle task to be the next runnable task on current CPU.
5176 * It does so by boosting its priority to highest possible and adding it to 5158 * It does so by boosting its priority to highest possible and adding it to
5177 * the _front_ of the runqueue. Used by CPU offline code. 5159 * the _front_ of the runqueue. Used by CPU offline code.
@@ -5283,14 +5265,23 @@ static struct ctl_table sd_ctl_root[] = {
5283static struct ctl_table *sd_alloc_ctl_entry(int n) 5265static struct ctl_table *sd_alloc_ctl_entry(int n)
5284{ 5266{
5285 struct ctl_table *entry = 5267 struct ctl_table *entry =
5286 kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL); 5268 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
5287
5288 BUG_ON(!entry);
5289 memset(entry, 0, n * sizeof(struct ctl_table));
5290 5269
5291 return entry; 5270 return entry;
5292} 5271}
5293 5272
5273static void sd_free_ctl_entry(struct ctl_table **tablep)
5274{
5275 struct ctl_table *entry = *tablep;
5276
5277 for (entry = *tablep; entry->procname; entry++)
5278 if (entry->child)
5279 sd_free_ctl_entry(&entry->child);
5280
5281 kfree(*tablep);
5282 *tablep = NULL;
5283}
5284
5294static void 5285static void
5295set_table_entry(struct ctl_table *entry, 5286set_table_entry(struct ctl_table *entry,
5296 const char *procname, void *data, int maxlen, 5287 const char *procname, void *data, int maxlen,
@@ -5306,7 +5297,10 @@ set_table_entry(struct ctl_table *entry,
5306static struct ctl_table * 5297static struct ctl_table *
5307sd_alloc_ctl_domain_table(struct sched_domain *sd) 5298sd_alloc_ctl_domain_table(struct sched_domain *sd)
5308{ 5299{
5309 struct ctl_table *table = sd_alloc_ctl_entry(14); 5300 struct ctl_table *table = sd_alloc_ctl_entry(12);
5301
5302 if (table == NULL)
5303 return NULL;
5310 5304
5311 set_table_entry(&table[0], "min_interval", &sd->min_interval, 5305 set_table_entry(&table[0], "min_interval", &sd->min_interval,
5312 sizeof(long), 0644, proc_doulongvec_minmax); 5306 sizeof(long), 0644, proc_doulongvec_minmax);
@@ -5326,11 +5320,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
5326 sizeof(int), 0644, proc_dointvec_minmax); 5320 sizeof(int), 0644, proc_dointvec_minmax);
5327 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, 5321 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
5328 sizeof(int), 0644, proc_dointvec_minmax); 5322 sizeof(int), 0644, proc_dointvec_minmax);
5329 set_table_entry(&table[10], "cache_nice_tries", 5323 set_table_entry(&table[9], "cache_nice_tries",
5330 &sd->cache_nice_tries, 5324 &sd->cache_nice_tries,
5331 sizeof(int), 0644, proc_dointvec_minmax); 5325 sizeof(int), 0644, proc_dointvec_minmax);
5332 set_table_entry(&table[12], "flags", &sd->flags, 5326 set_table_entry(&table[10], "flags", &sd->flags,
5333 sizeof(int), 0644, proc_dointvec_minmax); 5327 sizeof(int), 0644, proc_dointvec_minmax);
5328 /* &table[11] is terminator */
5334 5329
5335 return table; 5330 return table;
5336} 5331}
@@ -5345,6 +5340,8 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5345 for_each_domain(cpu, sd) 5340 for_each_domain(cpu, sd)
5346 domain_num++; 5341 domain_num++;
5347 entry = table = sd_alloc_ctl_entry(domain_num + 1); 5342 entry = table = sd_alloc_ctl_entry(domain_num + 1);
5343 if (table == NULL)
5344 return NULL;
5348 5345
5349 i = 0; 5346 i = 0;
5350 for_each_domain(cpu, sd) { 5347 for_each_domain(cpu, sd) {
@@ -5359,24 +5356,38 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5359} 5356}
5360 5357
5361static struct ctl_table_header *sd_sysctl_header; 5358static struct ctl_table_header *sd_sysctl_header;
5362static void init_sched_domain_sysctl(void) 5359static void register_sched_domain_sysctl(void)
5363{ 5360{
5364 int i, cpu_num = num_online_cpus(); 5361 int i, cpu_num = num_online_cpus();
5365 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 5362 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
5366 char buf[32]; 5363 char buf[32];
5367 5364
5365 if (entry == NULL)
5366 return;
5367
5368 sd_ctl_dir[0].child = entry; 5368 sd_ctl_dir[0].child = entry;
5369 5369
5370 for (i = 0; i < cpu_num; i++, entry++) { 5370 for_each_online_cpu(i) {
5371 snprintf(buf, 32, "cpu%d", i); 5371 snprintf(buf, 32, "cpu%d", i);
5372 entry->procname = kstrdup(buf, GFP_KERNEL); 5372 entry->procname = kstrdup(buf, GFP_KERNEL);
5373 entry->mode = 0555; 5373 entry->mode = 0555;
5374 entry->child = sd_alloc_ctl_cpu_table(i); 5374 entry->child = sd_alloc_ctl_cpu_table(i);
5375 entry++;
5375 } 5376 }
5376 sd_sysctl_header = register_sysctl_table(sd_ctl_root); 5377 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
5377} 5378}
5379
5380static void unregister_sched_domain_sysctl(void)
5381{
5382 unregister_sysctl_table(sd_sysctl_header);
5383 sd_sysctl_header = NULL;
5384 sd_free_ctl_entry(&sd_ctl_dir[0].child);
5385}
5378#else 5386#else
5379static void init_sched_domain_sysctl(void) 5387static void register_sched_domain_sysctl(void)
5388{
5389}
5390static void unregister_sched_domain_sysctl(void)
5380{ 5391{
5381} 5392}
5382#endif 5393#endif
@@ -5498,8 +5509,7 @@ int __init migration_init(void)
5498int nr_cpu_ids __read_mostly = NR_CPUS; 5509int nr_cpu_ids __read_mostly = NR_CPUS;
5499EXPORT_SYMBOL(nr_cpu_ids); 5510EXPORT_SYMBOL(nr_cpu_ids);
5500 5511
5501#undef SCHED_DOMAIN_DEBUG 5512#ifdef CONFIG_SCHED_DEBUG
5502#ifdef SCHED_DOMAIN_DEBUG
5503static void sched_domain_debug(struct sched_domain *sd, int cpu) 5513static void sched_domain_debug(struct sched_domain *sd, int cpu)
5504{ 5514{
5505 int level = 0; 5515 int level = 0;
@@ -5557,16 +5567,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5557 printk("\n"); 5567 printk("\n");
5558 printk(KERN_ERR "ERROR: domain->cpu_power not " 5568 printk(KERN_ERR "ERROR: domain->cpu_power not "
5559 "set\n"); 5569 "set\n");
5570 break;
5560 } 5571 }
5561 5572
5562 if (!cpus_weight(group->cpumask)) { 5573 if (!cpus_weight(group->cpumask)) {
5563 printk("\n"); 5574 printk("\n");
5564 printk(KERN_ERR "ERROR: empty group\n"); 5575 printk(KERN_ERR "ERROR: empty group\n");
5576 break;
5565 } 5577 }
5566 5578
5567 if (cpus_intersects(groupmask, group->cpumask)) { 5579 if (cpus_intersects(groupmask, group->cpumask)) {
5568 printk("\n"); 5580 printk("\n");
5569 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5581 printk(KERN_ERR "ERROR: repeated CPUs\n");
5582 break;
5570 } 5583 }
5571 5584
5572 cpus_or(groupmask, groupmask, group->cpumask); 5585 cpus_or(groupmask, groupmask, group->cpumask);
@@ -5700,7 +5713,7 @@ static int __init isolated_cpu_setup(char *str)
5700 return 1; 5713 return 1;
5701} 5714}
5702 5715
5703__setup ("isolcpus=", isolated_cpu_setup); 5716__setup("isolcpus=", isolated_cpu_setup);
5704 5717
5705/* 5718/*
5706 * init_sched_build_groups takes the cpumask we wish to span, and a pointer 5719 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
@@ -5929,24 +5942,23 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
5929 5942
5930 if (!sg) 5943 if (!sg)
5931 return; 5944 return;
5932next_sg: 5945 do {
5933 for_each_cpu_mask(j, sg->cpumask) { 5946 for_each_cpu_mask(j, sg->cpumask) {
5934 struct sched_domain *sd; 5947 struct sched_domain *sd;
5935 5948
5936 sd = &per_cpu(phys_domains, j); 5949 sd = &per_cpu(phys_domains, j);
5937 if (j != first_cpu(sd->groups->cpumask)) { 5950 if (j != first_cpu(sd->groups->cpumask)) {
5938 /* 5951 /*
5939 * Only add "power" once for each 5952 * Only add "power" once for each
5940 * physical package. 5953 * physical package.
5941 */ 5954 */
5942 continue; 5955 continue;
5943 } 5956 }
5944 5957
5945 sg_inc_cpu_power(sg, sd->groups->__cpu_power); 5958 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
5946 } 5959 }
5947 sg = sg->next; 5960 sg = sg->next;
5948 if (sg != group_head) 5961 } while (sg != group_head);
5949 goto next_sg;
5950} 5962}
5951#endif 5963#endif
5952 5964
@@ -6057,7 +6069,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6057 /* 6069 /*
6058 * Allocate the per-node list of sched groups 6070 * Allocate the per-node list of sched groups
6059 */ 6071 */
6060 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES, 6072 sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
6061 GFP_KERNEL); 6073 GFP_KERNEL);
6062 if (!sched_group_nodes) { 6074 if (!sched_group_nodes) {
6063 printk(KERN_WARNING "Can not alloc sched group node list\n"); 6075 printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -6310,6 +6322,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
6310 6322
6311 err = build_sched_domains(&cpu_default_map); 6323 err = build_sched_domains(&cpu_default_map);
6312 6324
6325 register_sched_domain_sysctl();
6326
6313 return err; 6327 return err;
6314} 6328}
6315 6329
@@ -6326,6 +6340,8 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6326{ 6340{
6327 int i; 6341 int i;
6328 6342
6343 unregister_sched_domain_sysctl();
6344
6329 for_each_cpu_mask(i, *cpu_map) 6345 for_each_cpu_mask(i, *cpu_map)
6330 cpu_attach_domain(NULL, i); 6346 cpu_attach_domain(NULL, i);
6331 synchronize_sched(); 6347 synchronize_sched();
@@ -6356,6 +6372,8 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6356 if (!err && !cpus_empty(*partition2)) 6372 if (!err && !cpus_empty(*partition2))
6357 err = build_sched_domains(partition2); 6373 err = build_sched_domains(partition2);
6358 6374
6375 register_sched_domain_sysctl();
6376
6359 return err; 6377 return err;
6360} 6378}
6361 6379
@@ -6487,17 +6505,13 @@ void __init sched_init_smp(void)
6487 /* XXX: Theoretical race here - CPU may be hotplugged now */ 6505 /* XXX: Theoretical race here - CPU may be hotplugged now */
6488 hotcpu_notifier(update_sched_domains, 0); 6506 hotcpu_notifier(update_sched_domains, 0);
6489 6507
6490 init_sched_domain_sysctl();
6491
6492 /* Move init over to a non-isolated CPU */ 6508 /* Move init over to a non-isolated CPU */
6493 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 6509 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6494 BUG(); 6510 BUG();
6495 sched_init_granularity();
6496} 6511}
6497#else 6512#else
6498void __init sched_init_smp(void) 6513void __init sched_init_smp(void)
6499{ 6514{
6500 sched_init_granularity();
6501} 6515}
6502#endif /* CONFIG_SMP */ 6516#endif /* CONFIG_SMP */
6503 6517
@@ -6511,28 +6525,20 @@ int in_sched_functions(unsigned long addr)
6511 && addr < (unsigned long)__sched_text_end); 6525 && addr < (unsigned long)__sched_text_end);
6512} 6526}
6513 6527
6514static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) 6528static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6515{ 6529{
6516 cfs_rq->tasks_timeline = RB_ROOT; 6530 cfs_rq->tasks_timeline = RB_ROOT;
6517 cfs_rq->fair_clock = 1;
6518#ifdef CONFIG_FAIR_GROUP_SCHED 6531#ifdef CONFIG_FAIR_GROUP_SCHED
6519 cfs_rq->rq = rq; 6532 cfs_rq->rq = rq;
6520#endif 6533#endif
6534 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
6521} 6535}
6522 6536
6523void __init sched_init(void) 6537void __init sched_init(void)
6524{ 6538{
6525 u64 now = sched_clock();
6526 int highest_cpu = 0; 6539 int highest_cpu = 0;
6527 int i, j; 6540 int i, j;
6528 6541
6529 /*
6530 * Link up the scheduling class hierarchy:
6531 */
6532 rt_sched_class.next = &fair_sched_class;
6533 fair_sched_class.next = &idle_sched_class;
6534 idle_sched_class.next = NULL;
6535
6536 for_each_possible_cpu(i) { 6542 for_each_possible_cpu(i) {
6537 struct rt_prio_array *array; 6543 struct rt_prio_array *array;
6538 struct rq *rq; 6544 struct rq *rq;
@@ -6545,10 +6551,28 @@ void __init sched_init(void)
6545 init_cfs_rq(&rq->cfs, rq); 6551 init_cfs_rq(&rq->cfs, rq);
6546#ifdef CONFIG_FAIR_GROUP_SCHED 6552#ifdef CONFIG_FAIR_GROUP_SCHED
6547 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6553 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6548 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); 6554 {
6555 struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
6556 struct sched_entity *se =
6557 &per_cpu(init_sched_entity, i);
6558
6559 init_cfs_rq_p[i] = cfs_rq;
6560 init_cfs_rq(cfs_rq, rq);
6561 cfs_rq->tg = &init_task_group;
6562 list_add(&cfs_rq->leaf_cfs_rq_list,
6563 &rq->leaf_cfs_rq_list);
6564
6565 init_sched_entity_p[i] = se;
6566 se->cfs_rq = &rq->cfs;
6567 se->my_q = cfs_rq;
6568 se->load.weight = init_task_group_load;
6569 se->load.inv_weight =
6570 div64_64(1ULL<<32, init_task_group_load);
6571 se->parent = NULL;
6572 }
6573 init_task_group.shares = init_task_group_load;
6574 spin_lock_init(&init_task_group.lock);
6549#endif 6575#endif
6550 rq->ls.load_update_last = now;
6551 rq->ls.load_update_start = now;
6552 6576
6553 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 6577 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6554 rq->cpu_load[j] = 0; 6578 rq->cpu_load[j] = 0;
@@ -6633,26 +6657,40 @@ EXPORT_SYMBOL(__might_sleep);
6633#endif 6657#endif
6634 6658
6635#ifdef CONFIG_MAGIC_SYSRQ 6659#ifdef CONFIG_MAGIC_SYSRQ
6660static void normalize_task(struct rq *rq, struct task_struct *p)
6661{
6662 int on_rq;
6663 update_rq_clock(rq);
6664 on_rq = p->se.on_rq;
6665 if (on_rq)
6666 deactivate_task(rq, p, 0);
6667 __setscheduler(rq, p, SCHED_NORMAL, 0);
6668 if (on_rq) {
6669 activate_task(rq, p, 0);
6670 resched_task(rq->curr);
6671 }
6672}
6673
6636void normalize_rt_tasks(void) 6674void normalize_rt_tasks(void)
6637{ 6675{
6638 struct task_struct *g, *p; 6676 struct task_struct *g, *p;
6639 unsigned long flags; 6677 unsigned long flags;
6640 struct rq *rq; 6678 struct rq *rq;
6641 int on_rq;
6642 6679
6643 read_lock_irq(&tasklist_lock); 6680 read_lock_irq(&tasklist_lock);
6644 do_each_thread(g, p) { 6681 do_each_thread(g, p) {
6645 p->se.fair_key = 0; 6682 /*
6646 p->se.wait_runtime = 0; 6683 * Only normalize user tasks:
6684 */
6685 if (!p->mm)
6686 continue;
6687
6647 p->se.exec_start = 0; 6688 p->se.exec_start = 0;
6648 p->se.wait_start_fair = 0;
6649 p->se.sleep_start_fair = 0;
6650#ifdef CONFIG_SCHEDSTATS 6689#ifdef CONFIG_SCHEDSTATS
6651 p->se.wait_start = 0; 6690 p->se.wait_start = 0;
6652 p->se.sleep_start = 0; 6691 p->se.sleep_start = 0;
6653 p->se.block_start = 0; 6692 p->se.block_start = 0;
6654#endif 6693#endif
6655 task_rq(p)->cfs.fair_clock = 0;
6656 task_rq(p)->clock = 0; 6694 task_rq(p)->clock = 0;
6657 6695
6658 if (!rt_task(p)) { 6696 if (!rt_task(p)) {
@@ -6667,26 +6705,9 @@ void normalize_rt_tasks(void)
6667 6705
6668 spin_lock_irqsave(&p->pi_lock, flags); 6706 spin_lock_irqsave(&p->pi_lock, flags);
6669 rq = __task_rq_lock(p); 6707 rq = __task_rq_lock(p);
6670#ifdef CONFIG_SMP
6671 /*
6672 * Do not touch the migration thread:
6673 */
6674 if (p == rq->migration_thread)
6675 goto out_unlock;
6676#endif
6677 6708
6678 update_rq_clock(rq); 6709 normalize_task(rq, p);
6679 on_rq = p->se.on_rq; 6710
6680 if (on_rq)
6681 deactivate_task(rq, p, 0);
6682 __setscheduler(rq, p, SCHED_NORMAL, 0);
6683 if (on_rq) {
6684 activate_task(rq, p, 0);
6685 resched_task(rq->curr);
6686 }
6687#ifdef CONFIG_SMP
6688 out_unlock:
6689#endif
6690 __task_rq_unlock(rq); 6711 __task_rq_unlock(rq);
6691 spin_unlock_irqrestore(&p->pi_lock, flags); 6712 spin_unlock_irqrestore(&p->pi_lock, flags);
6692 } while_each_thread(g, p); 6713 } while_each_thread(g, p);
@@ -6739,3 +6760,201 @@ void set_curr_task(int cpu, struct task_struct *p)
6739} 6760}
6740 6761
6741#endif 6762#endif
6763
6764#ifdef CONFIG_FAIR_GROUP_SCHED
6765
6766/* allocate runqueue etc for a new task group */
6767struct task_group *sched_create_group(void)
6768{
6769 struct task_group *tg;
6770 struct cfs_rq *cfs_rq;
6771 struct sched_entity *se;
6772 struct rq *rq;
6773 int i;
6774
6775 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
6776 if (!tg)
6777 return ERR_PTR(-ENOMEM);
6778
6779 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
6780 if (!tg->cfs_rq)
6781 goto err;
6782 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
6783 if (!tg->se)
6784 goto err;
6785
6786 for_each_possible_cpu(i) {
6787 rq = cpu_rq(i);
6788
6789 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL,
6790 cpu_to_node(i));
6791 if (!cfs_rq)
6792 goto err;
6793
6794 se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL,
6795 cpu_to_node(i));
6796 if (!se)
6797 goto err;
6798
6799 memset(cfs_rq, 0, sizeof(struct cfs_rq));
6800 memset(se, 0, sizeof(struct sched_entity));
6801
6802 tg->cfs_rq[i] = cfs_rq;
6803 init_cfs_rq(cfs_rq, rq);
6804 cfs_rq->tg = tg;
6805
6806 tg->se[i] = se;
6807 se->cfs_rq = &rq->cfs;
6808 se->my_q = cfs_rq;
6809 se->load.weight = NICE_0_LOAD;
6810 se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
6811 se->parent = NULL;
6812 }
6813
6814 for_each_possible_cpu(i) {
6815 rq = cpu_rq(i);
6816 cfs_rq = tg->cfs_rq[i];
6817 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6818 }
6819
6820 tg->shares = NICE_0_LOAD;
6821 spin_lock_init(&tg->lock);
6822
6823 return tg;
6824
6825err:
6826 for_each_possible_cpu(i) {
6827 if (tg->cfs_rq)
6828 kfree(tg->cfs_rq[i]);
6829 if (tg->se)
6830 kfree(tg->se[i]);
6831 }
6832 kfree(tg->cfs_rq);
6833 kfree(tg->se);
6834 kfree(tg);
6835
6836 return ERR_PTR(-ENOMEM);
6837}
6838
6839/* rcu callback to free various structures associated with a task group */
6840static void free_sched_group(struct rcu_head *rhp)
6841{
6842 struct cfs_rq *cfs_rq = container_of(rhp, struct cfs_rq, rcu);
6843 struct task_group *tg = cfs_rq->tg;
6844 struct sched_entity *se;
6845 int i;
6846
6847 /* now it should be safe to free those cfs_rqs */
6848 for_each_possible_cpu(i) {
6849 cfs_rq = tg->cfs_rq[i];
6850 kfree(cfs_rq);
6851
6852 se = tg->se[i];
6853 kfree(se);
6854 }
6855
6856 kfree(tg->cfs_rq);
6857 kfree(tg->se);
6858 kfree(tg);
6859}
6860
6861/* Destroy runqueue etc associated with a task group */
6862void sched_destroy_group(struct task_group *tg)
6863{
6864 struct cfs_rq *cfs_rq;
6865 int i;
6866
6867 for_each_possible_cpu(i) {
6868 cfs_rq = tg->cfs_rq[i];
6869 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
6870 }
6871
6872 cfs_rq = tg->cfs_rq[0];
6873
6874 /* wait for possible concurrent references to cfs_rqs complete */
6875 call_rcu(&cfs_rq->rcu, free_sched_group);
6876}
6877
6878/* change task's runqueue when it moves between groups.
6879 * The caller of this function should have put the task in its new group
6880 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
6881 * reflect its new group.
6882 */
6883void sched_move_task(struct task_struct *tsk)
6884{
6885 int on_rq, running;
6886 unsigned long flags;
6887 struct rq *rq;
6888
6889 rq = task_rq_lock(tsk, &flags);
6890
6891 if (tsk->sched_class != &fair_sched_class)
6892 goto done;
6893
6894 update_rq_clock(rq);
6895
6896 running = task_running(rq, tsk);
6897 on_rq = tsk->se.on_rq;
6898
6899 if (on_rq) {
6900 dequeue_task(rq, tsk, 0);
6901 if (unlikely(running))
6902 tsk->sched_class->put_prev_task(rq, tsk);
6903 }
6904
6905 set_task_cfs_rq(tsk);
6906
6907 if (on_rq) {
6908 if (unlikely(running))
6909 tsk->sched_class->set_curr_task(rq);
6910 enqueue_task(rq, tsk, 0);
6911 }
6912
6913done:
6914 task_rq_unlock(rq, &flags);
6915}
6916
6917static void set_se_shares(struct sched_entity *se, unsigned long shares)
6918{
6919 struct cfs_rq *cfs_rq = se->cfs_rq;
6920 struct rq *rq = cfs_rq->rq;
6921 int on_rq;
6922
6923 spin_lock_irq(&rq->lock);
6924
6925 on_rq = se->on_rq;
6926 if (on_rq)
6927 dequeue_entity(cfs_rq, se, 0);
6928
6929 se->load.weight = shares;
6930 se->load.inv_weight = div64_64((1ULL<<32), shares);
6931
6932 if (on_rq)
6933 enqueue_entity(cfs_rq, se, 0);
6934
6935 spin_unlock_irq(&rq->lock);
6936}
6937
6938int sched_group_set_shares(struct task_group *tg, unsigned long shares)
6939{
6940 int i;
6941
6942 spin_lock(&tg->lock);
6943 if (tg->shares == shares)
6944 goto done;
6945
6946 tg->shares = shares;
6947 for_each_possible_cpu(i)
6948 set_se_shares(tg->se[i], shares);
6949
6950done:
6951 spin_unlock(&tg->lock);
6952 return 0;
6953}
6954
6955unsigned long sched_group_shares(struct task_group *tg)
6956{
6957 return tg->shares;
6958}
6959
6960#endif /* CONFIG_FAIR_GROUP_SCHED */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index c3ee38bd3426..a5e517ec07c3 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -28,6 +28,31 @@
28 printk(x); \ 28 printk(x); \
29 } while (0) 29 } while (0)
30 30
31/*
32 * Ease the printing of nsec fields:
33 */
34static long long nsec_high(long long nsec)
35{
36 if (nsec < 0) {
37 nsec = -nsec;
38 do_div(nsec, 1000000);
39 return -nsec;
40 }
41 do_div(nsec, 1000000);
42
43 return nsec;
44}
45
46static unsigned long nsec_low(long long nsec)
47{
48 if (nsec < 0)
49 nsec = -nsec;
50
51 return do_div(nsec, 1000000);
52}
53
54#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
55
31static void 56static void
32print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 57print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
33{ 58{
@@ -36,23 +61,19 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
36 else 61 else
37 SEQ_printf(m, " "); 62 SEQ_printf(m, " ");
38 63
39 SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d ", 64 SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
40 p->comm, p->pid, 65 p->comm, p->pid,
41 (long long)p->se.fair_key, 66 SPLIT_NS(p->se.vruntime),
42 (long long)(p->se.fair_key - rq->cfs.fair_clock),
43 (long long)p->se.wait_runtime,
44 (long long)(p->nvcsw + p->nivcsw), 67 (long long)(p->nvcsw + p->nivcsw),
45 p->prio); 68 p->prio);
46#ifdef CONFIG_SCHEDSTATS 69#ifdef CONFIG_SCHEDSTATS
47 SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n", 70 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n",
48 (long long)p->se.sum_exec_runtime, 71 SPLIT_NS(p->se.vruntime),
49 (long long)p->se.sum_wait_runtime, 72 SPLIT_NS(p->se.sum_exec_runtime),
50 (long long)p->se.sum_sleep_runtime, 73 SPLIT_NS(p->se.sum_sleep_runtime));
51 (long long)p->se.wait_runtime_overruns,
52 (long long)p->se.wait_runtime_underruns);
53#else 74#else
54 SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n", 75 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n",
55 0LL, 0LL, 0LL, 0LL, 0LL); 76 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
56#endif 77#endif
57} 78}
58 79
@@ -62,14 +83,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
62 83
63 SEQ_printf(m, 84 SEQ_printf(m,
64 "\nrunnable tasks:\n" 85 "\nrunnable tasks:\n"
65 " task PID tree-key delta waiting" 86 " task PID tree-key switches prio"
66 " switches prio" 87 " exec-runtime sum-exec sum-sleep\n"
67 " sum-exec sum-wait sum-sleep" 88 "------------------------------------------------------"
68 " wait-overrun wait-underrun\n" 89 "----------------------------------------------------\n");
69 "------------------------------------------------------------------"
70 "----------------"
71 "------------------------------------------------"
72 "--------------------------------\n");
73 90
74 read_lock_irq(&tasklist_lock); 91 read_lock_irq(&tasklist_lock);
75 92
@@ -83,45 +100,48 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
83 read_unlock_irq(&tasklist_lock); 100 read_unlock_irq(&tasklist_lock);
84} 101}
85 102
86static void 103void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
87print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
88{ 104{
89 s64 wait_runtime_rq_sum = 0; 105 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
90 struct task_struct *p; 106 spread, rq0_min_vruntime, spread0;
91 struct rb_node *curr;
92 unsigned long flags;
93 struct rq *rq = &per_cpu(runqueues, cpu); 107 struct rq *rq = &per_cpu(runqueues, cpu);
108 struct sched_entity *last;
109 unsigned long flags;
94 110
95 spin_lock_irqsave(&rq->lock, flags);
96 curr = first_fair(cfs_rq);
97 while (curr) {
98 p = rb_entry(curr, struct task_struct, se.run_node);
99 wait_runtime_rq_sum += p->se.wait_runtime;
100
101 curr = rb_next(curr);
102 }
103 spin_unlock_irqrestore(&rq->lock, flags);
104
105 SEQ_printf(m, " .%-30s: %Ld\n", "wait_runtime_rq_sum",
106 (long long)wait_runtime_rq_sum);
107}
108
109void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
110{
111 SEQ_printf(m, "\ncfs_rq\n"); 111 SEQ_printf(m, "\ncfs_rq\n");
112 112
113#define P(x) \ 113 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
114 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(cfs_rq->x)) 114 SPLIT_NS(cfs_rq->exec_clock));
115
116 P(fair_clock);
117 P(exec_clock);
118 P(wait_runtime);
119 P(wait_runtime_overruns);
120 P(wait_runtime_underruns);
121 P(sleeper_bonus);
122#undef P
123 115
124 print_cfs_rq_runtime_sum(m, cpu, cfs_rq); 116 spin_lock_irqsave(&rq->lock, flags);
117 if (cfs_rq->rb_leftmost)
118 MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
119 last = __pick_last_entity(cfs_rq);
120 if (last)
121 max_vruntime = last->vruntime;
122 min_vruntime = rq->cfs.min_vruntime;
123 rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
124 spin_unlock_irqrestore(&rq->lock, flags);
125 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
126 SPLIT_NS(MIN_vruntime));
127 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
128 SPLIT_NS(min_vruntime));
129 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
130 SPLIT_NS(max_vruntime));
131 spread = max_vruntime - MIN_vruntime;
132 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
133 SPLIT_NS(spread));
134 spread0 = min_vruntime - rq0_min_vruntime;
135 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
136 SPLIT_NS(spread0));
137 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
138 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
139#ifdef CONFIG_SCHEDSTATS
140 SEQ_printf(m, " .%-30s: %ld\n", "bkl_count",
141 rq->bkl_count);
142#endif
143 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
144 cfs_rq->nr_spread_over);
125} 145}
126 146
127static void print_cpu(struct seq_file *m, int cpu) 147static void print_cpu(struct seq_file *m, int cpu)
@@ -141,31 +161,32 @@ static void print_cpu(struct seq_file *m, int cpu)
141 161
142#define P(x) \ 162#define P(x) \
143 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) 163 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x))
164#define PN(x) \
165 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
144 166
145 P(nr_running); 167 P(nr_running);
146 SEQ_printf(m, " .%-30s: %lu\n", "load", 168 SEQ_printf(m, " .%-30s: %lu\n", "load",
147 rq->ls.load.weight); 169 rq->load.weight);
148 P(ls.delta_fair);
149 P(ls.delta_exec);
150 P(nr_switches); 170 P(nr_switches);
151 P(nr_load_updates); 171 P(nr_load_updates);
152 P(nr_uninterruptible); 172 P(nr_uninterruptible);
153 SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies); 173 SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies);
154 P(next_balance); 174 PN(next_balance);
155 P(curr->pid); 175 P(curr->pid);
156 P(clock); 176 PN(clock);
157 P(idle_clock); 177 PN(idle_clock);
158 P(prev_clock_raw); 178 PN(prev_clock_raw);
159 P(clock_warps); 179 P(clock_warps);
160 P(clock_overflows); 180 P(clock_overflows);
161 P(clock_deep_idle_events); 181 P(clock_deep_idle_events);
162 P(clock_max_delta); 182 PN(clock_max_delta);
163 P(cpu_load[0]); 183 P(cpu_load[0]);
164 P(cpu_load[1]); 184 P(cpu_load[1]);
165 P(cpu_load[2]); 185 P(cpu_load[2]);
166 P(cpu_load[3]); 186 P(cpu_load[3]);
167 P(cpu_load[4]); 187 P(cpu_load[4]);
168#undef P 188#undef P
189#undef PN
169 190
170 print_cfs_stats(m, cpu); 191 print_cfs_stats(m, cpu);
171 192
@@ -177,12 +198,25 @@ static int sched_debug_show(struct seq_file *m, void *v)
177 u64 now = ktime_to_ns(ktime_get()); 198 u64 now = ktime_to_ns(ktime_get());
178 int cpu; 199 int cpu;
179 200
180 SEQ_printf(m, "Sched Debug Version: v0.05-v20, %s %.*s\n", 201 SEQ_printf(m, "Sched Debug Version: v0.06-v22, %s %.*s\n",
181 init_utsname()->release, 202 init_utsname()->release,
182 (int)strcspn(init_utsname()->version, " "), 203 (int)strcspn(init_utsname()->version, " "),
183 init_utsname()->version); 204 init_utsname()->version);
184 205
185 SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now); 206 SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now));
207
208#define P(x) \
209 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
210#define PN(x) \
211 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
212 PN(sysctl_sched_latency);
213 PN(sysctl_sched_nr_latency);
214 PN(sysctl_sched_wakeup_granularity);
215 PN(sysctl_sched_batch_wakeup_granularity);
216 PN(sysctl_sched_child_runs_first);
217 P(sysctl_sched_features);
218#undef PN
219#undef P
186 220
187 for_each_online_cpu(cpu) 221 for_each_online_cpu(cpu)
188 print_cpu(m, cpu); 222 print_cpu(m, cpu);
@@ -202,7 +236,7 @@ static int sched_debug_open(struct inode *inode, struct file *filp)
202 return single_open(filp, sched_debug_show, NULL); 236 return single_open(filp, sched_debug_show, NULL);
203} 237}
204 238
205static struct file_operations sched_debug_fops = { 239static const struct file_operations sched_debug_fops = {
206 .open = sched_debug_open, 240 .open = sched_debug_open,
207 .read = seq_read, 241 .read = seq_read,
208 .llseek = seq_lseek, 242 .llseek = seq_lseek,
@@ -226,6 +260,7 @@ __initcall(init_sched_debug_procfs);
226 260
227void proc_sched_show_task(struct task_struct *p, struct seq_file *m) 261void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
228{ 262{
263 unsigned long nr_switches;
229 unsigned long flags; 264 unsigned long flags;
230 int num_threads = 1; 265 int num_threads = 1;
231 266
@@ -237,41 +272,89 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
237 rcu_read_unlock(); 272 rcu_read_unlock();
238 273
239 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); 274 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
240 SEQ_printf(m, "----------------------------------------------\n"); 275 SEQ_printf(m,
276 "---------------------------------------------------------\n");
277#define __P(F) \
278 SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F)
241#define P(F) \ 279#define P(F) \
242 SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F) 280 SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F)
281#define __PN(F) \
282 SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
283#define PN(F) \
284 SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
243 285
244 P(se.wait_runtime); 286 PN(se.exec_start);
245 P(se.wait_start_fair); 287 PN(se.vruntime);
246 P(se.exec_start); 288 PN(se.sum_exec_runtime);
247 P(se.sleep_start_fair); 289
248 P(se.sum_exec_runtime); 290 nr_switches = p->nvcsw + p->nivcsw;
249 291
250#ifdef CONFIG_SCHEDSTATS 292#ifdef CONFIG_SCHEDSTATS
251 P(se.wait_start); 293 PN(se.wait_start);
252 P(se.sleep_start); 294 PN(se.sleep_start);
253 P(se.block_start); 295 PN(se.block_start);
254 P(se.sleep_max); 296 PN(se.sleep_max);
255 P(se.block_max); 297 PN(se.block_max);
256 P(se.exec_max); 298 PN(se.exec_max);
257 P(se.wait_max); 299 PN(se.slice_max);
258 P(se.wait_runtime_overruns); 300 PN(se.wait_max);
259 P(se.wait_runtime_underruns); 301 P(sched_info.bkl_count);
260 P(se.sum_wait_runtime); 302 P(se.nr_migrations);
303 P(se.nr_migrations_cold);
304 P(se.nr_failed_migrations_affine);
305 P(se.nr_failed_migrations_running);
306 P(se.nr_failed_migrations_hot);
307 P(se.nr_forced_migrations);
308 P(se.nr_forced2_migrations);
309 P(se.nr_wakeups);
310 P(se.nr_wakeups_sync);
311 P(se.nr_wakeups_migrate);
312 P(se.nr_wakeups_local);
313 P(se.nr_wakeups_remote);
314 P(se.nr_wakeups_affine);
315 P(se.nr_wakeups_affine_attempts);
316 P(se.nr_wakeups_passive);
317 P(se.nr_wakeups_idle);
318
319 {
320 u64 avg_atom, avg_per_cpu;
321
322 avg_atom = p->se.sum_exec_runtime;
323 if (nr_switches)
324 do_div(avg_atom, nr_switches);
325 else
326 avg_atom = -1LL;
327
328 avg_per_cpu = p->se.sum_exec_runtime;
329 if (p->se.nr_migrations)
330 avg_per_cpu = div64_64(avg_per_cpu, p->se.nr_migrations);
331 else
332 avg_per_cpu = -1LL;
333
334 __PN(avg_atom);
335 __PN(avg_per_cpu);
336 }
261#endif 337#endif
262 SEQ_printf(m, "%-25s:%20Ld\n", 338 __P(nr_switches);
263 "nr_switches", (long long)(p->nvcsw + p->nivcsw)); 339 SEQ_printf(m, "%-35s:%21Ld\n",
340 "nr_voluntary_switches", (long long)p->nvcsw);
341 SEQ_printf(m, "%-35s:%21Ld\n",
342 "nr_involuntary_switches", (long long)p->nivcsw);
343
264 P(se.load.weight); 344 P(se.load.weight);
265 P(policy); 345 P(policy);
266 P(prio); 346 P(prio);
347#undef PN
348#undef __PN
267#undef P 349#undef P
350#undef __P
268 351
269 { 352 {
270 u64 t0, t1; 353 u64 t0, t1;
271 354
272 t0 = sched_clock(); 355 t0 = sched_clock();
273 t1 = sched_clock(); 356 t1 = sched_clock();
274 SEQ_printf(m, "%-25s:%20Ld\n", 357 SEQ_printf(m, "%-35s:%21Ld\n",
275 "clock-delta", (long long)(t1-t0)); 358 "clock-delta", (long long)(t1-t0));
276 } 359 }
277} 360}
@@ -279,9 +362,32 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
279void proc_sched_set_task(struct task_struct *p) 362void proc_sched_set_task(struct task_struct *p)
280{ 363{
281#ifdef CONFIG_SCHEDSTATS 364#ifdef CONFIG_SCHEDSTATS
282 p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0; 365 p->se.wait_max = 0;
283 p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0; 366 p->se.sleep_max = 0;
367 p->se.sum_sleep_runtime = 0;
368 p->se.block_max = 0;
369 p->se.exec_max = 0;
370 p->se.slice_max = 0;
371 p->se.nr_migrations = 0;
372 p->se.nr_migrations_cold = 0;
373 p->se.nr_failed_migrations_affine = 0;
374 p->se.nr_failed_migrations_running = 0;
375 p->se.nr_failed_migrations_hot = 0;
376 p->se.nr_forced_migrations = 0;
377 p->se.nr_forced2_migrations = 0;
378 p->se.nr_wakeups = 0;
379 p->se.nr_wakeups_sync = 0;
380 p->se.nr_wakeups_migrate = 0;
381 p->se.nr_wakeups_local = 0;
382 p->se.nr_wakeups_remote = 0;
383 p->se.nr_wakeups_affine = 0;
384 p->se.nr_wakeups_affine_attempts = 0;
385 p->se.nr_wakeups_passive = 0;
386 p->se.nr_wakeups_idle = 0;
387 p->sched_info.bkl_count = 0;
284#endif 388#endif
285 p->se.sum_exec_runtime = 0; 389 p->se.sum_exec_runtime = 0;
286 p->se.prev_sum_exec_runtime = 0; 390 p->se.prev_sum_exec_runtime = 0;
391 p->nvcsw = 0;
392 p->nivcsw = 0;
287} 393}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 67c67a87146e..a17b785d7000 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -25,22 +25,26 @@
25 * (default: 20ms, units: nanoseconds) 25 * (default: 20ms, units: nanoseconds)
26 * 26 *
27 * NOTE: this latency value is not the same as the concept of 27 * NOTE: this latency value is not the same as the concept of
28 * 'timeslice length' - timeslices in CFS are of variable length. 28 * 'timeslice length' - timeslices in CFS are of variable length
29 * (to see the precise effective timeslice length of your workload, 29 * and have no persistent notion like in traditional, time-slice
30 * run vmstat and monitor the context-switches field) 30 * based scheduling concepts.
31 * 31 *
32 * On SMP systems the value of this is multiplied by the log2 of the 32 * (to see the precise effective timeslice length of your workload,
33 * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way 33 * run vmstat and monitor the context-switches (cs) field)
34 * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
35 * Targeted preemption latency for CPU-bound tasks:
36 */ 34 */
37unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; 35const_debug unsigned int sysctl_sched_latency = 20000000ULL;
36
37/*
38 * After fork, child runs first. (default) If set to 0 then
39 * parent will (try to) run first.
40 */
41const_debug unsigned int sysctl_sched_child_runs_first = 1;
38 42
39/* 43/*
40 * Minimal preemption granularity for CPU-bound tasks: 44 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 2 msec, units: nanoseconds) 45 * (default: 2 msec, units: nanoseconds)
42 */ 46 */
43unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL; 47const_debug unsigned int sysctl_sched_nr_latency = 20;
44 48
45/* 49/*
46 * sys_sched_yield() compat mode 50 * sys_sched_yield() compat mode
@@ -52,52 +56,25 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
52 56
53/* 57/*
54 * SCHED_BATCH wake-up granularity. 58 * SCHED_BATCH wake-up granularity.
55 * (default: 25 msec, units: nanoseconds) 59 * (default: 10 msec, units: nanoseconds)
56 * 60 *
57 * This option delays the preemption effects of decoupled workloads 61 * This option delays the preemption effects of decoupled workloads
58 * and reduces their over-scheduling. Synchronous workloads will still 62 * and reduces their over-scheduling. Synchronous workloads will still
59 * have immediate wakeup/sleep latencies. 63 * have immediate wakeup/sleep latencies.
60 */ 64 */
61unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL; 65const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
62 66
63/* 67/*
64 * SCHED_OTHER wake-up granularity. 68 * SCHED_OTHER wake-up granularity.
65 * (default: 1 msec, units: nanoseconds) 69 * (default: 10 msec, units: nanoseconds)
66 * 70 *
67 * This option delays the preemption effects of decoupled workloads 71 * This option delays the preemption effects of decoupled workloads
68 * and reduces their over-scheduling. Synchronous workloads will still 72 * and reduces their over-scheduling. Synchronous workloads will still
69 * have immediate wakeup/sleep latencies. 73 * have immediate wakeup/sleep latencies.
70 */ 74 */
71unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000UL; 75const_debug unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
72
73unsigned int sysctl_sched_stat_granularity __read_mostly;
74
75/*
76 * Initialized in sched_init_granularity() [to 5 times the base granularity]:
77 */
78unsigned int sysctl_sched_runtime_limit __read_mostly;
79
80/*
81 * Debugging: various feature bits
82 */
83enum {
84 SCHED_FEAT_FAIR_SLEEPERS = 1,
85 SCHED_FEAT_SLEEPER_AVG = 2,
86 SCHED_FEAT_SLEEPER_LOAD_AVG = 4,
87 SCHED_FEAT_PRECISE_CPU_LOAD = 8,
88 SCHED_FEAT_START_DEBIT = 16,
89 SCHED_FEAT_SKIP_INITIAL = 32,
90};
91 76
92unsigned int sysctl_sched_features __read_mostly = 77const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
93 SCHED_FEAT_FAIR_SLEEPERS *1 |
94 SCHED_FEAT_SLEEPER_AVG *0 |
95 SCHED_FEAT_SLEEPER_LOAD_AVG *1 |
96 SCHED_FEAT_PRECISE_CPU_LOAD *1 |
97 SCHED_FEAT_START_DEBIT *1 |
98 SCHED_FEAT_SKIP_INITIAL *0;
99
100extern struct sched_class fair_sched_class;
101 78
102/************************************************************** 79/**************************************************************
103 * CFS operations on generic schedulable entities: 80 * CFS operations on generic schedulable entities:
@@ -111,21 +88,9 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
111 return cfs_rq->rq; 88 return cfs_rq->rq;
112} 89}
113 90
114/* currently running entity (if any) on this cfs_rq */
115static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
116{
117 return cfs_rq->curr;
118}
119
120/* An entity is a task if it doesn't "own" a runqueue */ 91/* An entity is a task if it doesn't "own" a runqueue */
121#define entity_is_task(se) (!se->my_q) 92#define entity_is_task(se) (!se->my_q)
122 93
123static inline void
124set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se)
125{
126 cfs_rq->curr = se;
127}
128
129#else /* CONFIG_FAIR_GROUP_SCHED */ 94#else /* CONFIG_FAIR_GROUP_SCHED */
130 95
131static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 96static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -133,21 +98,8 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
133 return container_of(cfs_rq, struct rq, cfs); 98 return container_of(cfs_rq, struct rq, cfs);
134} 99}
135 100
136static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
137{
138 struct rq *rq = rq_of(cfs_rq);
139
140 if (unlikely(rq->curr->sched_class != &fair_sched_class))
141 return NULL;
142
143 return &rq->curr->se;
144}
145
146#define entity_is_task(se) 1 101#define entity_is_task(se) 1
147 102
148static inline void
149set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
150
151#endif /* CONFIG_FAIR_GROUP_SCHED */ 103#endif /* CONFIG_FAIR_GROUP_SCHED */
152 104
153static inline struct task_struct *task_of(struct sched_entity *se) 105static inline struct task_struct *task_of(struct sched_entity *se)
@@ -160,16 +112,38 @@ static inline struct task_struct *task_of(struct sched_entity *se)
160 * Scheduling class tree data structure manipulation methods: 112 * Scheduling class tree data structure manipulation methods:
161 */ 113 */
162 114
115static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
116{
117 s64 delta = (s64)(vruntime - min_vruntime);
118 if (delta > 0)
119 min_vruntime = vruntime;
120
121 return min_vruntime;
122}
123
124static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
125{
126 s64 delta = (s64)(vruntime - min_vruntime);
127 if (delta < 0)
128 min_vruntime = vruntime;
129
130 return min_vruntime;
131}
132
133static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
134{
135 return se->vruntime - cfs_rq->min_vruntime;
136}
137
163/* 138/*
164 * Enqueue an entity into the rb-tree: 139 * Enqueue an entity into the rb-tree:
165 */ 140 */
166static inline void 141static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
167__enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
168{ 142{
169 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; 143 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
170 struct rb_node *parent = NULL; 144 struct rb_node *parent = NULL;
171 struct sched_entity *entry; 145 struct sched_entity *entry;
172 s64 key = se->fair_key; 146 s64 key = entity_key(cfs_rq, se);
173 int leftmost = 1; 147 int leftmost = 1;
174 148
175 /* 149 /*
@@ -182,7 +156,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
182 * We dont care about collisions. Nodes with 156 * We dont care about collisions. Nodes with
183 * the same key stay together. 157 * the same key stay together.
184 */ 158 */
185 if (key - entry->fair_key < 0) { 159 if (key < entity_key(cfs_rq, entry)) {
186 link = &parent->rb_left; 160 link = &parent->rb_left;
187 } else { 161 } else {
188 link = &parent->rb_right; 162 link = &parent->rb_right;
@@ -199,24 +173,14 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
199 173
200 rb_link_node(&se->run_node, parent, link); 174 rb_link_node(&se->run_node, parent, link);
201 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); 175 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
202 update_load_add(&cfs_rq->load, se->load.weight);
203 cfs_rq->nr_running++;
204 se->on_rq = 1;
205
206 schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
207} 176}
208 177
209static inline void 178static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
210__dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
211{ 179{
212 if (cfs_rq->rb_leftmost == &se->run_node) 180 if (cfs_rq->rb_leftmost == &se->run_node)
213 cfs_rq->rb_leftmost = rb_next(&se->run_node); 181 cfs_rq->rb_leftmost = rb_next(&se->run_node);
214 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
215 update_load_sub(&cfs_rq->load, se->load.weight);
216 cfs_rq->nr_running--;
217 se->on_rq = 0;
218 182
219 schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); 183 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
220} 184}
221 185
222static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) 186static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
@@ -229,118 +193,86 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
229 return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); 193 return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
230} 194}
231 195
196static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
197{
198 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
199 struct sched_entity *se = NULL;
200 struct rb_node *parent;
201
202 while (*link) {
203 parent = *link;
204 se = rb_entry(parent, struct sched_entity, run_node);
205 link = &parent->rb_right;
206 }
207
208 return se;
209}
210
232/************************************************************** 211/**************************************************************
233 * Scheduling class statistics methods: 212 * Scheduling class statistics methods:
234 */ 213 */
235 214
215
236/* 216/*
237 * Calculate the preemption granularity needed to schedule every 217 * The idea is to set a period in which each task runs once.
238 * runnable task once per sysctl_sched_latency amount of time.
239 * (down to a sensible low limit on granularity)
240 *
241 * For example, if there are 2 tasks running and latency is 10 msecs,
242 * we switch tasks every 5 msecs. If we have 3 tasks running, we have
243 * to switch tasks every 3.33 msecs to get a 10 msecs observed latency
244 * for each task. We do finer and finer scheduling up to until we
245 * reach the minimum granularity value.
246 *
247 * To achieve this we use the following dynamic-granularity rule:
248 * 218 *
249 * gran = lat/nr - lat/nr/nr 219 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
220 * this period because otherwise the slices get too small.
250 * 221 *
251 * This comes out of the following equations: 222 * p = (nr <= nl) ? l : l*nr/nl
252 *
253 * kA1 + gran = kB1
254 * kB2 + gran = kA2
255 * kA2 = kA1
256 * kB2 = kB1 - d + d/nr
257 * lat = d * nr
258 *
259 * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running),
260 * '1' is start of time, '2' is end of time, 'd' is delay between
261 * 1 and 2 (during which task B was running), 'nr' is number of tasks
262 * running, 'lat' is the the period of each task. ('lat' is the
263 * sched_latency that we aim for.)
264 */ 223 */
265static long 224static u64 __sched_period(unsigned long nr_running)
266sched_granularity(struct cfs_rq *cfs_rq)
267{ 225{
268 unsigned int gran = sysctl_sched_latency; 226 u64 period = sysctl_sched_latency;
269 unsigned int nr = cfs_rq->nr_running; 227 unsigned long nr_latency = sysctl_sched_nr_latency;
270 228
271 if (nr > 1) { 229 if (unlikely(nr_running > nr_latency)) {
272 gran = gran/nr - gran/nr/nr; 230 period *= nr_running;
273 gran = max(gran, sysctl_sched_min_granularity); 231 do_div(period, nr_latency);
274 } 232 }
275 233
276 return gran; 234 return period;
277} 235}
278 236
279/* 237/*
280 * We rescale the rescheduling granularity of tasks according to their 238 * We calculate the wall-time slice from the period by taking a part
281 * nice level, but only linearly, not exponentially: 239 * proportional to the weight.
240 *
241 * s = p*w/rw
282 */ 242 */
283static long 243static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
284niced_granularity(struct sched_entity *curr, unsigned long granularity)
285{ 244{
286 u64 tmp; 245 u64 slice = __sched_period(cfs_rq->nr_running);
287 246
288 if (likely(curr->load.weight == NICE_0_LOAD)) 247 slice *= se->load.weight;
289 return granularity; 248 do_div(slice, cfs_rq->load.weight);
290 /*
291 * Positive nice levels get the same granularity as nice-0:
292 */
293 if (likely(curr->load.weight < NICE_0_LOAD)) {
294 tmp = curr->load.weight * (u64)granularity;
295 return (long) (tmp >> NICE_0_SHIFT);
296 }
297 /*
298 * Negative nice level tasks get linearly finer
299 * granularity:
300 */
301 tmp = curr->load.inv_weight * (u64)granularity;
302 249
303 /* 250 return slice;
304 * It will always fit into 'long':
305 */
306 return (long) (tmp >> (WMULT_SHIFT-NICE_0_SHIFT));
307} 251}
308 252
309static inline void 253/*
310limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se) 254 * We calculate the vruntime slice.
255 *
256 * vs = s/w = p/rw
257 */
258static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
311{ 259{
312 long limit = sysctl_sched_runtime_limit; 260 u64 vslice = __sched_period(nr_running);
313 261
314 /* 262 do_div(vslice, rq_weight);
315 * Niced tasks have the same history dynamic range as 263
316 * non-niced tasks: 264 return vslice;
317 */
318 if (unlikely(se->wait_runtime > limit)) {
319 se->wait_runtime = limit;
320 schedstat_inc(se, wait_runtime_overruns);
321 schedstat_inc(cfs_rq, wait_runtime_overruns);
322 }
323 if (unlikely(se->wait_runtime < -limit)) {
324 se->wait_runtime = -limit;
325 schedstat_inc(se, wait_runtime_underruns);
326 schedstat_inc(cfs_rq, wait_runtime_underruns);
327 }
328} 265}
329 266
330static inline void 267static u64 sched_vslice(struct cfs_rq *cfs_rq)
331__add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
332{ 268{
333 se->wait_runtime += delta; 269 return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running);
334 schedstat_add(se, sum_wait_runtime, delta);
335 limit_wait_runtime(cfs_rq, se);
336} 270}
337 271
338static void 272static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
339add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
340{ 273{
341 schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); 274 return __sched_vslice(cfs_rq->load.weight + se->load.weight,
342 __add_wait_runtime(cfs_rq, se, delta); 275 cfs_rq->nr_running + 1);
343 schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
344} 276}
345 277
346/* 278/*
@@ -348,46 +280,41 @@ add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
348 * are not in our scheduling class. 280 * are not in our scheduling class.
349 */ 281 */
350static inline void 282static inline void
351__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr) 283__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
284 unsigned long delta_exec)
352{ 285{
353 unsigned long delta, delta_exec, delta_fair, delta_mine; 286 unsigned long delta_exec_weighted;
354 struct load_weight *lw = &cfs_rq->load; 287 u64 vruntime;
355 unsigned long load = lw->weight;
356 288
357 delta_exec = curr->delta_exec;
358 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); 289 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
359 290
360 curr->sum_exec_runtime += delta_exec; 291 curr->sum_exec_runtime += delta_exec;
361 cfs_rq->exec_clock += delta_exec; 292 schedstat_add(cfs_rq, exec_clock, delta_exec);
362 293 delta_exec_weighted = delta_exec;
363 if (unlikely(!load)) 294 if (unlikely(curr->load.weight != NICE_0_LOAD)) {
364 return; 295 delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
365 296 &curr->load);
366 delta_fair = calc_delta_fair(delta_exec, lw);
367 delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
368
369 if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) {
370 delta = min((u64)delta_mine, cfs_rq->sleeper_bonus);
371 delta = min(delta, (unsigned long)(
372 (long)sysctl_sched_runtime_limit - curr->wait_runtime));
373 cfs_rq->sleeper_bonus -= delta;
374 delta_mine -= delta;
375 } 297 }
298 curr->vruntime += delta_exec_weighted;
376 299
377 cfs_rq->fair_clock += delta_fair;
378 /* 300 /*
379 * We executed delta_exec amount of time on the CPU, 301 * maintain cfs_rq->min_vruntime to be a monotonic increasing
380 * but we were only entitled to delta_mine amount of 302 * value tracking the leftmost vruntime in the tree.
381 * time during that period (if nr_running == 1 then
382 * the two values are equal)
383 * [Note: delta_mine - delta_exec is negative]:
384 */ 303 */
385 add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec); 304 if (first_fair(cfs_rq)) {
305 vruntime = min_vruntime(curr->vruntime,
306 __pick_next_entity(cfs_rq)->vruntime);
307 } else
308 vruntime = curr->vruntime;
309
310 cfs_rq->min_vruntime =
311 max_vruntime(cfs_rq->min_vruntime, vruntime);
386} 312}
387 313
388static void update_curr(struct cfs_rq *cfs_rq) 314static void update_curr(struct cfs_rq *cfs_rq)
389{ 315{
390 struct sched_entity *curr = cfs_rq_curr(cfs_rq); 316 struct sched_entity *curr = cfs_rq->curr;
317 u64 now = rq_of(cfs_rq)->clock;
391 unsigned long delta_exec; 318 unsigned long delta_exec;
392 319
393 if (unlikely(!curr)) 320 if (unlikely(!curr))
@@ -398,135 +325,47 @@ static void update_curr(struct cfs_rq *cfs_rq)
398 * since the last time we changed load (this cannot 325 * since the last time we changed load (this cannot
399 * overflow on 32 bits): 326 * overflow on 32 bits):
400 */ 327 */
401 delta_exec = (unsigned long)(rq_of(cfs_rq)->clock - curr->exec_start); 328 delta_exec = (unsigned long)(now - curr->exec_start);
402 329
403 curr->delta_exec += delta_exec; 330 __update_curr(cfs_rq, curr, delta_exec);
404 331 curr->exec_start = now;
405 if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) {
406 __update_curr(cfs_rq, curr);
407 curr->delta_exec = 0;
408 }
409 curr->exec_start = rq_of(cfs_rq)->clock;
410} 332}
411 333
412static inline void 334static inline void
413update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 335update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
414{ 336{
415 se->wait_start_fair = cfs_rq->fair_clock;
416 schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); 337 schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
417} 338}
418 339
419/* 340/*
420 * We calculate fair deltas here, so protect against the random effects
421 * of a multiplication overflow by capping it to the runtime limit:
422 */
423#if BITS_PER_LONG == 32
424static inline unsigned long
425calc_weighted(unsigned long delta, unsigned long weight, int shift)
426{
427 u64 tmp = (u64)delta * weight >> shift;
428
429 if (unlikely(tmp > sysctl_sched_runtime_limit*2))
430 return sysctl_sched_runtime_limit*2;
431 return tmp;
432}
433#else
434static inline unsigned long
435calc_weighted(unsigned long delta, unsigned long weight, int shift)
436{
437 return delta * weight >> shift;
438}
439#endif
440
441/*
442 * Task is being enqueued - update stats: 341 * Task is being enqueued - update stats:
443 */ 342 */
444static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 343static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
445{ 344{
446 s64 key;
447
448 /* 345 /*
449 * Are we enqueueing a waiting task? (for current tasks 346 * Are we enqueueing a waiting task? (for current tasks
450 * a dequeue/enqueue event is a NOP) 347 * a dequeue/enqueue event is a NOP)
451 */ 348 */
452 if (se != cfs_rq_curr(cfs_rq)) 349 if (se != cfs_rq->curr)
453 update_stats_wait_start(cfs_rq, se); 350 update_stats_wait_start(cfs_rq, se);
454 /*
455 * Update the key:
456 */
457 key = cfs_rq->fair_clock;
458
459 /*
460 * Optimize the common nice 0 case:
461 */
462 if (likely(se->load.weight == NICE_0_LOAD)) {
463 key -= se->wait_runtime;
464 } else {
465 u64 tmp;
466
467 if (se->wait_runtime < 0) {
468 tmp = -se->wait_runtime;
469 key += (tmp * se->load.inv_weight) >>
470 (WMULT_SHIFT - NICE_0_SHIFT);
471 } else {
472 tmp = se->wait_runtime;
473 key -= (tmp * se->load.inv_weight) >>
474 (WMULT_SHIFT - NICE_0_SHIFT);
475 }
476 }
477
478 se->fair_key = key;
479}
480
481/*
482 * Note: must be called with a freshly updated rq->fair_clock.
483 */
484static inline void
485__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
486{
487 unsigned long delta_fair = se->delta_fair_run;
488
489 schedstat_set(se->wait_max, max(se->wait_max,
490 rq_of(cfs_rq)->clock - se->wait_start));
491
492 if (unlikely(se->load.weight != NICE_0_LOAD))
493 delta_fair = calc_weighted(delta_fair, se->load.weight,
494 NICE_0_SHIFT);
495
496 add_wait_runtime(cfs_rq, se, delta_fair);
497} 351}
498 352
499static void 353static void
500update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 354update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
501{ 355{
502 unsigned long delta_fair; 356 schedstat_set(se->wait_max, max(se->wait_max,
503 357 rq_of(cfs_rq)->clock - se->wait_start));
504 if (unlikely(!se->wait_start_fair))
505 return;
506
507 delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
508 (u64)(cfs_rq->fair_clock - se->wait_start_fair));
509
510 se->delta_fair_run += delta_fair;
511 if (unlikely(abs(se->delta_fair_run) >=
512 sysctl_sched_stat_granularity)) {
513 __update_stats_wait_end(cfs_rq, se);
514 se->delta_fair_run = 0;
515 }
516
517 se->wait_start_fair = 0;
518 schedstat_set(se->wait_start, 0); 358 schedstat_set(se->wait_start, 0);
519} 359}
520 360
521static inline void 361static inline void
522update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 362update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
523{ 363{
524 update_curr(cfs_rq);
525 /* 364 /*
526 * Mark the end of the wait period if dequeueing a 365 * Mark the end of the wait period if dequeueing a
527 * waiting task: 366 * waiting task:
528 */ 367 */
529 if (se != cfs_rq_curr(cfs_rq)) 368 if (se != cfs_rq->curr)
530 update_stats_wait_end(cfs_rq, se); 369 update_stats_wait_end(cfs_rq, se);
531} 370}
532 371
@@ -542,79 +381,28 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
542 se->exec_start = rq_of(cfs_rq)->clock; 381 se->exec_start = rq_of(cfs_rq)->clock;
543} 382}
544 383
545/*
546 * We are descheduling a task - update its stats:
547 */
548static inline void
549update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
550{
551 se->exec_start = 0;
552}
553
554/************************************************** 384/**************************************************
555 * Scheduling class queueing methods: 385 * Scheduling class queueing methods:
556 */ 386 */
557 387
558static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 388static void
389account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
559{ 390{
560 unsigned long load = cfs_rq->load.weight, delta_fair; 391 update_load_add(&cfs_rq->load, se->load.weight);
561 long prev_runtime; 392 cfs_rq->nr_running++;
562 393 se->on_rq = 1;
563 /* 394}
564 * Do not boost sleepers if there's too much bonus 'in flight'
565 * already:
566 */
567 if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit))
568 return;
569
570 if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG)
571 load = rq_of(cfs_rq)->cpu_load[2];
572
573 delta_fair = se->delta_fair_sleep;
574
575 /*
576 * Fix up delta_fair with the effect of us running
577 * during the whole sleep period:
578 */
579 if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG)
580 delta_fair = div64_likely32((u64)delta_fair * load,
581 load + se->load.weight);
582
583 if (unlikely(se->load.weight != NICE_0_LOAD))
584 delta_fair = calc_weighted(delta_fair, se->load.weight,
585 NICE_0_SHIFT);
586
587 prev_runtime = se->wait_runtime;
588 __add_wait_runtime(cfs_rq, se, delta_fair);
589 delta_fair = se->wait_runtime - prev_runtime;
590 395
591 /* 396static void
592 * Track the amount of bonus we've given to sleepers: 397account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
593 */ 398{
594 cfs_rq->sleeper_bonus += delta_fair; 399 update_load_sub(&cfs_rq->load, se->load.weight);
400 cfs_rq->nr_running--;
401 se->on_rq = 0;
595} 402}
596 403
597static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 404static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
598{ 405{
599 struct task_struct *tsk = task_of(se);
600 unsigned long delta_fair;
601
602 if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) ||
603 !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS))
604 return;
605
606 delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
607 (u64)(cfs_rq->fair_clock - se->sleep_start_fair));
608
609 se->delta_fair_sleep += delta_fair;
610 if (unlikely(abs(se->delta_fair_sleep) >=
611 sysctl_sched_stat_granularity)) {
612 __enqueue_sleeper(cfs_rq, se);
613 se->delta_fair_sleep = 0;
614 }
615
616 se->sleep_start_fair = 0;
617
618#ifdef CONFIG_SCHEDSTATS 406#ifdef CONFIG_SCHEDSTATS
619 if (se->sleep_start) { 407 if (se->sleep_start) {
620 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 408 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
@@ -646,6 +434,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
646 * time that the task spent sleeping: 434 * time that the task spent sleeping:
647 */ 435 */
648 if (unlikely(prof_on == SLEEP_PROFILING)) { 436 if (unlikely(prof_on == SLEEP_PROFILING)) {
437 struct task_struct *tsk = task_of(se);
438
649 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), 439 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
650 delta >> 20); 440 delta >> 20);
651 } 441 }
@@ -653,27 +443,81 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
653#endif 443#endif
654} 444}
655 445
446static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
447{
448#ifdef CONFIG_SCHED_DEBUG
449 s64 d = se->vruntime - cfs_rq->min_vruntime;
450
451 if (d < 0)
452 d = -d;
453
454 if (d > 3*sysctl_sched_latency)
455 schedstat_inc(cfs_rq, nr_spread_over);
456#endif
457}
458
459static void
460place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
461{
462 u64 vruntime;
463
464 vruntime = cfs_rq->min_vruntime;
465
466 if (sched_feat(TREE_AVG)) {
467 struct sched_entity *last = __pick_last_entity(cfs_rq);
468 if (last) {
469 vruntime += last->vruntime;
470 vruntime >>= 1;
471 }
472 } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
473 vruntime += sched_vslice(cfs_rq)/2;
474
475 if (initial && sched_feat(START_DEBIT))
476 vruntime += sched_vslice_add(cfs_rq, se);
477
478 if (!initial) {
479 if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) &&
480 task_of(se)->policy != SCHED_BATCH)
481 vruntime -= sysctl_sched_latency;
482
483 vruntime = max_t(s64, vruntime, se->vruntime);
484 }
485
486 se->vruntime = vruntime;
487
488}
489
656static void 490static void
657enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) 491enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
658{ 492{
659 /* 493 /*
660 * Update the fair clock. 494 * Update run-time statistics of the 'current'.
661 */ 495 */
662 update_curr(cfs_rq); 496 update_curr(cfs_rq);
663 497
664 if (wakeup) 498 if (wakeup) {
499 place_entity(cfs_rq, se, 0);
665 enqueue_sleeper(cfs_rq, se); 500 enqueue_sleeper(cfs_rq, se);
501 }
666 502
667 update_stats_enqueue(cfs_rq, se); 503 update_stats_enqueue(cfs_rq, se);
668 __enqueue_entity(cfs_rq, se); 504 check_spread(cfs_rq, se);
505 if (se != cfs_rq->curr)
506 __enqueue_entity(cfs_rq, se);
507 account_entity_enqueue(cfs_rq, se);
669} 508}
670 509
671static void 510static void
672dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 511dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
673{ 512{
513 /*
514 * Update run-time statistics of the 'current'.
515 */
516 update_curr(cfs_rq);
517
674 update_stats_dequeue(cfs_rq, se); 518 update_stats_dequeue(cfs_rq, se);
675 if (sleep) { 519 if (sleep) {
676 se->sleep_start_fair = cfs_rq->fair_clock; 520 se->peer_preempt = 0;
677#ifdef CONFIG_SCHEDSTATS 521#ifdef CONFIG_SCHEDSTATS
678 if (entity_is_task(se)) { 522 if (entity_is_task(se)) {
679 struct task_struct *tsk = task_of(se); 523 struct task_struct *tsk = task_of(se);
@@ -685,68 +529,66 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
685 } 529 }
686#endif 530#endif
687 } 531 }
688 __dequeue_entity(cfs_rq, se); 532
533 if (se != cfs_rq->curr)
534 __dequeue_entity(cfs_rq, se);
535 account_entity_dequeue(cfs_rq, se);
689} 536}
690 537
691/* 538/*
692 * Preempt the current task with a newly woken task if needed: 539 * Preempt the current task with a newly woken task if needed:
693 */ 540 */
694static void 541static void
695__check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, 542check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
696 struct sched_entity *curr, unsigned long granularity)
697{ 543{
698 s64 __delta = curr->fair_key - se->fair_key;
699 unsigned long ideal_runtime, delta_exec; 544 unsigned long ideal_runtime, delta_exec;
700 545
701 /* 546 ideal_runtime = sched_slice(cfs_rq, curr);
702 * ideal_runtime is compared against sum_exec_runtime, which is
703 * walltime, hence do not scale.
704 */
705 ideal_runtime = max(sysctl_sched_latency / cfs_rq->nr_running,
706 (unsigned long)sysctl_sched_min_granularity);
707
708 /*
709 * If we executed more than what the latency constraint suggests,
710 * reduce the rescheduling granularity. This way the total latency
711 * of how much a task is not scheduled converges to
712 * sysctl_sched_latency:
713 */
714 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 547 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
715 if (delta_exec > ideal_runtime) 548 if (delta_exec > ideal_runtime ||
716 granularity = 0; 549 (sched_feat(PREEMPT_RESTRICT) && curr->peer_preempt))
717
718 /*
719 * Take scheduling granularity into account - do not
720 * preempt the current task unless the best task has
721 * a larger than sched_granularity fairness advantage:
722 *
723 * scale granularity as key space is in fair_clock.
724 */
725 if (__delta > niced_granularity(curr, granularity))
726 resched_task(rq_of(cfs_rq)->curr); 550 resched_task(rq_of(cfs_rq)->curr);
551 curr->peer_preempt = 0;
727} 552}
728 553
729static inline void 554static void
730set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 555set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
731{ 556{
557 /* 'current' is not kept within the tree. */
558 if (se->on_rq) {
559 /*
560 * Any task has to be enqueued before it get to execute on
561 * a CPU. So account for the time it spent waiting on the
562 * runqueue.
563 */
564 update_stats_wait_end(cfs_rq, se);
565 __dequeue_entity(cfs_rq, se);
566 }
567
568 update_stats_curr_start(cfs_rq, se);
569 cfs_rq->curr = se;
570#ifdef CONFIG_SCHEDSTATS
732 /* 571 /*
733 * Any task has to be enqueued before it get to execute on 572 * Track our maximum slice length, if the CPU's load is at
734 * a CPU. So account for the time it spent waiting on the 573 * least twice that of our own weight (i.e. dont track it
735 * runqueue. (note, here we rely on pick_next_task() having 574 * when there are only lesser-weight tasks around):
736 * done a put_prev_task_fair() shortly before this, which
737 * updated rq->fair_clock - used by update_stats_wait_end())
738 */ 575 */
739 update_stats_wait_end(cfs_rq, se); 576 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
740 update_stats_curr_start(cfs_rq, se); 577 se->slice_max = max(se->slice_max,
741 set_cfs_rq_curr(cfs_rq, se); 578 se->sum_exec_runtime - se->prev_sum_exec_runtime);
579 }
580#endif
742 se->prev_sum_exec_runtime = se->sum_exec_runtime; 581 se->prev_sum_exec_runtime = se->sum_exec_runtime;
743} 582}
744 583
745static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 584static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
746{ 585{
747 struct sched_entity *se = __pick_next_entity(cfs_rq); 586 struct sched_entity *se = NULL;
748 587
749 set_next_entity(cfs_rq, se); 588 if (first_fair(cfs_rq)) {
589 se = __pick_next_entity(cfs_rq);
590 set_next_entity(cfs_rq, se);
591 }
750 592
751 return se; 593 return se;
752} 594}
@@ -760,33 +602,24 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
760 if (prev->on_rq) 602 if (prev->on_rq)
761 update_curr(cfs_rq); 603 update_curr(cfs_rq);
762 604
763 update_stats_curr_end(cfs_rq, prev); 605 check_spread(cfs_rq, prev);
764 606 if (prev->on_rq) {
765 if (prev->on_rq)
766 update_stats_wait_start(cfs_rq, prev); 607 update_stats_wait_start(cfs_rq, prev);
767 set_cfs_rq_curr(cfs_rq, NULL); 608 /* Put 'current' back into the tree. */
609 __enqueue_entity(cfs_rq, prev);
610 }
611 cfs_rq->curr = NULL;
768} 612}
769 613
770static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) 614static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
771{ 615{
772 struct sched_entity *next;
773
774 /* 616 /*
775 * Dequeue and enqueue the task to update its 617 * Update run-time statistics of the 'current'.
776 * position within the tree:
777 */ 618 */
778 dequeue_entity(cfs_rq, curr, 0); 619 update_curr(cfs_rq);
779 enqueue_entity(cfs_rq, curr, 0);
780
781 /*
782 * Reschedule if another task tops the current one.
783 */
784 next = __pick_next_entity(cfs_rq);
785 if (next == curr)
786 return;
787 620
788 __check_preempt_curr_fair(cfs_rq, next, curr, 621 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
789 sched_granularity(cfs_rq)); 622 check_preempt_tick(cfs_rq, curr);
790} 623}
791 624
792/************************************************** 625/**************************************************
@@ -821,23 +654,28 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
821 */ 654 */
822static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) 655static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
823{ 656{
824 /* A later patch will take group into account */ 657 return cfs_rq->tg->cfs_rq[this_cpu];
825 return &cpu_rq(this_cpu)->cfs;
826} 658}
827 659
828/* Iterate thr' all leaf cfs_rq's on a runqueue */ 660/* Iterate thr' all leaf cfs_rq's on a runqueue */
829#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 661#define for_each_leaf_cfs_rq(rq, cfs_rq) \
830 list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 662 list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
831 663
832/* Do the two (enqueued) tasks belong to the same group ? */ 664/* Do the two (enqueued) entities belong to the same group ? */
833static inline int is_same_group(struct task_struct *curr, struct task_struct *p) 665static inline int
666is_same_group(struct sched_entity *se, struct sched_entity *pse)
834{ 667{
835 if (curr->se.cfs_rq == p->se.cfs_rq) 668 if (se->cfs_rq == pse->cfs_rq)
836 return 1; 669 return 1;
837 670
838 return 0; 671 return 0;
839} 672}
840 673
674static inline struct sched_entity *parent_entity(struct sched_entity *se)
675{
676 return se->parent;
677}
678
841#else /* CONFIG_FAIR_GROUP_SCHED */ 679#else /* CONFIG_FAIR_GROUP_SCHED */
842 680
843#define for_each_sched_entity(se) \ 681#define for_each_sched_entity(se) \
@@ -870,11 +708,17 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
870#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 708#define for_each_leaf_cfs_rq(rq, cfs_rq) \
871 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 709 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
872 710
873static inline int is_same_group(struct task_struct *curr, struct task_struct *p) 711static inline int
712is_same_group(struct sched_entity *se, struct sched_entity *pse)
874{ 713{
875 return 1; 714 return 1;
876} 715}
877 716
717static inline struct sched_entity *parent_entity(struct sched_entity *se)
718{
719 return NULL;
720}
721
878#endif /* CONFIG_FAIR_GROUP_SCHED */ 722#endif /* CONFIG_FAIR_GROUP_SCHED */
879 723
880/* 724/*
@@ -892,6 +736,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
892 break; 736 break;
893 cfs_rq = cfs_rq_of(se); 737 cfs_rq = cfs_rq_of(se);
894 enqueue_entity(cfs_rq, se, wakeup); 738 enqueue_entity(cfs_rq, se, wakeup);
739 wakeup = 1;
895 } 740 }
896} 741}
897 742
@@ -911,6 +756,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
911 /* Don't dequeue parent if it has other entities besides us */ 756 /* Don't dequeue parent if it has other entities besides us */
912 if (cfs_rq->load.weight) 757 if (cfs_rq->load.weight)
913 break; 758 break;
759 sleep = 1;
914 } 760 }
915} 761}
916 762
@@ -919,12 +765,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
919 * 765 *
920 * If compat_yield is turned on then we requeue to the end of the tree. 766 * If compat_yield is turned on then we requeue to the end of the tree.
921 */ 767 */
922static void yield_task_fair(struct rq *rq, struct task_struct *p) 768static void yield_task_fair(struct rq *rq)
923{ 769{
924 struct cfs_rq *cfs_rq = task_cfs_rq(p); 770 struct cfs_rq *cfs_rq = task_cfs_rq(rq->curr);
925 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; 771 struct sched_entity *rightmost, *se = &rq->curr->se;
926 struct sched_entity *rightmost, *se = &p->se;
927 struct rb_node *parent;
928 772
929 /* 773 /*
930 * Are we the only task in the tree? 774 * Are we the only task in the tree?
@@ -935,52 +779,39 @@ static void yield_task_fair(struct rq *rq, struct task_struct *p)
935 if (likely(!sysctl_sched_compat_yield)) { 779 if (likely(!sysctl_sched_compat_yield)) {
936 __update_rq_clock(rq); 780 __update_rq_clock(rq);
937 /* 781 /*
938 * Dequeue and enqueue the task to update its 782 * Update run-time statistics of the 'current'.
939 * position within the tree:
940 */ 783 */
941 dequeue_entity(cfs_rq, &p->se, 0); 784 update_curr(cfs_rq);
942 enqueue_entity(cfs_rq, &p->se, 0);
943 785
944 return; 786 return;
945 } 787 }
946 /* 788 /*
947 * Find the rightmost entry in the rbtree: 789 * Find the rightmost entry in the rbtree:
948 */ 790 */
949 do { 791 rightmost = __pick_last_entity(cfs_rq);
950 parent = *link;
951 link = &parent->rb_right;
952 } while (*link);
953
954 rightmost = rb_entry(parent, struct sched_entity, run_node);
955 /* 792 /*
956 * Already in the rightmost position? 793 * Already in the rightmost position?
957 */ 794 */
958 if (unlikely(rightmost == se)) 795 if (unlikely(rightmost->vruntime < se->vruntime))
959 return; 796 return;
960 797
961 /* 798 /*
962 * Minimally necessary key value to be last in the tree: 799 * Minimally necessary key value to be last in the tree:
800 * Upon rescheduling, sched_class::put_prev_task() will place
801 * 'current' within the tree based on its new key value.
963 */ 802 */
964 se->fair_key = rightmost->fair_key + 1; 803 se->vruntime = rightmost->vruntime + 1;
965
966 if (cfs_rq->rb_leftmost == &se->run_node)
967 cfs_rq->rb_leftmost = rb_next(&se->run_node);
968 /*
969 * Relink the task to the rightmost position:
970 */
971 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
972 rb_link_node(&se->run_node, parent, link);
973 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
974} 804}
975 805
976/* 806/*
977 * Preempt the current task with a newly woken task if needed: 807 * Preempt the current task with a newly woken task if needed:
978 */ 808 */
979static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) 809static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
980{ 810{
981 struct task_struct *curr = rq->curr; 811 struct task_struct *curr = rq->curr;
982 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 812 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
983 unsigned long gran; 813 struct sched_entity *se = &curr->se, *pse = &p->se;
814 s64 delta, gran;
984 815
985 if (unlikely(rt_prio(p->prio))) { 816 if (unlikely(rt_prio(p->prio))) {
986 update_rq_clock(rq); 817 update_rq_clock(rq);
@@ -988,16 +819,31 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
988 resched_task(curr); 819 resched_task(curr);
989 return; 820 return;
990 } 821 }
991
992 gran = sysctl_sched_wakeup_granularity;
993 /* 822 /*
994 * Batch tasks prefer throughput over latency: 823 * Batch tasks do not preempt (their preemption is driven by
824 * the tick):
995 */ 825 */
996 if (unlikely(p->policy == SCHED_BATCH)) 826 if (unlikely(p->policy == SCHED_BATCH))
997 gran = sysctl_sched_batch_wakeup_granularity; 827 return;
828
829 if (sched_feat(WAKEUP_PREEMPT)) {
830 while (!is_same_group(se, pse)) {
831 se = parent_entity(se);
832 pse = parent_entity(pse);
833 }
998 834
999 if (is_same_group(curr, p)) 835 delta = se->vruntime - pse->vruntime;
1000 __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran); 836 gran = sysctl_sched_wakeup_granularity;
837 if (unlikely(se->load.weight != NICE_0_LOAD))
838 gran = calc_delta_fair(gran, &se->load);
839
840 if (delta > gran) {
841 int now = !sched_feat(PREEMPT_RESTRICT);
842
843 if (now || p->prio < curr->prio || !se->peer_preempt++)
844 resched_task(curr);
845 }
846 }
1001} 847}
1002 848
1003static struct task_struct *pick_next_task_fair(struct rq *rq) 849static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1041,7 +887,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1041 * achieve that by always pre-iterating before returning 887 * achieve that by always pre-iterating before returning
1042 * the current task: 888 * the current task:
1043 */ 889 */
1044static inline struct task_struct * 890static struct task_struct *
1045__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) 891__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
1046{ 892{
1047 struct task_struct *p; 893 struct task_struct *p;
@@ -1078,7 +924,10 @@ static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
1078 if (!cfs_rq->nr_running) 924 if (!cfs_rq->nr_running)
1079 return MAX_PRIO; 925 return MAX_PRIO;
1080 926
1081 curr = __pick_next_entity(cfs_rq); 927 curr = cfs_rq->curr;
928 if (!curr)
929 curr = __pick_next_entity(cfs_rq);
930
1082 p = task_of(curr); 931 p = task_of(curr);
1083 932
1084 return p->prio; 933 return p->prio;
@@ -1153,6 +1002,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr)
1153 } 1002 }
1154} 1003}
1155 1004
1005#define swap(a,b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
1006
1156/* 1007/*
1157 * Share the fairness runtime between parent and child, thus the 1008 * Share the fairness runtime between parent and child, thus the
1158 * total amount of pressure for CPU stays equal - new tasks 1009 * total amount of pressure for CPU stays equal - new tasks
@@ -1163,37 +1014,32 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr)
1163static void task_new_fair(struct rq *rq, struct task_struct *p) 1014static void task_new_fair(struct rq *rq, struct task_struct *p)
1164{ 1015{
1165 struct cfs_rq *cfs_rq = task_cfs_rq(p); 1016 struct cfs_rq *cfs_rq = task_cfs_rq(p);
1166 struct sched_entity *se = &p->se, *curr = cfs_rq_curr(cfs_rq); 1017 struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
1018 int this_cpu = smp_processor_id();
1167 1019
1168 sched_info_queued(p); 1020 sched_info_queued(p);
1169 1021
1170 update_curr(cfs_rq); 1022 update_curr(cfs_rq);
1171 update_stats_enqueue(cfs_rq, se); 1023 place_entity(cfs_rq, se, 1);
1172 /*
1173 * Child runs first: we let it run before the parent
1174 * until it reschedules once. We set up the key so that
1175 * it will preempt the parent:
1176 */
1177 se->fair_key = curr->fair_key -
1178 niced_granularity(curr, sched_granularity(cfs_rq)) - 1;
1179 /*
1180 * The first wait is dominated by the child-runs-first logic,
1181 * so do not credit it with that waiting time yet:
1182 */
1183 if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL)
1184 se->wait_start_fair = 0;
1185 1024
1186 /* 1025 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1187 * The statistical average of wait_runtime is about 1026 curr->vruntime < se->vruntime) {
1188 * -granularity/2, so initialize the task with that: 1027 /*
1189 */ 1028 * Upon rescheduling, sched_class::put_prev_task() will place
1190 if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) 1029 * 'current' within the tree based on its new key value.
1191 se->wait_runtime = -(sched_granularity(cfs_rq) / 2); 1030 */
1031 swap(curr->vruntime, se->vruntime);
1032 }
1192 1033
1034 update_stats_enqueue(cfs_rq, se);
1035 check_spread(cfs_rq, se);
1036 check_spread(cfs_rq, curr);
1193 __enqueue_entity(cfs_rq, se); 1037 __enqueue_entity(cfs_rq, se);
1038 account_entity_enqueue(cfs_rq, se);
1039 se->peer_preempt = 0;
1040 resched_task(rq->curr);
1194} 1041}
1195 1042
1196#ifdef CONFIG_FAIR_GROUP_SCHED
1197/* Account for a task changing its policy or group. 1043/* Account for a task changing its policy or group.
1198 * 1044 *
1199 * This routine is mostly called to set cfs_rq->curr field when a task 1045 * This routine is mostly called to set cfs_rq->curr field when a task
@@ -1206,21 +1052,17 @@ static void set_curr_task_fair(struct rq *rq)
1206 for_each_sched_entity(se) 1052 for_each_sched_entity(se)
1207 set_next_entity(cfs_rq_of(se), se); 1053 set_next_entity(cfs_rq_of(se), se);
1208} 1054}
1209#else
1210static void set_curr_task_fair(struct rq *rq)
1211{
1212}
1213#endif
1214 1055
1215/* 1056/*
1216 * All the scheduling class methods: 1057 * All the scheduling class methods:
1217 */ 1058 */
1218struct sched_class fair_sched_class __read_mostly = { 1059static const struct sched_class fair_sched_class = {
1060 .next = &idle_sched_class,
1219 .enqueue_task = enqueue_task_fair, 1061 .enqueue_task = enqueue_task_fair,
1220 .dequeue_task = dequeue_task_fair, 1062 .dequeue_task = dequeue_task_fair,
1221 .yield_task = yield_task_fair, 1063 .yield_task = yield_task_fair,
1222 1064
1223 .check_preempt_curr = check_preempt_curr_fair, 1065 .check_preempt_curr = check_preempt_wakeup,
1224 1066
1225 .pick_next_task = pick_next_task_fair, 1067 .pick_next_task = pick_next_task_fair,
1226 .put_prev_task = put_prev_task_fair, 1068 .put_prev_task = put_prev_task_fair,
@@ -1237,6 +1079,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
1237{ 1079{
1238 struct cfs_rq *cfs_rq; 1080 struct cfs_rq *cfs_rq;
1239 1081
1082#ifdef CONFIG_FAIR_GROUP_SCHED
1083 print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
1084#endif
1240 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) 1085 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1241 print_cfs_rq(m, cpu, cfs_rq); 1086 print_cfs_rq(m, cpu, cfs_rq);
1242} 1087}
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 3503fb2d9f96..6e2ead41516e 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -50,10 +50,15 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr)
50{ 50{
51} 51}
52 52
53static void set_curr_task_idle(struct rq *rq)
54{
55}
56
53/* 57/*
54 * Simple, special scheduling class for the per-CPU idle tasks: 58 * Simple, special scheduling class for the per-CPU idle tasks:
55 */ 59 */
56static struct sched_class idle_sched_class __read_mostly = { 60const struct sched_class idle_sched_class = {
61 /* .next is NULL */
57 /* no enqueue/yield_task for idle tasks */ 62 /* no enqueue/yield_task for idle tasks */
58 63
59 /* dequeue is not valid, we print a debug message there: */ 64 /* dequeue is not valid, we print a debug message there: */
@@ -66,6 +71,7 @@ static struct sched_class idle_sched_class __read_mostly = {
66 71
67 .load_balance = load_balance_idle, 72 .load_balance = load_balance_idle,
68 73
74 .set_curr_task = set_curr_task_idle,
69 .task_tick = task_tick_idle, 75 .task_tick = task_tick_idle,
70 /* no .task_new for idle tasks */ 76 /* no .task_new for idle tasks */
71}; 77};
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 4b87476a02d0..d0097a0634e5 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -7,7 +7,7 @@
7 * Update the current task's runtime statistics. Skip current tasks that 7 * Update the current task's runtime statistics. Skip current tasks that
8 * are not in our scheduling class. 8 * are not in our scheduling class.
9 */ 9 */
10static inline void update_curr_rt(struct rq *rq) 10static void update_curr_rt(struct rq *rq)
11{ 11{
12 struct task_struct *curr = rq->curr; 12 struct task_struct *curr = rq->curr;
13 u64 delta_exec; 13 u64 delta_exec;
@@ -59,9 +59,9 @@ static void requeue_task_rt(struct rq *rq, struct task_struct *p)
59} 59}
60 60
61static void 61static void
62yield_task_rt(struct rq *rq, struct task_struct *p) 62yield_task_rt(struct rq *rq)
63{ 63{
64 requeue_task_rt(rq, p); 64 requeue_task_rt(rq, rq->curr);
65} 65}
66 66
67/* 67/*
@@ -206,7 +206,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
206 if (--p->time_slice) 206 if (--p->time_slice)
207 return; 207 return;
208 208
209 p->time_slice = static_prio_timeslice(p->static_prio); 209 p->time_slice = DEF_TIMESLICE;
210 210
211 /* 211 /*
212 * Requeue to the end of queue if we are not the only element 212 * Requeue to the end of queue if we are not the only element
@@ -218,7 +218,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
218 } 218 }
219} 219}
220 220
221static struct sched_class rt_sched_class __read_mostly = { 221static void set_curr_task_rt(struct rq *rq)
222{
223 struct task_struct *p = rq->curr;
224
225 p->se.exec_start = rq->clock;
226}
227
228const struct sched_class rt_sched_class = {
229 .next = &fair_sched_class,
222 .enqueue_task = enqueue_task_rt, 230 .enqueue_task = enqueue_task_rt,
223 .dequeue_task = dequeue_task_rt, 231 .dequeue_task = dequeue_task_rt,
224 .yield_task = yield_task_rt, 232 .yield_task = yield_task_rt,
@@ -230,5 +238,6 @@ static struct sched_class rt_sched_class __read_mostly = {
230 238
231 .load_balance = load_balance_rt, 239 .load_balance = load_balance_rt,
232 240
241 .set_curr_task = set_curr_task_rt,
233 .task_tick = task_tick_rt, 242 .task_tick = task_tick_rt,
234}; 243};
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index c20a94dda61e..1c084842c3e7 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -16,18 +16,18 @@ static int show_schedstat(struct seq_file *seq, void *v)
16 struct rq *rq = cpu_rq(cpu); 16 struct rq *rq = cpu_rq(cpu);
17#ifdef CONFIG_SMP 17#ifdef CONFIG_SMP
18 struct sched_domain *sd; 18 struct sched_domain *sd;
19 int dcnt = 0; 19 int dcount = 0;
20#endif 20#endif
21 21
22 /* runqueue-specific stats */ 22 /* runqueue-specific stats */
23 seq_printf(seq, 23 seq_printf(seq,
24 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu", 24 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu",
25 cpu, rq->yld_both_empty, 25 cpu, rq->yld_both_empty,
26 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, 26 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
27 rq->sched_switch, rq->sched_cnt, rq->sched_goidle, 27 rq->sched_switch, rq->sched_count, rq->sched_goidle,
28 rq->ttwu_cnt, rq->ttwu_local, 28 rq->ttwu_count, rq->ttwu_local,
29 rq->rq_sched_info.cpu_time, 29 rq->rq_sched_info.cpu_time,
30 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); 30 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
31 31
32 seq_printf(seq, "\n"); 32 seq_printf(seq, "\n");
33 33
@@ -39,12 +39,12 @@ static int show_schedstat(struct seq_file *seq, void *v)
39 char mask_str[NR_CPUS]; 39 char mask_str[NR_CPUS];
40 40
41 cpumask_scnprintf(mask_str, NR_CPUS, sd->span); 41 cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
42 seq_printf(seq, "domain%d %s", dcnt++, mask_str); 42 seq_printf(seq, "domain%d %s", dcount++, mask_str);
43 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; 43 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
44 itype++) { 44 itype++) {
45 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " 45 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
46 "%lu", 46 "%lu",
47 sd->lb_cnt[itype], 47 sd->lb_count[itype],
48 sd->lb_balanced[itype], 48 sd->lb_balanced[itype],
49 sd->lb_failed[itype], 49 sd->lb_failed[itype],
50 sd->lb_imbalance[itype], 50 sd->lb_imbalance[itype],
@@ -55,9 +55,9 @@ static int show_schedstat(struct seq_file *seq, void *v)
55 } 55 }
56 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" 56 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
57 " %lu %lu %lu\n", 57 " %lu %lu %lu\n",
58 sd->alb_cnt, sd->alb_failed, sd->alb_pushed, 58 sd->alb_count, sd->alb_failed, sd->alb_pushed,
59 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, 59 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
60 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, 60 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
61 sd->ttwu_wake_remote, sd->ttwu_move_affine, 61 sd->ttwu_wake_remote, sd->ttwu_move_affine,
62 sd->ttwu_move_balance); 62 sd->ttwu_move_balance);
63 } 63 }
@@ -101,7 +101,7 @@ rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
101{ 101{
102 if (rq) { 102 if (rq) {
103 rq->rq_sched_info.run_delay += delta; 103 rq->rq_sched_info.run_delay += delta;
104 rq->rq_sched_info.pcnt++; 104 rq->rq_sched_info.pcount++;
105 } 105 }
106} 106}
107 107
@@ -129,7 +129,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
129# define schedstat_set(var, val) do { } while (0) 129# define schedstat_set(var, val) do { } while (0)
130#endif 130#endif
131 131
132#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 132#ifdef CONFIG_SCHEDSTATS
133/* 133/*
134 * Called when a process is dequeued from the active array and given 134 * Called when a process is dequeued from the active array and given
135 * the cpu. We should note that with the exception of interactive 135 * the cpu. We should note that with the exception of interactive
@@ -164,7 +164,7 @@ static void sched_info_arrive(struct task_struct *t)
164 sched_info_dequeued(t); 164 sched_info_dequeued(t);
165 t->sched_info.run_delay += delta; 165 t->sched_info.run_delay += delta;
166 t->sched_info.last_arrival = now; 166 t->sched_info.last_arrival = now;
167 t->sched_info.pcnt++; 167 t->sched_info.pcount++;
168 168
169 rq_sched_info_arrive(task_rq(t), delta); 169 rq_sched_info_arrive(task_rq(t), delta);
170} 170}
@@ -233,5 +233,5 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
233#else 233#else
234#define sched_info_queued(t) do { } while (0) 234#define sched_info_queued(t) do { } while (0)
235#define sched_info_switch(t, next) do { } while (0) 235#define sched_info_switch(t, next) do { } while (0)
236#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ 236#endif /* CONFIG_SCHEDSTATS */
237 237
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0f546ddea43d..bd89bc4eb0b9 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -271,8 +271,6 @@ asmlinkage void do_softirq(void)
271 local_irq_restore(flags); 271 local_irq_restore(flags);
272} 272}
273 273
274EXPORT_SYMBOL(do_softirq);
275
276#endif 274#endif
277 275
278/* 276/*
@@ -332,8 +330,6 @@ inline fastcall void raise_softirq_irqoff(unsigned int nr)
332 wakeup_softirqd(); 330 wakeup_softirqd();
333} 331}
334 332
335EXPORT_SYMBOL(raise_softirq_irqoff);
336
337void fastcall raise_softirq(unsigned int nr) 333void fastcall raise_softirq(unsigned int nr)
338{ 334{
339 unsigned long flags; 335 unsigned long flags;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 53a456ebf6d5..ec14aa8ac51f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -222,14 +222,11 @@ static ctl_table kern_table[] = {
222#ifdef CONFIG_SCHED_DEBUG 222#ifdef CONFIG_SCHED_DEBUG
223 { 223 {
224 .ctl_name = CTL_UNNUMBERED, 224 .ctl_name = CTL_UNNUMBERED,
225 .procname = "sched_min_granularity_ns", 225 .procname = "sched_nr_latency",
226 .data = &sysctl_sched_min_granularity, 226 .data = &sysctl_sched_nr_latency,
227 .maxlen = sizeof(unsigned int), 227 .maxlen = sizeof(unsigned int),
228 .mode = 0644, 228 .mode = 0644,
229 .proc_handler = &proc_dointvec_minmax, 229 .proc_handler = &proc_dointvec,
230 .strategy = &sysctl_intvec,
231 .extra1 = &min_sched_granularity_ns,
232 .extra2 = &max_sched_granularity_ns,
233 }, 230 },
234 { 231 {
235 .ctl_name = CTL_UNNUMBERED, 232 .ctl_name = CTL_UNNUMBERED,
@@ -266,38 +263,24 @@ static ctl_table kern_table[] = {
266 }, 263 },
267 { 264 {
268 .ctl_name = CTL_UNNUMBERED, 265 .ctl_name = CTL_UNNUMBERED,
269 .procname = "sched_stat_granularity_ns", 266 .procname = "sched_child_runs_first",
270 .data = &sysctl_sched_stat_granularity, 267 .data = &sysctl_sched_child_runs_first,
271 .maxlen = sizeof(unsigned int),
272 .mode = 0644,
273 .proc_handler = &proc_dointvec_minmax,
274 .strategy = &sysctl_intvec,
275 .extra1 = &min_wakeup_granularity_ns,
276 .extra2 = &max_wakeup_granularity_ns,
277 },
278 {
279 .ctl_name = CTL_UNNUMBERED,
280 .procname = "sched_runtime_limit_ns",
281 .data = &sysctl_sched_runtime_limit,
282 .maxlen = sizeof(unsigned int), 268 .maxlen = sizeof(unsigned int),
283 .mode = 0644, 269 .mode = 0644,
284 .proc_handler = &proc_dointvec_minmax, 270 .proc_handler = &proc_dointvec,
285 .strategy = &sysctl_intvec,
286 .extra1 = &min_sched_granularity_ns,
287 .extra2 = &max_sched_granularity_ns,
288 }, 271 },
289 { 272 {
290 .ctl_name = CTL_UNNUMBERED, 273 .ctl_name = CTL_UNNUMBERED,
291 .procname = "sched_child_runs_first", 274 .procname = "sched_features",
292 .data = &sysctl_sched_child_runs_first, 275 .data = &sysctl_sched_features,
293 .maxlen = sizeof(unsigned int), 276 .maxlen = sizeof(unsigned int),
294 .mode = 0644, 277 .mode = 0644,
295 .proc_handler = &proc_dointvec, 278 .proc_handler = &proc_dointvec,
296 }, 279 },
297 { 280 {
298 .ctl_name = CTL_UNNUMBERED, 281 .ctl_name = CTL_UNNUMBERED,
299 .procname = "sched_features", 282 .procname = "sched_migration_cost",
300 .data = &sysctl_sched_features, 283 .data = &sysctl_sched_migration_cost,
301 .maxlen = sizeof(unsigned int), 284 .maxlen = sizeof(unsigned int),
302 .mode = 0644, 285 .mode = 0644,
303 .proc_handler = &proc_dointvec, 286 .proc_handler = &proc_dointvec,
@@ -1053,7 +1036,7 @@ static ctl_table vm_table[] = {
1053 .strategy = &sysctl_string, 1036 .strategy = &sysctl_string,
1054 }, 1037 },
1055#endif 1038#endif
1056#if defined(CONFIG_X86_32) || \ 1039#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \
1057 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) 1040 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
1058 { 1041 {
1059 .ctl_name = VM_VDSO_ENABLED, 1042 .ctl_name = VM_VDSO_ENABLED,
@@ -1221,7 +1204,7 @@ static ctl_table fs_table[] = {
1221}; 1204};
1222 1205
1223static ctl_table debug_table[] = { 1206static ctl_table debug_table[] = {
1224#ifdef CONFIG_X86 1207#if defined(CONFIG_X86) || defined(CONFIG_PPC)
1225 { 1208 {
1226 .ctl_name = CTL_UNNUMBERED, 1209 .ctl_name = CTL_UNNUMBERED,
1227 .procname = "exception-trace", 1210 .procname = "exception-trace",
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index f66351126544..8d53106a0a92 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -23,3 +23,8 @@ config HIGH_RES_TIMERS
23 hardware is not capable then this option only increases 23 hardware is not capable then this option only increases
24 the size of the kernel image. 24 the size of the kernel image.
25 25
26config GENERIC_CLOCKEVENTS_BUILD
27 bool
28 default y
29 depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR
30
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 99b6034fc86b..905b0b50792d 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,6 +1,6 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
2 2
3obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o 3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
5obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o
6obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o 6obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 41dd3105ce7f..822beebe664a 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -194,6 +194,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
194 local_irq_restore(flags); 194 local_irq_restore(flags);
195} 195}
196 196
197#ifdef CONFIG_GENERIC_CLOCKEVENTS
197/** 198/**
198 * clockevents_notify - notification about relevant events 199 * clockevents_notify - notification about relevant events
199 */ 200 */
@@ -222,4 +223,4 @@ void clockevents_notify(unsigned long reason, void *arg)
222 spin_unlock(&clockevents_lock); 223 spin_unlock(&clockevents_lock);
223} 224}
224EXPORT_SYMBOL_GPL(clockevents_notify); 225EXPORT_SYMBOL_GPL(clockevents_notify);
225 226#endif
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 0962e0577660..fc3fc79b3d59 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -64,8 +64,9 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
64 */ 64 */
65int tick_check_broadcast_device(struct clock_event_device *dev) 65int tick_check_broadcast_device(struct clock_event_device *dev)
66{ 66{
67 if (tick_broadcast_device.evtdev || 67 if ((tick_broadcast_device.evtdev &&
68 (dev->features & CLOCK_EVT_FEAT_C3STOP)) 68 tick_broadcast_device.evtdev->rating >= dev->rating) ||
69 (dev->features & CLOCK_EVT_FEAT_C3STOP))
69 return 0; 70 return 0;
70 71
71 clockevents_exchange_device(NULL, dev); 72 clockevents_exchange_device(NULL, dev);
@@ -176,8 +177,6 @@ static void tick_do_periodic_broadcast(void)
176 */ 177 */
177static void tick_handle_periodic_broadcast(struct clock_event_device *dev) 178static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
178{ 179{
179 dev->next_event.tv64 = KTIME_MAX;
180
181 tick_do_periodic_broadcast(); 180 tick_do_periodic_broadcast();
182 181
183 /* 182 /*
@@ -218,26 +217,43 @@ static void tick_do_broadcast_on_off(void *why)
218 bc = tick_broadcast_device.evtdev; 217 bc = tick_broadcast_device.evtdev;
219 218
220 /* 219 /*
221 * Is the device in broadcast mode forever or is it not 220 * Is the device not affected by the powerstate ?
222 * affected by the powerstate ?
223 */ 221 */
224 if (!dev || !tick_device_is_functional(dev) || 222 if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP))
225 !(dev->features & CLOCK_EVT_FEAT_C3STOP))
226 goto out; 223 goto out;
227 224
228 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_ON) { 225 /*
226 * Defect device ?
227 */
228 if (!tick_device_is_functional(dev)) {
229 /*
230 * AMD C1E wreckage fixup:
231 *
232 * Device was registered functional in the first
233 * place. Now the secondary CPU detected the C1E
234 * misfeature and notifies us to fix it up
235 */
236 if (*reason != CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
237 goto out;
238 }
239
240 switch (*reason) {
241 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
242 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
229 if (!cpu_isset(cpu, tick_broadcast_mask)) { 243 if (!cpu_isset(cpu, tick_broadcast_mask)) {
230 cpu_set(cpu, tick_broadcast_mask); 244 cpu_set(cpu, tick_broadcast_mask);
231 if (td->mode == TICKDEV_MODE_PERIODIC) 245 if (td->mode == TICKDEV_MODE_PERIODIC)
232 clockevents_set_mode(dev, 246 clockevents_set_mode(dev,
233 CLOCK_EVT_MODE_SHUTDOWN); 247 CLOCK_EVT_MODE_SHUTDOWN);
234 } 248 }
235 } else { 249 break;
250 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
236 if (cpu_isset(cpu, tick_broadcast_mask)) { 251 if (cpu_isset(cpu, tick_broadcast_mask)) {
237 cpu_clear(cpu, tick_broadcast_mask); 252 cpu_clear(cpu, tick_broadcast_mask);
238 if (td->mode == TICKDEV_MODE_PERIODIC) 253 if (td->mode == TICKDEV_MODE_PERIODIC)
239 tick_setup_periodic(dev, 0); 254 tick_setup_periodic(dev, 0);
240 } 255 }
256 break;
241 } 257 }
242 258
243 if (cpus_empty(tick_broadcast_mask)) 259 if (cpus_empty(tick_broadcast_mask))
@@ -515,11 +531,9 @@ static void tick_broadcast_clear_oneshot(int cpu)
515 */ 531 */
516void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 532void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
517{ 533{
518 if (bc->mode != CLOCK_EVT_MODE_ONESHOT) { 534 bc->event_handler = tick_handle_oneshot_broadcast;
519 bc->event_handler = tick_handle_oneshot_broadcast; 535 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
520 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 536 bc->next_event.tv64 = KTIME_MAX;
521 bc->next_event.tv64 = KTIME_MAX;
522 }
523} 537}
524 538
525/* 539/*
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 77a21abc8716..1bea399a9ef0 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -200,7 +200,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
200 200
201 cpu = smp_processor_id(); 201 cpu = smp_processor_id();
202 if (!cpu_isset(cpu, newdev->cpumask)) 202 if (!cpu_isset(cpu, newdev->cpumask))
203 goto out; 203 goto out_bc;
204 204
205 td = &per_cpu(tick_cpu_device, cpu); 205 td = &per_cpu(tick_cpu_device, cpu);
206 curdev = td->evtdev; 206 curdev = td->evtdev;
@@ -265,7 +265,7 @@ out_bc:
265 */ 265 */
266 if (tick_check_broadcast_device(newdev)) 266 if (tick_check_broadcast_device(newdev))
267 ret = NOTIFY_STOP; 267 ret = NOTIFY_STOP;
268out: 268
269 spin_unlock_irqrestore(&tick_device_lock, flags); 269 spin_unlock_irqrestore(&tick_device_lock, flags);
270 270
271 return ret; 271 return ret;
@@ -345,6 +345,7 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason,
345 345
346 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 346 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
347 case CLOCK_EVT_NOTIFY_BROADCAST_OFF: 347 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
348 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
348 tick_broadcast_on_off(reason, dev); 349 tick_broadcast_on_off(reason, dev);
349 break; 350 break;
350 351
diff --git a/kernel/user.c b/kernel/user.c
index 9ca2848fc356..f0e561e6d085 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -50,12 +50,16 @@ struct user_struct root_user = {
50 .uid_keyring = &root_user_keyring, 50 .uid_keyring = &root_user_keyring,
51 .session_keyring = &root_session_keyring, 51 .session_keyring = &root_session_keyring,
52#endif 52#endif
53#ifdef CONFIG_FAIR_USER_SCHED
54 .tg = &init_task_group,
55#endif
53}; 56};
54 57
55/* 58/*
56 * These routines must be called with the uidhash spinlock held! 59 * These routines must be called with the uidhash spinlock held!
57 */ 60 */
58static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) 61static inline void uid_hash_insert(struct user_struct *up,
62 struct hlist_head *hashent)
59{ 63{
60 hlist_add_head(&up->uidhash_node, hashent); 64 hlist_add_head(&up->uidhash_node, hashent);
61} 65}
@@ -65,13 +69,14 @@ static inline void uid_hash_remove(struct user_struct *up)
65 hlist_del_init(&up->uidhash_node); 69 hlist_del_init(&up->uidhash_node);
66} 70}
67 71
68static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) 72static inline struct user_struct *uid_hash_find(uid_t uid,
73 struct hlist_head *hashent)
69{ 74{
70 struct user_struct *user; 75 struct user_struct *user;
71 struct hlist_node *h; 76 struct hlist_node *h;
72 77
73 hlist_for_each_entry(user, h, hashent, uidhash_node) { 78 hlist_for_each_entry(user, h, hashent, uidhash_node) {
74 if(user->uid == uid) { 79 if (user->uid == uid) {
75 atomic_inc(&user->__count); 80 atomic_inc(&user->__count);
76 return user; 81 return user;
77 } 82 }
@@ -80,6 +85,203 @@ static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *ha
80 return NULL; 85 return NULL;
81} 86}
82 87
88#ifdef CONFIG_FAIR_USER_SCHED
89
90static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */
91static DEFINE_MUTEX(uids_mutex);
92
93static void sched_destroy_user(struct user_struct *up)
94{
95 sched_destroy_group(up->tg);
96}
97
98static int sched_create_user(struct user_struct *up)
99{
100 int rc = 0;
101
102 up->tg = sched_create_group();
103 if (IS_ERR(up->tg))
104 rc = -ENOMEM;
105
106 return rc;
107}
108
109static void sched_switch_user(struct task_struct *p)
110{
111 sched_move_task(p);
112}
113
114static inline void uids_mutex_lock(void)
115{
116 mutex_lock(&uids_mutex);
117}
118
119static inline void uids_mutex_unlock(void)
120{
121 mutex_unlock(&uids_mutex);
122}
123
124/* return cpu shares held by the user */
125ssize_t cpu_shares_show(struct kset *kset, char *buffer)
126{
127 struct user_struct *up = container_of(kset, struct user_struct, kset);
128
129 return sprintf(buffer, "%lu\n", sched_group_shares(up->tg));
130}
131
132/* modify cpu shares held by the user */
133ssize_t cpu_shares_store(struct kset *kset, const char *buffer, size_t size)
134{
135 struct user_struct *up = container_of(kset, struct user_struct, kset);
136 unsigned long shares;
137 int rc;
138
139 sscanf(buffer, "%lu", &shares);
140
141 rc = sched_group_set_shares(up->tg, shares);
142
143 return (rc ? rc : size);
144}
145
146static void user_attr_init(struct subsys_attribute *sa, char *name, int mode)
147{
148 sa->attr.name = name;
149 sa->attr.mode = mode;
150 sa->show = cpu_shares_show;
151 sa->store = cpu_shares_store;
152}
153
154/* Create "/sys/kernel/uids/<uid>" directory and
155 * "/sys/kernel/uids/<uid>/cpu_share" file for this user.
156 */
157static int user_kobject_create(struct user_struct *up)
158{
159 struct kset *kset = &up->kset;
160 struct kobject *kobj = &kset->kobj;
161 int error;
162
163 memset(kset, 0, sizeof(struct kset));
164 kobj->parent = &uids_kobject; /* create under /sys/kernel/uids dir */
165 kobject_set_name(kobj, "%d", up->uid);
166 kset_init(kset);
167 user_attr_init(&up->user_attr, "cpu_share", 0644);
168
169 error = kobject_add(kobj);
170 if (error)
171 goto done;
172
173 error = sysfs_create_file(kobj, &up->user_attr.attr);
174 if (error)
175 kobject_del(kobj);
176
177 kobject_uevent(kobj, KOBJ_ADD);
178
179done:
180 return error;
181}
182
183/* create these in sysfs filesystem:
184 * "/sys/kernel/uids" directory
185 * "/sys/kernel/uids/0" directory (for root user)
186 * "/sys/kernel/uids/0/cpu_share" file (for root user)
187 */
188int __init uids_kobject_init(void)
189{
190 int error;
191
192 /* create under /sys/kernel dir */
193 uids_kobject.parent = &kernel_subsys.kobj;
194 uids_kobject.kset = &kernel_subsys;
195 kobject_set_name(&uids_kobject, "uids");
196 kobject_init(&uids_kobject);
197
198 error = kobject_add(&uids_kobject);
199 if (!error)
200 error = user_kobject_create(&root_user);
201
202 return error;
203}
204
205/* work function to remove sysfs directory for a user and free up
206 * corresponding structures.
207 */
208static void remove_user_sysfs_dir(struct work_struct *w)
209{
210 struct user_struct *up = container_of(w, struct user_struct, work);
211 struct kobject *kobj = &up->kset.kobj;
212 unsigned long flags;
213 int remove_user = 0;
214
215 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
216 * atomic.
217 */
218 uids_mutex_lock();
219
220 local_irq_save(flags);
221
222 if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
223 uid_hash_remove(up);
224 remove_user = 1;
225 spin_unlock_irqrestore(&uidhash_lock, flags);
226 } else {
227 local_irq_restore(flags);
228 }
229
230 if (!remove_user)
231 goto done;
232
233 sysfs_remove_file(kobj, &up->user_attr.attr);
234 kobject_uevent(kobj, KOBJ_REMOVE);
235 kobject_del(kobj);
236
237 sched_destroy_user(up);
238 key_put(up->uid_keyring);
239 key_put(up->session_keyring);
240 kmem_cache_free(uid_cachep, up);
241
242done:
243 uids_mutex_unlock();
244}
245
246/* IRQs are disabled and uidhash_lock is held upon function entry.
247 * IRQ state (as stored in flags) is restored and uidhash_lock released
248 * upon function exit.
249 */
250static inline void free_user(struct user_struct *up, unsigned long flags)
251{
252 /* restore back the count */
253 atomic_inc(&up->__count);
254 spin_unlock_irqrestore(&uidhash_lock, flags);
255
256 INIT_WORK(&up->work, remove_user_sysfs_dir);
257 schedule_work(&up->work);
258}
259
260#else /* CONFIG_FAIR_USER_SCHED */
261
262static void sched_destroy_user(struct user_struct *up) { }
263static int sched_create_user(struct user_struct *up) { return 0; }
264static void sched_switch_user(struct task_struct *p) { }
265static inline int user_kobject_create(struct user_struct *up) { return 0; }
266static inline void uids_mutex_lock(void) { }
267static inline void uids_mutex_unlock(void) { }
268
269/* IRQs are disabled and uidhash_lock is held upon function entry.
270 * IRQ state (as stored in flags) is restored and uidhash_lock released
271 * upon function exit.
272 */
273static inline void free_user(struct user_struct *up, unsigned long flags)
274{
275 uid_hash_remove(up);
276 spin_unlock_irqrestore(&uidhash_lock, flags);
277 sched_destroy_user(up);
278 key_put(up->uid_keyring);
279 key_put(up->session_keyring);
280 kmem_cache_free(uid_cachep, up);
281}
282
283#endif /* CONFIG_FAIR_USER_SCHED */
284
83/* 285/*
84 * Locate the user_struct for the passed UID. If found, take a ref on it. The 286 * Locate the user_struct for the passed UID. If found, take a ref on it. The
85 * caller must undo that ref with free_uid(). 287 * caller must undo that ref with free_uid().
@@ -106,15 +308,10 @@ void free_uid(struct user_struct *up)
106 return; 308 return;
107 309
108 local_irq_save(flags); 310 local_irq_save(flags);
109 if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { 311 if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
110 uid_hash_remove(up); 312 free_user(up, flags);
111 spin_unlock_irqrestore(&uidhash_lock, flags); 313 else
112 key_put(up->uid_keyring);
113 key_put(up->session_keyring);
114 kmem_cache_free(uid_cachep, up);
115 } else {
116 local_irq_restore(flags); 314 local_irq_restore(flags);
117 }
118} 315}
119 316
120struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) 317struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
@@ -122,6 +319,11 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
122 struct hlist_head *hashent = uidhashentry(ns, uid); 319 struct hlist_head *hashent = uidhashentry(ns, uid);
123 struct user_struct *up; 320 struct user_struct *up;
124 321
322 /* Make uid_hash_find() + user_kobject_create() + uid_hash_insert()
323 * atomic.
324 */
325 uids_mutex_lock();
326
125 spin_lock_irq(&uidhash_lock); 327 spin_lock_irq(&uidhash_lock);
126 up = uid_hash_find(uid, hashent); 328 up = uid_hash_find(uid, hashent);
127 spin_unlock_irq(&uidhash_lock); 329 spin_unlock_irq(&uidhash_lock);
@@ -150,6 +352,22 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
150 return NULL; 352 return NULL;
151 } 353 }
152 354
355 if (sched_create_user(new) < 0) {
356 key_put(new->uid_keyring);
357 key_put(new->session_keyring);
358 kmem_cache_free(uid_cachep, new);
359 return NULL;
360 }
361
362 if (user_kobject_create(new)) {
363 sched_destroy_user(new);
364 key_put(new->uid_keyring);
365 key_put(new->session_keyring);
366 kmem_cache_free(uid_cachep, new);
367 uids_mutex_unlock();
368 return NULL;
369 }
370
153 /* 371 /*
154 * Before adding this, check whether we raced 372 * Before adding this, check whether we raced
155 * on adding the same user already.. 373 * on adding the same user already..
@@ -157,6 +375,11 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
157 spin_lock_irq(&uidhash_lock); 375 spin_lock_irq(&uidhash_lock);
158 up = uid_hash_find(uid, hashent); 376 up = uid_hash_find(uid, hashent);
159 if (up) { 377 if (up) {
378 /* This case is not possible when CONFIG_FAIR_USER_SCHED
379 * is defined, since we serialize alloc_uid() using
380 * uids_mutex. Hence no need to call
381 * sched_destroy_user() or remove_user_sysfs_dir().
382 */
160 key_put(new->uid_keyring); 383 key_put(new->uid_keyring);
161 key_put(new->session_keyring); 384 key_put(new->session_keyring);
162 kmem_cache_free(uid_cachep, new); 385 kmem_cache_free(uid_cachep, new);
@@ -167,6 +390,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
167 spin_unlock_irq(&uidhash_lock); 390 spin_unlock_irq(&uidhash_lock);
168 391
169 } 392 }
393
394 uids_mutex_unlock();
395
170 return up; 396 return up;
171} 397}
172 398
@@ -184,6 +410,7 @@ void switch_uid(struct user_struct *new_user)
184 atomic_dec(&old_user->processes); 410 atomic_dec(&old_user->processes);
185 switch_uid_keyring(new_user); 411 switch_uid_keyring(new_user);
186 current->user = new_user; 412 current->user = new_user;
413 sched_switch_user(current);
187 414
188 /* 415 /*
189 * We need to synchronize with __sigqueue_alloc() 416 * We need to synchronize with __sigqueue_alloc()