diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/audit.c | 16 | ||||
-rw-r--r-- | kernel/delayacct.c | 2 | ||||
-rw-r--r-- | kernel/exit.c | 6 | ||||
-rw-r--r-- | kernel/fork.c | 6 | ||||
-rw-r--r-- | kernel/hrtimer.c | 24 | ||||
-rw-r--r-- | kernel/ksysfs.c | 8 | ||||
-rw-r--r-- | kernel/lockdep.c | 26 | ||||
-rw-r--r-- | kernel/lockdep_proc.c | 61 | ||||
-rw-r--r-- | kernel/mutex.c | 35 | ||||
-rw-r--r-- | kernel/nsproxy.c | 15 | ||||
-rw-r--r-- | kernel/posix-timers.c | 6 | ||||
-rw-r--r-- | kernel/rcupdate.c | 8 | ||||
-rw-r--r-- | kernel/sched.c | 1445 | ||||
-rw-r--r-- | kernel/sched_debug.c | 282 | ||||
-rw-r--r-- | kernel/sched_fair.c | 811 | ||||
-rw-r--r-- | kernel/sched_idletask.c | 8 | ||||
-rw-r--r-- | kernel/sched_rt.c | 19 | ||||
-rw-r--r-- | kernel/sched_stats.h | 28 | ||||
-rw-r--r-- | kernel/softirq.c | 4 | ||||
-rw-r--r-- | kernel/sysctl.c | 41 | ||||
-rw-r--r-- | kernel/time/Kconfig | 5 | ||||
-rw-r--r-- | kernel/time/Makefile | 2 | ||||
-rw-r--r-- | kernel/time/clockevents.c | 3 | ||||
-rw-r--r-- | kernel/time/tick-broadcast.c | 44 | ||||
-rw-r--r-- | kernel/time/tick-common.c | 5 | ||||
-rw-r--r-- | kernel/user.c | 249 |
26 files changed, 1834 insertions, 1325 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index eb0f9165b401..2924251a6547 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -847,18 +847,10 @@ static void audit_receive_skb(struct sk_buff *skb) | |||
847 | } | 847 | } |
848 | 848 | ||
849 | /* Receive messages from netlink socket. */ | 849 | /* Receive messages from netlink socket. */ |
850 | static void audit_receive(struct sock *sk, int length) | 850 | static void audit_receive(struct sk_buff *skb) |
851 | { | 851 | { |
852 | struct sk_buff *skb; | ||
853 | unsigned int qlen; | ||
854 | |||
855 | mutex_lock(&audit_cmd_mutex); | 852 | mutex_lock(&audit_cmd_mutex); |
856 | 853 | audit_receive_skb(skb); | |
857 | for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { | ||
858 | skb = skb_dequeue(&sk->sk_receive_queue); | ||
859 | audit_receive_skb(skb); | ||
860 | kfree_skb(skb); | ||
861 | } | ||
862 | mutex_unlock(&audit_cmd_mutex); | 854 | mutex_unlock(&audit_cmd_mutex); |
863 | } | 855 | } |
864 | 856 | ||
@@ -876,8 +868,8 @@ static int __init audit_init(void) | |||
876 | 868 | ||
877 | printk(KERN_INFO "audit: initializing netlink socket (%s)\n", | 869 | printk(KERN_INFO "audit: initializing netlink socket (%s)\n", |
878 | audit_default ? "enabled" : "disabled"); | 870 | audit_default ? "enabled" : "disabled"); |
879 | audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, | 871 | audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0, |
880 | NULL, THIS_MODULE); | 872 | audit_receive, NULL, THIS_MODULE); |
881 | if (!audit_sock) | 873 | if (!audit_sock) |
882 | audit_panic("cannot initialize netlink socket"); | 874 | audit_panic("cannot initialize netlink socket"); |
883 | else | 875 | else |
diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 81e697829633..09e9574eeb26 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c | |||
@@ -119,7 +119,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) | |||
119 | * No locking available for sched_info (and too expensive to add one) | 119 | * No locking available for sched_info (and too expensive to add one) |
120 | * Mitigate by taking snapshot of values | 120 | * Mitigate by taking snapshot of values |
121 | */ | 121 | */ |
122 | t1 = tsk->sched_info.pcnt; | 122 | t1 = tsk->sched_info.pcount; |
123 | t2 = tsk->sched_info.run_delay; | 123 | t2 = tsk->sched_info.run_delay; |
124 | t3 = tsk->sched_info.cpu_time; | 124 | t3 = tsk->sched_info.cpu_time; |
125 | 125 | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 993369ee94d1..7f7959de4a87 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -111,6 +111,7 @@ static void __exit_signal(struct task_struct *tsk) | |||
111 | */ | 111 | */ |
112 | sig->utime = cputime_add(sig->utime, tsk->utime); | 112 | sig->utime = cputime_add(sig->utime, tsk->utime); |
113 | sig->stime = cputime_add(sig->stime, tsk->stime); | 113 | sig->stime = cputime_add(sig->stime, tsk->stime); |
114 | sig->gtime = cputime_add(sig->gtime, tsk->gtime); | ||
114 | sig->min_flt += tsk->min_flt; | 115 | sig->min_flt += tsk->min_flt; |
115 | sig->maj_flt += tsk->maj_flt; | 116 | sig->maj_flt += tsk->maj_flt; |
116 | sig->nvcsw += tsk->nvcsw; | 117 | sig->nvcsw += tsk->nvcsw; |
@@ -1242,6 +1243,11 @@ static int wait_task_zombie(struct task_struct *p, int noreap, | |||
1242 | cputime_add(p->stime, | 1243 | cputime_add(p->stime, |
1243 | cputime_add(sig->stime, | 1244 | cputime_add(sig->stime, |
1244 | sig->cstime))); | 1245 | sig->cstime))); |
1246 | psig->cgtime = | ||
1247 | cputime_add(psig->cgtime, | ||
1248 | cputime_add(p->gtime, | ||
1249 | cputime_add(sig->gtime, | ||
1250 | sig->cgtime))); | ||
1245 | psig->cmin_flt += | 1251 | psig->cmin_flt += |
1246 | p->min_flt + sig->min_flt + sig->cmin_flt; | 1252 | p->min_flt + sig->min_flt + sig->cmin_flt; |
1247 | psig->cmaj_flt += | 1253 | psig->cmaj_flt += |
diff --git a/kernel/fork.c b/kernel/fork.c index 33f12f48684a..3fc3c1383912 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -877,6 +877,8 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
877 | sig->tty_old_pgrp = NULL; | 877 | sig->tty_old_pgrp = NULL; |
878 | 878 | ||
879 | sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; | 879 | sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; |
880 | sig->gtime = cputime_zero; | ||
881 | sig->cgtime = cputime_zero; | ||
880 | sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; | 882 | sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; |
881 | sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; | 883 | sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; |
882 | sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; | 884 | sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; |
@@ -1045,6 +1047,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1045 | 1047 | ||
1046 | p->utime = cputime_zero; | 1048 | p->utime = cputime_zero; |
1047 | p->stime = cputime_zero; | 1049 | p->stime = cputime_zero; |
1050 | p->gtime = cputime_zero; | ||
1048 | 1051 | ||
1049 | #ifdef CONFIG_TASK_XACCT | 1052 | #ifdef CONFIG_TASK_XACCT |
1050 | p->rchar = 0; /* I/O counter: bytes read */ | 1053 | p->rchar = 0; /* I/O counter: bytes read */ |
@@ -1608,7 +1611,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
1608 | err = -EINVAL; | 1611 | err = -EINVAL; |
1609 | if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| | 1612 | if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| |
1610 | CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| | 1613 | CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| |
1611 | CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER)) | 1614 | CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER| |
1615 | CLONE_NEWNET)) | ||
1612 | goto bad_unshare_out; | 1616 | goto bad_unshare_out; |
1613 | 1617 | ||
1614 | if ((err = unshare_thread(unshare_flags))) | 1618 | if ((err = unshare_thread(unshare_flags))) |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index c21ca6bfaa66..dc8a4451d79b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -277,6 +277,30 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) | |||
277 | } | 277 | } |
278 | 278 | ||
279 | EXPORT_SYMBOL_GPL(ktime_add_ns); | 279 | EXPORT_SYMBOL_GPL(ktime_add_ns); |
280 | |||
281 | /** | ||
282 | * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable | ||
283 | * @kt: minuend | ||
284 | * @nsec: the scalar nsec value to subtract | ||
285 | * | ||
286 | * Returns the subtraction of @nsec from @kt in ktime_t format | ||
287 | */ | ||
288 | ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec) | ||
289 | { | ||
290 | ktime_t tmp; | ||
291 | |||
292 | if (likely(nsec < NSEC_PER_SEC)) { | ||
293 | tmp.tv64 = nsec; | ||
294 | } else { | ||
295 | unsigned long rem = do_div(nsec, NSEC_PER_SEC); | ||
296 | |||
297 | tmp = ktime_set((long)nsec, rem); | ||
298 | } | ||
299 | |||
300 | return ktime_sub(kt, tmp); | ||
301 | } | ||
302 | |||
303 | EXPORT_SYMBOL_GPL(ktime_sub_ns); | ||
280 | # endif /* !CONFIG_KTIME_SCALAR */ | 304 | # endif /* !CONFIG_KTIME_SCALAR */ |
281 | 305 | ||
282 | /* | 306 | /* |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index d0e5c48e18c7..6046939d0804 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
15 | #include <linux/init.h> | 15 | #include <linux/init.h> |
16 | #include <linux/kexec.h> | 16 | #include <linux/kexec.h> |
17 | #include <linux/sched.h> | ||
17 | 18 | ||
18 | #define KERNEL_ATTR_RO(_name) \ | 19 | #define KERNEL_ATTR_RO(_name) \ |
19 | static struct subsys_attribute _name##_attr = __ATTR_RO(_name) | 20 | static struct subsys_attribute _name##_attr = __ATTR_RO(_name) |
@@ -116,6 +117,13 @@ static int __init ksysfs_init(void) | |||
116 | ¬es_attr); | 117 | ¬es_attr); |
117 | } | 118 | } |
118 | 119 | ||
120 | /* | ||
121 | * Create "/sys/kernel/uids" directory and corresponding root user's | ||
122 | * directory under it. | ||
123 | */ | ||
124 | if (!error) | ||
125 | error = uids_kobject_init(); | ||
126 | |||
119 | return error; | 127 | return error; |
120 | } | 128 | } |
121 | 129 | ||
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 734da579ad13..a6f1ee9c92d9 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -1521,7 +1521,7 @@ cache_hit: | |||
1521 | } | 1521 | } |
1522 | 1522 | ||
1523 | static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, | 1523 | static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, |
1524 | struct held_lock *hlock, int chain_head) | 1524 | struct held_lock *hlock, int chain_head, u64 chain_key) |
1525 | { | 1525 | { |
1526 | /* | 1526 | /* |
1527 | * Trylock needs to maintain the stack of held locks, but it | 1527 | * Trylock needs to maintain the stack of held locks, but it |
@@ -1534,7 +1534,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, | |||
1534 | * graph_lock for us) | 1534 | * graph_lock for us) |
1535 | */ | 1535 | */ |
1536 | if (!hlock->trylock && (hlock->check == 2) && | 1536 | if (!hlock->trylock && (hlock->check == 2) && |
1537 | lookup_chain_cache(curr->curr_chain_key, hlock->class)) { | 1537 | lookup_chain_cache(chain_key, hlock->class)) { |
1538 | /* | 1538 | /* |
1539 | * Check whether last held lock: | 1539 | * Check whether last held lock: |
1540 | * | 1540 | * |
@@ -1576,7 +1576,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, | |||
1576 | #else | 1576 | #else |
1577 | static inline int validate_chain(struct task_struct *curr, | 1577 | static inline int validate_chain(struct task_struct *curr, |
1578 | struct lockdep_map *lock, struct held_lock *hlock, | 1578 | struct lockdep_map *lock, struct held_lock *hlock, |
1579 | int chain_head) | 1579 | int chain_head, u64 chain_key) |
1580 | { | 1580 | { |
1581 | return 1; | 1581 | return 1; |
1582 | } | 1582 | } |
@@ -2450,11 +2450,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
2450 | chain_head = 1; | 2450 | chain_head = 1; |
2451 | } | 2451 | } |
2452 | chain_key = iterate_chain_key(chain_key, id); | 2452 | chain_key = iterate_chain_key(chain_key, id); |
2453 | curr->curr_chain_key = chain_key; | ||
2454 | 2453 | ||
2455 | if (!validate_chain(curr, lock, hlock, chain_head)) | 2454 | if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) |
2456 | return 0; | 2455 | return 0; |
2457 | 2456 | ||
2457 | curr->curr_chain_key = chain_key; | ||
2458 | curr->lockdep_depth++; | 2458 | curr->lockdep_depth++; |
2459 | check_chain_key(curr); | 2459 | check_chain_key(curr); |
2460 | #ifdef CONFIG_DEBUG_LOCKDEP | 2460 | #ifdef CONFIG_DEBUG_LOCKDEP |
@@ -3199,3 +3199,19 @@ void debug_show_held_locks(struct task_struct *task) | |||
3199 | } | 3199 | } |
3200 | 3200 | ||
3201 | EXPORT_SYMBOL_GPL(debug_show_held_locks); | 3201 | EXPORT_SYMBOL_GPL(debug_show_held_locks); |
3202 | |||
3203 | void lockdep_sys_exit(void) | ||
3204 | { | ||
3205 | struct task_struct *curr = current; | ||
3206 | |||
3207 | if (unlikely(curr->lockdep_depth)) { | ||
3208 | if (!debug_locks_off()) | ||
3209 | return; | ||
3210 | printk("\n================================================\n"); | ||
3211 | printk( "[ BUG: lock held when returning to user space! ]\n"); | ||
3212 | printk( "------------------------------------------------\n"); | ||
3213 | printk("%s/%d is leaving the kernel with locks still held!\n", | ||
3214 | curr->comm, curr->pid); | ||
3215 | lockdep_print_held_locks(curr); | ||
3216 | } | ||
3217 | } | ||
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index c851b2dcc685..8a135bd163c2 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c | |||
@@ -25,28 +25,38 @@ | |||
25 | 25 | ||
26 | static void *l_next(struct seq_file *m, void *v, loff_t *pos) | 26 | static void *l_next(struct seq_file *m, void *v, loff_t *pos) |
27 | { | 27 | { |
28 | struct lock_class *class = v; | 28 | struct lock_class *class; |
29 | 29 | ||
30 | (*pos)++; | 30 | (*pos)++; |
31 | 31 | ||
32 | if (class->lock_entry.next != &all_lock_classes) | 32 | if (v == SEQ_START_TOKEN) |
33 | class = list_entry(class->lock_entry.next, struct lock_class, | 33 | class = m->private; |
34 | lock_entry); | 34 | else { |
35 | else | 35 | class = v; |
36 | class = NULL; | 36 | |
37 | m->private = class; | 37 | if (class->lock_entry.next != &all_lock_classes) |
38 | class = list_entry(class->lock_entry.next, | ||
39 | struct lock_class, lock_entry); | ||
40 | else | ||
41 | class = NULL; | ||
42 | } | ||
38 | 43 | ||
39 | return class; | 44 | return class; |
40 | } | 45 | } |
41 | 46 | ||
42 | static void *l_start(struct seq_file *m, loff_t *pos) | 47 | static void *l_start(struct seq_file *m, loff_t *pos) |
43 | { | 48 | { |
44 | struct lock_class *class = m->private; | 49 | struct lock_class *class; |
50 | loff_t i = 0; | ||
45 | 51 | ||
46 | if (&class->lock_entry == all_lock_classes.next) | 52 | if (*pos == 0) |
47 | seq_printf(m, "all lock classes:\n"); | 53 | return SEQ_START_TOKEN; |
48 | 54 | ||
49 | return class; | 55 | list_for_each_entry(class, &all_lock_classes, lock_entry) { |
56 | if (++i == *pos) | ||
57 | return class; | ||
58 | } | ||
59 | return NULL; | ||
50 | } | 60 | } |
51 | 61 | ||
52 | static void l_stop(struct seq_file *m, void *v) | 62 | static void l_stop(struct seq_file *m, void *v) |
@@ -101,10 +111,15 @@ static void print_name(struct seq_file *m, struct lock_class *class) | |||
101 | static int l_show(struct seq_file *m, void *v) | 111 | static int l_show(struct seq_file *m, void *v) |
102 | { | 112 | { |
103 | unsigned long nr_forward_deps, nr_backward_deps; | 113 | unsigned long nr_forward_deps, nr_backward_deps; |
104 | struct lock_class *class = m->private; | 114 | struct lock_class *class = v; |
105 | struct lock_list *entry; | 115 | struct lock_list *entry; |
106 | char c1, c2, c3, c4; | 116 | char c1, c2, c3, c4; |
107 | 117 | ||
118 | if (v == SEQ_START_TOKEN) { | ||
119 | seq_printf(m, "all lock classes:\n"); | ||
120 | return 0; | ||
121 | } | ||
122 | |||
108 | seq_printf(m, "%p", class->key); | 123 | seq_printf(m, "%p", class->key); |
109 | #ifdef CONFIG_DEBUG_LOCKDEP | 124 | #ifdef CONFIG_DEBUG_LOCKDEP |
110 | seq_printf(m, " OPS:%8ld", class->ops); | 125 | seq_printf(m, " OPS:%8ld", class->ops); |
@@ -523,10 +538,11 @@ static void *ls_start(struct seq_file *m, loff_t *pos) | |||
523 | { | 538 | { |
524 | struct lock_stat_seq *data = m->private; | 539 | struct lock_stat_seq *data = m->private; |
525 | 540 | ||
526 | if (data->iter == data->stats) | 541 | if (*pos == 0) |
527 | seq_header(m); | 542 | return SEQ_START_TOKEN; |
528 | 543 | ||
529 | if (data->iter == data->iter_end) | 544 | data->iter = data->stats + *pos; |
545 | if (data->iter >= data->iter_end) | ||
530 | data->iter = NULL; | 546 | data->iter = NULL; |
531 | 547 | ||
532 | return data->iter; | 548 | return data->iter; |
@@ -538,8 +554,13 @@ static void *ls_next(struct seq_file *m, void *v, loff_t *pos) | |||
538 | 554 | ||
539 | (*pos)++; | 555 | (*pos)++; |
540 | 556 | ||
541 | data->iter = v; | 557 | if (v == SEQ_START_TOKEN) |
542 | data->iter++; | 558 | data->iter = data->stats; |
559 | else { | ||
560 | data->iter = v; | ||
561 | data->iter++; | ||
562 | } | ||
563 | |||
543 | if (data->iter == data->iter_end) | 564 | if (data->iter == data->iter_end) |
544 | data->iter = NULL; | 565 | data->iter = NULL; |
545 | 566 | ||
@@ -552,9 +573,11 @@ static void ls_stop(struct seq_file *m, void *v) | |||
552 | 573 | ||
553 | static int ls_show(struct seq_file *m, void *v) | 574 | static int ls_show(struct seq_file *m, void *v) |
554 | { | 575 | { |
555 | struct lock_stat_seq *data = m->private; | 576 | if (v == SEQ_START_TOKEN) |
577 | seq_header(m); | ||
578 | else | ||
579 | seq_stats(m, v); | ||
556 | 580 | ||
557 | seq_stats(m, data->iter); | ||
558 | return 0; | 581 | return 0; |
559 | } | 582 | } |
560 | 583 | ||
diff --git a/kernel/mutex.c b/kernel/mutex.c index 691b86564dd9..d7fe50cc556f 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -51,6 +51,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) | |||
51 | 51 | ||
52 | EXPORT_SYMBOL(__mutex_init); | 52 | EXPORT_SYMBOL(__mutex_init); |
53 | 53 | ||
54 | #ifndef CONFIG_DEBUG_LOCK_ALLOC | ||
54 | /* | 55 | /* |
55 | * We split the mutex lock/unlock logic into separate fastpath and | 56 | * We split the mutex lock/unlock logic into separate fastpath and |
56 | * slowpath functions, to reduce the register pressure on the fastpath. | 57 | * slowpath functions, to reduce the register pressure on the fastpath. |
@@ -92,6 +93,7 @@ void inline fastcall __sched mutex_lock(struct mutex *lock) | |||
92 | } | 93 | } |
93 | 94 | ||
94 | EXPORT_SYMBOL(mutex_lock); | 95 | EXPORT_SYMBOL(mutex_lock); |
96 | #endif | ||
95 | 97 | ||
96 | static void fastcall noinline __sched | 98 | static void fastcall noinline __sched |
97 | __mutex_unlock_slowpath(atomic_t *lock_count); | 99 | __mutex_unlock_slowpath(atomic_t *lock_count); |
@@ -122,7 +124,8 @@ EXPORT_SYMBOL(mutex_unlock); | |||
122 | * Lock a mutex (possibly interruptible), slowpath: | 124 | * Lock a mutex (possibly interruptible), slowpath: |
123 | */ | 125 | */ |
124 | static inline int __sched | 126 | static inline int __sched |
125 | __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass) | 127 | __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, |
128 | unsigned long ip) | ||
126 | { | 129 | { |
127 | struct task_struct *task = current; | 130 | struct task_struct *task = current; |
128 | struct mutex_waiter waiter; | 131 | struct mutex_waiter waiter; |
@@ -132,7 +135,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass) | |||
132 | spin_lock_mutex(&lock->wait_lock, flags); | 135 | spin_lock_mutex(&lock->wait_lock, flags); |
133 | 136 | ||
134 | debug_mutex_lock_common(lock, &waiter); | 137 | debug_mutex_lock_common(lock, &waiter); |
135 | mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); | 138 | mutex_acquire(&lock->dep_map, subclass, 0, ip); |
136 | debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); | 139 | debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); |
137 | 140 | ||
138 | /* add waiting tasks to the end of the waitqueue (FIFO): */ | 141 | /* add waiting tasks to the end of the waitqueue (FIFO): */ |
@@ -143,7 +146,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass) | |||
143 | if (old_val == 1) | 146 | if (old_val == 1) |
144 | goto done; | 147 | goto done; |
145 | 148 | ||
146 | lock_contended(&lock->dep_map, _RET_IP_); | 149 | lock_contended(&lock->dep_map, ip); |
147 | 150 | ||
148 | for (;;) { | 151 | for (;;) { |
149 | /* | 152 | /* |
@@ -166,7 +169,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass) | |||
166 | if (unlikely(state == TASK_INTERRUPTIBLE && | 169 | if (unlikely(state == TASK_INTERRUPTIBLE && |
167 | signal_pending(task))) { | 170 | signal_pending(task))) { |
168 | mutex_remove_waiter(lock, &waiter, task_thread_info(task)); | 171 | mutex_remove_waiter(lock, &waiter, task_thread_info(task)); |
169 | mutex_release(&lock->dep_map, 1, _RET_IP_); | 172 | mutex_release(&lock->dep_map, 1, ip); |
170 | spin_unlock_mutex(&lock->wait_lock, flags); | 173 | spin_unlock_mutex(&lock->wait_lock, flags); |
171 | 174 | ||
172 | debug_mutex_free_waiter(&waiter); | 175 | debug_mutex_free_waiter(&waiter); |
@@ -197,20 +200,12 @@ done: | |||
197 | return 0; | 200 | return 0; |
198 | } | 201 | } |
199 | 202 | ||
200 | static void fastcall noinline __sched | ||
201 | __mutex_lock_slowpath(atomic_t *lock_count) | ||
202 | { | ||
203 | struct mutex *lock = container_of(lock_count, struct mutex, count); | ||
204 | |||
205 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0); | ||
206 | } | ||
207 | |||
208 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 203 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
209 | void __sched | 204 | void __sched |
210 | mutex_lock_nested(struct mutex *lock, unsigned int subclass) | 205 | mutex_lock_nested(struct mutex *lock, unsigned int subclass) |
211 | { | 206 | { |
212 | might_sleep(); | 207 | might_sleep(); |
213 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass); | 208 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_); |
214 | } | 209 | } |
215 | 210 | ||
216 | EXPORT_SYMBOL_GPL(mutex_lock_nested); | 211 | EXPORT_SYMBOL_GPL(mutex_lock_nested); |
@@ -219,7 +214,7 @@ int __sched | |||
219 | mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) | 214 | mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) |
220 | { | 215 | { |
221 | might_sleep(); | 216 | might_sleep(); |
222 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass); | 217 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_); |
223 | } | 218 | } |
224 | 219 | ||
225 | EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); | 220 | EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); |
@@ -271,6 +266,7 @@ __mutex_unlock_slowpath(atomic_t *lock_count) | |||
271 | __mutex_unlock_common_slowpath(lock_count, 1); | 266 | __mutex_unlock_common_slowpath(lock_count, 1); |
272 | } | 267 | } |
273 | 268 | ||
269 | #ifndef CONFIG_DEBUG_LOCK_ALLOC | ||
274 | /* | 270 | /* |
275 | * Here come the less common (and hence less performance-critical) APIs: | 271 | * Here come the less common (and hence less performance-critical) APIs: |
276 | * mutex_lock_interruptible() and mutex_trylock(). | 272 | * mutex_lock_interruptible() and mutex_trylock(). |
@@ -298,13 +294,22 @@ int fastcall __sched mutex_lock_interruptible(struct mutex *lock) | |||
298 | 294 | ||
299 | EXPORT_SYMBOL(mutex_lock_interruptible); | 295 | EXPORT_SYMBOL(mutex_lock_interruptible); |
300 | 296 | ||
297 | static void fastcall noinline __sched | ||
298 | __mutex_lock_slowpath(atomic_t *lock_count) | ||
299 | { | ||
300 | struct mutex *lock = container_of(lock_count, struct mutex, count); | ||
301 | |||
302 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_); | ||
303 | } | ||
304 | |||
301 | static int fastcall noinline __sched | 305 | static int fastcall noinline __sched |
302 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count) | 306 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count) |
303 | { | 307 | { |
304 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 308 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
305 | 309 | ||
306 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0); | 310 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, _RET_IP_); |
307 | } | 311 | } |
312 | #endif | ||
308 | 313 | ||
309 | /* | 314 | /* |
310 | * Spinlock based trylock, we take the spinlock and check whether we | 315 | * Spinlock based trylock, we take the spinlock and check whether we |
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index a4fb7d46971f..f1decd21a534 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/mnt_namespace.h> | 20 | #include <linux/mnt_namespace.h> |
21 | #include <linux/utsname.h> | 21 | #include <linux/utsname.h> |
22 | #include <linux/pid_namespace.h> | 22 | #include <linux/pid_namespace.h> |
23 | #include <net/net_namespace.h> | ||
23 | 24 | ||
24 | static struct kmem_cache *nsproxy_cachep; | 25 | static struct kmem_cache *nsproxy_cachep; |
25 | 26 | ||
@@ -98,8 +99,17 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, | |||
98 | goto out_user; | 99 | goto out_user; |
99 | } | 100 | } |
100 | 101 | ||
102 | new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns); | ||
103 | if (IS_ERR(new_nsp->net_ns)) { | ||
104 | err = PTR_ERR(new_nsp->net_ns); | ||
105 | goto out_net; | ||
106 | } | ||
107 | |||
101 | return new_nsp; | 108 | return new_nsp; |
102 | 109 | ||
110 | out_net: | ||
111 | if (new_nsp->user_ns) | ||
112 | put_user_ns(new_nsp->user_ns); | ||
103 | out_user: | 113 | out_user: |
104 | if (new_nsp->pid_ns) | 114 | if (new_nsp->pid_ns) |
105 | put_pid_ns(new_nsp->pid_ns); | 115 | put_pid_ns(new_nsp->pid_ns); |
@@ -132,7 +142,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) | |||
132 | 142 | ||
133 | get_nsproxy(old_ns); | 143 | get_nsproxy(old_ns); |
134 | 144 | ||
135 | if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER))) | 145 | if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWNET))) |
136 | return 0; | 146 | return 0; |
137 | 147 | ||
138 | if (!capable(CAP_SYS_ADMIN)) { | 148 | if (!capable(CAP_SYS_ADMIN)) { |
@@ -164,6 +174,7 @@ void free_nsproxy(struct nsproxy *ns) | |||
164 | put_pid_ns(ns->pid_ns); | 174 | put_pid_ns(ns->pid_ns); |
165 | if (ns->user_ns) | 175 | if (ns->user_ns) |
166 | put_user_ns(ns->user_ns); | 176 | put_user_ns(ns->user_ns); |
177 | put_net(ns->net_ns); | ||
167 | kmem_cache_free(nsproxy_cachep, ns); | 178 | kmem_cache_free(nsproxy_cachep, ns); |
168 | } | 179 | } |
169 | 180 | ||
@@ -177,7 +188,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, | |||
177 | int err = 0; | 188 | int err = 0; |
178 | 189 | ||
179 | if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | | 190 | if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | |
180 | CLONE_NEWUSER))) | 191 | CLONE_NEWUSER | CLONE_NEWNET))) |
181 | return 0; | 192 | return 0; |
182 | 193 | ||
183 | if (!capable(CAP_SYS_ADMIN)) | 194 | if (!capable(CAP_SYS_ADMIN)) |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 7a15afb73ed0..57efe0400bc2 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -712,7 +712,7 @@ sys_timer_getoverrun(timer_t timer_id) | |||
712 | { | 712 | { |
713 | struct k_itimer *timr; | 713 | struct k_itimer *timr; |
714 | int overrun; | 714 | int overrun; |
715 | long flags; | 715 | unsigned long flags; |
716 | 716 | ||
717 | timr = lock_timer(timer_id, &flags); | 717 | timr = lock_timer(timer_id, &flags); |
718 | if (!timr) | 718 | if (!timr) |
@@ -784,7 +784,7 @@ sys_timer_settime(timer_t timer_id, int flags, | |||
784 | struct k_itimer *timr; | 784 | struct k_itimer *timr; |
785 | struct itimerspec new_spec, old_spec; | 785 | struct itimerspec new_spec, old_spec; |
786 | int error = 0; | 786 | int error = 0; |
787 | long flag; | 787 | unsigned long flag; |
788 | struct itimerspec *rtn = old_setting ? &old_spec : NULL; | 788 | struct itimerspec *rtn = old_setting ? &old_spec : NULL; |
789 | 789 | ||
790 | if (!new_setting) | 790 | if (!new_setting) |
@@ -836,7 +836,7 @@ asmlinkage long | |||
836 | sys_timer_delete(timer_t timer_id) | 836 | sys_timer_delete(timer_t timer_id) |
837 | { | 837 | { |
838 | struct k_itimer *timer; | 838 | struct k_itimer *timer; |
839 | long flags; | 839 | unsigned long flags; |
840 | 840 | ||
841 | retry_delete: | 841 | retry_delete: |
842 | timer = lock_timer(timer_id, &flags); | 842 | timer = lock_timer(timer_id, &flags); |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 2c2dd8410dc4..130214f3d229 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -49,6 +49,14 @@ | |||
49 | #include <linux/cpu.h> | 49 | #include <linux/cpu.h> |
50 | #include <linux/mutex.h> | 50 | #include <linux/mutex.h> |
51 | 51 | ||
52 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
53 | static struct lock_class_key rcu_lock_key; | ||
54 | struct lockdep_map rcu_lock_map = | ||
55 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); | ||
56 | |||
57 | EXPORT_SYMBOL_GPL(rcu_lock_map); | ||
58 | #endif | ||
59 | |||
52 | /* Definition for rcupdate control block. */ | 60 | /* Definition for rcupdate control block. */ |
53 | static struct rcu_ctrlblk rcu_ctrlblk = { | 61 | static struct rcu_ctrlblk rcu_ctrlblk = { |
54 | .cur = -300, | 62 | .cur = -300, |
diff --git a/kernel/sched.c b/kernel/sched.c index 6107a0cd6325..bba57adb9504 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -61,6 +61,7 @@ | |||
61 | #include <linux/delayacct.h> | 61 | #include <linux/delayacct.h> |
62 | #include <linux/reciprocal_div.h> | 62 | #include <linux/reciprocal_div.h> |
63 | #include <linux/unistd.h> | 63 | #include <linux/unistd.h> |
64 | #include <linux/pagemap.h> | ||
64 | 65 | ||
65 | #include <asm/tlb.h> | 66 | #include <asm/tlb.h> |
66 | 67 | ||
@@ -95,7 +96,7 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
95 | /* | 96 | /* |
96 | * Some helpers for converting nanosecond timing to jiffy resolution | 97 | * Some helpers for converting nanosecond timing to jiffy resolution |
97 | */ | 98 | */ |
98 | #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) | 99 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (1000000000 / HZ)) |
99 | #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) | 100 | #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) |
100 | 101 | ||
101 | #define NICE_0_LOAD SCHED_LOAD_SCALE | 102 | #define NICE_0_LOAD SCHED_LOAD_SCALE |
@@ -104,11 +105,9 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
104 | /* | 105 | /* |
105 | * These are the 'tuning knobs' of the scheduler: | 106 | * These are the 'tuning knobs' of the scheduler: |
106 | * | 107 | * |
107 | * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), | 108 | * default timeslice is 100 msecs (used only for SCHED_RR tasks). |
108 | * default timeslice is 100 msecs, maximum timeslice is 800 msecs. | ||
109 | * Timeslices get refilled after they expire. | 109 | * Timeslices get refilled after they expire. |
110 | */ | 110 | */ |
111 | #define MIN_TIMESLICE max(5 * HZ / 1000, 1) | ||
112 | #define DEF_TIMESLICE (100 * HZ / 1000) | 111 | #define DEF_TIMESLICE (100 * HZ / 1000) |
113 | 112 | ||
114 | #ifdef CONFIG_SMP | 113 | #ifdef CONFIG_SMP |
@@ -132,24 +131,6 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) | |||
132 | } | 131 | } |
133 | #endif | 132 | #endif |
134 | 133 | ||
135 | #define SCALE_PRIO(x, prio) \ | ||
136 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) | ||
137 | |||
138 | /* | ||
139 | * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] | ||
140 | * to time slice values: [800ms ... 100ms ... 5ms] | ||
141 | */ | ||
142 | static unsigned int static_prio_timeslice(int static_prio) | ||
143 | { | ||
144 | if (static_prio == NICE_TO_PRIO(19)) | ||
145 | return 1; | ||
146 | |||
147 | if (static_prio < NICE_TO_PRIO(0)) | ||
148 | return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); | ||
149 | else | ||
150 | return SCALE_PRIO(DEF_TIMESLICE, static_prio); | ||
151 | } | ||
152 | |||
153 | static inline int rt_policy(int policy) | 134 | static inline int rt_policy(int policy) |
154 | { | 135 | { |
155 | if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) | 136 | if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) |
@@ -170,31 +151,91 @@ struct rt_prio_array { | |||
170 | struct list_head queue[MAX_RT_PRIO]; | 151 | struct list_head queue[MAX_RT_PRIO]; |
171 | }; | 152 | }; |
172 | 153 | ||
173 | struct load_stat { | 154 | #ifdef CONFIG_FAIR_GROUP_SCHED |
174 | struct load_weight load; | 155 | |
175 | u64 load_update_start, load_update_last; | 156 | struct cfs_rq; |
176 | unsigned long delta_fair, delta_exec, delta_stat; | 157 | |
158 | /* task group related information */ | ||
159 | struct task_group { | ||
160 | /* schedulable entities of this group on each cpu */ | ||
161 | struct sched_entity **se; | ||
162 | /* runqueue "owned" by this group on each cpu */ | ||
163 | struct cfs_rq **cfs_rq; | ||
164 | unsigned long shares; | ||
165 | /* spinlock to serialize modification to shares */ | ||
166 | spinlock_t lock; | ||
167 | }; | ||
168 | |||
169 | /* Default task group's sched entity on each cpu */ | ||
170 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | ||
171 | /* Default task group's cfs_rq on each cpu */ | ||
172 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | ||
173 | |||
174 | static struct sched_entity *init_sched_entity_p[NR_CPUS]; | ||
175 | static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; | ||
176 | |||
177 | /* Default task group. | ||
178 | * Every task in system belong to this group at bootup. | ||
179 | */ | ||
180 | struct task_group init_task_group = { | ||
181 | .se = init_sched_entity_p, | ||
182 | .cfs_rq = init_cfs_rq_p, | ||
177 | }; | 183 | }; |
178 | 184 | ||
185 | #ifdef CONFIG_FAIR_USER_SCHED | ||
186 | # define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD | ||
187 | #else | ||
188 | # define INIT_TASK_GRP_LOAD NICE_0_LOAD | ||
189 | #endif | ||
190 | |||
191 | static int init_task_group_load = INIT_TASK_GRP_LOAD; | ||
192 | |||
193 | /* return group to which a task belongs */ | ||
194 | static inline struct task_group *task_group(struct task_struct *p) | ||
195 | { | ||
196 | struct task_group *tg; | ||
197 | |||
198 | #ifdef CONFIG_FAIR_USER_SCHED | ||
199 | tg = p->user->tg; | ||
200 | #else | ||
201 | tg = &init_task_group; | ||
202 | #endif | ||
203 | |||
204 | return tg; | ||
205 | } | ||
206 | |||
207 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | ||
208 | static inline void set_task_cfs_rq(struct task_struct *p) | ||
209 | { | ||
210 | p->se.cfs_rq = task_group(p)->cfs_rq[task_cpu(p)]; | ||
211 | p->se.parent = task_group(p)->se[task_cpu(p)]; | ||
212 | } | ||
213 | |||
214 | #else | ||
215 | |||
216 | static inline void set_task_cfs_rq(struct task_struct *p) { } | ||
217 | |||
218 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
219 | |||
179 | /* CFS-related fields in a runqueue */ | 220 | /* CFS-related fields in a runqueue */ |
180 | struct cfs_rq { | 221 | struct cfs_rq { |
181 | struct load_weight load; | 222 | struct load_weight load; |
182 | unsigned long nr_running; | 223 | unsigned long nr_running; |
183 | 224 | ||
184 | s64 fair_clock; | ||
185 | u64 exec_clock; | 225 | u64 exec_clock; |
186 | s64 wait_runtime; | 226 | u64 min_vruntime; |
187 | u64 sleeper_bonus; | ||
188 | unsigned long wait_runtime_overruns, wait_runtime_underruns; | ||
189 | 227 | ||
190 | struct rb_root tasks_timeline; | 228 | struct rb_root tasks_timeline; |
191 | struct rb_node *rb_leftmost; | 229 | struct rb_node *rb_leftmost; |
192 | struct rb_node *rb_load_balance_curr; | 230 | struct rb_node *rb_load_balance_curr; |
193 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
194 | /* 'curr' points to currently running entity on this cfs_rq. | 231 | /* 'curr' points to currently running entity on this cfs_rq. |
195 | * It is set to NULL otherwise (i.e when none are currently running). | 232 | * It is set to NULL otherwise (i.e when none are currently running). |
196 | */ | 233 | */ |
197 | struct sched_entity *curr; | 234 | struct sched_entity *curr; |
235 | |||
236 | unsigned long nr_spread_over; | ||
237 | |||
238 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
198 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | 239 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ |
199 | 240 | ||
200 | /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | 241 | /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in |
@@ -205,6 +246,8 @@ struct cfs_rq { | |||
205 | * list is used during load balance. | 246 | * list is used during load balance. |
206 | */ | 247 | */ |
207 | struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ | 248 | struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ |
249 | struct task_group *tg; /* group that "owns" this runqueue */ | ||
250 | struct rcu_head rcu; | ||
208 | #endif | 251 | #endif |
209 | }; | 252 | }; |
210 | 253 | ||
@@ -236,7 +279,7 @@ struct rq { | |||
236 | #ifdef CONFIG_NO_HZ | 279 | #ifdef CONFIG_NO_HZ |
237 | unsigned char in_nohz_recently; | 280 | unsigned char in_nohz_recently; |
238 | #endif | 281 | #endif |
239 | struct load_stat ls; /* capture load from *all* tasks on this cpu */ | 282 | struct load_weight load; /* capture load from *all* tasks on this cpu */ |
240 | unsigned long nr_load_updates; | 283 | unsigned long nr_load_updates; |
241 | u64 nr_switches; | 284 | u64 nr_switches; |
242 | 285 | ||
@@ -288,16 +331,19 @@ struct rq { | |||
288 | unsigned long yld_exp_empty; | 331 | unsigned long yld_exp_empty; |
289 | unsigned long yld_act_empty; | 332 | unsigned long yld_act_empty; |
290 | unsigned long yld_both_empty; | 333 | unsigned long yld_both_empty; |
291 | unsigned long yld_cnt; | 334 | unsigned long yld_count; |
292 | 335 | ||
293 | /* schedule() stats */ | 336 | /* schedule() stats */ |
294 | unsigned long sched_switch; | 337 | unsigned long sched_switch; |
295 | unsigned long sched_cnt; | 338 | unsigned long sched_count; |
296 | unsigned long sched_goidle; | 339 | unsigned long sched_goidle; |
297 | 340 | ||
298 | /* try_to_wake_up() stats */ | 341 | /* try_to_wake_up() stats */ |
299 | unsigned long ttwu_cnt; | 342 | unsigned long ttwu_count; |
300 | unsigned long ttwu_local; | 343 | unsigned long ttwu_local; |
344 | |||
345 | /* BKL stats */ | ||
346 | unsigned long bkl_count; | ||
301 | #endif | 347 | #endif |
302 | struct lock_class_key rq_lock_key; | 348 | struct lock_class_key rq_lock_key; |
303 | }; | 349 | }; |
@@ -382,6 +428,37 @@ static void update_rq_clock(struct rq *rq) | |||
382 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 428 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
383 | 429 | ||
384 | /* | 430 | /* |
431 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | ||
432 | */ | ||
433 | #ifdef CONFIG_SCHED_DEBUG | ||
434 | # define const_debug __read_mostly | ||
435 | #else | ||
436 | # define const_debug static const | ||
437 | #endif | ||
438 | |||
439 | /* | ||
440 | * Debugging: various feature bits | ||
441 | */ | ||
442 | enum { | ||
443 | SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, | ||
444 | SCHED_FEAT_START_DEBIT = 2, | ||
445 | SCHED_FEAT_TREE_AVG = 4, | ||
446 | SCHED_FEAT_APPROX_AVG = 8, | ||
447 | SCHED_FEAT_WAKEUP_PREEMPT = 16, | ||
448 | SCHED_FEAT_PREEMPT_RESTRICT = 32, | ||
449 | }; | ||
450 | |||
451 | const_debug unsigned int sysctl_sched_features = | ||
452 | SCHED_FEAT_NEW_FAIR_SLEEPERS *1 | | ||
453 | SCHED_FEAT_START_DEBIT *1 | | ||
454 | SCHED_FEAT_TREE_AVG *0 | | ||
455 | SCHED_FEAT_APPROX_AVG *0 | | ||
456 | SCHED_FEAT_WAKEUP_PREEMPT *1 | | ||
457 | SCHED_FEAT_PREEMPT_RESTRICT *1; | ||
458 | |||
459 | #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) | ||
460 | |||
461 | /* | ||
385 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | 462 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu |
386 | * clock constructed from sched_clock(): | 463 | * clock constructed from sched_clock(): |
387 | */ | 464 | */ |
@@ -399,18 +476,7 @@ unsigned long long cpu_clock(int cpu) | |||
399 | 476 | ||
400 | return now; | 477 | return now; |
401 | } | 478 | } |
402 | 479 | EXPORT_SYMBOL_GPL(cpu_clock); | |
403 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
404 | /* Change a task's ->cfs_rq if it moves across CPUs */ | ||
405 | static inline void set_task_cfs_rq(struct task_struct *p) | ||
406 | { | ||
407 | p->se.cfs_rq = &task_rq(p)->cfs; | ||
408 | } | ||
409 | #else | ||
410 | static inline void set_task_cfs_rq(struct task_struct *p) | ||
411 | { | ||
412 | } | ||
413 | #endif | ||
414 | 480 | ||
415 | #ifndef prepare_arch_switch | 481 | #ifndef prepare_arch_switch |
416 | # define prepare_arch_switch(next) do { } while (0) | 482 | # define prepare_arch_switch(next) do { } while (0) |
@@ -496,16 +562,13 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
496 | static inline struct rq *__task_rq_lock(struct task_struct *p) | 562 | static inline struct rq *__task_rq_lock(struct task_struct *p) |
497 | __acquires(rq->lock) | 563 | __acquires(rq->lock) |
498 | { | 564 | { |
499 | struct rq *rq; | 565 | for (;;) { |
500 | 566 | struct rq *rq = task_rq(p); | |
501 | repeat_lock_task: | 567 | spin_lock(&rq->lock); |
502 | rq = task_rq(p); | 568 | if (likely(rq == task_rq(p))) |
503 | spin_lock(&rq->lock); | 569 | return rq; |
504 | if (unlikely(rq != task_rq(p))) { | ||
505 | spin_unlock(&rq->lock); | 570 | spin_unlock(&rq->lock); |
506 | goto repeat_lock_task; | ||
507 | } | 571 | } |
508 | return rq; | ||
509 | } | 572 | } |
510 | 573 | ||
511 | /* | 574 | /* |
@@ -518,18 +581,17 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | |||
518 | { | 581 | { |
519 | struct rq *rq; | 582 | struct rq *rq; |
520 | 583 | ||
521 | repeat_lock_task: | 584 | for (;;) { |
522 | local_irq_save(*flags); | 585 | local_irq_save(*flags); |
523 | rq = task_rq(p); | 586 | rq = task_rq(p); |
524 | spin_lock(&rq->lock); | 587 | spin_lock(&rq->lock); |
525 | if (unlikely(rq != task_rq(p))) { | 588 | if (likely(rq == task_rq(p))) |
589 | return rq; | ||
526 | spin_unlock_irqrestore(&rq->lock, *flags); | 590 | spin_unlock_irqrestore(&rq->lock, *flags); |
527 | goto repeat_lock_task; | ||
528 | } | 591 | } |
529 | return rq; | ||
530 | } | 592 | } |
531 | 593 | ||
532 | static inline void __task_rq_unlock(struct rq *rq) | 594 | static void __task_rq_unlock(struct rq *rq) |
533 | __releases(rq->lock) | 595 | __releases(rq->lock) |
534 | { | 596 | { |
535 | spin_unlock(&rq->lock); | 597 | spin_unlock(&rq->lock); |
@@ -544,7 +606,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) | |||
544 | /* | 606 | /* |
545 | * this_rq_lock - lock this runqueue and disable interrupts. | 607 | * this_rq_lock - lock this runqueue and disable interrupts. |
546 | */ | 608 | */ |
547 | static inline struct rq *this_rq_lock(void) | 609 | static struct rq *this_rq_lock(void) |
548 | __acquires(rq->lock) | 610 | __acquires(rq->lock) |
549 | { | 611 | { |
550 | struct rq *rq; | 612 | struct rq *rq; |
@@ -644,19 +706,6 @@ static inline void resched_task(struct task_struct *p) | |||
644 | } | 706 | } |
645 | #endif | 707 | #endif |
646 | 708 | ||
647 | static u64 div64_likely32(u64 divident, unsigned long divisor) | ||
648 | { | ||
649 | #if BITS_PER_LONG == 32 | ||
650 | if (likely(divident <= 0xffffffffULL)) | ||
651 | return (u32)divident / divisor; | ||
652 | do_div(divident, divisor); | ||
653 | |||
654 | return divident; | ||
655 | #else | ||
656 | return divident / divisor; | ||
657 | #endif | ||
658 | } | ||
659 | |||
660 | #if BITS_PER_LONG == 32 | 709 | #if BITS_PER_LONG == 32 |
661 | # define WMULT_CONST (~0UL) | 710 | # define WMULT_CONST (~0UL) |
662 | #else | 711 | #else |
@@ -698,16 +747,14 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) | |||
698 | return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); | 747 | return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); |
699 | } | 748 | } |
700 | 749 | ||
701 | static void update_load_add(struct load_weight *lw, unsigned long inc) | 750 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) |
702 | { | 751 | { |
703 | lw->weight += inc; | 752 | lw->weight += inc; |
704 | lw->inv_weight = 0; | ||
705 | } | 753 | } |
706 | 754 | ||
707 | static void update_load_sub(struct load_weight *lw, unsigned long dec) | 755 | static inline void update_load_sub(struct load_weight *lw, unsigned long dec) |
708 | { | 756 | { |
709 | lw->weight -= dec; | 757 | lw->weight -= dec; |
710 | lw->inv_weight = 0; | ||
711 | } | 758 | } |
712 | 759 | ||
713 | /* | 760 | /* |
@@ -783,29 +830,20 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
783 | int *this_best_prio, struct rq_iterator *iterator); | 830 | int *this_best_prio, struct rq_iterator *iterator); |
784 | 831 | ||
785 | #include "sched_stats.h" | 832 | #include "sched_stats.h" |
786 | #include "sched_rt.c" | ||
787 | #include "sched_fair.c" | ||
788 | #include "sched_idletask.c" | 833 | #include "sched_idletask.c" |
834 | #include "sched_fair.c" | ||
835 | #include "sched_rt.c" | ||
789 | #ifdef CONFIG_SCHED_DEBUG | 836 | #ifdef CONFIG_SCHED_DEBUG |
790 | # include "sched_debug.c" | 837 | # include "sched_debug.c" |
791 | #endif | 838 | #endif |
792 | 839 | ||
793 | #define sched_class_highest (&rt_sched_class) | 840 | #define sched_class_highest (&rt_sched_class) |
794 | 841 | ||
795 | static void __update_curr_load(struct rq *rq, struct load_stat *ls) | ||
796 | { | ||
797 | if (rq->curr != rq->idle && ls->load.weight) { | ||
798 | ls->delta_exec += ls->delta_stat; | ||
799 | ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load); | ||
800 | ls->delta_stat = 0; | ||
801 | } | ||
802 | } | ||
803 | |||
804 | /* | 842 | /* |
805 | * Update delta_exec, delta_fair fields for rq. | 843 | * Update delta_exec, delta_fair fields for rq. |
806 | * | 844 | * |
807 | * delta_fair clock advances at a rate inversely proportional to | 845 | * delta_fair clock advances at a rate inversely proportional to |
808 | * total load (rq->ls.load.weight) on the runqueue, while | 846 | * total load (rq->load.weight) on the runqueue, while |
809 | * delta_exec advances at the same rate as wall-clock (provided | 847 | * delta_exec advances at the same rate as wall-clock (provided |
810 | * cpu is not idle). | 848 | * cpu is not idle). |
811 | * | 849 | * |
@@ -813,35 +851,17 @@ static void __update_curr_load(struct rq *rq, struct load_stat *ls) | |||
813 | * runqueue over any given interval. This (smoothened) load is used | 851 | * runqueue over any given interval. This (smoothened) load is used |
814 | * during load balance. | 852 | * during load balance. |
815 | * | 853 | * |
816 | * This function is called /before/ updating rq->ls.load | 854 | * This function is called /before/ updating rq->load |
817 | * and when switching tasks. | 855 | * and when switching tasks. |
818 | */ | 856 | */ |
819 | static void update_curr_load(struct rq *rq) | ||
820 | { | ||
821 | struct load_stat *ls = &rq->ls; | ||
822 | u64 start; | ||
823 | |||
824 | start = ls->load_update_start; | ||
825 | ls->load_update_start = rq->clock; | ||
826 | ls->delta_stat += rq->clock - start; | ||
827 | /* | ||
828 | * Stagger updates to ls->delta_fair. Very frequent updates | ||
829 | * can be expensive. | ||
830 | */ | ||
831 | if (ls->delta_stat >= sysctl_sched_stat_granularity) | ||
832 | __update_curr_load(rq, ls); | ||
833 | } | ||
834 | |||
835 | static inline void inc_load(struct rq *rq, const struct task_struct *p) | 857 | static inline void inc_load(struct rq *rq, const struct task_struct *p) |
836 | { | 858 | { |
837 | update_curr_load(rq); | 859 | update_load_add(&rq->load, p->se.load.weight); |
838 | update_load_add(&rq->ls.load, p->se.load.weight); | ||
839 | } | 860 | } |
840 | 861 | ||
841 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | 862 | static inline void dec_load(struct rq *rq, const struct task_struct *p) |
842 | { | 863 | { |
843 | update_curr_load(rq); | 864 | update_load_sub(&rq->load, p->se.load.weight); |
844 | update_load_sub(&rq->ls.load, p->se.load.weight); | ||
845 | } | 865 | } |
846 | 866 | ||
847 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | 867 | static void inc_nr_running(struct task_struct *p, struct rq *rq) |
@@ -858,8 +878,6 @@ static void dec_nr_running(struct task_struct *p, struct rq *rq) | |||
858 | 878 | ||
859 | static void set_load_weight(struct task_struct *p) | 879 | static void set_load_weight(struct task_struct *p) |
860 | { | 880 | { |
861 | p->se.wait_runtime = 0; | ||
862 | |||
863 | if (task_has_rt_policy(p)) { | 881 | if (task_has_rt_policy(p)) { |
864 | p->se.load.weight = prio_to_weight[0] * 2; | 882 | p->se.load.weight = prio_to_weight[0] * 2; |
865 | p->se.load.inv_weight = prio_to_wmult[0] >> 1; | 883 | p->se.load.inv_weight = prio_to_wmult[0] >> 1; |
@@ -951,20 +969,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
951 | } | 969 | } |
952 | 970 | ||
953 | /* | 971 | /* |
954 | * activate_idle_task - move idle task to the _front_ of runqueue. | ||
955 | */ | ||
956 | static inline void activate_idle_task(struct task_struct *p, struct rq *rq) | ||
957 | { | ||
958 | update_rq_clock(rq); | ||
959 | |||
960 | if (p->state == TASK_UNINTERRUPTIBLE) | ||
961 | rq->nr_uninterruptible--; | ||
962 | |||
963 | enqueue_task(rq, p, 0); | ||
964 | inc_nr_running(p, rq); | ||
965 | } | ||
966 | |||
967 | /* | ||
968 | * deactivate_task - remove a task from the runqueue. | 972 | * deactivate_task - remove a task from the runqueue. |
969 | */ | 973 | */ |
970 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | 974 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) |
@@ -988,32 +992,50 @@ inline int task_curr(const struct task_struct *p) | |||
988 | /* Used instead of source_load when we know the type == 0 */ | 992 | /* Used instead of source_load when we know the type == 0 */ |
989 | unsigned long weighted_cpuload(const int cpu) | 993 | unsigned long weighted_cpuload(const int cpu) |
990 | { | 994 | { |
991 | return cpu_rq(cpu)->ls.load.weight; | 995 | return cpu_rq(cpu)->load.weight; |
992 | } | 996 | } |
993 | 997 | ||
994 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | 998 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) |
995 | { | 999 | { |
996 | #ifdef CONFIG_SMP | 1000 | #ifdef CONFIG_SMP |
997 | task_thread_info(p)->cpu = cpu; | 1001 | task_thread_info(p)->cpu = cpu; |
998 | set_task_cfs_rq(p); | ||
999 | #endif | 1002 | #endif |
1003 | set_task_cfs_rq(p); | ||
1000 | } | 1004 | } |
1001 | 1005 | ||
1002 | #ifdef CONFIG_SMP | 1006 | #ifdef CONFIG_SMP |
1003 | 1007 | ||
1008 | /* | ||
1009 | * Is this task likely cache-hot: | ||
1010 | */ | ||
1011 | static inline int | ||
1012 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | ||
1013 | { | ||
1014 | s64 delta; | ||
1015 | |||
1016 | if (p->sched_class != &fair_sched_class) | ||
1017 | return 0; | ||
1018 | |||
1019 | if (sysctl_sched_migration_cost == -1) | ||
1020 | return 1; | ||
1021 | if (sysctl_sched_migration_cost == 0) | ||
1022 | return 0; | ||
1023 | |||
1024 | delta = now - p->se.exec_start; | ||
1025 | |||
1026 | return delta < (s64)sysctl_sched_migration_cost; | ||
1027 | } | ||
1028 | |||
1029 | |||
1004 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 1030 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
1005 | { | 1031 | { |
1006 | int old_cpu = task_cpu(p); | 1032 | int old_cpu = task_cpu(p); |
1007 | struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); | 1033 | struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); |
1008 | u64 clock_offset, fair_clock_offset; | 1034 | struct cfs_rq *old_cfsrq = task_cfs_rq(p), |
1035 | *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); | ||
1036 | u64 clock_offset; | ||
1009 | 1037 | ||
1010 | clock_offset = old_rq->clock - new_rq->clock; | 1038 | clock_offset = old_rq->clock - new_rq->clock; |
1011 | fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock; | ||
1012 | |||
1013 | if (p->se.wait_start_fair) | ||
1014 | p->se.wait_start_fair -= fair_clock_offset; | ||
1015 | if (p->se.sleep_start_fair) | ||
1016 | p->se.sleep_start_fair -= fair_clock_offset; | ||
1017 | 1039 | ||
1018 | #ifdef CONFIG_SCHEDSTATS | 1040 | #ifdef CONFIG_SCHEDSTATS |
1019 | if (p->se.wait_start) | 1041 | if (p->se.wait_start) |
@@ -1022,7 +1044,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1022 | p->se.sleep_start -= clock_offset; | 1044 | p->se.sleep_start -= clock_offset; |
1023 | if (p->se.block_start) | 1045 | if (p->se.block_start) |
1024 | p->se.block_start -= clock_offset; | 1046 | p->se.block_start -= clock_offset; |
1047 | if (old_cpu != new_cpu) { | ||
1048 | schedstat_inc(p, se.nr_migrations); | ||
1049 | if (task_hot(p, old_rq->clock, NULL)) | ||
1050 | schedstat_inc(p, se.nr_forced2_migrations); | ||
1051 | } | ||
1025 | #endif | 1052 | #endif |
1053 | p->se.vruntime -= old_cfsrq->min_vruntime - | ||
1054 | new_cfsrq->min_vruntime; | ||
1026 | 1055 | ||
1027 | __set_task_cpu(p, new_cpu); | 1056 | __set_task_cpu(p, new_cpu); |
1028 | } | 1057 | } |
@@ -1077,69 +1106,71 @@ void wait_task_inactive(struct task_struct *p) | |||
1077 | int running, on_rq; | 1106 | int running, on_rq; |
1078 | struct rq *rq; | 1107 | struct rq *rq; |
1079 | 1108 | ||
1080 | repeat: | 1109 | for (;;) { |
1081 | /* | 1110 | /* |
1082 | * We do the initial early heuristics without holding | 1111 | * We do the initial early heuristics without holding |
1083 | * any task-queue locks at all. We'll only try to get | 1112 | * any task-queue locks at all. We'll only try to get |
1084 | * the runqueue lock when things look like they will | 1113 | * the runqueue lock when things look like they will |
1085 | * work out! | 1114 | * work out! |
1086 | */ | 1115 | */ |
1087 | rq = task_rq(p); | 1116 | rq = task_rq(p); |
1088 | 1117 | ||
1089 | /* | 1118 | /* |
1090 | * If the task is actively running on another CPU | 1119 | * If the task is actively running on another CPU |
1091 | * still, just relax and busy-wait without holding | 1120 | * still, just relax and busy-wait without holding |
1092 | * any locks. | 1121 | * any locks. |
1093 | * | 1122 | * |
1094 | * NOTE! Since we don't hold any locks, it's not | 1123 | * NOTE! Since we don't hold any locks, it's not |
1095 | * even sure that "rq" stays as the right runqueue! | 1124 | * even sure that "rq" stays as the right runqueue! |
1096 | * But we don't care, since "task_running()" will | 1125 | * But we don't care, since "task_running()" will |
1097 | * return false if the runqueue has changed and p | 1126 | * return false if the runqueue has changed and p |
1098 | * is actually now running somewhere else! | 1127 | * is actually now running somewhere else! |
1099 | */ | 1128 | */ |
1100 | while (task_running(rq, p)) | 1129 | while (task_running(rq, p)) |
1101 | cpu_relax(); | 1130 | cpu_relax(); |
1102 | 1131 | ||
1103 | /* | 1132 | /* |
1104 | * Ok, time to look more closely! We need the rq | 1133 | * Ok, time to look more closely! We need the rq |
1105 | * lock now, to be *sure*. If we're wrong, we'll | 1134 | * lock now, to be *sure*. If we're wrong, we'll |
1106 | * just go back and repeat. | 1135 | * just go back and repeat. |
1107 | */ | 1136 | */ |
1108 | rq = task_rq_lock(p, &flags); | 1137 | rq = task_rq_lock(p, &flags); |
1109 | running = task_running(rq, p); | 1138 | running = task_running(rq, p); |
1110 | on_rq = p->se.on_rq; | 1139 | on_rq = p->se.on_rq; |
1111 | task_rq_unlock(rq, &flags); | 1140 | task_rq_unlock(rq, &flags); |
1112 | 1141 | ||
1113 | /* | 1142 | /* |
1114 | * Was it really running after all now that we | 1143 | * Was it really running after all now that we |
1115 | * checked with the proper locks actually held? | 1144 | * checked with the proper locks actually held? |
1116 | * | 1145 | * |
1117 | * Oops. Go back and try again.. | 1146 | * Oops. Go back and try again.. |
1118 | */ | 1147 | */ |
1119 | if (unlikely(running)) { | 1148 | if (unlikely(running)) { |
1120 | cpu_relax(); | 1149 | cpu_relax(); |
1121 | goto repeat; | 1150 | continue; |
1122 | } | 1151 | } |
1123 | 1152 | ||
1124 | /* | 1153 | /* |
1125 | * It's not enough that it's not actively running, | 1154 | * It's not enough that it's not actively running, |
1126 | * it must be off the runqueue _entirely_, and not | 1155 | * it must be off the runqueue _entirely_, and not |
1127 | * preempted! | 1156 | * preempted! |
1128 | * | 1157 | * |
1129 | * So if it wa still runnable (but just not actively | 1158 | * So if it wa still runnable (but just not actively |
1130 | * running right now), it's preempted, and we should | 1159 | * running right now), it's preempted, and we should |
1131 | * yield - it could be a while. | 1160 | * yield - it could be a while. |
1132 | */ | 1161 | */ |
1133 | if (unlikely(on_rq)) { | 1162 | if (unlikely(on_rq)) { |
1134 | yield(); | 1163 | schedule_timeout_uninterruptible(1); |
1135 | goto repeat; | 1164 | continue; |
1136 | } | 1165 | } |
1137 | 1166 | ||
1138 | /* | 1167 | /* |
1139 | * Ahh, all good. It wasn't running, and it wasn't | 1168 | * Ahh, all good. It wasn't running, and it wasn't |
1140 | * runnable, which means that it will never become | 1169 | * runnable, which means that it will never become |
1141 | * running in the future either. We're all done! | 1170 | * running in the future either. We're all done! |
1142 | */ | 1171 | */ |
1172 | break; | ||
1173 | } | ||
1143 | } | 1174 | } |
1144 | 1175 | ||
1145 | /*** | 1176 | /*** |
@@ -1173,7 +1204,7 @@ void kick_process(struct task_struct *p) | |||
1173 | * We want to under-estimate the load of migration sources, to | 1204 | * We want to under-estimate the load of migration sources, to |
1174 | * balance conservatively. | 1205 | * balance conservatively. |
1175 | */ | 1206 | */ |
1176 | static inline unsigned long source_load(int cpu, int type) | 1207 | static unsigned long source_load(int cpu, int type) |
1177 | { | 1208 | { |
1178 | struct rq *rq = cpu_rq(cpu); | 1209 | struct rq *rq = cpu_rq(cpu); |
1179 | unsigned long total = weighted_cpuload(cpu); | 1210 | unsigned long total = weighted_cpuload(cpu); |
@@ -1188,7 +1219,7 @@ static inline unsigned long source_load(int cpu, int type) | |||
1188 | * Return a high guess at the load of a migration-target cpu weighted | 1219 | * Return a high guess at the load of a migration-target cpu weighted |
1189 | * according to the scheduling class and "nice" value. | 1220 | * according to the scheduling class and "nice" value. |
1190 | */ | 1221 | */ |
1191 | static inline unsigned long target_load(int cpu, int type) | 1222 | static unsigned long target_load(int cpu, int type) |
1192 | { | 1223 | { |
1193 | struct rq *rq = cpu_rq(cpu); | 1224 | struct rq *rq = cpu_rq(cpu); |
1194 | unsigned long total = weighted_cpuload(cpu); | 1225 | unsigned long total = weighted_cpuload(cpu); |
@@ -1230,7 +1261,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
1230 | 1261 | ||
1231 | /* Skip over this group if it has no CPUs allowed */ | 1262 | /* Skip over this group if it has no CPUs allowed */ |
1232 | if (!cpus_intersects(group->cpumask, p->cpus_allowed)) | 1263 | if (!cpus_intersects(group->cpumask, p->cpus_allowed)) |
1233 | goto nextgroup; | 1264 | continue; |
1234 | 1265 | ||
1235 | local_group = cpu_isset(this_cpu, group->cpumask); | 1266 | local_group = cpu_isset(this_cpu, group->cpumask); |
1236 | 1267 | ||
@@ -1258,9 +1289,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
1258 | min_load = avg_load; | 1289 | min_load = avg_load; |
1259 | idlest = group; | 1290 | idlest = group; |
1260 | } | 1291 | } |
1261 | nextgroup: | 1292 | } while (group = group->next, group != sd->groups); |
1262 | group = group->next; | ||
1263 | } while (group != sd->groups); | ||
1264 | 1293 | ||
1265 | if (!idlest || 100*this_load < imbalance*min_load) | 1294 | if (!idlest || 100*this_load < imbalance*min_load) |
1266 | return NULL; | 1295 | return NULL; |
@@ -1392,8 +1421,13 @@ static int wake_idle(int cpu, struct task_struct *p) | |||
1392 | if (sd->flags & SD_WAKE_IDLE) { | 1421 | if (sd->flags & SD_WAKE_IDLE) { |
1393 | cpus_and(tmp, sd->span, p->cpus_allowed); | 1422 | cpus_and(tmp, sd->span, p->cpus_allowed); |
1394 | for_each_cpu_mask(i, tmp) { | 1423 | for_each_cpu_mask(i, tmp) { |
1395 | if (idle_cpu(i)) | 1424 | if (idle_cpu(i)) { |
1425 | if (i != task_cpu(p)) { | ||
1426 | schedstat_inc(p, | ||
1427 | se.nr_wakeups_idle); | ||
1428 | } | ||
1396 | return i; | 1429 | return i; |
1430 | } | ||
1397 | } | 1431 | } |
1398 | } else { | 1432 | } else { |
1399 | break; | 1433 | break; |
@@ -1424,7 +1458,7 @@ static inline int wake_idle(int cpu, struct task_struct *p) | |||
1424 | */ | 1458 | */ |
1425 | static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | 1459 | static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) |
1426 | { | 1460 | { |
1427 | int cpu, this_cpu, success = 0; | 1461 | int cpu, orig_cpu, this_cpu, success = 0; |
1428 | unsigned long flags; | 1462 | unsigned long flags; |
1429 | long old_state; | 1463 | long old_state; |
1430 | struct rq *rq; | 1464 | struct rq *rq; |
@@ -1443,6 +1477,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1443 | goto out_running; | 1477 | goto out_running; |
1444 | 1478 | ||
1445 | cpu = task_cpu(p); | 1479 | cpu = task_cpu(p); |
1480 | orig_cpu = cpu; | ||
1446 | this_cpu = smp_processor_id(); | 1481 | this_cpu = smp_processor_id(); |
1447 | 1482 | ||
1448 | #ifdef CONFIG_SMP | 1483 | #ifdef CONFIG_SMP |
@@ -1451,7 +1486,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1451 | 1486 | ||
1452 | new_cpu = cpu; | 1487 | new_cpu = cpu; |
1453 | 1488 | ||
1454 | schedstat_inc(rq, ttwu_cnt); | 1489 | schedstat_inc(rq, ttwu_count); |
1455 | if (cpu == this_cpu) { | 1490 | if (cpu == this_cpu) { |
1456 | schedstat_inc(rq, ttwu_local); | 1491 | schedstat_inc(rq, ttwu_local); |
1457 | goto out_set_cpu; | 1492 | goto out_set_cpu; |
@@ -1486,6 +1521,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1486 | unsigned long tl = this_load; | 1521 | unsigned long tl = this_load; |
1487 | unsigned long tl_per_task; | 1522 | unsigned long tl_per_task; |
1488 | 1523 | ||
1524 | /* | ||
1525 | * Attract cache-cold tasks on sync wakeups: | ||
1526 | */ | ||
1527 | if (sync && !task_hot(p, rq->clock, this_sd)) | ||
1528 | goto out_set_cpu; | ||
1529 | |||
1530 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | ||
1489 | tl_per_task = cpu_avg_load_per_task(this_cpu); | 1531 | tl_per_task = cpu_avg_load_per_task(this_cpu); |
1490 | 1532 | ||
1491 | /* | 1533 | /* |
@@ -1505,6 +1547,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1505 | * there is no bad imbalance. | 1547 | * there is no bad imbalance. |
1506 | */ | 1548 | */ |
1507 | schedstat_inc(this_sd, ttwu_move_affine); | 1549 | schedstat_inc(this_sd, ttwu_move_affine); |
1550 | schedstat_inc(p, se.nr_wakeups_affine); | ||
1508 | goto out_set_cpu; | 1551 | goto out_set_cpu; |
1509 | } | 1552 | } |
1510 | } | 1553 | } |
@@ -1516,6 +1559,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1516 | if (this_sd->flags & SD_WAKE_BALANCE) { | 1559 | if (this_sd->flags & SD_WAKE_BALANCE) { |
1517 | if (imbalance*this_load <= 100*load) { | 1560 | if (imbalance*this_load <= 100*load) { |
1518 | schedstat_inc(this_sd, ttwu_move_balance); | 1561 | schedstat_inc(this_sd, ttwu_move_balance); |
1562 | schedstat_inc(p, se.nr_wakeups_passive); | ||
1519 | goto out_set_cpu; | 1563 | goto out_set_cpu; |
1520 | } | 1564 | } |
1521 | } | 1565 | } |
@@ -1541,18 +1585,18 @@ out_set_cpu: | |||
1541 | 1585 | ||
1542 | out_activate: | 1586 | out_activate: |
1543 | #endif /* CONFIG_SMP */ | 1587 | #endif /* CONFIG_SMP */ |
1588 | schedstat_inc(p, se.nr_wakeups); | ||
1589 | if (sync) | ||
1590 | schedstat_inc(p, se.nr_wakeups_sync); | ||
1591 | if (orig_cpu != cpu) | ||
1592 | schedstat_inc(p, se.nr_wakeups_migrate); | ||
1593 | if (cpu == this_cpu) | ||
1594 | schedstat_inc(p, se.nr_wakeups_local); | ||
1595 | else | ||
1596 | schedstat_inc(p, se.nr_wakeups_remote); | ||
1544 | update_rq_clock(rq); | 1597 | update_rq_clock(rq); |
1545 | activate_task(rq, p, 1); | 1598 | activate_task(rq, p, 1); |
1546 | /* | 1599 | check_preempt_curr(rq, p); |
1547 | * Sync wakeups (i.e. those types of wakeups where the waker | ||
1548 | * has indicated that it will leave the CPU in short order) | ||
1549 | * don't trigger a preemption, if the woken up task will run on | ||
1550 | * this cpu. (in this case the 'I will reschedule' promise of | ||
1551 | * the waker guarantees that the freshly woken up task is going | ||
1552 | * to be considered on this CPU.) | ||
1553 | */ | ||
1554 | if (!sync || cpu != this_cpu) | ||
1555 | check_preempt_curr(rq, p); | ||
1556 | success = 1; | 1600 | success = 1; |
1557 | 1601 | ||
1558 | out_running: | 1602 | out_running: |
@@ -1583,28 +1627,20 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state) | |||
1583 | */ | 1627 | */ |
1584 | static void __sched_fork(struct task_struct *p) | 1628 | static void __sched_fork(struct task_struct *p) |
1585 | { | 1629 | { |
1586 | p->se.wait_start_fair = 0; | ||
1587 | p->se.exec_start = 0; | 1630 | p->se.exec_start = 0; |
1588 | p->se.sum_exec_runtime = 0; | 1631 | p->se.sum_exec_runtime = 0; |
1589 | p->se.prev_sum_exec_runtime = 0; | 1632 | p->se.prev_sum_exec_runtime = 0; |
1590 | p->se.delta_exec = 0; | ||
1591 | p->se.delta_fair_run = 0; | ||
1592 | p->se.delta_fair_sleep = 0; | ||
1593 | p->se.wait_runtime = 0; | ||
1594 | p->se.sleep_start_fair = 0; | ||
1595 | 1633 | ||
1596 | #ifdef CONFIG_SCHEDSTATS | 1634 | #ifdef CONFIG_SCHEDSTATS |
1597 | p->se.wait_start = 0; | 1635 | p->se.wait_start = 0; |
1598 | p->se.sum_wait_runtime = 0; | ||
1599 | p->se.sum_sleep_runtime = 0; | 1636 | p->se.sum_sleep_runtime = 0; |
1600 | p->se.sleep_start = 0; | 1637 | p->se.sleep_start = 0; |
1601 | p->se.block_start = 0; | 1638 | p->se.block_start = 0; |
1602 | p->se.sleep_max = 0; | 1639 | p->se.sleep_max = 0; |
1603 | p->se.block_max = 0; | 1640 | p->se.block_max = 0; |
1604 | p->se.exec_max = 0; | 1641 | p->se.exec_max = 0; |
1642 | p->se.slice_max = 0; | ||
1605 | p->se.wait_max = 0; | 1643 | p->se.wait_max = 0; |
1606 | p->se.wait_runtime_overruns = 0; | ||
1607 | p->se.wait_runtime_underruns = 0; | ||
1608 | #endif | 1644 | #endif |
1609 | 1645 | ||
1610 | INIT_LIST_HEAD(&p->run_list); | 1646 | INIT_LIST_HEAD(&p->run_list); |
@@ -1635,12 +1671,14 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
1635 | #ifdef CONFIG_SMP | 1671 | #ifdef CONFIG_SMP |
1636 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); | 1672 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); |
1637 | #endif | 1673 | #endif |
1638 | __set_task_cpu(p, cpu); | 1674 | set_task_cpu(p, cpu); |
1639 | 1675 | ||
1640 | /* | 1676 | /* |
1641 | * Make sure we do not leak PI boosting priority to the child: | 1677 | * Make sure we do not leak PI boosting priority to the child: |
1642 | */ | 1678 | */ |
1643 | p->prio = current->normal_prio; | 1679 | p->prio = current->normal_prio; |
1680 | if (!rt_prio(p->prio)) | ||
1681 | p->sched_class = &fair_sched_class; | ||
1644 | 1682 | ||
1645 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 1683 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
1646 | if (likely(sched_info_on())) | 1684 | if (likely(sched_info_on())) |
@@ -1657,12 +1695,6 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
1657 | } | 1695 | } |
1658 | 1696 | ||
1659 | /* | 1697 | /* |
1660 | * After fork, child runs first. (default) If set to 0 then | ||
1661 | * parent will (try to) run first. | ||
1662 | */ | ||
1663 | unsigned int __read_mostly sysctl_sched_child_runs_first = 1; | ||
1664 | |||
1665 | /* | ||
1666 | * wake_up_new_task - wake up a newly created task for the first time. | 1698 | * wake_up_new_task - wake up a newly created task for the first time. |
1667 | * | 1699 | * |
1668 | * This function will do some initial scheduler statistics housekeeping | 1700 | * This function will do some initial scheduler statistics housekeeping |
@@ -1673,24 +1705,14 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
1673 | { | 1705 | { |
1674 | unsigned long flags; | 1706 | unsigned long flags; |
1675 | struct rq *rq; | 1707 | struct rq *rq; |
1676 | int this_cpu; | ||
1677 | 1708 | ||
1678 | rq = task_rq_lock(p, &flags); | 1709 | rq = task_rq_lock(p, &flags); |
1679 | BUG_ON(p->state != TASK_RUNNING); | 1710 | BUG_ON(p->state != TASK_RUNNING); |
1680 | this_cpu = smp_processor_id(); /* parent's CPU */ | ||
1681 | update_rq_clock(rq); | 1711 | update_rq_clock(rq); |
1682 | 1712 | ||
1683 | p->prio = effective_prio(p); | 1713 | p->prio = effective_prio(p); |
1684 | 1714 | ||
1685 | if (rt_prio(p->prio)) | 1715 | if (!p->sched_class->task_new || !current->se.on_rq || !rq->cfs.curr) { |
1686 | p->sched_class = &rt_sched_class; | ||
1687 | else | ||
1688 | p->sched_class = &fair_sched_class; | ||
1689 | |||
1690 | if (!p->sched_class->task_new || !sysctl_sched_child_runs_first || | ||
1691 | (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu || | ||
1692 | !current->se.on_rq) { | ||
1693 | |||
1694 | activate_task(rq, p, 0); | 1716 | activate_task(rq, p, 0); |
1695 | } else { | 1717 | } else { |
1696 | /* | 1718 | /* |
@@ -1799,7 +1821,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, | |||
1799 | * with the lock held can cause deadlocks; see schedule() for | 1821 | * with the lock held can cause deadlocks; see schedule() for |
1800 | * details.) | 1822 | * details.) |
1801 | */ | 1823 | */ |
1802 | static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) | 1824 | static void finish_task_switch(struct rq *rq, struct task_struct *prev) |
1803 | __releases(rq->lock) | 1825 | __releases(rq->lock) |
1804 | { | 1826 | { |
1805 | struct mm_struct *mm = rq->prev_mm; | 1827 | struct mm_struct *mm = rq->prev_mm; |
@@ -1981,42 +2003,10 @@ unsigned long nr_active(void) | |||
1981 | */ | 2003 | */ |
1982 | static void update_cpu_load(struct rq *this_rq) | 2004 | static void update_cpu_load(struct rq *this_rq) |
1983 | { | 2005 | { |
1984 | u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64; | 2006 | unsigned long this_load = this_rq->load.weight; |
1985 | unsigned long total_load = this_rq->ls.load.weight; | ||
1986 | unsigned long this_load = total_load; | ||
1987 | struct load_stat *ls = &this_rq->ls; | ||
1988 | int i, scale; | 2007 | int i, scale; |
1989 | 2008 | ||
1990 | this_rq->nr_load_updates++; | 2009 | this_rq->nr_load_updates++; |
1991 | if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD))) | ||
1992 | goto do_avg; | ||
1993 | |||
1994 | /* Update delta_fair/delta_exec fields first */ | ||
1995 | update_curr_load(this_rq); | ||
1996 | |||
1997 | fair_delta64 = ls->delta_fair + 1; | ||
1998 | ls->delta_fair = 0; | ||
1999 | |||
2000 | exec_delta64 = ls->delta_exec + 1; | ||
2001 | ls->delta_exec = 0; | ||
2002 | |||
2003 | sample_interval64 = this_rq->clock - ls->load_update_last; | ||
2004 | ls->load_update_last = this_rq->clock; | ||
2005 | |||
2006 | if ((s64)sample_interval64 < (s64)TICK_NSEC) | ||
2007 | sample_interval64 = TICK_NSEC; | ||
2008 | |||
2009 | if (exec_delta64 > sample_interval64) | ||
2010 | exec_delta64 = sample_interval64; | ||
2011 | |||
2012 | idle_delta64 = sample_interval64 - exec_delta64; | ||
2013 | |||
2014 | tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64); | ||
2015 | tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64); | ||
2016 | |||
2017 | this_load = (unsigned long)tmp64; | ||
2018 | |||
2019 | do_avg: | ||
2020 | 2010 | ||
2021 | /* Update our load: */ | 2011 | /* Update our load: */ |
2022 | for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | 2012 | for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { |
@@ -2026,7 +2016,13 @@ do_avg: | |||
2026 | 2016 | ||
2027 | old_load = this_rq->cpu_load[i]; | 2017 | old_load = this_rq->cpu_load[i]; |
2028 | new_load = this_load; | 2018 | new_load = this_load; |
2029 | 2019 | /* | |
2020 | * Round up the averaging division if load is increasing. This | ||
2021 | * prevents us from getting stuck on 9 if the load is 10, for | ||
2022 | * example. | ||
2023 | */ | ||
2024 | if (new_load > old_load) | ||
2025 | new_load += scale-1; | ||
2030 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; | 2026 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; |
2031 | } | 2027 | } |
2032 | } | 2028 | } |
@@ -2178,13 +2174,38 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
2178 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | 2174 | * 2) cannot be migrated to this CPU due to cpus_allowed, or |
2179 | * 3) are cache-hot on their current CPU. | 2175 | * 3) are cache-hot on their current CPU. |
2180 | */ | 2176 | */ |
2181 | if (!cpu_isset(this_cpu, p->cpus_allowed)) | 2177 | if (!cpu_isset(this_cpu, p->cpus_allowed)) { |
2178 | schedstat_inc(p, se.nr_failed_migrations_affine); | ||
2182 | return 0; | 2179 | return 0; |
2180 | } | ||
2183 | *all_pinned = 0; | 2181 | *all_pinned = 0; |
2184 | 2182 | ||
2185 | if (task_running(rq, p)) | 2183 | if (task_running(rq, p)) { |
2184 | schedstat_inc(p, se.nr_failed_migrations_running); | ||
2186 | return 0; | 2185 | return 0; |
2186 | } | ||
2187 | |||
2188 | /* | ||
2189 | * Aggressive migration if: | ||
2190 | * 1) task is cache cold, or | ||
2191 | * 2) too many balance attempts have failed. | ||
2192 | */ | ||
2193 | |||
2194 | if (!task_hot(p, rq->clock, sd) || | ||
2195 | sd->nr_balance_failed > sd->cache_nice_tries) { | ||
2196 | #ifdef CONFIG_SCHEDSTATS | ||
2197 | if (task_hot(p, rq->clock, sd)) { | ||
2198 | schedstat_inc(sd, lb_hot_gained[idle]); | ||
2199 | schedstat_inc(p, se.nr_forced_migrations); | ||
2200 | } | ||
2201 | #endif | ||
2202 | return 1; | ||
2203 | } | ||
2187 | 2204 | ||
2205 | if (task_hot(p, rq->clock, sd)) { | ||
2206 | schedstat_inc(p, se.nr_failed_migrations_hot); | ||
2207 | return 0; | ||
2208 | } | ||
2188 | return 1; | 2209 | return 1; |
2189 | } | 2210 | } |
2190 | 2211 | ||
@@ -2263,7 +2284,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2263 | struct sched_domain *sd, enum cpu_idle_type idle, | 2284 | struct sched_domain *sd, enum cpu_idle_type idle, |
2264 | int *all_pinned) | 2285 | int *all_pinned) |
2265 | { | 2286 | { |
2266 | struct sched_class *class = sched_class_highest; | 2287 | const struct sched_class *class = sched_class_highest; |
2267 | unsigned long total_load_moved = 0; | 2288 | unsigned long total_load_moved = 0; |
2268 | int this_best_prio = this_rq->curr->prio; | 2289 | int this_best_prio = this_rq->curr->prio; |
2269 | 2290 | ||
@@ -2288,7 +2309,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2288 | static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2309 | static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2289 | struct sched_domain *sd, enum cpu_idle_type idle) | 2310 | struct sched_domain *sd, enum cpu_idle_type idle) |
2290 | { | 2311 | { |
2291 | struct sched_class *class; | 2312 | const struct sched_class *class; |
2292 | int this_best_prio = MAX_PRIO; | 2313 | int this_best_prio = MAX_PRIO; |
2293 | 2314 | ||
2294 | for (class = sched_class_highest; class; class = class->next) | 2315 | for (class = sched_class_highest; class; class = class->next) |
@@ -2652,7 +2673,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
2652 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2673 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2653 | sd_idle = 1; | 2674 | sd_idle = 1; |
2654 | 2675 | ||
2655 | schedstat_inc(sd, lb_cnt[idle]); | 2676 | schedstat_inc(sd, lb_count[idle]); |
2656 | 2677 | ||
2657 | redo: | 2678 | redo: |
2658 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 2679 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
@@ -2805,7 +2826,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | |||
2805 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2826 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2806 | sd_idle = 1; | 2827 | sd_idle = 1; |
2807 | 2828 | ||
2808 | schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]); | 2829 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); |
2809 | redo: | 2830 | redo: |
2810 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, | 2831 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, |
2811 | &sd_idle, &cpus, NULL); | 2832 | &sd_idle, &cpus, NULL); |
@@ -2939,7 +2960,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
2939 | } | 2960 | } |
2940 | 2961 | ||
2941 | if (likely(sd)) { | 2962 | if (likely(sd)) { |
2942 | schedstat_inc(sd, alb_cnt); | 2963 | schedstat_inc(sd, alb_count); |
2943 | 2964 | ||
2944 | if (move_one_task(target_rq, target_cpu, busiest_rq, | 2965 | if (move_one_task(target_rq, target_cpu, busiest_rq, |
2945 | sd, CPU_IDLE)) | 2966 | sd, CPU_IDLE)) |
@@ -3032,7 +3053,7 @@ static DEFINE_SPINLOCK(balancing); | |||
3032 | * | 3053 | * |
3033 | * Balancing parameters are set up in arch_init_sched_domains. | 3054 | * Balancing parameters are set up in arch_init_sched_domains. |
3034 | */ | 3055 | */ |
3035 | static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) | 3056 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) |
3036 | { | 3057 | { |
3037 | int balance = 1; | 3058 | int balance = 1; |
3038 | struct rq *rq = cpu_rq(cpu); | 3059 | struct rq *rq = cpu_rq(cpu); |
@@ -3279,6 +3300,25 @@ void account_user_time(struct task_struct *p, cputime_t cputime) | |||
3279 | } | 3300 | } |
3280 | 3301 | ||
3281 | /* | 3302 | /* |
3303 | * Account guest cpu time to a process. | ||
3304 | * @p: the process that the cpu time gets accounted to | ||
3305 | * @cputime: the cpu time spent in virtual machine since the last update | ||
3306 | */ | ||
3307 | void account_guest_time(struct task_struct *p, cputime_t cputime) | ||
3308 | { | ||
3309 | cputime64_t tmp; | ||
3310 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
3311 | |||
3312 | tmp = cputime_to_cputime64(cputime); | ||
3313 | |||
3314 | p->utime = cputime_add(p->utime, cputime); | ||
3315 | p->gtime = cputime_add(p->gtime, cputime); | ||
3316 | |||
3317 | cpustat->user = cputime64_add(cpustat->user, tmp); | ||
3318 | cpustat->guest = cputime64_add(cpustat->guest, tmp); | ||
3319 | } | ||
3320 | |||
3321 | /* | ||
3282 | * Account system cpu time to a process. | 3322 | * Account system cpu time to a process. |
3283 | * @p: the process that the cpu time gets accounted to | 3323 | * @p: the process that the cpu time gets accounted to |
3284 | * @hardirq_offset: the offset to subtract from hardirq_count() | 3324 | * @hardirq_offset: the offset to subtract from hardirq_count() |
@@ -3291,6 +3331,12 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
3291 | struct rq *rq = this_rq(); | 3331 | struct rq *rq = this_rq(); |
3292 | cputime64_t tmp; | 3332 | cputime64_t tmp; |
3293 | 3333 | ||
3334 | if (p->flags & PF_VCPU) { | ||
3335 | account_guest_time(p, cputime); | ||
3336 | p->flags &= ~PF_VCPU; | ||
3337 | return; | ||
3338 | } | ||
3339 | |||
3294 | p->stime = cputime_add(p->stime, cputime); | 3340 | p->stime = cputime_add(p->stime, cputime); |
3295 | 3341 | ||
3296 | /* Add system time to cpustat. */ | 3342 | /* Add system time to cpustat. */ |
@@ -3429,7 +3475,13 @@ static inline void schedule_debug(struct task_struct *prev) | |||
3429 | 3475 | ||
3430 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 3476 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
3431 | 3477 | ||
3432 | schedstat_inc(this_rq(), sched_cnt); | 3478 | schedstat_inc(this_rq(), sched_count); |
3479 | #ifdef CONFIG_SCHEDSTATS | ||
3480 | if (unlikely(prev->lock_depth >= 0)) { | ||
3481 | schedstat_inc(this_rq(), bkl_count); | ||
3482 | schedstat_inc(prev, sched_info.bkl_count); | ||
3483 | } | ||
3484 | #endif | ||
3433 | } | 3485 | } |
3434 | 3486 | ||
3435 | /* | 3487 | /* |
@@ -3438,7 +3490,7 @@ static inline void schedule_debug(struct task_struct *prev) | |||
3438 | static inline struct task_struct * | 3490 | static inline struct task_struct * |
3439 | pick_next_task(struct rq *rq, struct task_struct *prev) | 3491 | pick_next_task(struct rq *rq, struct task_struct *prev) |
3440 | { | 3492 | { |
3441 | struct sched_class *class; | 3493 | const struct sched_class *class; |
3442 | struct task_struct *p; | 3494 | struct task_struct *p; |
3443 | 3495 | ||
3444 | /* | 3496 | /* |
@@ -3487,9 +3539,13 @@ need_resched_nonpreemptible: | |||
3487 | 3539 | ||
3488 | schedule_debug(prev); | 3540 | schedule_debug(prev); |
3489 | 3541 | ||
3490 | spin_lock_irq(&rq->lock); | 3542 | /* |
3491 | clear_tsk_need_resched(prev); | 3543 | * Do the rq-clock update outside the rq lock: |
3544 | */ | ||
3545 | local_irq_disable(); | ||
3492 | __update_rq_clock(rq); | 3546 | __update_rq_clock(rq); |
3547 | spin_lock(&rq->lock); | ||
3548 | clear_tsk_need_resched(prev); | ||
3493 | 3549 | ||
3494 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3550 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
3495 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && | 3551 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && |
@@ -3549,27 +3605,30 @@ asmlinkage void __sched preempt_schedule(void) | |||
3549 | if (likely(ti->preempt_count || irqs_disabled())) | 3605 | if (likely(ti->preempt_count || irqs_disabled())) |
3550 | return; | 3606 | return; |
3551 | 3607 | ||
3552 | need_resched: | 3608 | do { |
3553 | add_preempt_count(PREEMPT_ACTIVE); | 3609 | add_preempt_count(PREEMPT_ACTIVE); |
3554 | /* | 3610 | |
3555 | * We keep the big kernel semaphore locked, but we | 3611 | /* |
3556 | * clear ->lock_depth so that schedule() doesnt | 3612 | * We keep the big kernel semaphore locked, but we |
3557 | * auto-release the semaphore: | 3613 | * clear ->lock_depth so that schedule() doesnt |
3558 | */ | 3614 | * auto-release the semaphore: |
3615 | */ | ||
3559 | #ifdef CONFIG_PREEMPT_BKL | 3616 | #ifdef CONFIG_PREEMPT_BKL |
3560 | saved_lock_depth = task->lock_depth; | 3617 | saved_lock_depth = task->lock_depth; |
3561 | task->lock_depth = -1; | 3618 | task->lock_depth = -1; |
3562 | #endif | 3619 | #endif |
3563 | schedule(); | 3620 | schedule(); |
3564 | #ifdef CONFIG_PREEMPT_BKL | 3621 | #ifdef CONFIG_PREEMPT_BKL |
3565 | task->lock_depth = saved_lock_depth; | 3622 | task->lock_depth = saved_lock_depth; |
3566 | #endif | 3623 | #endif |
3567 | sub_preempt_count(PREEMPT_ACTIVE); | 3624 | sub_preempt_count(PREEMPT_ACTIVE); |
3568 | 3625 | ||
3569 | /* we could miss a preemption opportunity between schedule and now */ | 3626 | /* |
3570 | barrier(); | 3627 | * Check again in case we missed a preemption opportunity |
3571 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3628 | * between schedule and now. |
3572 | goto need_resched; | 3629 | */ |
3630 | barrier(); | ||
3631 | } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); | ||
3573 | } | 3632 | } |
3574 | EXPORT_SYMBOL(preempt_schedule); | 3633 | EXPORT_SYMBOL(preempt_schedule); |
3575 | 3634 | ||
@@ -3589,29 +3648,32 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
3589 | /* Catch callers which need to be fixed */ | 3648 | /* Catch callers which need to be fixed */ |
3590 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 3649 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
3591 | 3650 | ||
3592 | need_resched: | 3651 | do { |
3593 | add_preempt_count(PREEMPT_ACTIVE); | 3652 | add_preempt_count(PREEMPT_ACTIVE); |
3594 | /* | 3653 | |
3595 | * We keep the big kernel semaphore locked, but we | 3654 | /* |
3596 | * clear ->lock_depth so that schedule() doesnt | 3655 | * We keep the big kernel semaphore locked, but we |
3597 | * auto-release the semaphore: | 3656 | * clear ->lock_depth so that schedule() doesnt |
3598 | */ | 3657 | * auto-release the semaphore: |
3658 | */ | ||
3599 | #ifdef CONFIG_PREEMPT_BKL | 3659 | #ifdef CONFIG_PREEMPT_BKL |
3600 | saved_lock_depth = task->lock_depth; | 3660 | saved_lock_depth = task->lock_depth; |
3601 | task->lock_depth = -1; | 3661 | task->lock_depth = -1; |
3602 | #endif | 3662 | #endif |
3603 | local_irq_enable(); | 3663 | local_irq_enable(); |
3604 | schedule(); | 3664 | schedule(); |
3605 | local_irq_disable(); | 3665 | local_irq_disable(); |
3606 | #ifdef CONFIG_PREEMPT_BKL | 3666 | #ifdef CONFIG_PREEMPT_BKL |
3607 | task->lock_depth = saved_lock_depth; | 3667 | task->lock_depth = saved_lock_depth; |
3608 | #endif | 3668 | #endif |
3609 | sub_preempt_count(PREEMPT_ACTIVE); | 3669 | sub_preempt_count(PREEMPT_ACTIVE); |
3610 | 3670 | ||
3611 | /* we could miss a preemption opportunity between schedule and now */ | 3671 | /* |
3612 | barrier(); | 3672 | * Check again in case we missed a preemption opportunity |
3613 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3673 | * between schedule and now. |
3614 | goto need_resched; | 3674 | */ |
3675 | barrier(); | ||
3676 | } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); | ||
3615 | } | 3677 | } |
3616 | 3678 | ||
3617 | #endif /* CONFIG_PREEMPT */ | 3679 | #endif /* CONFIG_PREEMPT */ |
@@ -3635,10 +3697,9 @@ EXPORT_SYMBOL(default_wake_function); | |||
3635 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | 3697 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, |
3636 | int nr_exclusive, int sync, void *key) | 3698 | int nr_exclusive, int sync, void *key) |
3637 | { | 3699 | { |
3638 | struct list_head *tmp, *next; | 3700 | wait_queue_t *curr, *next; |
3639 | 3701 | ||
3640 | list_for_each_safe(tmp, next, &q->task_list) { | 3702 | list_for_each_entry_safe(curr, next, &q->task_list, task_list) { |
3641 | wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); | ||
3642 | unsigned flags = curr->flags; | 3703 | unsigned flags = curr->flags; |
3643 | 3704 | ||
3644 | if (curr->func(curr, mode, sync, key) && | 3705 | if (curr->func(curr, mode, sync, key) && |
@@ -3728,206 +3789,116 @@ void fastcall complete_all(struct completion *x) | |||
3728 | } | 3789 | } |
3729 | EXPORT_SYMBOL(complete_all); | 3790 | EXPORT_SYMBOL(complete_all); |
3730 | 3791 | ||
3731 | void fastcall __sched wait_for_completion(struct completion *x) | 3792 | static inline long __sched |
3732 | { | 3793 | do_wait_for_common(struct completion *x, long timeout, int state) |
3733 | might_sleep(); | ||
3734 | |||
3735 | spin_lock_irq(&x->wait.lock); | ||
3736 | if (!x->done) { | ||
3737 | DECLARE_WAITQUEUE(wait, current); | ||
3738 | |||
3739 | wait.flags |= WQ_FLAG_EXCLUSIVE; | ||
3740 | __add_wait_queue_tail(&x->wait, &wait); | ||
3741 | do { | ||
3742 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
3743 | spin_unlock_irq(&x->wait.lock); | ||
3744 | schedule(); | ||
3745 | spin_lock_irq(&x->wait.lock); | ||
3746 | } while (!x->done); | ||
3747 | __remove_wait_queue(&x->wait, &wait); | ||
3748 | } | ||
3749 | x->done--; | ||
3750 | spin_unlock_irq(&x->wait.lock); | ||
3751 | } | ||
3752 | EXPORT_SYMBOL(wait_for_completion); | ||
3753 | |||
3754 | unsigned long fastcall __sched | ||
3755 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | ||
3756 | { | 3794 | { |
3757 | might_sleep(); | ||
3758 | |||
3759 | spin_lock_irq(&x->wait.lock); | ||
3760 | if (!x->done) { | 3795 | if (!x->done) { |
3761 | DECLARE_WAITQUEUE(wait, current); | 3796 | DECLARE_WAITQUEUE(wait, current); |
3762 | 3797 | ||
3763 | wait.flags |= WQ_FLAG_EXCLUSIVE; | 3798 | wait.flags |= WQ_FLAG_EXCLUSIVE; |
3764 | __add_wait_queue_tail(&x->wait, &wait); | 3799 | __add_wait_queue_tail(&x->wait, &wait); |
3765 | do { | 3800 | do { |
3766 | __set_current_state(TASK_UNINTERRUPTIBLE); | 3801 | if (state == TASK_INTERRUPTIBLE && |
3802 | signal_pending(current)) { | ||
3803 | __remove_wait_queue(&x->wait, &wait); | ||
3804 | return -ERESTARTSYS; | ||
3805 | } | ||
3806 | __set_current_state(state); | ||
3767 | spin_unlock_irq(&x->wait.lock); | 3807 | spin_unlock_irq(&x->wait.lock); |
3768 | timeout = schedule_timeout(timeout); | 3808 | timeout = schedule_timeout(timeout); |
3769 | spin_lock_irq(&x->wait.lock); | 3809 | spin_lock_irq(&x->wait.lock); |
3770 | if (!timeout) { | 3810 | if (!timeout) { |
3771 | __remove_wait_queue(&x->wait, &wait); | 3811 | __remove_wait_queue(&x->wait, &wait); |
3772 | goto out; | 3812 | return timeout; |
3773 | } | 3813 | } |
3774 | } while (!x->done); | 3814 | } while (!x->done); |
3775 | __remove_wait_queue(&x->wait, &wait); | 3815 | __remove_wait_queue(&x->wait, &wait); |
3776 | } | 3816 | } |
3777 | x->done--; | 3817 | x->done--; |
3778 | out: | ||
3779 | spin_unlock_irq(&x->wait.lock); | ||
3780 | return timeout; | 3818 | return timeout; |
3781 | } | 3819 | } |
3782 | EXPORT_SYMBOL(wait_for_completion_timeout); | ||
3783 | 3820 | ||
3784 | int fastcall __sched wait_for_completion_interruptible(struct completion *x) | 3821 | static long __sched |
3822 | wait_for_common(struct completion *x, long timeout, int state) | ||
3785 | { | 3823 | { |
3786 | int ret = 0; | ||
3787 | |||
3788 | might_sleep(); | 3824 | might_sleep(); |
3789 | 3825 | ||
3790 | spin_lock_irq(&x->wait.lock); | 3826 | spin_lock_irq(&x->wait.lock); |
3791 | if (!x->done) { | 3827 | timeout = do_wait_for_common(x, timeout, state); |
3792 | DECLARE_WAITQUEUE(wait, current); | ||
3793 | |||
3794 | wait.flags |= WQ_FLAG_EXCLUSIVE; | ||
3795 | __add_wait_queue_tail(&x->wait, &wait); | ||
3796 | do { | ||
3797 | if (signal_pending(current)) { | ||
3798 | ret = -ERESTARTSYS; | ||
3799 | __remove_wait_queue(&x->wait, &wait); | ||
3800 | goto out; | ||
3801 | } | ||
3802 | __set_current_state(TASK_INTERRUPTIBLE); | ||
3803 | spin_unlock_irq(&x->wait.lock); | ||
3804 | schedule(); | ||
3805 | spin_lock_irq(&x->wait.lock); | ||
3806 | } while (!x->done); | ||
3807 | __remove_wait_queue(&x->wait, &wait); | ||
3808 | } | ||
3809 | x->done--; | ||
3810 | out: | ||
3811 | spin_unlock_irq(&x->wait.lock); | 3828 | spin_unlock_irq(&x->wait.lock); |
3829 | return timeout; | ||
3830 | } | ||
3812 | 3831 | ||
3813 | return ret; | 3832 | void fastcall __sched wait_for_completion(struct completion *x) |
3833 | { | ||
3834 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | ||
3814 | } | 3835 | } |
3815 | EXPORT_SYMBOL(wait_for_completion_interruptible); | 3836 | EXPORT_SYMBOL(wait_for_completion); |
3816 | 3837 | ||
3817 | unsigned long fastcall __sched | 3838 | unsigned long fastcall __sched |
3818 | wait_for_completion_interruptible_timeout(struct completion *x, | 3839 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) |
3819 | unsigned long timeout) | ||
3820 | { | 3840 | { |
3821 | might_sleep(); | 3841 | return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); |
3822 | |||
3823 | spin_lock_irq(&x->wait.lock); | ||
3824 | if (!x->done) { | ||
3825 | DECLARE_WAITQUEUE(wait, current); | ||
3826 | |||
3827 | wait.flags |= WQ_FLAG_EXCLUSIVE; | ||
3828 | __add_wait_queue_tail(&x->wait, &wait); | ||
3829 | do { | ||
3830 | if (signal_pending(current)) { | ||
3831 | timeout = -ERESTARTSYS; | ||
3832 | __remove_wait_queue(&x->wait, &wait); | ||
3833 | goto out; | ||
3834 | } | ||
3835 | __set_current_state(TASK_INTERRUPTIBLE); | ||
3836 | spin_unlock_irq(&x->wait.lock); | ||
3837 | timeout = schedule_timeout(timeout); | ||
3838 | spin_lock_irq(&x->wait.lock); | ||
3839 | if (!timeout) { | ||
3840 | __remove_wait_queue(&x->wait, &wait); | ||
3841 | goto out; | ||
3842 | } | ||
3843 | } while (!x->done); | ||
3844 | __remove_wait_queue(&x->wait, &wait); | ||
3845 | } | ||
3846 | x->done--; | ||
3847 | out: | ||
3848 | spin_unlock_irq(&x->wait.lock); | ||
3849 | return timeout; | ||
3850 | } | 3842 | } |
3851 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | 3843 | EXPORT_SYMBOL(wait_for_completion_timeout); |
3852 | 3844 | ||
3853 | static inline void | 3845 | int __sched wait_for_completion_interruptible(struct completion *x) |
3854 | sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) | ||
3855 | { | 3846 | { |
3856 | spin_lock_irqsave(&q->lock, *flags); | 3847 | return wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); |
3857 | __add_wait_queue(q, wait); | ||
3858 | spin_unlock(&q->lock); | ||
3859 | } | 3848 | } |
3849 | EXPORT_SYMBOL(wait_for_completion_interruptible); | ||
3860 | 3850 | ||
3861 | static inline void | 3851 | unsigned long fastcall __sched |
3862 | sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) | 3852 | wait_for_completion_interruptible_timeout(struct completion *x, |
3853 | unsigned long timeout) | ||
3863 | { | 3854 | { |
3864 | spin_lock_irq(&q->lock); | 3855 | return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); |
3865 | __remove_wait_queue(q, wait); | ||
3866 | spin_unlock_irqrestore(&q->lock, *flags); | ||
3867 | } | 3856 | } |
3857 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | ||
3868 | 3858 | ||
3869 | void __sched interruptible_sleep_on(wait_queue_head_t *q) | 3859 | static long __sched |
3860 | sleep_on_common(wait_queue_head_t *q, int state, long timeout) | ||
3870 | { | 3861 | { |
3871 | unsigned long flags; | 3862 | unsigned long flags; |
3872 | wait_queue_t wait; | 3863 | wait_queue_t wait; |
3873 | 3864 | ||
3874 | init_waitqueue_entry(&wait, current); | 3865 | init_waitqueue_entry(&wait, current); |
3875 | 3866 | ||
3876 | current->state = TASK_INTERRUPTIBLE; | 3867 | __set_current_state(state); |
3877 | 3868 | ||
3878 | sleep_on_head(q, &wait, &flags); | 3869 | spin_lock_irqsave(&q->lock, flags); |
3879 | schedule(); | 3870 | __add_wait_queue(q, &wait); |
3880 | sleep_on_tail(q, &wait, &flags); | 3871 | spin_unlock(&q->lock); |
3872 | timeout = schedule_timeout(timeout); | ||
3873 | spin_lock_irq(&q->lock); | ||
3874 | __remove_wait_queue(q, &wait); | ||
3875 | spin_unlock_irqrestore(&q->lock, flags); | ||
3876 | |||
3877 | return timeout; | ||
3878 | } | ||
3879 | |||
3880 | void __sched interruptible_sleep_on(wait_queue_head_t *q) | ||
3881 | { | ||
3882 | sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); | ||
3881 | } | 3883 | } |
3882 | EXPORT_SYMBOL(interruptible_sleep_on); | 3884 | EXPORT_SYMBOL(interruptible_sleep_on); |
3883 | 3885 | ||
3884 | long __sched | 3886 | long __sched |
3885 | interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) | 3887 | interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) |
3886 | { | 3888 | { |
3887 | unsigned long flags; | 3889 | return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); |
3888 | wait_queue_t wait; | ||
3889 | |||
3890 | init_waitqueue_entry(&wait, current); | ||
3891 | |||
3892 | current->state = TASK_INTERRUPTIBLE; | ||
3893 | |||
3894 | sleep_on_head(q, &wait, &flags); | ||
3895 | timeout = schedule_timeout(timeout); | ||
3896 | sleep_on_tail(q, &wait, &flags); | ||
3897 | |||
3898 | return timeout; | ||
3899 | } | 3890 | } |
3900 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); | 3891 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); |
3901 | 3892 | ||
3902 | void __sched sleep_on(wait_queue_head_t *q) | 3893 | void __sched sleep_on(wait_queue_head_t *q) |
3903 | { | 3894 | { |
3904 | unsigned long flags; | 3895 | sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); |
3905 | wait_queue_t wait; | ||
3906 | |||
3907 | init_waitqueue_entry(&wait, current); | ||
3908 | |||
3909 | current->state = TASK_UNINTERRUPTIBLE; | ||
3910 | |||
3911 | sleep_on_head(q, &wait, &flags); | ||
3912 | schedule(); | ||
3913 | sleep_on_tail(q, &wait, &flags); | ||
3914 | } | 3896 | } |
3915 | EXPORT_SYMBOL(sleep_on); | 3897 | EXPORT_SYMBOL(sleep_on); |
3916 | 3898 | ||
3917 | long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | 3899 | long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) |
3918 | { | 3900 | { |
3919 | unsigned long flags; | 3901 | return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); |
3920 | wait_queue_t wait; | ||
3921 | |||
3922 | init_waitqueue_entry(&wait, current); | ||
3923 | |||
3924 | current->state = TASK_UNINTERRUPTIBLE; | ||
3925 | |||
3926 | sleep_on_head(q, &wait, &flags); | ||
3927 | timeout = schedule_timeout(timeout); | ||
3928 | sleep_on_tail(q, &wait, &flags); | ||
3929 | |||
3930 | return timeout; | ||
3931 | } | 3902 | } |
3932 | EXPORT_SYMBOL(sleep_on_timeout); | 3903 | EXPORT_SYMBOL(sleep_on_timeout); |
3933 | 3904 | ||
@@ -3946,7 +3917,7 @@ EXPORT_SYMBOL(sleep_on_timeout); | |||
3946 | void rt_mutex_setprio(struct task_struct *p, int prio) | 3917 | void rt_mutex_setprio(struct task_struct *p, int prio) |
3947 | { | 3918 | { |
3948 | unsigned long flags; | 3919 | unsigned long flags; |
3949 | int oldprio, on_rq; | 3920 | int oldprio, on_rq, running; |
3950 | struct rq *rq; | 3921 | struct rq *rq; |
3951 | 3922 | ||
3952 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 3923 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
@@ -3956,8 +3927,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3956 | 3927 | ||
3957 | oldprio = p->prio; | 3928 | oldprio = p->prio; |
3958 | on_rq = p->se.on_rq; | 3929 | on_rq = p->se.on_rq; |
3959 | if (on_rq) | 3930 | running = task_running(rq, p); |
3931 | if (on_rq) { | ||
3960 | dequeue_task(rq, p, 0); | 3932 | dequeue_task(rq, p, 0); |
3933 | if (running) | ||
3934 | p->sched_class->put_prev_task(rq, p); | ||
3935 | } | ||
3961 | 3936 | ||
3962 | if (rt_prio(prio)) | 3937 | if (rt_prio(prio)) |
3963 | p->sched_class = &rt_sched_class; | 3938 | p->sched_class = &rt_sched_class; |
@@ -3967,13 +3942,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3967 | p->prio = prio; | 3942 | p->prio = prio; |
3968 | 3943 | ||
3969 | if (on_rq) { | 3944 | if (on_rq) { |
3945 | if (running) | ||
3946 | p->sched_class->set_curr_task(rq); | ||
3970 | enqueue_task(rq, p, 0); | 3947 | enqueue_task(rq, p, 0); |
3971 | /* | 3948 | /* |
3972 | * Reschedule if we are currently running on this runqueue and | 3949 | * Reschedule if we are currently running on this runqueue and |
3973 | * our priority decreased, or if we are not currently running on | 3950 | * our priority decreased, or if we are not currently running on |
3974 | * this runqueue and our priority is higher than the current's | 3951 | * this runqueue and our priority is higher than the current's |
3975 | */ | 3952 | */ |
3976 | if (task_running(rq, p)) { | 3953 | if (running) { |
3977 | if (p->prio > oldprio) | 3954 | if (p->prio > oldprio) |
3978 | resched_task(rq->curr); | 3955 | resched_task(rq->curr); |
3979 | } else { | 3956 | } else { |
@@ -4137,7 +4114,7 @@ struct task_struct *idle_task(int cpu) | |||
4137 | * find_process_by_pid - find a process with a matching PID value. | 4114 | * find_process_by_pid - find a process with a matching PID value. |
4138 | * @pid: the pid in question. | 4115 | * @pid: the pid in question. |
4139 | */ | 4116 | */ |
4140 | static inline struct task_struct *find_process_by_pid(pid_t pid) | 4117 | static struct task_struct *find_process_by_pid(pid_t pid) |
4141 | { | 4118 | { |
4142 | return pid ? find_task_by_pid(pid) : current; | 4119 | return pid ? find_task_by_pid(pid) : current; |
4143 | } | 4120 | } |
@@ -4179,7 +4156,7 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | |||
4179 | int sched_setscheduler(struct task_struct *p, int policy, | 4156 | int sched_setscheduler(struct task_struct *p, int policy, |
4180 | struct sched_param *param) | 4157 | struct sched_param *param) |
4181 | { | 4158 | { |
4182 | int retval, oldprio, oldpolicy = -1, on_rq; | 4159 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
4183 | unsigned long flags; | 4160 | unsigned long flags; |
4184 | struct rq *rq; | 4161 | struct rq *rq; |
4185 | 4162 | ||
@@ -4261,18 +4238,26 @@ recheck: | |||
4261 | } | 4238 | } |
4262 | update_rq_clock(rq); | 4239 | update_rq_clock(rq); |
4263 | on_rq = p->se.on_rq; | 4240 | on_rq = p->se.on_rq; |
4264 | if (on_rq) | 4241 | running = task_running(rq, p); |
4242 | if (on_rq) { | ||
4265 | deactivate_task(rq, p, 0); | 4243 | deactivate_task(rq, p, 0); |
4244 | if (running) | ||
4245 | p->sched_class->put_prev_task(rq, p); | ||
4246 | } | ||
4247 | |||
4266 | oldprio = p->prio; | 4248 | oldprio = p->prio; |
4267 | __setscheduler(rq, p, policy, param->sched_priority); | 4249 | __setscheduler(rq, p, policy, param->sched_priority); |
4250 | |||
4268 | if (on_rq) { | 4251 | if (on_rq) { |
4252 | if (running) | ||
4253 | p->sched_class->set_curr_task(rq); | ||
4269 | activate_task(rq, p, 0); | 4254 | activate_task(rq, p, 0); |
4270 | /* | 4255 | /* |
4271 | * Reschedule if we are currently running on this runqueue and | 4256 | * Reschedule if we are currently running on this runqueue and |
4272 | * our priority decreased, or if we are not currently running on | 4257 | * our priority decreased, or if we are not currently running on |
4273 | * this runqueue and our priority is higher than the current's | 4258 | * this runqueue and our priority is higher than the current's |
4274 | */ | 4259 | */ |
4275 | if (task_running(rq, p)) { | 4260 | if (running) { |
4276 | if (p->prio > oldprio) | 4261 | if (p->prio > oldprio) |
4277 | resched_task(rq->curr); | 4262 | resched_task(rq->curr); |
4278 | } else { | 4263 | } else { |
@@ -4343,10 +4328,10 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) | |||
4343 | asmlinkage long sys_sched_getscheduler(pid_t pid) | 4328 | asmlinkage long sys_sched_getscheduler(pid_t pid) |
4344 | { | 4329 | { |
4345 | struct task_struct *p; | 4330 | struct task_struct *p; |
4346 | int retval = -EINVAL; | 4331 | int retval; |
4347 | 4332 | ||
4348 | if (pid < 0) | 4333 | if (pid < 0) |
4349 | goto out_nounlock; | 4334 | return -EINVAL; |
4350 | 4335 | ||
4351 | retval = -ESRCH; | 4336 | retval = -ESRCH; |
4352 | read_lock(&tasklist_lock); | 4337 | read_lock(&tasklist_lock); |
@@ -4357,8 +4342,6 @@ asmlinkage long sys_sched_getscheduler(pid_t pid) | |||
4357 | retval = p->policy; | 4342 | retval = p->policy; |
4358 | } | 4343 | } |
4359 | read_unlock(&tasklist_lock); | 4344 | read_unlock(&tasklist_lock); |
4360 | |||
4361 | out_nounlock: | ||
4362 | return retval; | 4345 | return retval; |
4363 | } | 4346 | } |
4364 | 4347 | ||
@@ -4371,10 +4354,10 @@ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) | |||
4371 | { | 4354 | { |
4372 | struct sched_param lp; | 4355 | struct sched_param lp; |
4373 | struct task_struct *p; | 4356 | struct task_struct *p; |
4374 | int retval = -EINVAL; | 4357 | int retval; |
4375 | 4358 | ||
4376 | if (!param || pid < 0) | 4359 | if (!param || pid < 0) |
4377 | goto out_nounlock; | 4360 | return -EINVAL; |
4378 | 4361 | ||
4379 | read_lock(&tasklist_lock); | 4362 | read_lock(&tasklist_lock); |
4380 | p = find_process_by_pid(pid); | 4363 | p = find_process_by_pid(pid); |
@@ -4394,7 +4377,6 @@ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) | |||
4394 | */ | 4377 | */ |
4395 | retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; | 4378 | retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; |
4396 | 4379 | ||
4397 | out_nounlock: | ||
4398 | return retval; | 4380 | return retval; |
4399 | 4381 | ||
4400 | out_unlock: | 4382 | out_unlock: |
@@ -4554,8 +4536,8 @@ asmlinkage long sys_sched_yield(void) | |||
4554 | { | 4536 | { |
4555 | struct rq *rq = this_rq_lock(); | 4537 | struct rq *rq = this_rq_lock(); |
4556 | 4538 | ||
4557 | schedstat_inc(rq, yld_cnt); | 4539 | schedstat_inc(rq, yld_count); |
4558 | current->sched_class->yield_task(rq, current); | 4540 | current->sched_class->yield_task(rq); |
4559 | 4541 | ||
4560 | /* | 4542 | /* |
4561 | * Since we are going to call schedule() anyway, there's | 4543 | * Since we are going to call schedule() anyway, there's |
@@ -4749,11 +4731,12 @@ asmlinkage | |||
4749 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | 4731 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) |
4750 | { | 4732 | { |
4751 | struct task_struct *p; | 4733 | struct task_struct *p; |
4752 | int retval = -EINVAL; | 4734 | unsigned int time_slice; |
4735 | int retval; | ||
4753 | struct timespec t; | 4736 | struct timespec t; |
4754 | 4737 | ||
4755 | if (pid < 0) | 4738 | if (pid < 0) |
4756 | goto out_nounlock; | 4739 | return -EINVAL; |
4757 | 4740 | ||
4758 | retval = -ESRCH; | 4741 | retval = -ESRCH; |
4759 | read_lock(&tasklist_lock); | 4742 | read_lock(&tasklist_lock); |
@@ -4765,12 +4748,24 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | |||
4765 | if (retval) | 4748 | if (retval) |
4766 | goto out_unlock; | 4749 | goto out_unlock; |
4767 | 4750 | ||
4768 | jiffies_to_timespec(p->policy == SCHED_FIFO ? | 4751 | if (p->policy == SCHED_FIFO) |
4769 | 0 : static_prio_timeslice(p->static_prio), &t); | 4752 | time_slice = 0; |
4753 | else if (p->policy == SCHED_RR) | ||
4754 | time_slice = DEF_TIMESLICE; | ||
4755 | else { | ||
4756 | struct sched_entity *se = &p->se; | ||
4757 | unsigned long flags; | ||
4758 | struct rq *rq; | ||
4759 | |||
4760 | rq = task_rq_lock(p, &flags); | ||
4761 | time_slice = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); | ||
4762 | task_rq_unlock(rq, &flags); | ||
4763 | } | ||
4770 | read_unlock(&tasklist_lock); | 4764 | read_unlock(&tasklist_lock); |
4765 | jiffies_to_timespec(time_slice, &t); | ||
4771 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | 4766 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
4772 | out_nounlock: | ||
4773 | return retval; | 4767 | return retval; |
4768 | |||
4774 | out_unlock: | 4769 | out_unlock: |
4775 | read_unlock(&tasklist_lock); | 4770 | read_unlock(&tasklist_lock); |
4776 | return retval; | 4771 | return retval; |
@@ -4899,32 +4894,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
4899 | */ | 4894 | */ |
4900 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | 4895 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; |
4901 | 4896 | ||
4902 | /* | ||
4903 | * Increase the granularity value when there are more CPUs, | ||
4904 | * because with more CPUs the 'effective latency' as visible | ||
4905 | * to users decreases. But the relationship is not linear, | ||
4906 | * so pick a second-best guess by going with the log2 of the | ||
4907 | * number of CPUs. | ||
4908 | * | ||
4909 | * This idea comes from the SD scheduler of Con Kolivas: | ||
4910 | */ | ||
4911 | static inline void sched_init_granularity(void) | ||
4912 | { | ||
4913 | unsigned int factor = 1 + ilog2(num_online_cpus()); | ||
4914 | const unsigned long limit = 100000000; | ||
4915 | |||
4916 | sysctl_sched_min_granularity *= factor; | ||
4917 | if (sysctl_sched_min_granularity > limit) | ||
4918 | sysctl_sched_min_granularity = limit; | ||
4919 | |||
4920 | sysctl_sched_latency *= factor; | ||
4921 | if (sysctl_sched_latency > limit) | ||
4922 | sysctl_sched_latency = limit; | ||
4923 | |||
4924 | sysctl_sched_runtime_limit = sysctl_sched_latency; | ||
4925 | sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2; | ||
4926 | } | ||
4927 | |||
4928 | #ifdef CONFIG_SMP | 4897 | #ifdef CONFIG_SMP |
4929 | /* | 4898 | /* |
4930 | * This is how migration works: | 4899 | * This is how migration works: |
@@ -5102,35 +5071,34 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
5102 | struct rq *rq; | 5071 | struct rq *rq; |
5103 | int dest_cpu; | 5072 | int dest_cpu; |
5104 | 5073 | ||
5105 | restart: | 5074 | do { |
5106 | /* On same node? */ | 5075 | /* On same node? */ |
5107 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); | 5076 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); |
5108 | cpus_and(mask, mask, p->cpus_allowed); | 5077 | cpus_and(mask, mask, p->cpus_allowed); |
5109 | dest_cpu = any_online_cpu(mask); | 5078 | dest_cpu = any_online_cpu(mask); |
5110 | 5079 | ||
5111 | /* On any allowed CPU? */ | 5080 | /* On any allowed CPU? */ |
5112 | if (dest_cpu == NR_CPUS) | 5081 | if (dest_cpu == NR_CPUS) |
5113 | dest_cpu = any_online_cpu(p->cpus_allowed); | 5082 | dest_cpu = any_online_cpu(p->cpus_allowed); |
5114 | 5083 | ||
5115 | /* No more Mr. Nice Guy. */ | 5084 | /* No more Mr. Nice Guy. */ |
5116 | if (dest_cpu == NR_CPUS) { | 5085 | if (dest_cpu == NR_CPUS) { |
5117 | rq = task_rq_lock(p, &flags); | 5086 | rq = task_rq_lock(p, &flags); |
5118 | cpus_setall(p->cpus_allowed); | 5087 | cpus_setall(p->cpus_allowed); |
5119 | dest_cpu = any_online_cpu(p->cpus_allowed); | 5088 | dest_cpu = any_online_cpu(p->cpus_allowed); |
5120 | task_rq_unlock(rq, &flags); | 5089 | task_rq_unlock(rq, &flags); |
5121 | 5090 | ||
5122 | /* | 5091 | /* |
5123 | * Don't tell them about moving exiting tasks or | 5092 | * Don't tell them about moving exiting tasks or |
5124 | * kernel threads (both mm NULL), since they never | 5093 | * kernel threads (both mm NULL), since they never |
5125 | * leave kernel. | 5094 | * leave kernel. |
5126 | */ | 5095 | */ |
5127 | if (p->mm && printk_ratelimit()) | 5096 | if (p->mm && printk_ratelimit()) |
5128 | printk(KERN_INFO "process %d (%s) no " | 5097 | printk(KERN_INFO "process %d (%s) no " |
5129 | "longer affine to cpu%d\n", | 5098 | "longer affine to cpu%d\n", |
5130 | p->pid, p->comm, dead_cpu); | 5099 | p->pid, p->comm, dead_cpu); |
5131 | } | 5100 | } |
5132 | if (!__migrate_task(p, dead_cpu, dest_cpu)) | 5101 | } while (!__migrate_task(p, dead_cpu, dest_cpu)); |
5133 | goto restart; | ||
5134 | } | 5102 | } |
5135 | 5103 | ||
5136 | /* | 5104 | /* |
@@ -5172,6 +5140,20 @@ static void migrate_live_tasks(int src_cpu) | |||
5172 | } | 5140 | } |
5173 | 5141 | ||
5174 | /* | 5142 | /* |
5143 | * activate_idle_task - move idle task to the _front_ of runqueue. | ||
5144 | */ | ||
5145 | static void activate_idle_task(struct task_struct *p, struct rq *rq) | ||
5146 | { | ||
5147 | update_rq_clock(rq); | ||
5148 | |||
5149 | if (p->state == TASK_UNINTERRUPTIBLE) | ||
5150 | rq->nr_uninterruptible--; | ||
5151 | |||
5152 | enqueue_task(rq, p, 0); | ||
5153 | inc_nr_running(p, rq); | ||
5154 | } | ||
5155 | |||
5156 | /* | ||
5175 | * Schedules idle task to be the next runnable task on current CPU. | 5157 | * Schedules idle task to be the next runnable task on current CPU. |
5176 | * It does so by boosting its priority to highest possible and adding it to | 5158 | * It does so by boosting its priority to highest possible and adding it to |
5177 | * the _front_ of the runqueue. Used by CPU offline code. | 5159 | * the _front_ of the runqueue. Used by CPU offline code. |
@@ -5283,14 +5265,23 @@ static struct ctl_table sd_ctl_root[] = { | |||
5283 | static struct ctl_table *sd_alloc_ctl_entry(int n) | 5265 | static struct ctl_table *sd_alloc_ctl_entry(int n) |
5284 | { | 5266 | { |
5285 | struct ctl_table *entry = | 5267 | struct ctl_table *entry = |
5286 | kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL); | 5268 | kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); |
5287 | |||
5288 | BUG_ON(!entry); | ||
5289 | memset(entry, 0, n * sizeof(struct ctl_table)); | ||
5290 | 5269 | ||
5291 | return entry; | 5270 | return entry; |
5292 | } | 5271 | } |
5293 | 5272 | ||
5273 | static void sd_free_ctl_entry(struct ctl_table **tablep) | ||
5274 | { | ||
5275 | struct ctl_table *entry = *tablep; | ||
5276 | |||
5277 | for (entry = *tablep; entry->procname; entry++) | ||
5278 | if (entry->child) | ||
5279 | sd_free_ctl_entry(&entry->child); | ||
5280 | |||
5281 | kfree(*tablep); | ||
5282 | *tablep = NULL; | ||
5283 | } | ||
5284 | |||
5294 | static void | 5285 | static void |
5295 | set_table_entry(struct ctl_table *entry, | 5286 | set_table_entry(struct ctl_table *entry, |
5296 | const char *procname, void *data, int maxlen, | 5287 | const char *procname, void *data, int maxlen, |
@@ -5306,7 +5297,10 @@ set_table_entry(struct ctl_table *entry, | |||
5306 | static struct ctl_table * | 5297 | static struct ctl_table * |
5307 | sd_alloc_ctl_domain_table(struct sched_domain *sd) | 5298 | sd_alloc_ctl_domain_table(struct sched_domain *sd) |
5308 | { | 5299 | { |
5309 | struct ctl_table *table = sd_alloc_ctl_entry(14); | 5300 | struct ctl_table *table = sd_alloc_ctl_entry(12); |
5301 | |||
5302 | if (table == NULL) | ||
5303 | return NULL; | ||
5310 | 5304 | ||
5311 | set_table_entry(&table[0], "min_interval", &sd->min_interval, | 5305 | set_table_entry(&table[0], "min_interval", &sd->min_interval, |
5312 | sizeof(long), 0644, proc_doulongvec_minmax); | 5306 | sizeof(long), 0644, proc_doulongvec_minmax); |
@@ -5326,11 +5320,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) | |||
5326 | sizeof(int), 0644, proc_dointvec_minmax); | 5320 | sizeof(int), 0644, proc_dointvec_minmax); |
5327 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, | 5321 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, |
5328 | sizeof(int), 0644, proc_dointvec_minmax); | 5322 | sizeof(int), 0644, proc_dointvec_minmax); |
5329 | set_table_entry(&table[10], "cache_nice_tries", | 5323 | set_table_entry(&table[9], "cache_nice_tries", |
5330 | &sd->cache_nice_tries, | 5324 | &sd->cache_nice_tries, |
5331 | sizeof(int), 0644, proc_dointvec_minmax); | 5325 | sizeof(int), 0644, proc_dointvec_minmax); |
5332 | set_table_entry(&table[12], "flags", &sd->flags, | 5326 | set_table_entry(&table[10], "flags", &sd->flags, |
5333 | sizeof(int), 0644, proc_dointvec_minmax); | 5327 | sizeof(int), 0644, proc_dointvec_minmax); |
5328 | /* &table[11] is terminator */ | ||
5334 | 5329 | ||
5335 | return table; | 5330 | return table; |
5336 | } | 5331 | } |
@@ -5345,6 +5340,8 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) | |||
5345 | for_each_domain(cpu, sd) | 5340 | for_each_domain(cpu, sd) |
5346 | domain_num++; | 5341 | domain_num++; |
5347 | entry = table = sd_alloc_ctl_entry(domain_num + 1); | 5342 | entry = table = sd_alloc_ctl_entry(domain_num + 1); |
5343 | if (table == NULL) | ||
5344 | return NULL; | ||
5348 | 5345 | ||
5349 | i = 0; | 5346 | i = 0; |
5350 | for_each_domain(cpu, sd) { | 5347 | for_each_domain(cpu, sd) { |
@@ -5359,24 +5356,38 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) | |||
5359 | } | 5356 | } |
5360 | 5357 | ||
5361 | static struct ctl_table_header *sd_sysctl_header; | 5358 | static struct ctl_table_header *sd_sysctl_header; |
5362 | static void init_sched_domain_sysctl(void) | 5359 | static void register_sched_domain_sysctl(void) |
5363 | { | 5360 | { |
5364 | int i, cpu_num = num_online_cpus(); | 5361 | int i, cpu_num = num_online_cpus(); |
5365 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); | 5362 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); |
5366 | char buf[32]; | 5363 | char buf[32]; |
5367 | 5364 | ||
5365 | if (entry == NULL) | ||
5366 | return; | ||
5367 | |||
5368 | sd_ctl_dir[0].child = entry; | 5368 | sd_ctl_dir[0].child = entry; |
5369 | 5369 | ||
5370 | for (i = 0; i < cpu_num; i++, entry++) { | 5370 | for_each_online_cpu(i) { |
5371 | snprintf(buf, 32, "cpu%d", i); | 5371 | snprintf(buf, 32, "cpu%d", i); |
5372 | entry->procname = kstrdup(buf, GFP_KERNEL); | 5372 | entry->procname = kstrdup(buf, GFP_KERNEL); |
5373 | entry->mode = 0555; | 5373 | entry->mode = 0555; |
5374 | entry->child = sd_alloc_ctl_cpu_table(i); | 5374 | entry->child = sd_alloc_ctl_cpu_table(i); |
5375 | entry++; | ||
5375 | } | 5376 | } |
5376 | sd_sysctl_header = register_sysctl_table(sd_ctl_root); | 5377 | sd_sysctl_header = register_sysctl_table(sd_ctl_root); |
5377 | } | 5378 | } |
5379 | |||
5380 | static void unregister_sched_domain_sysctl(void) | ||
5381 | { | ||
5382 | unregister_sysctl_table(sd_sysctl_header); | ||
5383 | sd_sysctl_header = NULL; | ||
5384 | sd_free_ctl_entry(&sd_ctl_dir[0].child); | ||
5385 | } | ||
5378 | #else | 5386 | #else |
5379 | static void init_sched_domain_sysctl(void) | 5387 | static void register_sched_domain_sysctl(void) |
5388 | { | ||
5389 | } | ||
5390 | static void unregister_sched_domain_sysctl(void) | ||
5380 | { | 5391 | { |
5381 | } | 5392 | } |
5382 | #endif | 5393 | #endif |
@@ -5498,8 +5509,7 @@ int __init migration_init(void) | |||
5498 | int nr_cpu_ids __read_mostly = NR_CPUS; | 5509 | int nr_cpu_ids __read_mostly = NR_CPUS; |
5499 | EXPORT_SYMBOL(nr_cpu_ids); | 5510 | EXPORT_SYMBOL(nr_cpu_ids); |
5500 | 5511 | ||
5501 | #undef SCHED_DOMAIN_DEBUG | 5512 | #ifdef CONFIG_SCHED_DEBUG |
5502 | #ifdef SCHED_DOMAIN_DEBUG | ||
5503 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 5513 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
5504 | { | 5514 | { |
5505 | int level = 0; | 5515 | int level = 0; |
@@ -5557,16 +5567,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
5557 | printk("\n"); | 5567 | printk("\n"); |
5558 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 5568 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
5559 | "set\n"); | 5569 | "set\n"); |
5570 | break; | ||
5560 | } | 5571 | } |
5561 | 5572 | ||
5562 | if (!cpus_weight(group->cpumask)) { | 5573 | if (!cpus_weight(group->cpumask)) { |
5563 | printk("\n"); | 5574 | printk("\n"); |
5564 | printk(KERN_ERR "ERROR: empty group\n"); | 5575 | printk(KERN_ERR "ERROR: empty group\n"); |
5576 | break; | ||
5565 | } | 5577 | } |
5566 | 5578 | ||
5567 | if (cpus_intersects(groupmask, group->cpumask)) { | 5579 | if (cpus_intersects(groupmask, group->cpumask)) { |
5568 | printk("\n"); | 5580 | printk("\n"); |
5569 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | 5581 | printk(KERN_ERR "ERROR: repeated CPUs\n"); |
5582 | break; | ||
5570 | } | 5583 | } |
5571 | 5584 | ||
5572 | cpus_or(groupmask, groupmask, group->cpumask); | 5585 | cpus_or(groupmask, groupmask, group->cpumask); |
@@ -5700,7 +5713,7 @@ static int __init isolated_cpu_setup(char *str) | |||
5700 | return 1; | 5713 | return 1; |
5701 | } | 5714 | } |
5702 | 5715 | ||
5703 | __setup ("isolcpus=", isolated_cpu_setup); | 5716 | __setup("isolcpus=", isolated_cpu_setup); |
5704 | 5717 | ||
5705 | /* | 5718 | /* |
5706 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer | 5719 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer |
@@ -5929,24 +5942,23 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
5929 | 5942 | ||
5930 | if (!sg) | 5943 | if (!sg) |
5931 | return; | 5944 | return; |
5932 | next_sg: | 5945 | do { |
5933 | for_each_cpu_mask(j, sg->cpumask) { | 5946 | for_each_cpu_mask(j, sg->cpumask) { |
5934 | struct sched_domain *sd; | 5947 | struct sched_domain *sd; |
5935 | 5948 | ||
5936 | sd = &per_cpu(phys_domains, j); | 5949 | sd = &per_cpu(phys_domains, j); |
5937 | if (j != first_cpu(sd->groups->cpumask)) { | 5950 | if (j != first_cpu(sd->groups->cpumask)) { |
5938 | /* | 5951 | /* |
5939 | * Only add "power" once for each | 5952 | * Only add "power" once for each |
5940 | * physical package. | 5953 | * physical package. |
5941 | */ | 5954 | */ |
5942 | continue; | 5955 | continue; |
5943 | } | 5956 | } |
5944 | 5957 | ||
5945 | sg_inc_cpu_power(sg, sd->groups->__cpu_power); | 5958 | sg_inc_cpu_power(sg, sd->groups->__cpu_power); |
5946 | } | 5959 | } |
5947 | sg = sg->next; | 5960 | sg = sg->next; |
5948 | if (sg != group_head) | 5961 | } while (sg != group_head); |
5949 | goto next_sg; | ||
5950 | } | 5962 | } |
5951 | #endif | 5963 | #endif |
5952 | 5964 | ||
@@ -6057,7 +6069,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6057 | /* | 6069 | /* |
6058 | * Allocate the per-node list of sched groups | 6070 | * Allocate the per-node list of sched groups |
6059 | */ | 6071 | */ |
6060 | sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES, | 6072 | sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), |
6061 | GFP_KERNEL); | 6073 | GFP_KERNEL); |
6062 | if (!sched_group_nodes) { | 6074 | if (!sched_group_nodes) { |
6063 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 6075 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
@@ -6310,6 +6322,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) | |||
6310 | 6322 | ||
6311 | err = build_sched_domains(&cpu_default_map); | 6323 | err = build_sched_domains(&cpu_default_map); |
6312 | 6324 | ||
6325 | register_sched_domain_sysctl(); | ||
6326 | |||
6313 | return err; | 6327 | return err; |
6314 | } | 6328 | } |
6315 | 6329 | ||
@@ -6326,6 +6340,8 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
6326 | { | 6340 | { |
6327 | int i; | 6341 | int i; |
6328 | 6342 | ||
6343 | unregister_sched_domain_sysctl(); | ||
6344 | |||
6329 | for_each_cpu_mask(i, *cpu_map) | 6345 | for_each_cpu_mask(i, *cpu_map) |
6330 | cpu_attach_domain(NULL, i); | 6346 | cpu_attach_domain(NULL, i); |
6331 | synchronize_sched(); | 6347 | synchronize_sched(); |
@@ -6356,6 +6372,8 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | |||
6356 | if (!err && !cpus_empty(*partition2)) | 6372 | if (!err && !cpus_empty(*partition2)) |
6357 | err = build_sched_domains(partition2); | 6373 | err = build_sched_domains(partition2); |
6358 | 6374 | ||
6375 | register_sched_domain_sysctl(); | ||
6376 | |||
6359 | return err; | 6377 | return err; |
6360 | } | 6378 | } |
6361 | 6379 | ||
@@ -6487,17 +6505,13 @@ void __init sched_init_smp(void) | |||
6487 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 6505 | /* XXX: Theoretical race here - CPU may be hotplugged now */ |
6488 | hotcpu_notifier(update_sched_domains, 0); | 6506 | hotcpu_notifier(update_sched_domains, 0); |
6489 | 6507 | ||
6490 | init_sched_domain_sysctl(); | ||
6491 | |||
6492 | /* Move init over to a non-isolated CPU */ | 6508 | /* Move init over to a non-isolated CPU */ |
6493 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 6509 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) |
6494 | BUG(); | 6510 | BUG(); |
6495 | sched_init_granularity(); | ||
6496 | } | 6511 | } |
6497 | #else | 6512 | #else |
6498 | void __init sched_init_smp(void) | 6513 | void __init sched_init_smp(void) |
6499 | { | 6514 | { |
6500 | sched_init_granularity(); | ||
6501 | } | 6515 | } |
6502 | #endif /* CONFIG_SMP */ | 6516 | #endif /* CONFIG_SMP */ |
6503 | 6517 | ||
@@ -6511,28 +6525,20 @@ int in_sched_functions(unsigned long addr) | |||
6511 | && addr < (unsigned long)__sched_text_end); | 6525 | && addr < (unsigned long)__sched_text_end); |
6512 | } | 6526 | } |
6513 | 6527 | ||
6514 | static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | 6528 | static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) |
6515 | { | 6529 | { |
6516 | cfs_rq->tasks_timeline = RB_ROOT; | 6530 | cfs_rq->tasks_timeline = RB_ROOT; |
6517 | cfs_rq->fair_clock = 1; | ||
6518 | #ifdef CONFIG_FAIR_GROUP_SCHED | 6531 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6519 | cfs_rq->rq = rq; | 6532 | cfs_rq->rq = rq; |
6520 | #endif | 6533 | #endif |
6534 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | ||
6521 | } | 6535 | } |
6522 | 6536 | ||
6523 | void __init sched_init(void) | 6537 | void __init sched_init(void) |
6524 | { | 6538 | { |
6525 | u64 now = sched_clock(); | ||
6526 | int highest_cpu = 0; | 6539 | int highest_cpu = 0; |
6527 | int i, j; | 6540 | int i, j; |
6528 | 6541 | ||
6529 | /* | ||
6530 | * Link up the scheduling class hierarchy: | ||
6531 | */ | ||
6532 | rt_sched_class.next = &fair_sched_class; | ||
6533 | fair_sched_class.next = &idle_sched_class; | ||
6534 | idle_sched_class.next = NULL; | ||
6535 | |||
6536 | for_each_possible_cpu(i) { | 6542 | for_each_possible_cpu(i) { |
6537 | struct rt_prio_array *array; | 6543 | struct rt_prio_array *array; |
6538 | struct rq *rq; | 6544 | struct rq *rq; |
@@ -6545,10 +6551,28 @@ void __init sched_init(void) | |||
6545 | init_cfs_rq(&rq->cfs, rq); | 6551 | init_cfs_rq(&rq->cfs, rq); |
6546 | #ifdef CONFIG_FAIR_GROUP_SCHED | 6552 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6547 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 6553 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
6548 | list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | 6554 | { |
6555 | struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i); | ||
6556 | struct sched_entity *se = | ||
6557 | &per_cpu(init_sched_entity, i); | ||
6558 | |||
6559 | init_cfs_rq_p[i] = cfs_rq; | ||
6560 | init_cfs_rq(cfs_rq, rq); | ||
6561 | cfs_rq->tg = &init_task_group; | ||
6562 | list_add(&cfs_rq->leaf_cfs_rq_list, | ||
6563 | &rq->leaf_cfs_rq_list); | ||
6564 | |||
6565 | init_sched_entity_p[i] = se; | ||
6566 | se->cfs_rq = &rq->cfs; | ||
6567 | se->my_q = cfs_rq; | ||
6568 | se->load.weight = init_task_group_load; | ||
6569 | se->load.inv_weight = | ||
6570 | div64_64(1ULL<<32, init_task_group_load); | ||
6571 | se->parent = NULL; | ||
6572 | } | ||
6573 | init_task_group.shares = init_task_group_load; | ||
6574 | spin_lock_init(&init_task_group.lock); | ||
6549 | #endif | 6575 | #endif |
6550 | rq->ls.load_update_last = now; | ||
6551 | rq->ls.load_update_start = now; | ||
6552 | 6576 | ||
6553 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 6577 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
6554 | rq->cpu_load[j] = 0; | 6578 | rq->cpu_load[j] = 0; |
@@ -6633,26 +6657,40 @@ EXPORT_SYMBOL(__might_sleep); | |||
6633 | #endif | 6657 | #endif |
6634 | 6658 | ||
6635 | #ifdef CONFIG_MAGIC_SYSRQ | 6659 | #ifdef CONFIG_MAGIC_SYSRQ |
6660 | static void normalize_task(struct rq *rq, struct task_struct *p) | ||
6661 | { | ||
6662 | int on_rq; | ||
6663 | update_rq_clock(rq); | ||
6664 | on_rq = p->se.on_rq; | ||
6665 | if (on_rq) | ||
6666 | deactivate_task(rq, p, 0); | ||
6667 | __setscheduler(rq, p, SCHED_NORMAL, 0); | ||
6668 | if (on_rq) { | ||
6669 | activate_task(rq, p, 0); | ||
6670 | resched_task(rq->curr); | ||
6671 | } | ||
6672 | } | ||
6673 | |||
6636 | void normalize_rt_tasks(void) | 6674 | void normalize_rt_tasks(void) |
6637 | { | 6675 | { |
6638 | struct task_struct *g, *p; | 6676 | struct task_struct *g, *p; |
6639 | unsigned long flags; | 6677 | unsigned long flags; |
6640 | struct rq *rq; | 6678 | struct rq *rq; |
6641 | int on_rq; | ||
6642 | 6679 | ||
6643 | read_lock_irq(&tasklist_lock); | 6680 | read_lock_irq(&tasklist_lock); |
6644 | do_each_thread(g, p) { | 6681 | do_each_thread(g, p) { |
6645 | p->se.fair_key = 0; | 6682 | /* |
6646 | p->se.wait_runtime = 0; | 6683 | * Only normalize user tasks: |
6684 | */ | ||
6685 | if (!p->mm) | ||
6686 | continue; | ||
6687 | |||
6647 | p->se.exec_start = 0; | 6688 | p->se.exec_start = 0; |
6648 | p->se.wait_start_fair = 0; | ||
6649 | p->se.sleep_start_fair = 0; | ||
6650 | #ifdef CONFIG_SCHEDSTATS | 6689 | #ifdef CONFIG_SCHEDSTATS |
6651 | p->se.wait_start = 0; | 6690 | p->se.wait_start = 0; |
6652 | p->se.sleep_start = 0; | 6691 | p->se.sleep_start = 0; |
6653 | p->se.block_start = 0; | 6692 | p->se.block_start = 0; |
6654 | #endif | 6693 | #endif |
6655 | task_rq(p)->cfs.fair_clock = 0; | ||
6656 | task_rq(p)->clock = 0; | 6694 | task_rq(p)->clock = 0; |
6657 | 6695 | ||
6658 | if (!rt_task(p)) { | 6696 | if (!rt_task(p)) { |
@@ -6667,26 +6705,9 @@ void normalize_rt_tasks(void) | |||
6667 | 6705 | ||
6668 | spin_lock_irqsave(&p->pi_lock, flags); | 6706 | spin_lock_irqsave(&p->pi_lock, flags); |
6669 | rq = __task_rq_lock(p); | 6707 | rq = __task_rq_lock(p); |
6670 | #ifdef CONFIG_SMP | ||
6671 | /* | ||
6672 | * Do not touch the migration thread: | ||
6673 | */ | ||
6674 | if (p == rq->migration_thread) | ||
6675 | goto out_unlock; | ||
6676 | #endif | ||
6677 | 6708 | ||
6678 | update_rq_clock(rq); | 6709 | normalize_task(rq, p); |
6679 | on_rq = p->se.on_rq; | 6710 | |
6680 | if (on_rq) | ||
6681 | deactivate_task(rq, p, 0); | ||
6682 | __setscheduler(rq, p, SCHED_NORMAL, 0); | ||
6683 | if (on_rq) { | ||
6684 | activate_task(rq, p, 0); | ||
6685 | resched_task(rq->curr); | ||
6686 | } | ||
6687 | #ifdef CONFIG_SMP | ||
6688 | out_unlock: | ||
6689 | #endif | ||
6690 | __task_rq_unlock(rq); | 6711 | __task_rq_unlock(rq); |
6691 | spin_unlock_irqrestore(&p->pi_lock, flags); | 6712 | spin_unlock_irqrestore(&p->pi_lock, flags); |
6692 | } while_each_thread(g, p); | 6713 | } while_each_thread(g, p); |
@@ -6739,3 +6760,201 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
6739 | } | 6760 | } |
6740 | 6761 | ||
6741 | #endif | 6762 | #endif |
6763 | |||
6764 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
6765 | |||
6766 | /* allocate runqueue etc for a new task group */ | ||
6767 | struct task_group *sched_create_group(void) | ||
6768 | { | ||
6769 | struct task_group *tg; | ||
6770 | struct cfs_rq *cfs_rq; | ||
6771 | struct sched_entity *se; | ||
6772 | struct rq *rq; | ||
6773 | int i; | ||
6774 | |||
6775 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); | ||
6776 | if (!tg) | ||
6777 | return ERR_PTR(-ENOMEM); | ||
6778 | |||
6779 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); | ||
6780 | if (!tg->cfs_rq) | ||
6781 | goto err; | ||
6782 | tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); | ||
6783 | if (!tg->se) | ||
6784 | goto err; | ||
6785 | |||
6786 | for_each_possible_cpu(i) { | ||
6787 | rq = cpu_rq(i); | ||
6788 | |||
6789 | cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, | ||
6790 | cpu_to_node(i)); | ||
6791 | if (!cfs_rq) | ||
6792 | goto err; | ||
6793 | |||
6794 | se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, | ||
6795 | cpu_to_node(i)); | ||
6796 | if (!se) | ||
6797 | goto err; | ||
6798 | |||
6799 | memset(cfs_rq, 0, sizeof(struct cfs_rq)); | ||
6800 | memset(se, 0, sizeof(struct sched_entity)); | ||
6801 | |||
6802 | tg->cfs_rq[i] = cfs_rq; | ||
6803 | init_cfs_rq(cfs_rq, rq); | ||
6804 | cfs_rq->tg = tg; | ||
6805 | |||
6806 | tg->se[i] = se; | ||
6807 | se->cfs_rq = &rq->cfs; | ||
6808 | se->my_q = cfs_rq; | ||
6809 | se->load.weight = NICE_0_LOAD; | ||
6810 | se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD); | ||
6811 | se->parent = NULL; | ||
6812 | } | ||
6813 | |||
6814 | for_each_possible_cpu(i) { | ||
6815 | rq = cpu_rq(i); | ||
6816 | cfs_rq = tg->cfs_rq[i]; | ||
6817 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
6818 | } | ||
6819 | |||
6820 | tg->shares = NICE_0_LOAD; | ||
6821 | spin_lock_init(&tg->lock); | ||
6822 | |||
6823 | return tg; | ||
6824 | |||
6825 | err: | ||
6826 | for_each_possible_cpu(i) { | ||
6827 | if (tg->cfs_rq) | ||
6828 | kfree(tg->cfs_rq[i]); | ||
6829 | if (tg->se) | ||
6830 | kfree(tg->se[i]); | ||
6831 | } | ||
6832 | kfree(tg->cfs_rq); | ||
6833 | kfree(tg->se); | ||
6834 | kfree(tg); | ||
6835 | |||
6836 | return ERR_PTR(-ENOMEM); | ||
6837 | } | ||
6838 | |||
6839 | /* rcu callback to free various structures associated with a task group */ | ||
6840 | static void free_sched_group(struct rcu_head *rhp) | ||
6841 | { | ||
6842 | struct cfs_rq *cfs_rq = container_of(rhp, struct cfs_rq, rcu); | ||
6843 | struct task_group *tg = cfs_rq->tg; | ||
6844 | struct sched_entity *se; | ||
6845 | int i; | ||
6846 | |||
6847 | /* now it should be safe to free those cfs_rqs */ | ||
6848 | for_each_possible_cpu(i) { | ||
6849 | cfs_rq = tg->cfs_rq[i]; | ||
6850 | kfree(cfs_rq); | ||
6851 | |||
6852 | se = tg->se[i]; | ||
6853 | kfree(se); | ||
6854 | } | ||
6855 | |||
6856 | kfree(tg->cfs_rq); | ||
6857 | kfree(tg->se); | ||
6858 | kfree(tg); | ||
6859 | } | ||
6860 | |||
6861 | /* Destroy runqueue etc associated with a task group */ | ||
6862 | void sched_destroy_group(struct task_group *tg) | ||
6863 | { | ||
6864 | struct cfs_rq *cfs_rq; | ||
6865 | int i; | ||
6866 | |||
6867 | for_each_possible_cpu(i) { | ||
6868 | cfs_rq = tg->cfs_rq[i]; | ||
6869 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | ||
6870 | } | ||
6871 | |||
6872 | cfs_rq = tg->cfs_rq[0]; | ||
6873 | |||
6874 | /* wait for possible concurrent references to cfs_rqs complete */ | ||
6875 | call_rcu(&cfs_rq->rcu, free_sched_group); | ||
6876 | } | ||
6877 | |||
6878 | /* change task's runqueue when it moves between groups. | ||
6879 | * The caller of this function should have put the task in its new group | ||
6880 | * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to | ||
6881 | * reflect its new group. | ||
6882 | */ | ||
6883 | void sched_move_task(struct task_struct *tsk) | ||
6884 | { | ||
6885 | int on_rq, running; | ||
6886 | unsigned long flags; | ||
6887 | struct rq *rq; | ||
6888 | |||
6889 | rq = task_rq_lock(tsk, &flags); | ||
6890 | |||
6891 | if (tsk->sched_class != &fair_sched_class) | ||
6892 | goto done; | ||
6893 | |||
6894 | update_rq_clock(rq); | ||
6895 | |||
6896 | running = task_running(rq, tsk); | ||
6897 | on_rq = tsk->se.on_rq; | ||
6898 | |||
6899 | if (on_rq) { | ||
6900 | dequeue_task(rq, tsk, 0); | ||
6901 | if (unlikely(running)) | ||
6902 | tsk->sched_class->put_prev_task(rq, tsk); | ||
6903 | } | ||
6904 | |||
6905 | set_task_cfs_rq(tsk); | ||
6906 | |||
6907 | if (on_rq) { | ||
6908 | if (unlikely(running)) | ||
6909 | tsk->sched_class->set_curr_task(rq); | ||
6910 | enqueue_task(rq, tsk, 0); | ||
6911 | } | ||
6912 | |||
6913 | done: | ||
6914 | task_rq_unlock(rq, &flags); | ||
6915 | } | ||
6916 | |||
6917 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | ||
6918 | { | ||
6919 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
6920 | struct rq *rq = cfs_rq->rq; | ||
6921 | int on_rq; | ||
6922 | |||
6923 | spin_lock_irq(&rq->lock); | ||
6924 | |||
6925 | on_rq = se->on_rq; | ||
6926 | if (on_rq) | ||
6927 | dequeue_entity(cfs_rq, se, 0); | ||
6928 | |||
6929 | se->load.weight = shares; | ||
6930 | se->load.inv_weight = div64_64((1ULL<<32), shares); | ||
6931 | |||
6932 | if (on_rq) | ||
6933 | enqueue_entity(cfs_rq, se, 0); | ||
6934 | |||
6935 | spin_unlock_irq(&rq->lock); | ||
6936 | } | ||
6937 | |||
6938 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | ||
6939 | { | ||
6940 | int i; | ||
6941 | |||
6942 | spin_lock(&tg->lock); | ||
6943 | if (tg->shares == shares) | ||
6944 | goto done; | ||
6945 | |||
6946 | tg->shares = shares; | ||
6947 | for_each_possible_cpu(i) | ||
6948 | set_se_shares(tg->se[i], shares); | ||
6949 | |||
6950 | done: | ||
6951 | spin_unlock(&tg->lock); | ||
6952 | return 0; | ||
6953 | } | ||
6954 | |||
6955 | unsigned long sched_group_shares(struct task_group *tg) | ||
6956 | { | ||
6957 | return tg->shares; | ||
6958 | } | ||
6959 | |||
6960 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index c3ee38bd3426..a5e517ec07c3 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -28,6 +28,31 @@ | |||
28 | printk(x); \ | 28 | printk(x); \ |
29 | } while (0) | 29 | } while (0) |
30 | 30 | ||
31 | /* | ||
32 | * Ease the printing of nsec fields: | ||
33 | */ | ||
34 | static long long nsec_high(long long nsec) | ||
35 | { | ||
36 | if (nsec < 0) { | ||
37 | nsec = -nsec; | ||
38 | do_div(nsec, 1000000); | ||
39 | return -nsec; | ||
40 | } | ||
41 | do_div(nsec, 1000000); | ||
42 | |||
43 | return nsec; | ||
44 | } | ||
45 | |||
46 | static unsigned long nsec_low(long long nsec) | ||
47 | { | ||
48 | if (nsec < 0) | ||
49 | nsec = -nsec; | ||
50 | |||
51 | return do_div(nsec, 1000000); | ||
52 | } | ||
53 | |||
54 | #define SPLIT_NS(x) nsec_high(x), nsec_low(x) | ||
55 | |||
31 | static void | 56 | static void |
32 | print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | 57 | print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) |
33 | { | 58 | { |
@@ -36,23 +61,19 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
36 | else | 61 | else |
37 | SEQ_printf(m, " "); | 62 | SEQ_printf(m, " "); |
38 | 63 | ||
39 | SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d ", | 64 | SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", |
40 | p->comm, p->pid, | 65 | p->comm, p->pid, |
41 | (long long)p->se.fair_key, | 66 | SPLIT_NS(p->se.vruntime), |
42 | (long long)(p->se.fair_key - rq->cfs.fair_clock), | ||
43 | (long long)p->se.wait_runtime, | ||
44 | (long long)(p->nvcsw + p->nivcsw), | 67 | (long long)(p->nvcsw + p->nivcsw), |
45 | p->prio); | 68 | p->prio); |
46 | #ifdef CONFIG_SCHEDSTATS | 69 | #ifdef CONFIG_SCHEDSTATS |
47 | SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n", | 70 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n", |
48 | (long long)p->se.sum_exec_runtime, | 71 | SPLIT_NS(p->se.vruntime), |
49 | (long long)p->se.sum_wait_runtime, | 72 | SPLIT_NS(p->se.sum_exec_runtime), |
50 | (long long)p->se.sum_sleep_runtime, | 73 | SPLIT_NS(p->se.sum_sleep_runtime)); |
51 | (long long)p->se.wait_runtime_overruns, | ||
52 | (long long)p->se.wait_runtime_underruns); | ||
53 | #else | 74 | #else |
54 | SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n", | 75 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n", |
55 | 0LL, 0LL, 0LL, 0LL, 0LL); | 76 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); |
56 | #endif | 77 | #endif |
57 | } | 78 | } |
58 | 79 | ||
@@ -62,14 +83,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
62 | 83 | ||
63 | SEQ_printf(m, | 84 | SEQ_printf(m, |
64 | "\nrunnable tasks:\n" | 85 | "\nrunnable tasks:\n" |
65 | " task PID tree-key delta waiting" | 86 | " task PID tree-key switches prio" |
66 | " switches prio" | 87 | " exec-runtime sum-exec sum-sleep\n" |
67 | " sum-exec sum-wait sum-sleep" | 88 | "------------------------------------------------------" |
68 | " wait-overrun wait-underrun\n" | 89 | "----------------------------------------------------\n"); |
69 | "------------------------------------------------------------------" | ||
70 | "----------------" | ||
71 | "------------------------------------------------" | ||
72 | "--------------------------------\n"); | ||
73 | 90 | ||
74 | read_lock_irq(&tasklist_lock); | 91 | read_lock_irq(&tasklist_lock); |
75 | 92 | ||
@@ -83,45 +100,48 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
83 | read_unlock_irq(&tasklist_lock); | 100 | read_unlock_irq(&tasklist_lock); |
84 | } | 101 | } |
85 | 102 | ||
86 | static void | 103 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) |
87 | print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | ||
88 | { | 104 | { |
89 | s64 wait_runtime_rq_sum = 0; | 105 | s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, |
90 | struct task_struct *p; | 106 | spread, rq0_min_vruntime, spread0; |
91 | struct rb_node *curr; | ||
92 | unsigned long flags; | ||
93 | struct rq *rq = &per_cpu(runqueues, cpu); | 107 | struct rq *rq = &per_cpu(runqueues, cpu); |
108 | struct sched_entity *last; | ||
109 | unsigned long flags; | ||
94 | 110 | ||
95 | spin_lock_irqsave(&rq->lock, flags); | ||
96 | curr = first_fair(cfs_rq); | ||
97 | while (curr) { | ||
98 | p = rb_entry(curr, struct task_struct, se.run_node); | ||
99 | wait_runtime_rq_sum += p->se.wait_runtime; | ||
100 | |||
101 | curr = rb_next(curr); | ||
102 | } | ||
103 | spin_unlock_irqrestore(&rq->lock, flags); | ||
104 | |||
105 | SEQ_printf(m, " .%-30s: %Ld\n", "wait_runtime_rq_sum", | ||
106 | (long long)wait_runtime_rq_sum); | ||
107 | } | ||
108 | |||
109 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | ||
110 | { | ||
111 | SEQ_printf(m, "\ncfs_rq\n"); | 111 | SEQ_printf(m, "\ncfs_rq\n"); |
112 | 112 | ||
113 | #define P(x) \ | 113 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", |
114 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(cfs_rq->x)) | 114 | SPLIT_NS(cfs_rq->exec_clock)); |
115 | |||
116 | P(fair_clock); | ||
117 | P(exec_clock); | ||
118 | P(wait_runtime); | ||
119 | P(wait_runtime_overruns); | ||
120 | P(wait_runtime_underruns); | ||
121 | P(sleeper_bonus); | ||
122 | #undef P | ||
123 | 115 | ||
124 | print_cfs_rq_runtime_sum(m, cpu, cfs_rq); | 116 | spin_lock_irqsave(&rq->lock, flags); |
117 | if (cfs_rq->rb_leftmost) | ||
118 | MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; | ||
119 | last = __pick_last_entity(cfs_rq); | ||
120 | if (last) | ||
121 | max_vruntime = last->vruntime; | ||
122 | min_vruntime = rq->cfs.min_vruntime; | ||
123 | rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; | ||
124 | spin_unlock_irqrestore(&rq->lock, flags); | ||
125 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", | ||
126 | SPLIT_NS(MIN_vruntime)); | ||
127 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", | ||
128 | SPLIT_NS(min_vruntime)); | ||
129 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", | ||
130 | SPLIT_NS(max_vruntime)); | ||
131 | spread = max_vruntime - MIN_vruntime; | ||
132 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", | ||
133 | SPLIT_NS(spread)); | ||
134 | spread0 = min_vruntime - rq0_min_vruntime; | ||
135 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", | ||
136 | SPLIT_NS(spread0)); | ||
137 | SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); | ||
138 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | ||
139 | #ifdef CONFIG_SCHEDSTATS | ||
140 | SEQ_printf(m, " .%-30s: %ld\n", "bkl_count", | ||
141 | rq->bkl_count); | ||
142 | #endif | ||
143 | SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", | ||
144 | cfs_rq->nr_spread_over); | ||
125 | } | 145 | } |
126 | 146 | ||
127 | static void print_cpu(struct seq_file *m, int cpu) | 147 | static void print_cpu(struct seq_file *m, int cpu) |
@@ -141,31 +161,32 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
141 | 161 | ||
142 | #define P(x) \ | 162 | #define P(x) \ |
143 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) | 163 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) |
164 | #define PN(x) \ | ||
165 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) | ||
144 | 166 | ||
145 | P(nr_running); | 167 | P(nr_running); |
146 | SEQ_printf(m, " .%-30s: %lu\n", "load", | 168 | SEQ_printf(m, " .%-30s: %lu\n", "load", |
147 | rq->ls.load.weight); | 169 | rq->load.weight); |
148 | P(ls.delta_fair); | ||
149 | P(ls.delta_exec); | ||
150 | P(nr_switches); | 170 | P(nr_switches); |
151 | P(nr_load_updates); | 171 | P(nr_load_updates); |
152 | P(nr_uninterruptible); | 172 | P(nr_uninterruptible); |
153 | SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies); | 173 | SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies); |
154 | P(next_balance); | 174 | PN(next_balance); |
155 | P(curr->pid); | 175 | P(curr->pid); |
156 | P(clock); | 176 | PN(clock); |
157 | P(idle_clock); | 177 | PN(idle_clock); |
158 | P(prev_clock_raw); | 178 | PN(prev_clock_raw); |
159 | P(clock_warps); | 179 | P(clock_warps); |
160 | P(clock_overflows); | 180 | P(clock_overflows); |
161 | P(clock_deep_idle_events); | 181 | P(clock_deep_idle_events); |
162 | P(clock_max_delta); | 182 | PN(clock_max_delta); |
163 | P(cpu_load[0]); | 183 | P(cpu_load[0]); |
164 | P(cpu_load[1]); | 184 | P(cpu_load[1]); |
165 | P(cpu_load[2]); | 185 | P(cpu_load[2]); |
166 | P(cpu_load[3]); | 186 | P(cpu_load[3]); |
167 | P(cpu_load[4]); | 187 | P(cpu_load[4]); |
168 | #undef P | 188 | #undef P |
189 | #undef PN | ||
169 | 190 | ||
170 | print_cfs_stats(m, cpu); | 191 | print_cfs_stats(m, cpu); |
171 | 192 | ||
@@ -177,12 +198,25 @@ static int sched_debug_show(struct seq_file *m, void *v) | |||
177 | u64 now = ktime_to_ns(ktime_get()); | 198 | u64 now = ktime_to_ns(ktime_get()); |
178 | int cpu; | 199 | int cpu; |
179 | 200 | ||
180 | SEQ_printf(m, "Sched Debug Version: v0.05-v20, %s %.*s\n", | 201 | SEQ_printf(m, "Sched Debug Version: v0.06-v22, %s %.*s\n", |
181 | init_utsname()->release, | 202 | init_utsname()->release, |
182 | (int)strcspn(init_utsname()->version, " "), | 203 | (int)strcspn(init_utsname()->version, " "), |
183 | init_utsname()->version); | 204 | init_utsname()->version); |
184 | 205 | ||
185 | SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now); | 206 | SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); |
207 | |||
208 | #define P(x) \ | ||
209 | SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) | ||
210 | #define PN(x) \ | ||
211 | SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) | ||
212 | PN(sysctl_sched_latency); | ||
213 | PN(sysctl_sched_nr_latency); | ||
214 | PN(sysctl_sched_wakeup_granularity); | ||
215 | PN(sysctl_sched_batch_wakeup_granularity); | ||
216 | PN(sysctl_sched_child_runs_first); | ||
217 | P(sysctl_sched_features); | ||
218 | #undef PN | ||
219 | #undef P | ||
186 | 220 | ||
187 | for_each_online_cpu(cpu) | 221 | for_each_online_cpu(cpu) |
188 | print_cpu(m, cpu); | 222 | print_cpu(m, cpu); |
@@ -202,7 +236,7 @@ static int sched_debug_open(struct inode *inode, struct file *filp) | |||
202 | return single_open(filp, sched_debug_show, NULL); | 236 | return single_open(filp, sched_debug_show, NULL); |
203 | } | 237 | } |
204 | 238 | ||
205 | static struct file_operations sched_debug_fops = { | 239 | static const struct file_operations sched_debug_fops = { |
206 | .open = sched_debug_open, | 240 | .open = sched_debug_open, |
207 | .read = seq_read, | 241 | .read = seq_read, |
208 | .llseek = seq_lseek, | 242 | .llseek = seq_lseek, |
@@ -226,6 +260,7 @@ __initcall(init_sched_debug_procfs); | |||
226 | 260 | ||
227 | void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | 261 | void proc_sched_show_task(struct task_struct *p, struct seq_file *m) |
228 | { | 262 | { |
263 | unsigned long nr_switches; | ||
229 | unsigned long flags; | 264 | unsigned long flags; |
230 | int num_threads = 1; | 265 | int num_threads = 1; |
231 | 266 | ||
@@ -237,41 +272,89 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
237 | rcu_read_unlock(); | 272 | rcu_read_unlock(); |
238 | 273 | ||
239 | SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); | 274 | SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); |
240 | SEQ_printf(m, "----------------------------------------------\n"); | 275 | SEQ_printf(m, |
276 | "---------------------------------------------------------\n"); | ||
277 | #define __P(F) \ | ||
278 | SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F) | ||
241 | #define P(F) \ | 279 | #define P(F) \ |
242 | SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F) | 280 | SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F) |
281 | #define __PN(F) \ | ||
282 | SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) | ||
283 | #define PN(F) \ | ||
284 | SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) | ||
243 | 285 | ||
244 | P(se.wait_runtime); | 286 | PN(se.exec_start); |
245 | P(se.wait_start_fair); | 287 | PN(se.vruntime); |
246 | P(se.exec_start); | 288 | PN(se.sum_exec_runtime); |
247 | P(se.sleep_start_fair); | 289 | |
248 | P(se.sum_exec_runtime); | 290 | nr_switches = p->nvcsw + p->nivcsw; |
249 | 291 | ||
250 | #ifdef CONFIG_SCHEDSTATS | 292 | #ifdef CONFIG_SCHEDSTATS |
251 | P(se.wait_start); | 293 | PN(se.wait_start); |
252 | P(se.sleep_start); | 294 | PN(se.sleep_start); |
253 | P(se.block_start); | 295 | PN(se.block_start); |
254 | P(se.sleep_max); | 296 | PN(se.sleep_max); |
255 | P(se.block_max); | 297 | PN(se.block_max); |
256 | P(se.exec_max); | 298 | PN(se.exec_max); |
257 | P(se.wait_max); | 299 | PN(se.slice_max); |
258 | P(se.wait_runtime_overruns); | 300 | PN(se.wait_max); |
259 | P(se.wait_runtime_underruns); | 301 | P(sched_info.bkl_count); |
260 | P(se.sum_wait_runtime); | 302 | P(se.nr_migrations); |
303 | P(se.nr_migrations_cold); | ||
304 | P(se.nr_failed_migrations_affine); | ||
305 | P(se.nr_failed_migrations_running); | ||
306 | P(se.nr_failed_migrations_hot); | ||
307 | P(se.nr_forced_migrations); | ||
308 | P(se.nr_forced2_migrations); | ||
309 | P(se.nr_wakeups); | ||
310 | P(se.nr_wakeups_sync); | ||
311 | P(se.nr_wakeups_migrate); | ||
312 | P(se.nr_wakeups_local); | ||
313 | P(se.nr_wakeups_remote); | ||
314 | P(se.nr_wakeups_affine); | ||
315 | P(se.nr_wakeups_affine_attempts); | ||
316 | P(se.nr_wakeups_passive); | ||
317 | P(se.nr_wakeups_idle); | ||
318 | |||
319 | { | ||
320 | u64 avg_atom, avg_per_cpu; | ||
321 | |||
322 | avg_atom = p->se.sum_exec_runtime; | ||
323 | if (nr_switches) | ||
324 | do_div(avg_atom, nr_switches); | ||
325 | else | ||
326 | avg_atom = -1LL; | ||
327 | |||
328 | avg_per_cpu = p->se.sum_exec_runtime; | ||
329 | if (p->se.nr_migrations) | ||
330 | avg_per_cpu = div64_64(avg_per_cpu, p->se.nr_migrations); | ||
331 | else | ||
332 | avg_per_cpu = -1LL; | ||
333 | |||
334 | __PN(avg_atom); | ||
335 | __PN(avg_per_cpu); | ||
336 | } | ||
261 | #endif | 337 | #endif |
262 | SEQ_printf(m, "%-25s:%20Ld\n", | 338 | __P(nr_switches); |
263 | "nr_switches", (long long)(p->nvcsw + p->nivcsw)); | 339 | SEQ_printf(m, "%-35s:%21Ld\n", |
340 | "nr_voluntary_switches", (long long)p->nvcsw); | ||
341 | SEQ_printf(m, "%-35s:%21Ld\n", | ||
342 | "nr_involuntary_switches", (long long)p->nivcsw); | ||
343 | |||
264 | P(se.load.weight); | 344 | P(se.load.weight); |
265 | P(policy); | 345 | P(policy); |
266 | P(prio); | 346 | P(prio); |
347 | #undef PN | ||
348 | #undef __PN | ||
267 | #undef P | 349 | #undef P |
350 | #undef __P | ||
268 | 351 | ||
269 | { | 352 | { |
270 | u64 t0, t1; | 353 | u64 t0, t1; |
271 | 354 | ||
272 | t0 = sched_clock(); | 355 | t0 = sched_clock(); |
273 | t1 = sched_clock(); | 356 | t1 = sched_clock(); |
274 | SEQ_printf(m, "%-25s:%20Ld\n", | 357 | SEQ_printf(m, "%-35s:%21Ld\n", |
275 | "clock-delta", (long long)(t1-t0)); | 358 | "clock-delta", (long long)(t1-t0)); |
276 | } | 359 | } |
277 | } | 360 | } |
@@ -279,9 +362,32 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
279 | void proc_sched_set_task(struct task_struct *p) | 362 | void proc_sched_set_task(struct task_struct *p) |
280 | { | 363 | { |
281 | #ifdef CONFIG_SCHEDSTATS | 364 | #ifdef CONFIG_SCHEDSTATS |
282 | p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0; | 365 | p->se.wait_max = 0; |
283 | p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0; | 366 | p->se.sleep_max = 0; |
367 | p->se.sum_sleep_runtime = 0; | ||
368 | p->se.block_max = 0; | ||
369 | p->se.exec_max = 0; | ||
370 | p->se.slice_max = 0; | ||
371 | p->se.nr_migrations = 0; | ||
372 | p->se.nr_migrations_cold = 0; | ||
373 | p->se.nr_failed_migrations_affine = 0; | ||
374 | p->se.nr_failed_migrations_running = 0; | ||
375 | p->se.nr_failed_migrations_hot = 0; | ||
376 | p->se.nr_forced_migrations = 0; | ||
377 | p->se.nr_forced2_migrations = 0; | ||
378 | p->se.nr_wakeups = 0; | ||
379 | p->se.nr_wakeups_sync = 0; | ||
380 | p->se.nr_wakeups_migrate = 0; | ||
381 | p->se.nr_wakeups_local = 0; | ||
382 | p->se.nr_wakeups_remote = 0; | ||
383 | p->se.nr_wakeups_affine = 0; | ||
384 | p->se.nr_wakeups_affine_attempts = 0; | ||
385 | p->se.nr_wakeups_passive = 0; | ||
386 | p->se.nr_wakeups_idle = 0; | ||
387 | p->sched_info.bkl_count = 0; | ||
284 | #endif | 388 | #endif |
285 | p->se.sum_exec_runtime = 0; | 389 | p->se.sum_exec_runtime = 0; |
286 | p->se.prev_sum_exec_runtime = 0; | 390 | p->se.prev_sum_exec_runtime = 0; |
391 | p->nvcsw = 0; | ||
392 | p->nivcsw = 0; | ||
287 | } | 393 | } |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 67c67a87146e..a17b785d7000 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -25,22 +25,26 @@ | |||
25 | * (default: 20ms, units: nanoseconds) | 25 | * (default: 20ms, units: nanoseconds) |
26 | * | 26 | * |
27 | * NOTE: this latency value is not the same as the concept of | 27 | * NOTE: this latency value is not the same as the concept of |
28 | * 'timeslice length' - timeslices in CFS are of variable length. | 28 | * 'timeslice length' - timeslices in CFS are of variable length |
29 | * (to see the precise effective timeslice length of your workload, | 29 | * and have no persistent notion like in traditional, time-slice |
30 | * run vmstat and monitor the context-switches field) | 30 | * based scheduling concepts. |
31 | * | 31 | * |
32 | * On SMP systems the value of this is multiplied by the log2 of the | 32 | * (to see the precise effective timeslice length of your workload, |
33 | * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way | 33 | * run vmstat and monitor the context-switches (cs) field) |
34 | * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) | ||
35 | * Targeted preemption latency for CPU-bound tasks: | ||
36 | */ | 34 | */ |
37 | unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; | 35 | const_debug unsigned int sysctl_sched_latency = 20000000ULL; |
36 | |||
37 | /* | ||
38 | * After fork, child runs first. (default) If set to 0 then | ||
39 | * parent will (try to) run first. | ||
40 | */ | ||
41 | const_debug unsigned int sysctl_sched_child_runs_first = 1; | ||
38 | 42 | ||
39 | /* | 43 | /* |
40 | * Minimal preemption granularity for CPU-bound tasks: | 44 | * Minimal preemption granularity for CPU-bound tasks: |
41 | * (default: 2 msec, units: nanoseconds) | 45 | * (default: 2 msec, units: nanoseconds) |
42 | */ | 46 | */ |
43 | unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL; | 47 | const_debug unsigned int sysctl_sched_nr_latency = 20; |
44 | 48 | ||
45 | /* | 49 | /* |
46 | * sys_sched_yield() compat mode | 50 | * sys_sched_yield() compat mode |
@@ -52,52 +56,25 @@ unsigned int __read_mostly sysctl_sched_compat_yield; | |||
52 | 56 | ||
53 | /* | 57 | /* |
54 | * SCHED_BATCH wake-up granularity. | 58 | * SCHED_BATCH wake-up granularity. |
55 | * (default: 25 msec, units: nanoseconds) | 59 | * (default: 10 msec, units: nanoseconds) |
56 | * | 60 | * |
57 | * This option delays the preemption effects of decoupled workloads | 61 | * This option delays the preemption effects of decoupled workloads |
58 | * and reduces their over-scheduling. Synchronous workloads will still | 62 | * and reduces their over-scheduling. Synchronous workloads will still |
59 | * have immediate wakeup/sleep latencies. | 63 | * have immediate wakeup/sleep latencies. |
60 | */ | 64 | */ |
61 | unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL; | 65 | const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; |
62 | 66 | ||
63 | /* | 67 | /* |
64 | * SCHED_OTHER wake-up granularity. | 68 | * SCHED_OTHER wake-up granularity. |
65 | * (default: 1 msec, units: nanoseconds) | 69 | * (default: 10 msec, units: nanoseconds) |
66 | * | 70 | * |
67 | * This option delays the preemption effects of decoupled workloads | 71 | * This option delays the preemption effects of decoupled workloads |
68 | * and reduces their over-scheduling. Synchronous workloads will still | 72 | * and reduces their over-scheduling. Synchronous workloads will still |
69 | * have immediate wakeup/sleep latencies. | 73 | * have immediate wakeup/sleep latencies. |
70 | */ | 74 | */ |
71 | unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000UL; | 75 | const_debug unsigned int sysctl_sched_wakeup_granularity = 10000000UL; |
72 | |||
73 | unsigned int sysctl_sched_stat_granularity __read_mostly; | ||
74 | |||
75 | /* | ||
76 | * Initialized in sched_init_granularity() [to 5 times the base granularity]: | ||
77 | */ | ||
78 | unsigned int sysctl_sched_runtime_limit __read_mostly; | ||
79 | |||
80 | /* | ||
81 | * Debugging: various feature bits | ||
82 | */ | ||
83 | enum { | ||
84 | SCHED_FEAT_FAIR_SLEEPERS = 1, | ||
85 | SCHED_FEAT_SLEEPER_AVG = 2, | ||
86 | SCHED_FEAT_SLEEPER_LOAD_AVG = 4, | ||
87 | SCHED_FEAT_PRECISE_CPU_LOAD = 8, | ||
88 | SCHED_FEAT_START_DEBIT = 16, | ||
89 | SCHED_FEAT_SKIP_INITIAL = 32, | ||
90 | }; | ||
91 | 76 | ||
92 | unsigned int sysctl_sched_features __read_mostly = | 77 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
93 | SCHED_FEAT_FAIR_SLEEPERS *1 | | ||
94 | SCHED_FEAT_SLEEPER_AVG *0 | | ||
95 | SCHED_FEAT_SLEEPER_LOAD_AVG *1 | | ||
96 | SCHED_FEAT_PRECISE_CPU_LOAD *1 | | ||
97 | SCHED_FEAT_START_DEBIT *1 | | ||
98 | SCHED_FEAT_SKIP_INITIAL *0; | ||
99 | |||
100 | extern struct sched_class fair_sched_class; | ||
101 | 78 | ||
102 | /************************************************************** | 79 | /************************************************************** |
103 | * CFS operations on generic schedulable entities: | 80 | * CFS operations on generic schedulable entities: |
@@ -111,21 +88,9 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
111 | return cfs_rq->rq; | 88 | return cfs_rq->rq; |
112 | } | 89 | } |
113 | 90 | ||
114 | /* currently running entity (if any) on this cfs_rq */ | ||
115 | static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) | ||
116 | { | ||
117 | return cfs_rq->curr; | ||
118 | } | ||
119 | |||
120 | /* An entity is a task if it doesn't "own" a runqueue */ | 91 | /* An entity is a task if it doesn't "own" a runqueue */ |
121 | #define entity_is_task(se) (!se->my_q) | 92 | #define entity_is_task(se) (!se->my_q) |
122 | 93 | ||
123 | static inline void | ||
124 | set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
125 | { | ||
126 | cfs_rq->curr = se; | ||
127 | } | ||
128 | |||
129 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 94 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
130 | 95 | ||
131 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | 96 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) |
@@ -133,21 +98,8 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
133 | return container_of(cfs_rq, struct rq, cfs); | 98 | return container_of(cfs_rq, struct rq, cfs); |
134 | } | 99 | } |
135 | 100 | ||
136 | static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) | ||
137 | { | ||
138 | struct rq *rq = rq_of(cfs_rq); | ||
139 | |||
140 | if (unlikely(rq->curr->sched_class != &fair_sched_class)) | ||
141 | return NULL; | ||
142 | |||
143 | return &rq->curr->se; | ||
144 | } | ||
145 | |||
146 | #define entity_is_task(se) 1 | 101 | #define entity_is_task(se) 1 |
147 | 102 | ||
148 | static inline void | ||
149 | set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { } | ||
150 | |||
151 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 103 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
152 | 104 | ||
153 | static inline struct task_struct *task_of(struct sched_entity *se) | 105 | static inline struct task_struct *task_of(struct sched_entity *se) |
@@ -160,16 +112,38 @@ static inline struct task_struct *task_of(struct sched_entity *se) | |||
160 | * Scheduling class tree data structure manipulation methods: | 112 | * Scheduling class tree data structure manipulation methods: |
161 | */ | 113 | */ |
162 | 114 | ||
115 | static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) | ||
116 | { | ||
117 | s64 delta = (s64)(vruntime - min_vruntime); | ||
118 | if (delta > 0) | ||
119 | min_vruntime = vruntime; | ||
120 | |||
121 | return min_vruntime; | ||
122 | } | ||
123 | |||
124 | static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) | ||
125 | { | ||
126 | s64 delta = (s64)(vruntime - min_vruntime); | ||
127 | if (delta < 0) | ||
128 | min_vruntime = vruntime; | ||
129 | |||
130 | return min_vruntime; | ||
131 | } | ||
132 | |||
133 | static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
134 | { | ||
135 | return se->vruntime - cfs_rq->min_vruntime; | ||
136 | } | ||
137 | |||
163 | /* | 138 | /* |
164 | * Enqueue an entity into the rb-tree: | 139 | * Enqueue an entity into the rb-tree: |
165 | */ | 140 | */ |
166 | static inline void | 141 | static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) |
167 | __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
168 | { | 142 | { |
169 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; | 143 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; |
170 | struct rb_node *parent = NULL; | 144 | struct rb_node *parent = NULL; |
171 | struct sched_entity *entry; | 145 | struct sched_entity *entry; |
172 | s64 key = se->fair_key; | 146 | s64 key = entity_key(cfs_rq, se); |
173 | int leftmost = 1; | 147 | int leftmost = 1; |
174 | 148 | ||
175 | /* | 149 | /* |
@@ -182,7 +156,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
182 | * We dont care about collisions. Nodes with | 156 | * We dont care about collisions. Nodes with |
183 | * the same key stay together. | 157 | * the same key stay together. |
184 | */ | 158 | */ |
185 | if (key - entry->fair_key < 0) { | 159 | if (key < entity_key(cfs_rq, entry)) { |
186 | link = &parent->rb_left; | 160 | link = &parent->rb_left; |
187 | } else { | 161 | } else { |
188 | link = &parent->rb_right; | 162 | link = &parent->rb_right; |
@@ -199,24 +173,14 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
199 | 173 | ||
200 | rb_link_node(&se->run_node, parent, link); | 174 | rb_link_node(&se->run_node, parent, link); |
201 | rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); | 175 | rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); |
202 | update_load_add(&cfs_rq->load, se->load.weight); | ||
203 | cfs_rq->nr_running++; | ||
204 | se->on_rq = 1; | ||
205 | |||
206 | schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); | ||
207 | } | 176 | } |
208 | 177 | ||
209 | static inline void | 178 | static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) |
210 | __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
211 | { | 179 | { |
212 | if (cfs_rq->rb_leftmost == &se->run_node) | 180 | if (cfs_rq->rb_leftmost == &se->run_node) |
213 | cfs_rq->rb_leftmost = rb_next(&se->run_node); | 181 | cfs_rq->rb_leftmost = rb_next(&se->run_node); |
214 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); | ||
215 | update_load_sub(&cfs_rq->load, se->load.weight); | ||
216 | cfs_rq->nr_running--; | ||
217 | se->on_rq = 0; | ||
218 | 182 | ||
219 | schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); | 183 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); |
220 | } | 184 | } |
221 | 185 | ||
222 | static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) | 186 | static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) |
@@ -229,118 +193,86 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) | |||
229 | return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); | 193 | return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); |
230 | } | 194 | } |
231 | 195 | ||
196 | static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | ||
197 | { | ||
198 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; | ||
199 | struct sched_entity *se = NULL; | ||
200 | struct rb_node *parent; | ||
201 | |||
202 | while (*link) { | ||
203 | parent = *link; | ||
204 | se = rb_entry(parent, struct sched_entity, run_node); | ||
205 | link = &parent->rb_right; | ||
206 | } | ||
207 | |||
208 | return se; | ||
209 | } | ||
210 | |||
232 | /************************************************************** | 211 | /************************************************************** |
233 | * Scheduling class statistics methods: | 212 | * Scheduling class statistics methods: |
234 | */ | 213 | */ |
235 | 214 | ||
215 | |||
236 | /* | 216 | /* |
237 | * Calculate the preemption granularity needed to schedule every | 217 | * The idea is to set a period in which each task runs once. |
238 | * runnable task once per sysctl_sched_latency amount of time. | ||
239 | * (down to a sensible low limit on granularity) | ||
240 | * | ||
241 | * For example, if there are 2 tasks running and latency is 10 msecs, | ||
242 | * we switch tasks every 5 msecs. If we have 3 tasks running, we have | ||
243 | * to switch tasks every 3.33 msecs to get a 10 msecs observed latency | ||
244 | * for each task. We do finer and finer scheduling up to until we | ||
245 | * reach the minimum granularity value. | ||
246 | * | ||
247 | * To achieve this we use the following dynamic-granularity rule: | ||
248 | * | 218 | * |
249 | * gran = lat/nr - lat/nr/nr | 219 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch |
220 | * this period because otherwise the slices get too small. | ||
250 | * | 221 | * |
251 | * This comes out of the following equations: | 222 | * p = (nr <= nl) ? l : l*nr/nl |
252 | * | ||
253 | * kA1 + gran = kB1 | ||
254 | * kB2 + gran = kA2 | ||
255 | * kA2 = kA1 | ||
256 | * kB2 = kB1 - d + d/nr | ||
257 | * lat = d * nr | ||
258 | * | ||
259 | * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running), | ||
260 | * '1' is start of time, '2' is end of time, 'd' is delay between | ||
261 | * 1 and 2 (during which task B was running), 'nr' is number of tasks | ||
262 | * running, 'lat' is the the period of each task. ('lat' is the | ||
263 | * sched_latency that we aim for.) | ||
264 | */ | 223 | */ |
265 | static long | 224 | static u64 __sched_period(unsigned long nr_running) |
266 | sched_granularity(struct cfs_rq *cfs_rq) | ||
267 | { | 225 | { |
268 | unsigned int gran = sysctl_sched_latency; | 226 | u64 period = sysctl_sched_latency; |
269 | unsigned int nr = cfs_rq->nr_running; | 227 | unsigned long nr_latency = sysctl_sched_nr_latency; |
270 | 228 | ||
271 | if (nr > 1) { | 229 | if (unlikely(nr_running > nr_latency)) { |
272 | gran = gran/nr - gran/nr/nr; | 230 | period *= nr_running; |
273 | gran = max(gran, sysctl_sched_min_granularity); | 231 | do_div(period, nr_latency); |
274 | } | 232 | } |
275 | 233 | ||
276 | return gran; | 234 | return period; |
277 | } | 235 | } |
278 | 236 | ||
279 | /* | 237 | /* |
280 | * We rescale the rescheduling granularity of tasks according to their | 238 | * We calculate the wall-time slice from the period by taking a part |
281 | * nice level, but only linearly, not exponentially: | 239 | * proportional to the weight. |
240 | * | ||
241 | * s = p*w/rw | ||
282 | */ | 242 | */ |
283 | static long | 243 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) |
284 | niced_granularity(struct sched_entity *curr, unsigned long granularity) | ||
285 | { | 244 | { |
286 | u64 tmp; | 245 | u64 slice = __sched_period(cfs_rq->nr_running); |
287 | 246 | ||
288 | if (likely(curr->load.weight == NICE_0_LOAD)) | 247 | slice *= se->load.weight; |
289 | return granularity; | 248 | do_div(slice, cfs_rq->load.weight); |
290 | /* | ||
291 | * Positive nice levels get the same granularity as nice-0: | ||
292 | */ | ||
293 | if (likely(curr->load.weight < NICE_0_LOAD)) { | ||
294 | tmp = curr->load.weight * (u64)granularity; | ||
295 | return (long) (tmp >> NICE_0_SHIFT); | ||
296 | } | ||
297 | /* | ||
298 | * Negative nice level tasks get linearly finer | ||
299 | * granularity: | ||
300 | */ | ||
301 | tmp = curr->load.inv_weight * (u64)granularity; | ||
302 | 249 | ||
303 | /* | 250 | return slice; |
304 | * It will always fit into 'long': | ||
305 | */ | ||
306 | return (long) (tmp >> (WMULT_SHIFT-NICE_0_SHIFT)); | ||
307 | } | 251 | } |
308 | 252 | ||
309 | static inline void | 253 | /* |
310 | limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se) | 254 | * We calculate the vruntime slice. |
255 | * | ||
256 | * vs = s/w = p/rw | ||
257 | */ | ||
258 | static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) | ||
311 | { | 259 | { |
312 | long limit = sysctl_sched_runtime_limit; | 260 | u64 vslice = __sched_period(nr_running); |
313 | 261 | ||
314 | /* | 262 | do_div(vslice, rq_weight); |
315 | * Niced tasks have the same history dynamic range as | 263 | |
316 | * non-niced tasks: | 264 | return vslice; |
317 | */ | ||
318 | if (unlikely(se->wait_runtime > limit)) { | ||
319 | se->wait_runtime = limit; | ||
320 | schedstat_inc(se, wait_runtime_overruns); | ||
321 | schedstat_inc(cfs_rq, wait_runtime_overruns); | ||
322 | } | ||
323 | if (unlikely(se->wait_runtime < -limit)) { | ||
324 | se->wait_runtime = -limit; | ||
325 | schedstat_inc(se, wait_runtime_underruns); | ||
326 | schedstat_inc(cfs_rq, wait_runtime_underruns); | ||
327 | } | ||
328 | } | 265 | } |
329 | 266 | ||
330 | static inline void | 267 | static u64 sched_vslice(struct cfs_rq *cfs_rq) |
331 | __add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) | ||
332 | { | 268 | { |
333 | se->wait_runtime += delta; | 269 | return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running); |
334 | schedstat_add(se, sum_wait_runtime, delta); | ||
335 | limit_wait_runtime(cfs_rq, se); | ||
336 | } | 270 | } |
337 | 271 | ||
338 | static void | 272 | static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) |
339 | add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) | ||
340 | { | 273 | { |
341 | schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); | 274 | return __sched_vslice(cfs_rq->load.weight + se->load.weight, |
342 | __add_wait_runtime(cfs_rq, se, delta); | 275 | cfs_rq->nr_running + 1); |
343 | schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); | ||
344 | } | 276 | } |
345 | 277 | ||
346 | /* | 278 | /* |
@@ -348,46 +280,41 @@ add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) | |||
348 | * are not in our scheduling class. | 280 | * are not in our scheduling class. |
349 | */ | 281 | */ |
350 | static inline void | 282 | static inline void |
351 | __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr) | 283 | __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, |
284 | unsigned long delta_exec) | ||
352 | { | 285 | { |
353 | unsigned long delta, delta_exec, delta_fair, delta_mine; | 286 | unsigned long delta_exec_weighted; |
354 | struct load_weight *lw = &cfs_rq->load; | 287 | u64 vruntime; |
355 | unsigned long load = lw->weight; | ||
356 | 288 | ||
357 | delta_exec = curr->delta_exec; | ||
358 | schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); | 289 | schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); |
359 | 290 | ||
360 | curr->sum_exec_runtime += delta_exec; | 291 | curr->sum_exec_runtime += delta_exec; |
361 | cfs_rq->exec_clock += delta_exec; | 292 | schedstat_add(cfs_rq, exec_clock, delta_exec); |
362 | 293 | delta_exec_weighted = delta_exec; | |
363 | if (unlikely(!load)) | 294 | if (unlikely(curr->load.weight != NICE_0_LOAD)) { |
364 | return; | 295 | delta_exec_weighted = calc_delta_fair(delta_exec_weighted, |
365 | 296 | &curr->load); | |
366 | delta_fair = calc_delta_fair(delta_exec, lw); | ||
367 | delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); | ||
368 | |||
369 | if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) { | ||
370 | delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); | ||
371 | delta = min(delta, (unsigned long)( | ||
372 | (long)sysctl_sched_runtime_limit - curr->wait_runtime)); | ||
373 | cfs_rq->sleeper_bonus -= delta; | ||
374 | delta_mine -= delta; | ||
375 | } | 297 | } |
298 | curr->vruntime += delta_exec_weighted; | ||
376 | 299 | ||
377 | cfs_rq->fair_clock += delta_fair; | ||
378 | /* | 300 | /* |
379 | * We executed delta_exec amount of time on the CPU, | 301 | * maintain cfs_rq->min_vruntime to be a monotonic increasing |
380 | * but we were only entitled to delta_mine amount of | 302 | * value tracking the leftmost vruntime in the tree. |
381 | * time during that period (if nr_running == 1 then | ||
382 | * the two values are equal) | ||
383 | * [Note: delta_mine - delta_exec is negative]: | ||
384 | */ | 303 | */ |
385 | add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec); | 304 | if (first_fair(cfs_rq)) { |
305 | vruntime = min_vruntime(curr->vruntime, | ||
306 | __pick_next_entity(cfs_rq)->vruntime); | ||
307 | } else | ||
308 | vruntime = curr->vruntime; | ||
309 | |||
310 | cfs_rq->min_vruntime = | ||
311 | max_vruntime(cfs_rq->min_vruntime, vruntime); | ||
386 | } | 312 | } |
387 | 313 | ||
388 | static void update_curr(struct cfs_rq *cfs_rq) | 314 | static void update_curr(struct cfs_rq *cfs_rq) |
389 | { | 315 | { |
390 | struct sched_entity *curr = cfs_rq_curr(cfs_rq); | 316 | struct sched_entity *curr = cfs_rq->curr; |
317 | u64 now = rq_of(cfs_rq)->clock; | ||
391 | unsigned long delta_exec; | 318 | unsigned long delta_exec; |
392 | 319 | ||
393 | if (unlikely(!curr)) | 320 | if (unlikely(!curr)) |
@@ -398,135 +325,47 @@ static void update_curr(struct cfs_rq *cfs_rq) | |||
398 | * since the last time we changed load (this cannot | 325 | * since the last time we changed load (this cannot |
399 | * overflow on 32 bits): | 326 | * overflow on 32 bits): |
400 | */ | 327 | */ |
401 | delta_exec = (unsigned long)(rq_of(cfs_rq)->clock - curr->exec_start); | 328 | delta_exec = (unsigned long)(now - curr->exec_start); |
402 | 329 | ||
403 | curr->delta_exec += delta_exec; | 330 | __update_curr(cfs_rq, curr, delta_exec); |
404 | 331 | curr->exec_start = now; | |
405 | if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) { | ||
406 | __update_curr(cfs_rq, curr); | ||
407 | curr->delta_exec = 0; | ||
408 | } | ||
409 | curr->exec_start = rq_of(cfs_rq)->clock; | ||
410 | } | 332 | } |
411 | 333 | ||
412 | static inline void | 334 | static inline void |
413 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | 335 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) |
414 | { | 336 | { |
415 | se->wait_start_fair = cfs_rq->fair_clock; | ||
416 | schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); | 337 | schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); |
417 | } | 338 | } |
418 | 339 | ||
419 | /* | 340 | /* |
420 | * We calculate fair deltas here, so protect against the random effects | ||
421 | * of a multiplication overflow by capping it to the runtime limit: | ||
422 | */ | ||
423 | #if BITS_PER_LONG == 32 | ||
424 | static inline unsigned long | ||
425 | calc_weighted(unsigned long delta, unsigned long weight, int shift) | ||
426 | { | ||
427 | u64 tmp = (u64)delta * weight >> shift; | ||
428 | |||
429 | if (unlikely(tmp > sysctl_sched_runtime_limit*2)) | ||
430 | return sysctl_sched_runtime_limit*2; | ||
431 | return tmp; | ||
432 | } | ||
433 | #else | ||
434 | static inline unsigned long | ||
435 | calc_weighted(unsigned long delta, unsigned long weight, int shift) | ||
436 | { | ||
437 | return delta * weight >> shift; | ||
438 | } | ||
439 | #endif | ||
440 | |||
441 | /* | ||
442 | * Task is being enqueued - update stats: | 341 | * Task is being enqueued - update stats: |
443 | */ | 342 | */ |
444 | static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 343 | static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
445 | { | 344 | { |
446 | s64 key; | ||
447 | |||
448 | /* | 345 | /* |
449 | * Are we enqueueing a waiting task? (for current tasks | 346 | * Are we enqueueing a waiting task? (for current tasks |
450 | * a dequeue/enqueue event is a NOP) | 347 | * a dequeue/enqueue event is a NOP) |
451 | */ | 348 | */ |
452 | if (se != cfs_rq_curr(cfs_rq)) | 349 | if (se != cfs_rq->curr) |
453 | update_stats_wait_start(cfs_rq, se); | 350 | update_stats_wait_start(cfs_rq, se); |
454 | /* | ||
455 | * Update the key: | ||
456 | */ | ||
457 | key = cfs_rq->fair_clock; | ||
458 | |||
459 | /* | ||
460 | * Optimize the common nice 0 case: | ||
461 | */ | ||
462 | if (likely(se->load.weight == NICE_0_LOAD)) { | ||
463 | key -= se->wait_runtime; | ||
464 | } else { | ||
465 | u64 tmp; | ||
466 | |||
467 | if (se->wait_runtime < 0) { | ||
468 | tmp = -se->wait_runtime; | ||
469 | key += (tmp * se->load.inv_weight) >> | ||
470 | (WMULT_SHIFT - NICE_0_SHIFT); | ||
471 | } else { | ||
472 | tmp = se->wait_runtime; | ||
473 | key -= (tmp * se->load.inv_weight) >> | ||
474 | (WMULT_SHIFT - NICE_0_SHIFT); | ||
475 | } | ||
476 | } | ||
477 | |||
478 | se->fair_key = key; | ||
479 | } | ||
480 | |||
481 | /* | ||
482 | * Note: must be called with a freshly updated rq->fair_clock. | ||
483 | */ | ||
484 | static inline void | ||
485 | __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
486 | { | ||
487 | unsigned long delta_fair = se->delta_fair_run; | ||
488 | |||
489 | schedstat_set(se->wait_max, max(se->wait_max, | ||
490 | rq_of(cfs_rq)->clock - se->wait_start)); | ||
491 | |||
492 | if (unlikely(se->load.weight != NICE_0_LOAD)) | ||
493 | delta_fair = calc_weighted(delta_fair, se->load.weight, | ||
494 | NICE_0_SHIFT); | ||
495 | |||
496 | add_wait_runtime(cfs_rq, se, delta_fair); | ||
497 | } | 351 | } |
498 | 352 | ||
499 | static void | 353 | static void |
500 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | 354 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) |
501 | { | 355 | { |
502 | unsigned long delta_fair; | 356 | schedstat_set(se->wait_max, max(se->wait_max, |
503 | 357 | rq_of(cfs_rq)->clock - se->wait_start)); | |
504 | if (unlikely(!se->wait_start_fair)) | ||
505 | return; | ||
506 | |||
507 | delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), | ||
508 | (u64)(cfs_rq->fair_clock - se->wait_start_fair)); | ||
509 | |||
510 | se->delta_fair_run += delta_fair; | ||
511 | if (unlikely(abs(se->delta_fair_run) >= | ||
512 | sysctl_sched_stat_granularity)) { | ||
513 | __update_stats_wait_end(cfs_rq, se); | ||
514 | se->delta_fair_run = 0; | ||
515 | } | ||
516 | |||
517 | se->wait_start_fair = 0; | ||
518 | schedstat_set(se->wait_start, 0); | 358 | schedstat_set(se->wait_start, 0); |
519 | } | 359 | } |
520 | 360 | ||
521 | static inline void | 361 | static inline void |
522 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 362 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
523 | { | 363 | { |
524 | update_curr(cfs_rq); | ||
525 | /* | 364 | /* |
526 | * Mark the end of the wait period if dequeueing a | 365 | * Mark the end of the wait period if dequeueing a |
527 | * waiting task: | 366 | * waiting task: |
528 | */ | 367 | */ |
529 | if (se != cfs_rq_curr(cfs_rq)) | 368 | if (se != cfs_rq->curr) |
530 | update_stats_wait_end(cfs_rq, se); | 369 | update_stats_wait_end(cfs_rq, se); |
531 | } | 370 | } |
532 | 371 | ||
@@ -542,79 +381,28 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
542 | se->exec_start = rq_of(cfs_rq)->clock; | 381 | se->exec_start = rq_of(cfs_rq)->clock; |
543 | } | 382 | } |
544 | 383 | ||
545 | /* | ||
546 | * We are descheduling a task - update its stats: | ||
547 | */ | ||
548 | static inline void | ||
549 | update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
550 | { | ||
551 | se->exec_start = 0; | ||
552 | } | ||
553 | |||
554 | /************************************************** | 384 | /************************************************** |
555 | * Scheduling class queueing methods: | 385 | * Scheduling class queueing methods: |
556 | */ | 386 | */ |
557 | 387 | ||
558 | static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 388 | static void |
389 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
559 | { | 390 | { |
560 | unsigned long load = cfs_rq->load.weight, delta_fair; | 391 | update_load_add(&cfs_rq->load, se->load.weight); |
561 | long prev_runtime; | 392 | cfs_rq->nr_running++; |
562 | 393 | se->on_rq = 1; | |
563 | /* | 394 | } |
564 | * Do not boost sleepers if there's too much bonus 'in flight' | ||
565 | * already: | ||
566 | */ | ||
567 | if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit)) | ||
568 | return; | ||
569 | |||
570 | if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG) | ||
571 | load = rq_of(cfs_rq)->cpu_load[2]; | ||
572 | |||
573 | delta_fair = se->delta_fair_sleep; | ||
574 | |||
575 | /* | ||
576 | * Fix up delta_fair with the effect of us running | ||
577 | * during the whole sleep period: | ||
578 | */ | ||
579 | if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG) | ||
580 | delta_fair = div64_likely32((u64)delta_fair * load, | ||
581 | load + se->load.weight); | ||
582 | |||
583 | if (unlikely(se->load.weight != NICE_0_LOAD)) | ||
584 | delta_fair = calc_weighted(delta_fair, se->load.weight, | ||
585 | NICE_0_SHIFT); | ||
586 | |||
587 | prev_runtime = se->wait_runtime; | ||
588 | __add_wait_runtime(cfs_rq, se, delta_fair); | ||
589 | delta_fair = se->wait_runtime - prev_runtime; | ||
590 | 395 | ||
591 | /* | 396 | static void |
592 | * Track the amount of bonus we've given to sleepers: | 397 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
593 | */ | 398 | { |
594 | cfs_rq->sleeper_bonus += delta_fair; | 399 | update_load_sub(&cfs_rq->load, se->load.weight); |
400 | cfs_rq->nr_running--; | ||
401 | se->on_rq = 0; | ||
595 | } | 402 | } |
596 | 403 | ||
597 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 404 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
598 | { | 405 | { |
599 | struct task_struct *tsk = task_of(se); | ||
600 | unsigned long delta_fair; | ||
601 | |||
602 | if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) || | ||
603 | !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS)) | ||
604 | return; | ||
605 | |||
606 | delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), | ||
607 | (u64)(cfs_rq->fair_clock - se->sleep_start_fair)); | ||
608 | |||
609 | se->delta_fair_sleep += delta_fair; | ||
610 | if (unlikely(abs(se->delta_fair_sleep) >= | ||
611 | sysctl_sched_stat_granularity)) { | ||
612 | __enqueue_sleeper(cfs_rq, se); | ||
613 | se->delta_fair_sleep = 0; | ||
614 | } | ||
615 | |||
616 | se->sleep_start_fair = 0; | ||
617 | |||
618 | #ifdef CONFIG_SCHEDSTATS | 406 | #ifdef CONFIG_SCHEDSTATS |
619 | if (se->sleep_start) { | 407 | if (se->sleep_start) { |
620 | u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; | 408 | u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; |
@@ -646,6 +434,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
646 | * time that the task spent sleeping: | 434 | * time that the task spent sleeping: |
647 | */ | 435 | */ |
648 | if (unlikely(prof_on == SLEEP_PROFILING)) { | 436 | if (unlikely(prof_on == SLEEP_PROFILING)) { |
437 | struct task_struct *tsk = task_of(se); | ||
438 | |||
649 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), | 439 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), |
650 | delta >> 20); | 440 | delta >> 20); |
651 | } | 441 | } |
@@ -653,27 +443,81 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
653 | #endif | 443 | #endif |
654 | } | 444 | } |
655 | 445 | ||
446 | static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
447 | { | ||
448 | #ifdef CONFIG_SCHED_DEBUG | ||
449 | s64 d = se->vruntime - cfs_rq->min_vruntime; | ||
450 | |||
451 | if (d < 0) | ||
452 | d = -d; | ||
453 | |||
454 | if (d > 3*sysctl_sched_latency) | ||
455 | schedstat_inc(cfs_rq, nr_spread_over); | ||
456 | #endif | ||
457 | } | ||
458 | |||
459 | static void | ||
460 | place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | ||
461 | { | ||
462 | u64 vruntime; | ||
463 | |||
464 | vruntime = cfs_rq->min_vruntime; | ||
465 | |||
466 | if (sched_feat(TREE_AVG)) { | ||
467 | struct sched_entity *last = __pick_last_entity(cfs_rq); | ||
468 | if (last) { | ||
469 | vruntime += last->vruntime; | ||
470 | vruntime >>= 1; | ||
471 | } | ||
472 | } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) | ||
473 | vruntime += sched_vslice(cfs_rq)/2; | ||
474 | |||
475 | if (initial && sched_feat(START_DEBIT)) | ||
476 | vruntime += sched_vslice_add(cfs_rq, se); | ||
477 | |||
478 | if (!initial) { | ||
479 | if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) && | ||
480 | task_of(se)->policy != SCHED_BATCH) | ||
481 | vruntime -= sysctl_sched_latency; | ||
482 | |||
483 | vruntime = max_t(s64, vruntime, se->vruntime); | ||
484 | } | ||
485 | |||
486 | se->vruntime = vruntime; | ||
487 | |||
488 | } | ||
489 | |||
656 | static void | 490 | static void |
657 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) | 491 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) |
658 | { | 492 | { |
659 | /* | 493 | /* |
660 | * Update the fair clock. | 494 | * Update run-time statistics of the 'current'. |
661 | */ | 495 | */ |
662 | update_curr(cfs_rq); | 496 | update_curr(cfs_rq); |
663 | 497 | ||
664 | if (wakeup) | 498 | if (wakeup) { |
499 | place_entity(cfs_rq, se, 0); | ||
665 | enqueue_sleeper(cfs_rq, se); | 500 | enqueue_sleeper(cfs_rq, se); |
501 | } | ||
666 | 502 | ||
667 | update_stats_enqueue(cfs_rq, se); | 503 | update_stats_enqueue(cfs_rq, se); |
668 | __enqueue_entity(cfs_rq, se); | 504 | check_spread(cfs_rq, se); |
505 | if (se != cfs_rq->curr) | ||
506 | __enqueue_entity(cfs_rq, se); | ||
507 | account_entity_enqueue(cfs_rq, se); | ||
669 | } | 508 | } |
670 | 509 | ||
671 | static void | 510 | static void |
672 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | 511 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) |
673 | { | 512 | { |
513 | /* | ||
514 | * Update run-time statistics of the 'current'. | ||
515 | */ | ||
516 | update_curr(cfs_rq); | ||
517 | |||
674 | update_stats_dequeue(cfs_rq, se); | 518 | update_stats_dequeue(cfs_rq, se); |
675 | if (sleep) { | 519 | if (sleep) { |
676 | se->sleep_start_fair = cfs_rq->fair_clock; | 520 | se->peer_preempt = 0; |
677 | #ifdef CONFIG_SCHEDSTATS | 521 | #ifdef CONFIG_SCHEDSTATS |
678 | if (entity_is_task(se)) { | 522 | if (entity_is_task(se)) { |
679 | struct task_struct *tsk = task_of(se); | 523 | struct task_struct *tsk = task_of(se); |
@@ -685,68 +529,66 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | |||
685 | } | 529 | } |
686 | #endif | 530 | #endif |
687 | } | 531 | } |
688 | __dequeue_entity(cfs_rq, se); | 532 | |
533 | if (se != cfs_rq->curr) | ||
534 | __dequeue_entity(cfs_rq, se); | ||
535 | account_entity_dequeue(cfs_rq, se); | ||
689 | } | 536 | } |
690 | 537 | ||
691 | /* | 538 | /* |
692 | * Preempt the current task with a newly woken task if needed: | 539 | * Preempt the current task with a newly woken task if needed: |
693 | */ | 540 | */ |
694 | static void | 541 | static void |
695 | __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, | 542 | check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) |
696 | struct sched_entity *curr, unsigned long granularity) | ||
697 | { | 543 | { |
698 | s64 __delta = curr->fair_key - se->fair_key; | ||
699 | unsigned long ideal_runtime, delta_exec; | 544 | unsigned long ideal_runtime, delta_exec; |
700 | 545 | ||
701 | /* | 546 | ideal_runtime = sched_slice(cfs_rq, curr); |
702 | * ideal_runtime is compared against sum_exec_runtime, which is | ||
703 | * walltime, hence do not scale. | ||
704 | */ | ||
705 | ideal_runtime = max(sysctl_sched_latency / cfs_rq->nr_running, | ||
706 | (unsigned long)sysctl_sched_min_granularity); | ||
707 | |||
708 | /* | ||
709 | * If we executed more than what the latency constraint suggests, | ||
710 | * reduce the rescheduling granularity. This way the total latency | ||
711 | * of how much a task is not scheduled converges to | ||
712 | * sysctl_sched_latency: | ||
713 | */ | ||
714 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; | 547 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; |
715 | if (delta_exec > ideal_runtime) | 548 | if (delta_exec > ideal_runtime || |
716 | granularity = 0; | 549 | (sched_feat(PREEMPT_RESTRICT) && curr->peer_preempt)) |
717 | |||
718 | /* | ||
719 | * Take scheduling granularity into account - do not | ||
720 | * preempt the current task unless the best task has | ||
721 | * a larger than sched_granularity fairness advantage: | ||
722 | * | ||
723 | * scale granularity as key space is in fair_clock. | ||
724 | */ | ||
725 | if (__delta > niced_granularity(curr, granularity)) | ||
726 | resched_task(rq_of(cfs_rq)->curr); | 550 | resched_task(rq_of(cfs_rq)->curr); |
551 | curr->peer_preempt = 0; | ||
727 | } | 552 | } |
728 | 553 | ||
729 | static inline void | 554 | static void |
730 | set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | 555 | set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) |
731 | { | 556 | { |
557 | /* 'current' is not kept within the tree. */ | ||
558 | if (se->on_rq) { | ||
559 | /* | ||
560 | * Any task has to be enqueued before it get to execute on | ||
561 | * a CPU. So account for the time it spent waiting on the | ||
562 | * runqueue. | ||
563 | */ | ||
564 | update_stats_wait_end(cfs_rq, se); | ||
565 | __dequeue_entity(cfs_rq, se); | ||
566 | } | ||
567 | |||
568 | update_stats_curr_start(cfs_rq, se); | ||
569 | cfs_rq->curr = se; | ||
570 | #ifdef CONFIG_SCHEDSTATS | ||
732 | /* | 571 | /* |
733 | * Any task has to be enqueued before it get to execute on | 572 | * Track our maximum slice length, if the CPU's load is at |
734 | * a CPU. So account for the time it spent waiting on the | 573 | * least twice that of our own weight (i.e. dont track it |
735 | * runqueue. (note, here we rely on pick_next_task() having | 574 | * when there are only lesser-weight tasks around): |
736 | * done a put_prev_task_fair() shortly before this, which | ||
737 | * updated rq->fair_clock - used by update_stats_wait_end()) | ||
738 | */ | 575 | */ |
739 | update_stats_wait_end(cfs_rq, se); | 576 | if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { |
740 | update_stats_curr_start(cfs_rq, se); | 577 | se->slice_max = max(se->slice_max, |
741 | set_cfs_rq_curr(cfs_rq, se); | 578 | se->sum_exec_runtime - se->prev_sum_exec_runtime); |
579 | } | ||
580 | #endif | ||
742 | se->prev_sum_exec_runtime = se->sum_exec_runtime; | 581 | se->prev_sum_exec_runtime = se->sum_exec_runtime; |
743 | } | 582 | } |
744 | 583 | ||
745 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | 584 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) |
746 | { | 585 | { |
747 | struct sched_entity *se = __pick_next_entity(cfs_rq); | 586 | struct sched_entity *se = NULL; |
748 | 587 | ||
749 | set_next_entity(cfs_rq, se); | 588 | if (first_fair(cfs_rq)) { |
589 | se = __pick_next_entity(cfs_rq); | ||
590 | set_next_entity(cfs_rq, se); | ||
591 | } | ||
750 | 592 | ||
751 | return se; | 593 | return se; |
752 | } | 594 | } |
@@ -760,33 +602,24 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
760 | if (prev->on_rq) | 602 | if (prev->on_rq) |
761 | update_curr(cfs_rq); | 603 | update_curr(cfs_rq); |
762 | 604 | ||
763 | update_stats_curr_end(cfs_rq, prev); | 605 | check_spread(cfs_rq, prev); |
764 | 606 | if (prev->on_rq) { | |
765 | if (prev->on_rq) | ||
766 | update_stats_wait_start(cfs_rq, prev); | 607 | update_stats_wait_start(cfs_rq, prev); |
767 | set_cfs_rq_curr(cfs_rq, NULL); | 608 | /* Put 'current' back into the tree. */ |
609 | __enqueue_entity(cfs_rq, prev); | ||
610 | } | ||
611 | cfs_rq->curr = NULL; | ||
768 | } | 612 | } |
769 | 613 | ||
770 | static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | 614 | static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) |
771 | { | 615 | { |
772 | struct sched_entity *next; | ||
773 | |||
774 | /* | 616 | /* |
775 | * Dequeue and enqueue the task to update its | 617 | * Update run-time statistics of the 'current'. |
776 | * position within the tree: | ||
777 | */ | 618 | */ |
778 | dequeue_entity(cfs_rq, curr, 0); | 619 | update_curr(cfs_rq); |
779 | enqueue_entity(cfs_rq, curr, 0); | ||
780 | |||
781 | /* | ||
782 | * Reschedule if another task tops the current one. | ||
783 | */ | ||
784 | next = __pick_next_entity(cfs_rq); | ||
785 | if (next == curr) | ||
786 | return; | ||
787 | 620 | ||
788 | __check_preempt_curr_fair(cfs_rq, next, curr, | 621 | if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) |
789 | sched_granularity(cfs_rq)); | 622 | check_preempt_tick(cfs_rq, curr); |
790 | } | 623 | } |
791 | 624 | ||
792 | /************************************************** | 625 | /************************************************** |
@@ -821,23 +654,28 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | |||
821 | */ | 654 | */ |
822 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | 655 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) |
823 | { | 656 | { |
824 | /* A later patch will take group into account */ | 657 | return cfs_rq->tg->cfs_rq[this_cpu]; |
825 | return &cpu_rq(this_cpu)->cfs; | ||
826 | } | 658 | } |
827 | 659 | ||
828 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | 660 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ |
829 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 661 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
830 | list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | 662 | list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) |
831 | 663 | ||
832 | /* Do the two (enqueued) tasks belong to the same group ? */ | 664 | /* Do the two (enqueued) entities belong to the same group ? */ |
833 | static inline int is_same_group(struct task_struct *curr, struct task_struct *p) | 665 | static inline int |
666 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
834 | { | 667 | { |
835 | if (curr->se.cfs_rq == p->se.cfs_rq) | 668 | if (se->cfs_rq == pse->cfs_rq) |
836 | return 1; | 669 | return 1; |
837 | 670 | ||
838 | return 0; | 671 | return 0; |
839 | } | 672 | } |
840 | 673 | ||
674 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
675 | { | ||
676 | return se->parent; | ||
677 | } | ||
678 | |||
841 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 679 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
842 | 680 | ||
843 | #define for_each_sched_entity(se) \ | 681 | #define for_each_sched_entity(se) \ |
@@ -870,11 +708,17 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | |||
870 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 708 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
871 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | 709 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) |
872 | 710 | ||
873 | static inline int is_same_group(struct task_struct *curr, struct task_struct *p) | 711 | static inline int |
712 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
874 | { | 713 | { |
875 | return 1; | 714 | return 1; |
876 | } | 715 | } |
877 | 716 | ||
717 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
718 | { | ||
719 | return NULL; | ||
720 | } | ||
721 | |||
878 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 722 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
879 | 723 | ||
880 | /* | 724 | /* |
@@ -892,6 +736,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) | |||
892 | break; | 736 | break; |
893 | cfs_rq = cfs_rq_of(se); | 737 | cfs_rq = cfs_rq_of(se); |
894 | enqueue_entity(cfs_rq, se, wakeup); | 738 | enqueue_entity(cfs_rq, se, wakeup); |
739 | wakeup = 1; | ||
895 | } | 740 | } |
896 | } | 741 | } |
897 | 742 | ||
@@ -911,6 +756,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) | |||
911 | /* Don't dequeue parent if it has other entities besides us */ | 756 | /* Don't dequeue parent if it has other entities besides us */ |
912 | if (cfs_rq->load.weight) | 757 | if (cfs_rq->load.weight) |
913 | break; | 758 | break; |
759 | sleep = 1; | ||
914 | } | 760 | } |
915 | } | 761 | } |
916 | 762 | ||
@@ -919,12 +765,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) | |||
919 | * | 765 | * |
920 | * If compat_yield is turned on then we requeue to the end of the tree. | 766 | * If compat_yield is turned on then we requeue to the end of the tree. |
921 | */ | 767 | */ |
922 | static void yield_task_fair(struct rq *rq, struct task_struct *p) | 768 | static void yield_task_fair(struct rq *rq) |
923 | { | 769 | { |
924 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 770 | struct cfs_rq *cfs_rq = task_cfs_rq(rq->curr); |
925 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; | 771 | struct sched_entity *rightmost, *se = &rq->curr->se; |
926 | struct sched_entity *rightmost, *se = &p->se; | ||
927 | struct rb_node *parent; | ||
928 | 772 | ||
929 | /* | 773 | /* |
930 | * Are we the only task in the tree? | 774 | * Are we the only task in the tree? |
@@ -935,52 +779,39 @@ static void yield_task_fair(struct rq *rq, struct task_struct *p) | |||
935 | if (likely(!sysctl_sched_compat_yield)) { | 779 | if (likely(!sysctl_sched_compat_yield)) { |
936 | __update_rq_clock(rq); | 780 | __update_rq_clock(rq); |
937 | /* | 781 | /* |
938 | * Dequeue and enqueue the task to update its | 782 | * Update run-time statistics of the 'current'. |
939 | * position within the tree: | ||
940 | */ | 783 | */ |
941 | dequeue_entity(cfs_rq, &p->se, 0); | 784 | update_curr(cfs_rq); |
942 | enqueue_entity(cfs_rq, &p->se, 0); | ||
943 | 785 | ||
944 | return; | 786 | return; |
945 | } | 787 | } |
946 | /* | 788 | /* |
947 | * Find the rightmost entry in the rbtree: | 789 | * Find the rightmost entry in the rbtree: |
948 | */ | 790 | */ |
949 | do { | 791 | rightmost = __pick_last_entity(cfs_rq); |
950 | parent = *link; | ||
951 | link = &parent->rb_right; | ||
952 | } while (*link); | ||
953 | |||
954 | rightmost = rb_entry(parent, struct sched_entity, run_node); | ||
955 | /* | 792 | /* |
956 | * Already in the rightmost position? | 793 | * Already in the rightmost position? |
957 | */ | 794 | */ |
958 | if (unlikely(rightmost == se)) | 795 | if (unlikely(rightmost->vruntime < se->vruntime)) |
959 | return; | 796 | return; |
960 | 797 | ||
961 | /* | 798 | /* |
962 | * Minimally necessary key value to be last in the tree: | 799 | * Minimally necessary key value to be last in the tree: |
800 | * Upon rescheduling, sched_class::put_prev_task() will place | ||
801 | * 'current' within the tree based on its new key value. | ||
963 | */ | 802 | */ |
964 | se->fair_key = rightmost->fair_key + 1; | 803 | se->vruntime = rightmost->vruntime + 1; |
965 | |||
966 | if (cfs_rq->rb_leftmost == &se->run_node) | ||
967 | cfs_rq->rb_leftmost = rb_next(&se->run_node); | ||
968 | /* | ||
969 | * Relink the task to the rightmost position: | ||
970 | */ | ||
971 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); | ||
972 | rb_link_node(&se->run_node, parent, link); | ||
973 | rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); | ||
974 | } | 804 | } |
975 | 805 | ||
976 | /* | 806 | /* |
977 | * Preempt the current task with a newly woken task if needed: | 807 | * Preempt the current task with a newly woken task if needed: |
978 | */ | 808 | */ |
979 | static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) | 809 | static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) |
980 | { | 810 | { |
981 | struct task_struct *curr = rq->curr; | 811 | struct task_struct *curr = rq->curr; |
982 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 812 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
983 | unsigned long gran; | 813 | struct sched_entity *se = &curr->se, *pse = &p->se; |
814 | s64 delta, gran; | ||
984 | 815 | ||
985 | if (unlikely(rt_prio(p->prio))) { | 816 | if (unlikely(rt_prio(p->prio))) { |
986 | update_rq_clock(rq); | 817 | update_rq_clock(rq); |
@@ -988,16 +819,31 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) | |||
988 | resched_task(curr); | 819 | resched_task(curr); |
989 | return; | 820 | return; |
990 | } | 821 | } |
991 | |||
992 | gran = sysctl_sched_wakeup_granularity; | ||
993 | /* | 822 | /* |
994 | * Batch tasks prefer throughput over latency: | 823 | * Batch tasks do not preempt (their preemption is driven by |
824 | * the tick): | ||
995 | */ | 825 | */ |
996 | if (unlikely(p->policy == SCHED_BATCH)) | 826 | if (unlikely(p->policy == SCHED_BATCH)) |
997 | gran = sysctl_sched_batch_wakeup_granularity; | 827 | return; |
828 | |||
829 | if (sched_feat(WAKEUP_PREEMPT)) { | ||
830 | while (!is_same_group(se, pse)) { | ||
831 | se = parent_entity(se); | ||
832 | pse = parent_entity(pse); | ||
833 | } | ||
998 | 834 | ||
999 | if (is_same_group(curr, p)) | 835 | delta = se->vruntime - pse->vruntime; |
1000 | __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran); | 836 | gran = sysctl_sched_wakeup_granularity; |
837 | if (unlikely(se->load.weight != NICE_0_LOAD)) | ||
838 | gran = calc_delta_fair(gran, &se->load); | ||
839 | |||
840 | if (delta > gran) { | ||
841 | int now = !sched_feat(PREEMPT_RESTRICT); | ||
842 | |||
843 | if (now || p->prio < curr->prio || !se->peer_preempt++) | ||
844 | resched_task(curr); | ||
845 | } | ||
846 | } | ||
1001 | } | 847 | } |
1002 | 848 | ||
1003 | static struct task_struct *pick_next_task_fair(struct rq *rq) | 849 | static struct task_struct *pick_next_task_fair(struct rq *rq) |
@@ -1041,7 +887,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) | |||
1041 | * achieve that by always pre-iterating before returning | 887 | * achieve that by always pre-iterating before returning |
1042 | * the current task: | 888 | * the current task: |
1043 | */ | 889 | */ |
1044 | static inline struct task_struct * | 890 | static struct task_struct * |
1045 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) | 891 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) |
1046 | { | 892 | { |
1047 | struct task_struct *p; | 893 | struct task_struct *p; |
@@ -1078,7 +924,10 @@ static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | |||
1078 | if (!cfs_rq->nr_running) | 924 | if (!cfs_rq->nr_running) |
1079 | return MAX_PRIO; | 925 | return MAX_PRIO; |
1080 | 926 | ||
1081 | curr = __pick_next_entity(cfs_rq); | 927 | curr = cfs_rq->curr; |
928 | if (!curr) | ||
929 | curr = __pick_next_entity(cfs_rq); | ||
930 | |||
1082 | p = task_of(curr); | 931 | p = task_of(curr); |
1083 | 932 | ||
1084 | return p->prio; | 933 | return p->prio; |
@@ -1153,6 +1002,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr) | |||
1153 | } | 1002 | } |
1154 | } | 1003 | } |
1155 | 1004 | ||
1005 | #define swap(a,b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0) | ||
1006 | |||
1156 | /* | 1007 | /* |
1157 | * Share the fairness runtime between parent and child, thus the | 1008 | * Share the fairness runtime between parent and child, thus the |
1158 | * total amount of pressure for CPU stays equal - new tasks | 1009 | * total amount of pressure for CPU stays equal - new tasks |
@@ -1163,37 +1014,32 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr) | |||
1163 | static void task_new_fair(struct rq *rq, struct task_struct *p) | 1014 | static void task_new_fair(struct rq *rq, struct task_struct *p) |
1164 | { | 1015 | { |
1165 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 1016 | struct cfs_rq *cfs_rq = task_cfs_rq(p); |
1166 | struct sched_entity *se = &p->se, *curr = cfs_rq_curr(cfs_rq); | 1017 | struct sched_entity *se = &p->se, *curr = cfs_rq->curr; |
1018 | int this_cpu = smp_processor_id(); | ||
1167 | 1019 | ||
1168 | sched_info_queued(p); | 1020 | sched_info_queued(p); |
1169 | 1021 | ||
1170 | update_curr(cfs_rq); | 1022 | update_curr(cfs_rq); |
1171 | update_stats_enqueue(cfs_rq, se); | 1023 | place_entity(cfs_rq, se, 1); |
1172 | /* | ||
1173 | * Child runs first: we let it run before the parent | ||
1174 | * until it reschedules once. We set up the key so that | ||
1175 | * it will preempt the parent: | ||
1176 | */ | ||
1177 | se->fair_key = curr->fair_key - | ||
1178 | niced_granularity(curr, sched_granularity(cfs_rq)) - 1; | ||
1179 | /* | ||
1180 | * The first wait is dominated by the child-runs-first logic, | ||
1181 | * so do not credit it with that waiting time yet: | ||
1182 | */ | ||
1183 | if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL) | ||
1184 | se->wait_start_fair = 0; | ||
1185 | 1024 | ||
1186 | /* | 1025 | if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && |
1187 | * The statistical average of wait_runtime is about | 1026 | curr->vruntime < se->vruntime) { |
1188 | * -granularity/2, so initialize the task with that: | 1027 | /* |
1189 | */ | 1028 | * Upon rescheduling, sched_class::put_prev_task() will place |
1190 | if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) | 1029 | * 'current' within the tree based on its new key value. |
1191 | se->wait_runtime = -(sched_granularity(cfs_rq) / 2); | 1030 | */ |
1031 | swap(curr->vruntime, se->vruntime); | ||
1032 | } | ||
1192 | 1033 | ||
1034 | update_stats_enqueue(cfs_rq, se); | ||
1035 | check_spread(cfs_rq, se); | ||
1036 | check_spread(cfs_rq, curr); | ||
1193 | __enqueue_entity(cfs_rq, se); | 1037 | __enqueue_entity(cfs_rq, se); |
1038 | account_entity_enqueue(cfs_rq, se); | ||
1039 | se->peer_preempt = 0; | ||
1040 | resched_task(rq->curr); | ||
1194 | } | 1041 | } |
1195 | 1042 | ||
1196 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1197 | /* Account for a task changing its policy or group. | 1043 | /* Account for a task changing its policy or group. |
1198 | * | 1044 | * |
1199 | * This routine is mostly called to set cfs_rq->curr field when a task | 1045 | * This routine is mostly called to set cfs_rq->curr field when a task |
@@ -1206,21 +1052,17 @@ static void set_curr_task_fair(struct rq *rq) | |||
1206 | for_each_sched_entity(se) | 1052 | for_each_sched_entity(se) |
1207 | set_next_entity(cfs_rq_of(se), se); | 1053 | set_next_entity(cfs_rq_of(se), se); |
1208 | } | 1054 | } |
1209 | #else | ||
1210 | static void set_curr_task_fair(struct rq *rq) | ||
1211 | { | ||
1212 | } | ||
1213 | #endif | ||
1214 | 1055 | ||
1215 | /* | 1056 | /* |
1216 | * All the scheduling class methods: | 1057 | * All the scheduling class methods: |
1217 | */ | 1058 | */ |
1218 | struct sched_class fair_sched_class __read_mostly = { | 1059 | static const struct sched_class fair_sched_class = { |
1060 | .next = &idle_sched_class, | ||
1219 | .enqueue_task = enqueue_task_fair, | 1061 | .enqueue_task = enqueue_task_fair, |
1220 | .dequeue_task = dequeue_task_fair, | 1062 | .dequeue_task = dequeue_task_fair, |
1221 | .yield_task = yield_task_fair, | 1063 | .yield_task = yield_task_fair, |
1222 | 1064 | ||
1223 | .check_preempt_curr = check_preempt_curr_fair, | 1065 | .check_preempt_curr = check_preempt_wakeup, |
1224 | 1066 | ||
1225 | .pick_next_task = pick_next_task_fair, | 1067 | .pick_next_task = pick_next_task_fair, |
1226 | .put_prev_task = put_prev_task_fair, | 1068 | .put_prev_task = put_prev_task_fair, |
@@ -1237,6 +1079,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu) | |||
1237 | { | 1079 | { |
1238 | struct cfs_rq *cfs_rq; | 1080 | struct cfs_rq *cfs_rq; |
1239 | 1081 | ||
1082 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1083 | print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); | ||
1084 | #endif | ||
1240 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) | 1085 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) |
1241 | print_cfs_rq(m, cpu, cfs_rq); | 1086 | print_cfs_rq(m, cpu, cfs_rq); |
1242 | } | 1087 | } |
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 3503fb2d9f96..6e2ead41516e 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
@@ -50,10 +50,15 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr) | |||
50 | { | 50 | { |
51 | } | 51 | } |
52 | 52 | ||
53 | static void set_curr_task_idle(struct rq *rq) | ||
54 | { | ||
55 | } | ||
56 | |||
53 | /* | 57 | /* |
54 | * Simple, special scheduling class for the per-CPU idle tasks: | 58 | * Simple, special scheduling class for the per-CPU idle tasks: |
55 | */ | 59 | */ |
56 | static struct sched_class idle_sched_class __read_mostly = { | 60 | const struct sched_class idle_sched_class = { |
61 | /* .next is NULL */ | ||
57 | /* no enqueue/yield_task for idle tasks */ | 62 | /* no enqueue/yield_task for idle tasks */ |
58 | 63 | ||
59 | /* dequeue is not valid, we print a debug message there: */ | 64 | /* dequeue is not valid, we print a debug message there: */ |
@@ -66,6 +71,7 @@ static struct sched_class idle_sched_class __read_mostly = { | |||
66 | 71 | ||
67 | .load_balance = load_balance_idle, | 72 | .load_balance = load_balance_idle, |
68 | 73 | ||
74 | .set_curr_task = set_curr_task_idle, | ||
69 | .task_tick = task_tick_idle, | 75 | .task_tick = task_tick_idle, |
70 | /* no .task_new for idle tasks */ | 76 | /* no .task_new for idle tasks */ |
71 | }; | 77 | }; |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 4b87476a02d0..d0097a0634e5 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -7,7 +7,7 @@ | |||
7 | * Update the current task's runtime statistics. Skip current tasks that | 7 | * Update the current task's runtime statistics. Skip current tasks that |
8 | * are not in our scheduling class. | 8 | * are not in our scheduling class. |
9 | */ | 9 | */ |
10 | static inline void update_curr_rt(struct rq *rq) | 10 | static void update_curr_rt(struct rq *rq) |
11 | { | 11 | { |
12 | struct task_struct *curr = rq->curr; | 12 | struct task_struct *curr = rq->curr; |
13 | u64 delta_exec; | 13 | u64 delta_exec; |
@@ -59,9 +59,9 @@ static void requeue_task_rt(struct rq *rq, struct task_struct *p) | |||
59 | } | 59 | } |
60 | 60 | ||
61 | static void | 61 | static void |
62 | yield_task_rt(struct rq *rq, struct task_struct *p) | 62 | yield_task_rt(struct rq *rq) |
63 | { | 63 | { |
64 | requeue_task_rt(rq, p); | 64 | requeue_task_rt(rq, rq->curr); |
65 | } | 65 | } |
66 | 66 | ||
67 | /* | 67 | /* |
@@ -206,7 +206,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p) | |||
206 | if (--p->time_slice) | 206 | if (--p->time_slice) |
207 | return; | 207 | return; |
208 | 208 | ||
209 | p->time_slice = static_prio_timeslice(p->static_prio); | 209 | p->time_slice = DEF_TIMESLICE; |
210 | 210 | ||
211 | /* | 211 | /* |
212 | * Requeue to the end of queue if we are not the only element | 212 | * Requeue to the end of queue if we are not the only element |
@@ -218,7 +218,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p) | |||
218 | } | 218 | } |
219 | } | 219 | } |
220 | 220 | ||
221 | static struct sched_class rt_sched_class __read_mostly = { | 221 | static void set_curr_task_rt(struct rq *rq) |
222 | { | ||
223 | struct task_struct *p = rq->curr; | ||
224 | |||
225 | p->se.exec_start = rq->clock; | ||
226 | } | ||
227 | |||
228 | const struct sched_class rt_sched_class = { | ||
229 | .next = &fair_sched_class, | ||
222 | .enqueue_task = enqueue_task_rt, | 230 | .enqueue_task = enqueue_task_rt, |
223 | .dequeue_task = dequeue_task_rt, | 231 | .dequeue_task = dequeue_task_rt, |
224 | .yield_task = yield_task_rt, | 232 | .yield_task = yield_task_rt, |
@@ -230,5 +238,6 @@ static struct sched_class rt_sched_class __read_mostly = { | |||
230 | 238 | ||
231 | .load_balance = load_balance_rt, | 239 | .load_balance = load_balance_rt, |
232 | 240 | ||
241 | .set_curr_task = set_curr_task_rt, | ||
233 | .task_tick = task_tick_rt, | 242 | .task_tick = task_tick_rt, |
234 | }; | 243 | }; |
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index c20a94dda61e..1c084842c3e7 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h | |||
@@ -16,18 +16,18 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
16 | struct rq *rq = cpu_rq(cpu); | 16 | struct rq *rq = cpu_rq(cpu); |
17 | #ifdef CONFIG_SMP | 17 | #ifdef CONFIG_SMP |
18 | struct sched_domain *sd; | 18 | struct sched_domain *sd; |
19 | int dcnt = 0; | 19 | int dcount = 0; |
20 | #endif | 20 | #endif |
21 | 21 | ||
22 | /* runqueue-specific stats */ | 22 | /* runqueue-specific stats */ |
23 | seq_printf(seq, | 23 | seq_printf(seq, |
24 | "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu", | 24 | "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu", |
25 | cpu, rq->yld_both_empty, | 25 | cpu, rq->yld_both_empty, |
26 | rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, | 26 | rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count, |
27 | rq->sched_switch, rq->sched_cnt, rq->sched_goidle, | 27 | rq->sched_switch, rq->sched_count, rq->sched_goidle, |
28 | rq->ttwu_cnt, rq->ttwu_local, | 28 | rq->ttwu_count, rq->ttwu_local, |
29 | rq->rq_sched_info.cpu_time, | 29 | rq->rq_sched_info.cpu_time, |
30 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); | 30 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); |
31 | 31 | ||
32 | seq_printf(seq, "\n"); | 32 | seq_printf(seq, "\n"); |
33 | 33 | ||
@@ -39,12 +39,12 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
39 | char mask_str[NR_CPUS]; | 39 | char mask_str[NR_CPUS]; |
40 | 40 | ||
41 | cpumask_scnprintf(mask_str, NR_CPUS, sd->span); | 41 | cpumask_scnprintf(mask_str, NR_CPUS, sd->span); |
42 | seq_printf(seq, "domain%d %s", dcnt++, mask_str); | 42 | seq_printf(seq, "domain%d %s", dcount++, mask_str); |
43 | for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; | 43 | for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; |
44 | itype++) { | 44 | itype++) { |
45 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " | 45 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " |
46 | "%lu", | 46 | "%lu", |
47 | sd->lb_cnt[itype], | 47 | sd->lb_count[itype], |
48 | sd->lb_balanced[itype], | 48 | sd->lb_balanced[itype], |
49 | sd->lb_failed[itype], | 49 | sd->lb_failed[itype], |
50 | sd->lb_imbalance[itype], | 50 | sd->lb_imbalance[itype], |
@@ -55,9 +55,9 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
55 | } | 55 | } |
56 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" | 56 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" |
57 | " %lu %lu %lu\n", | 57 | " %lu %lu %lu\n", |
58 | sd->alb_cnt, sd->alb_failed, sd->alb_pushed, | 58 | sd->alb_count, sd->alb_failed, sd->alb_pushed, |
59 | sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, | 59 | sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, |
60 | sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, | 60 | sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, |
61 | sd->ttwu_wake_remote, sd->ttwu_move_affine, | 61 | sd->ttwu_wake_remote, sd->ttwu_move_affine, |
62 | sd->ttwu_move_balance); | 62 | sd->ttwu_move_balance); |
63 | } | 63 | } |
@@ -101,7 +101,7 @@ rq_sched_info_arrive(struct rq *rq, unsigned long long delta) | |||
101 | { | 101 | { |
102 | if (rq) { | 102 | if (rq) { |
103 | rq->rq_sched_info.run_delay += delta; | 103 | rq->rq_sched_info.run_delay += delta; |
104 | rq->rq_sched_info.pcnt++; | 104 | rq->rq_sched_info.pcount++; |
105 | } | 105 | } |
106 | } | 106 | } |
107 | 107 | ||
@@ -129,7 +129,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) | |||
129 | # define schedstat_set(var, val) do { } while (0) | 129 | # define schedstat_set(var, val) do { } while (0) |
130 | #endif | 130 | #endif |
131 | 131 | ||
132 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 132 | #ifdef CONFIG_SCHEDSTATS |
133 | /* | 133 | /* |
134 | * Called when a process is dequeued from the active array and given | 134 | * Called when a process is dequeued from the active array and given |
135 | * the cpu. We should note that with the exception of interactive | 135 | * the cpu. We should note that with the exception of interactive |
@@ -164,7 +164,7 @@ static void sched_info_arrive(struct task_struct *t) | |||
164 | sched_info_dequeued(t); | 164 | sched_info_dequeued(t); |
165 | t->sched_info.run_delay += delta; | 165 | t->sched_info.run_delay += delta; |
166 | t->sched_info.last_arrival = now; | 166 | t->sched_info.last_arrival = now; |
167 | t->sched_info.pcnt++; | 167 | t->sched_info.pcount++; |
168 | 168 | ||
169 | rq_sched_info_arrive(task_rq(t), delta); | 169 | rq_sched_info_arrive(task_rq(t), delta); |
170 | } | 170 | } |
@@ -233,5 +233,5 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) | |||
233 | #else | 233 | #else |
234 | #define sched_info_queued(t) do { } while (0) | 234 | #define sched_info_queued(t) do { } while (0) |
235 | #define sched_info_switch(t, next) do { } while (0) | 235 | #define sched_info_switch(t, next) do { } while (0) |
236 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ | 236 | #endif /* CONFIG_SCHEDSTATS */ |
237 | 237 | ||
diff --git a/kernel/softirq.c b/kernel/softirq.c index 0f546ddea43d..bd89bc4eb0b9 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -271,8 +271,6 @@ asmlinkage void do_softirq(void) | |||
271 | local_irq_restore(flags); | 271 | local_irq_restore(flags); |
272 | } | 272 | } |
273 | 273 | ||
274 | EXPORT_SYMBOL(do_softirq); | ||
275 | |||
276 | #endif | 274 | #endif |
277 | 275 | ||
278 | /* | 276 | /* |
@@ -332,8 +330,6 @@ inline fastcall void raise_softirq_irqoff(unsigned int nr) | |||
332 | wakeup_softirqd(); | 330 | wakeup_softirqd(); |
333 | } | 331 | } |
334 | 332 | ||
335 | EXPORT_SYMBOL(raise_softirq_irqoff); | ||
336 | |||
337 | void fastcall raise_softirq(unsigned int nr) | 333 | void fastcall raise_softirq(unsigned int nr) |
338 | { | 334 | { |
339 | unsigned long flags; | 335 | unsigned long flags; |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 53a456ebf6d5..ec14aa8ac51f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -222,14 +222,11 @@ static ctl_table kern_table[] = { | |||
222 | #ifdef CONFIG_SCHED_DEBUG | 222 | #ifdef CONFIG_SCHED_DEBUG |
223 | { | 223 | { |
224 | .ctl_name = CTL_UNNUMBERED, | 224 | .ctl_name = CTL_UNNUMBERED, |
225 | .procname = "sched_min_granularity_ns", | 225 | .procname = "sched_nr_latency", |
226 | .data = &sysctl_sched_min_granularity, | 226 | .data = &sysctl_sched_nr_latency, |
227 | .maxlen = sizeof(unsigned int), | 227 | .maxlen = sizeof(unsigned int), |
228 | .mode = 0644, | 228 | .mode = 0644, |
229 | .proc_handler = &proc_dointvec_minmax, | 229 | .proc_handler = &proc_dointvec, |
230 | .strategy = &sysctl_intvec, | ||
231 | .extra1 = &min_sched_granularity_ns, | ||
232 | .extra2 = &max_sched_granularity_ns, | ||
233 | }, | 230 | }, |
234 | { | 231 | { |
235 | .ctl_name = CTL_UNNUMBERED, | 232 | .ctl_name = CTL_UNNUMBERED, |
@@ -266,38 +263,24 @@ static ctl_table kern_table[] = { | |||
266 | }, | 263 | }, |
267 | { | 264 | { |
268 | .ctl_name = CTL_UNNUMBERED, | 265 | .ctl_name = CTL_UNNUMBERED, |
269 | .procname = "sched_stat_granularity_ns", | 266 | .procname = "sched_child_runs_first", |
270 | .data = &sysctl_sched_stat_granularity, | 267 | .data = &sysctl_sched_child_runs_first, |
271 | .maxlen = sizeof(unsigned int), | ||
272 | .mode = 0644, | ||
273 | .proc_handler = &proc_dointvec_minmax, | ||
274 | .strategy = &sysctl_intvec, | ||
275 | .extra1 = &min_wakeup_granularity_ns, | ||
276 | .extra2 = &max_wakeup_granularity_ns, | ||
277 | }, | ||
278 | { | ||
279 | .ctl_name = CTL_UNNUMBERED, | ||
280 | .procname = "sched_runtime_limit_ns", | ||
281 | .data = &sysctl_sched_runtime_limit, | ||
282 | .maxlen = sizeof(unsigned int), | 268 | .maxlen = sizeof(unsigned int), |
283 | .mode = 0644, | 269 | .mode = 0644, |
284 | .proc_handler = &proc_dointvec_minmax, | 270 | .proc_handler = &proc_dointvec, |
285 | .strategy = &sysctl_intvec, | ||
286 | .extra1 = &min_sched_granularity_ns, | ||
287 | .extra2 = &max_sched_granularity_ns, | ||
288 | }, | 271 | }, |
289 | { | 272 | { |
290 | .ctl_name = CTL_UNNUMBERED, | 273 | .ctl_name = CTL_UNNUMBERED, |
291 | .procname = "sched_child_runs_first", | 274 | .procname = "sched_features", |
292 | .data = &sysctl_sched_child_runs_first, | 275 | .data = &sysctl_sched_features, |
293 | .maxlen = sizeof(unsigned int), | 276 | .maxlen = sizeof(unsigned int), |
294 | .mode = 0644, | 277 | .mode = 0644, |
295 | .proc_handler = &proc_dointvec, | 278 | .proc_handler = &proc_dointvec, |
296 | }, | 279 | }, |
297 | { | 280 | { |
298 | .ctl_name = CTL_UNNUMBERED, | 281 | .ctl_name = CTL_UNNUMBERED, |
299 | .procname = "sched_features", | 282 | .procname = "sched_migration_cost", |
300 | .data = &sysctl_sched_features, | 283 | .data = &sysctl_sched_migration_cost, |
301 | .maxlen = sizeof(unsigned int), | 284 | .maxlen = sizeof(unsigned int), |
302 | .mode = 0644, | 285 | .mode = 0644, |
303 | .proc_handler = &proc_dointvec, | 286 | .proc_handler = &proc_dointvec, |
@@ -1053,7 +1036,7 @@ static ctl_table vm_table[] = { | |||
1053 | .strategy = &sysctl_string, | 1036 | .strategy = &sysctl_string, |
1054 | }, | 1037 | }, |
1055 | #endif | 1038 | #endif |
1056 | #if defined(CONFIG_X86_32) || \ | 1039 | #if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ |
1057 | (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) | 1040 | (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) |
1058 | { | 1041 | { |
1059 | .ctl_name = VM_VDSO_ENABLED, | 1042 | .ctl_name = VM_VDSO_ENABLED, |
@@ -1221,7 +1204,7 @@ static ctl_table fs_table[] = { | |||
1221 | }; | 1204 | }; |
1222 | 1205 | ||
1223 | static ctl_table debug_table[] = { | 1206 | static ctl_table debug_table[] = { |
1224 | #ifdef CONFIG_X86 | 1207 | #if defined(CONFIG_X86) || defined(CONFIG_PPC) |
1225 | { | 1208 | { |
1226 | .ctl_name = CTL_UNNUMBERED, | 1209 | .ctl_name = CTL_UNNUMBERED, |
1227 | .procname = "exception-trace", | 1210 | .procname = "exception-trace", |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f66351126544..8d53106a0a92 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
@@ -23,3 +23,8 @@ config HIGH_RES_TIMERS | |||
23 | hardware is not capable then this option only increases | 23 | hardware is not capable then this option only increases |
24 | the size of the kernel image. | 24 | the size of the kernel image. |
25 | 25 | ||
26 | config GENERIC_CLOCKEVENTS_BUILD | ||
27 | bool | ||
28 | default y | ||
29 | depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR | ||
30 | |||
diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 99b6034fc86b..905b0b50792d 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile | |||
@@ -1,6 +1,6 @@ | |||
1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o | 1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o |
2 | 2 | ||
3 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o | 3 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o |
4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o | 4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o |
5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o | 5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o |
6 | obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o | 6 | obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 41dd3105ce7f..822beebe664a 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -194,6 +194,7 @@ void clockevents_exchange_device(struct clock_event_device *old, | |||
194 | local_irq_restore(flags); | 194 | local_irq_restore(flags); |
195 | } | 195 | } |
196 | 196 | ||
197 | #ifdef CONFIG_GENERIC_CLOCKEVENTS | ||
197 | /** | 198 | /** |
198 | * clockevents_notify - notification about relevant events | 199 | * clockevents_notify - notification about relevant events |
199 | */ | 200 | */ |
@@ -222,4 +223,4 @@ void clockevents_notify(unsigned long reason, void *arg) | |||
222 | spin_unlock(&clockevents_lock); | 223 | spin_unlock(&clockevents_lock); |
223 | } | 224 | } |
224 | EXPORT_SYMBOL_GPL(clockevents_notify); | 225 | EXPORT_SYMBOL_GPL(clockevents_notify); |
225 | 226 | #endif | |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 0962e0577660..fc3fc79b3d59 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -64,8 +64,9 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) | |||
64 | */ | 64 | */ |
65 | int tick_check_broadcast_device(struct clock_event_device *dev) | 65 | int tick_check_broadcast_device(struct clock_event_device *dev) |
66 | { | 66 | { |
67 | if (tick_broadcast_device.evtdev || | 67 | if ((tick_broadcast_device.evtdev && |
68 | (dev->features & CLOCK_EVT_FEAT_C3STOP)) | 68 | tick_broadcast_device.evtdev->rating >= dev->rating) || |
69 | (dev->features & CLOCK_EVT_FEAT_C3STOP)) | ||
69 | return 0; | 70 | return 0; |
70 | 71 | ||
71 | clockevents_exchange_device(NULL, dev); | 72 | clockevents_exchange_device(NULL, dev); |
@@ -176,8 +177,6 @@ static void tick_do_periodic_broadcast(void) | |||
176 | */ | 177 | */ |
177 | static void tick_handle_periodic_broadcast(struct clock_event_device *dev) | 178 | static void tick_handle_periodic_broadcast(struct clock_event_device *dev) |
178 | { | 179 | { |
179 | dev->next_event.tv64 = KTIME_MAX; | ||
180 | |||
181 | tick_do_periodic_broadcast(); | 180 | tick_do_periodic_broadcast(); |
182 | 181 | ||
183 | /* | 182 | /* |
@@ -218,26 +217,43 @@ static void tick_do_broadcast_on_off(void *why) | |||
218 | bc = tick_broadcast_device.evtdev; | 217 | bc = tick_broadcast_device.evtdev; |
219 | 218 | ||
220 | /* | 219 | /* |
221 | * Is the device in broadcast mode forever or is it not | 220 | * Is the device not affected by the powerstate ? |
222 | * affected by the powerstate ? | ||
223 | */ | 221 | */ |
224 | if (!dev || !tick_device_is_functional(dev) || | 222 | if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP)) |
225 | !(dev->features & CLOCK_EVT_FEAT_C3STOP)) | ||
226 | goto out; | 223 | goto out; |
227 | 224 | ||
228 | if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_ON) { | 225 | /* |
226 | * Defect device ? | ||
227 | */ | ||
228 | if (!tick_device_is_functional(dev)) { | ||
229 | /* | ||
230 | * AMD C1E wreckage fixup: | ||
231 | * | ||
232 | * Device was registered functional in the first | ||
233 | * place. Now the secondary CPU detected the C1E | ||
234 | * misfeature and notifies us to fix it up | ||
235 | */ | ||
236 | if (*reason != CLOCK_EVT_NOTIFY_BROADCAST_FORCE) | ||
237 | goto out; | ||
238 | } | ||
239 | |||
240 | switch (*reason) { | ||
241 | case CLOCK_EVT_NOTIFY_BROADCAST_ON: | ||
242 | case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: | ||
229 | if (!cpu_isset(cpu, tick_broadcast_mask)) { | 243 | if (!cpu_isset(cpu, tick_broadcast_mask)) { |
230 | cpu_set(cpu, tick_broadcast_mask); | 244 | cpu_set(cpu, tick_broadcast_mask); |
231 | if (td->mode == TICKDEV_MODE_PERIODIC) | 245 | if (td->mode == TICKDEV_MODE_PERIODIC) |
232 | clockevents_set_mode(dev, | 246 | clockevents_set_mode(dev, |
233 | CLOCK_EVT_MODE_SHUTDOWN); | 247 | CLOCK_EVT_MODE_SHUTDOWN); |
234 | } | 248 | } |
235 | } else { | 249 | break; |
250 | case CLOCK_EVT_NOTIFY_BROADCAST_OFF: | ||
236 | if (cpu_isset(cpu, tick_broadcast_mask)) { | 251 | if (cpu_isset(cpu, tick_broadcast_mask)) { |
237 | cpu_clear(cpu, tick_broadcast_mask); | 252 | cpu_clear(cpu, tick_broadcast_mask); |
238 | if (td->mode == TICKDEV_MODE_PERIODIC) | 253 | if (td->mode == TICKDEV_MODE_PERIODIC) |
239 | tick_setup_periodic(dev, 0); | 254 | tick_setup_periodic(dev, 0); |
240 | } | 255 | } |
256 | break; | ||
241 | } | 257 | } |
242 | 258 | ||
243 | if (cpus_empty(tick_broadcast_mask)) | 259 | if (cpus_empty(tick_broadcast_mask)) |
@@ -515,11 +531,9 @@ static void tick_broadcast_clear_oneshot(int cpu) | |||
515 | */ | 531 | */ |
516 | void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | 532 | void tick_broadcast_setup_oneshot(struct clock_event_device *bc) |
517 | { | 533 | { |
518 | if (bc->mode != CLOCK_EVT_MODE_ONESHOT) { | 534 | bc->event_handler = tick_handle_oneshot_broadcast; |
519 | bc->event_handler = tick_handle_oneshot_broadcast; | 535 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); |
520 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | 536 | bc->next_event.tv64 = KTIME_MAX; |
521 | bc->next_event.tv64 = KTIME_MAX; | ||
522 | } | ||
523 | } | 537 | } |
524 | 538 | ||
525 | /* | 539 | /* |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 77a21abc8716..1bea399a9ef0 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
@@ -200,7 +200,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) | |||
200 | 200 | ||
201 | cpu = smp_processor_id(); | 201 | cpu = smp_processor_id(); |
202 | if (!cpu_isset(cpu, newdev->cpumask)) | 202 | if (!cpu_isset(cpu, newdev->cpumask)) |
203 | goto out; | 203 | goto out_bc; |
204 | 204 | ||
205 | td = &per_cpu(tick_cpu_device, cpu); | 205 | td = &per_cpu(tick_cpu_device, cpu); |
206 | curdev = td->evtdev; | 206 | curdev = td->evtdev; |
@@ -265,7 +265,7 @@ out_bc: | |||
265 | */ | 265 | */ |
266 | if (tick_check_broadcast_device(newdev)) | 266 | if (tick_check_broadcast_device(newdev)) |
267 | ret = NOTIFY_STOP; | 267 | ret = NOTIFY_STOP; |
268 | out: | 268 | |
269 | spin_unlock_irqrestore(&tick_device_lock, flags); | 269 | spin_unlock_irqrestore(&tick_device_lock, flags); |
270 | 270 | ||
271 | return ret; | 271 | return ret; |
@@ -345,6 +345,7 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason, | |||
345 | 345 | ||
346 | case CLOCK_EVT_NOTIFY_BROADCAST_ON: | 346 | case CLOCK_EVT_NOTIFY_BROADCAST_ON: |
347 | case CLOCK_EVT_NOTIFY_BROADCAST_OFF: | 347 | case CLOCK_EVT_NOTIFY_BROADCAST_OFF: |
348 | case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: | ||
348 | tick_broadcast_on_off(reason, dev); | 349 | tick_broadcast_on_off(reason, dev); |
349 | break; | 350 | break; |
350 | 351 | ||
diff --git a/kernel/user.c b/kernel/user.c index 9ca2848fc356..f0e561e6d085 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -50,12 +50,16 @@ struct user_struct root_user = { | |||
50 | .uid_keyring = &root_user_keyring, | 50 | .uid_keyring = &root_user_keyring, |
51 | .session_keyring = &root_session_keyring, | 51 | .session_keyring = &root_session_keyring, |
52 | #endif | 52 | #endif |
53 | #ifdef CONFIG_FAIR_USER_SCHED | ||
54 | .tg = &init_task_group, | ||
55 | #endif | ||
53 | }; | 56 | }; |
54 | 57 | ||
55 | /* | 58 | /* |
56 | * These routines must be called with the uidhash spinlock held! | 59 | * These routines must be called with the uidhash spinlock held! |
57 | */ | 60 | */ |
58 | static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) | 61 | static inline void uid_hash_insert(struct user_struct *up, |
62 | struct hlist_head *hashent) | ||
59 | { | 63 | { |
60 | hlist_add_head(&up->uidhash_node, hashent); | 64 | hlist_add_head(&up->uidhash_node, hashent); |
61 | } | 65 | } |
@@ -65,13 +69,14 @@ static inline void uid_hash_remove(struct user_struct *up) | |||
65 | hlist_del_init(&up->uidhash_node); | 69 | hlist_del_init(&up->uidhash_node); |
66 | } | 70 | } |
67 | 71 | ||
68 | static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) | 72 | static inline struct user_struct *uid_hash_find(uid_t uid, |
73 | struct hlist_head *hashent) | ||
69 | { | 74 | { |
70 | struct user_struct *user; | 75 | struct user_struct *user; |
71 | struct hlist_node *h; | 76 | struct hlist_node *h; |
72 | 77 | ||
73 | hlist_for_each_entry(user, h, hashent, uidhash_node) { | 78 | hlist_for_each_entry(user, h, hashent, uidhash_node) { |
74 | if(user->uid == uid) { | 79 | if (user->uid == uid) { |
75 | atomic_inc(&user->__count); | 80 | atomic_inc(&user->__count); |
76 | return user; | 81 | return user; |
77 | } | 82 | } |
@@ -80,6 +85,203 @@ static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *ha | |||
80 | return NULL; | 85 | return NULL; |
81 | } | 86 | } |
82 | 87 | ||
88 | #ifdef CONFIG_FAIR_USER_SCHED | ||
89 | |||
90 | static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */ | ||
91 | static DEFINE_MUTEX(uids_mutex); | ||
92 | |||
93 | static void sched_destroy_user(struct user_struct *up) | ||
94 | { | ||
95 | sched_destroy_group(up->tg); | ||
96 | } | ||
97 | |||
98 | static int sched_create_user(struct user_struct *up) | ||
99 | { | ||
100 | int rc = 0; | ||
101 | |||
102 | up->tg = sched_create_group(); | ||
103 | if (IS_ERR(up->tg)) | ||
104 | rc = -ENOMEM; | ||
105 | |||
106 | return rc; | ||
107 | } | ||
108 | |||
109 | static void sched_switch_user(struct task_struct *p) | ||
110 | { | ||
111 | sched_move_task(p); | ||
112 | } | ||
113 | |||
114 | static inline void uids_mutex_lock(void) | ||
115 | { | ||
116 | mutex_lock(&uids_mutex); | ||
117 | } | ||
118 | |||
119 | static inline void uids_mutex_unlock(void) | ||
120 | { | ||
121 | mutex_unlock(&uids_mutex); | ||
122 | } | ||
123 | |||
124 | /* return cpu shares held by the user */ | ||
125 | ssize_t cpu_shares_show(struct kset *kset, char *buffer) | ||
126 | { | ||
127 | struct user_struct *up = container_of(kset, struct user_struct, kset); | ||
128 | |||
129 | return sprintf(buffer, "%lu\n", sched_group_shares(up->tg)); | ||
130 | } | ||
131 | |||
132 | /* modify cpu shares held by the user */ | ||
133 | ssize_t cpu_shares_store(struct kset *kset, const char *buffer, size_t size) | ||
134 | { | ||
135 | struct user_struct *up = container_of(kset, struct user_struct, kset); | ||
136 | unsigned long shares; | ||
137 | int rc; | ||
138 | |||
139 | sscanf(buffer, "%lu", &shares); | ||
140 | |||
141 | rc = sched_group_set_shares(up->tg, shares); | ||
142 | |||
143 | return (rc ? rc : size); | ||
144 | } | ||
145 | |||
146 | static void user_attr_init(struct subsys_attribute *sa, char *name, int mode) | ||
147 | { | ||
148 | sa->attr.name = name; | ||
149 | sa->attr.mode = mode; | ||
150 | sa->show = cpu_shares_show; | ||
151 | sa->store = cpu_shares_store; | ||
152 | } | ||
153 | |||
154 | /* Create "/sys/kernel/uids/<uid>" directory and | ||
155 | * "/sys/kernel/uids/<uid>/cpu_share" file for this user. | ||
156 | */ | ||
157 | static int user_kobject_create(struct user_struct *up) | ||
158 | { | ||
159 | struct kset *kset = &up->kset; | ||
160 | struct kobject *kobj = &kset->kobj; | ||
161 | int error; | ||
162 | |||
163 | memset(kset, 0, sizeof(struct kset)); | ||
164 | kobj->parent = &uids_kobject; /* create under /sys/kernel/uids dir */ | ||
165 | kobject_set_name(kobj, "%d", up->uid); | ||
166 | kset_init(kset); | ||
167 | user_attr_init(&up->user_attr, "cpu_share", 0644); | ||
168 | |||
169 | error = kobject_add(kobj); | ||
170 | if (error) | ||
171 | goto done; | ||
172 | |||
173 | error = sysfs_create_file(kobj, &up->user_attr.attr); | ||
174 | if (error) | ||
175 | kobject_del(kobj); | ||
176 | |||
177 | kobject_uevent(kobj, KOBJ_ADD); | ||
178 | |||
179 | done: | ||
180 | return error; | ||
181 | } | ||
182 | |||
183 | /* create these in sysfs filesystem: | ||
184 | * "/sys/kernel/uids" directory | ||
185 | * "/sys/kernel/uids/0" directory (for root user) | ||
186 | * "/sys/kernel/uids/0/cpu_share" file (for root user) | ||
187 | */ | ||
188 | int __init uids_kobject_init(void) | ||
189 | { | ||
190 | int error; | ||
191 | |||
192 | /* create under /sys/kernel dir */ | ||
193 | uids_kobject.parent = &kernel_subsys.kobj; | ||
194 | uids_kobject.kset = &kernel_subsys; | ||
195 | kobject_set_name(&uids_kobject, "uids"); | ||
196 | kobject_init(&uids_kobject); | ||
197 | |||
198 | error = kobject_add(&uids_kobject); | ||
199 | if (!error) | ||
200 | error = user_kobject_create(&root_user); | ||
201 | |||
202 | return error; | ||
203 | } | ||
204 | |||
205 | /* work function to remove sysfs directory for a user and free up | ||
206 | * corresponding structures. | ||
207 | */ | ||
208 | static void remove_user_sysfs_dir(struct work_struct *w) | ||
209 | { | ||
210 | struct user_struct *up = container_of(w, struct user_struct, work); | ||
211 | struct kobject *kobj = &up->kset.kobj; | ||
212 | unsigned long flags; | ||
213 | int remove_user = 0; | ||
214 | |||
215 | /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del() | ||
216 | * atomic. | ||
217 | */ | ||
218 | uids_mutex_lock(); | ||
219 | |||
220 | local_irq_save(flags); | ||
221 | |||
222 | if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { | ||
223 | uid_hash_remove(up); | ||
224 | remove_user = 1; | ||
225 | spin_unlock_irqrestore(&uidhash_lock, flags); | ||
226 | } else { | ||
227 | local_irq_restore(flags); | ||
228 | } | ||
229 | |||
230 | if (!remove_user) | ||
231 | goto done; | ||
232 | |||
233 | sysfs_remove_file(kobj, &up->user_attr.attr); | ||
234 | kobject_uevent(kobj, KOBJ_REMOVE); | ||
235 | kobject_del(kobj); | ||
236 | |||
237 | sched_destroy_user(up); | ||
238 | key_put(up->uid_keyring); | ||
239 | key_put(up->session_keyring); | ||
240 | kmem_cache_free(uid_cachep, up); | ||
241 | |||
242 | done: | ||
243 | uids_mutex_unlock(); | ||
244 | } | ||
245 | |||
246 | /* IRQs are disabled and uidhash_lock is held upon function entry. | ||
247 | * IRQ state (as stored in flags) is restored and uidhash_lock released | ||
248 | * upon function exit. | ||
249 | */ | ||
250 | static inline void free_user(struct user_struct *up, unsigned long flags) | ||
251 | { | ||
252 | /* restore back the count */ | ||
253 | atomic_inc(&up->__count); | ||
254 | spin_unlock_irqrestore(&uidhash_lock, flags); | ||
255 | |||
256 | INIT_WORK(&up->work, remove_user_sysfs_dir); | ||
257 | schedule_work(&up->work); | ||
258 | } | ||
259 | |||
260 | #else /* CONFIG_FAIR_USER_SCHED */ | ||
261 | |||
262 | static void sched_destroy_user(struct user_struct *up) { } | ||
263 | static int sched_create_user(struct user_struct *up) { return 0; } | ||
264 | static void sched_switch_user(struct task_struct *p) { } | ||
265 | static inline int user_kobject_create(struct user_struct *up) { return 0; } | ||
266 | static inline void uids_mutex_lock(void) { } | ||
267 | static inline void uids_mutex_unlock(void) { } | ||
268 | |||
269 | /* IRQs are disabled and uidhash_lock is held upon function entry. | ||
270 | * IRQ state (as stored in flags) is restored and uidhash_lock released | ||
271 | * upon function exit. | ||
272 | */ | ||
273 | static inline void free_user(struct user_struct *up, unsigned long flags) | ||
274 | { | ||
275 | uid_hash_remove(up); | ||
276 | spin_unlock_irqrestore(&uidhash_lock, flags); | ||
277 | sched_destroy_user(up); | ||
278 | key_put(up->uid_keyring); | ||
279 | key_put(up->session_keyring); | ||
280 | kmem_cache_free(uid_cachep, up); | ||
281 | } | ||
282 | |||
283 | #endif /* CONFIG_FAIR_USER_SCHED */ | ||
284 | |||
83 | /* | 285 | /* |
84 | * Locate the user_struct for the passed UID. If found, take a ref on it. The | 286 | * Locate the user_struct for the passed UID. If found, take a ref on it. The |
85 | * caller must undo that ref with free_uid(). | 287 | * caller must undo that ref with free_uid(). |
@@ -106,15 +308,10 @@ void free_uid(struct user_struct *up) | |||
106 | return; | 308 | return; |
107 | 309 | ||
108 | local_irq_save(flags); | 310 | local_irq_save(flags); |
109 | if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { | 311 | if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) |
110 | uid_hash_remove(up); | 312 | free_user(up, flags); |
111 | spin_unlock_irqrestore(&uidhash_lock, flags); | 313 | else |
112 | key_put(up->uid_keyring); | ||
113 | key_put(up->session_keyring); | ||
114 | kmem_cache_free(uid_cachep, up); | ||
115 | } else { | ||
116 | local_irq_restore(flags); | 314 | local_irq_restore(flags); |
117 | } | ||
118 | } | 315 | } |
119 | 316 | ||
120 | struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | 317 | struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) |
@@ -122,6 +319,11 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
122 | struct hlist_head *hashent = uidhashentry(ns, uid); | 319 | struct hlist_head *hashent = uidhashentry(ns, uid); |
123 | struct user_struct *up; | 320 | struct user_struct *up; |
124 | 321 | ||
322 | /* Make uid_hash_find() + user_kobject_create() + uid_hash_insert() | ||
323 | * atomic. | ||
324 | */ | ||
325 | uids_mutex_lock(); | ||
326 | |||
125 | spin_lock_irq(&uidhash_lock); | 327 | spin_lock_irq(&uidhash_lock); |
126 | up = uid_hash_find(uid, hashent); | 328 | up = uid_hash_find(uid, hashent); |
127 | spin_unlock_irq(&uidhash_lock); | 329 | spin_unlock_irq(&uidhash_lock); |
@@ -150,6 +352,22 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
150 | return NULL; | 352 | return NULL; |
151 | } | 353 | } |
152 | 354 | ||
355 | if (sched_create_user(new) < 0) { | ||
356 | key_put(new->uid_keyring); | ||
357 | key_put(new->session_keyring); | ||
358 | kmem_cache_free(uid_cachep, new); | ||
359 | return NULL; | ||
360 | } | ||
361 | |||
362 | if (user_kobject_create(new)) { | ||
363 | sched_destroy_user(new); | ||
364 | key_put(new->uid_keyring); | ||
365 | key_put(new->session_keyring); | ||
366 | kmem_cache_free(uid_cachep, new); | ||
367 | uids_mutex_unlock(); | ||
368 | return NULL; | ||
369 | } | ||
370 | |||
153 | /* | 371 | /* |
154 | * Before adding this, check whether we raced | 372 | * Before adding this, check whether we raced |
155 | * on adding the same user already.. | 373 | * on adding the same user already.. |
@@ -157,6 +375,11 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
157 | spin_lock_irq(&uidhash_lock); | 375 | spin_lock_irq(&uidhash_lock); |
158 | up = uid_hash_find(uid, hashent); | 376 | up = uid_hash_find(uid, hashent); |
159 | if (up) { | 377 | if (up) { |
378 | /* This case is not possible when CONFIG_FAIR_USER_SCHED | ||
379 | * is defined, since we serialize alloc_uid() using | ||
380 | * uids_mutex. Hence no need to call | ||
381 | * sched_destroy_user() or remove_user_sysfs_dir(). | ||
382 | */ | ||
160 | key_put(new->uid_keyring); | 383 | key_put(new->uid_keyring); |
161 | key_put(new->session_keyring); | 384 | key_put(new->session_keyring); |
162 | kmem_cache_free(uid_cachep, new); | 385 | kmem_cache_free(uid_cachep, new); |
@@ -167,6 +390,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
167 | spin_unlock_irq(&uidhash_lock); | 390 | spin_unlock_irq(&uidhash_lock); |
168 | 391 | ||
169 | } | 392 | } |
393 | |||
394 | uids_mutex_unlock(); | ||
395 | |||
170 | return up; | 396 | return up; |
171 | } | 397 | } |
172 | 398 | ||
@@ -184,6 +410,7 @@ void switch_uid(struct user_struct *new_user) | |||
184 | atomic_dec(&old_user->processes); | 410 | atomic_dec(&old_user->processes); |
185 | switch_uid_keyring(new_user); | 411 | switch_uid_keyring(new_user); |
186 | current->user = new_user; | 412 | current->user = new_user; |
413 | sched_switch_user(current); | ||
187 | 414 | ||
188 | /* | 415 | /* |
189 | * We need to synchronize with __sigqueue_alloc() | 416 | * We need to synchronize with __sigqueue_alloc() |