aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c23
-rw-r--r--kernel/audit.c17
-rw-r--r--kernel/cgroup.c4
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/futex.c6
-rw-r--r--kernel/futex_compat.c2
-rw-r--r--kernel/marker.c31
-rw-r--r--kernel/power/Kconfig2
-rw-r--r--kernel/power/snapshot.c41
-rw-r--r--kernel/printk.c83
-rw-r--r--kernel/relay.c12
-rw-r--r--kernel/sched.c115
-rw-r--r--kernel/sched_debug.c1
-rw-r--r--kernel/sched_fair.c291
-rw-r--r--kernel/time/clocksource.c16
-rw-r--r--kernel/time/timekeeping.c4
-rw-r--r--kernel/timer.c10
17 files changed, 420 insertions, 240 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 521dfa53cb99..91e1cfd734d2 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -58,6 +58,7 @@
58#include <asm/uaccess.h> 58#include <asm/uaccess.h>
59#include <asm/div64.h> 59#include <asm/div64.h>
60#include <linux/blkdev.h> /* sector_div */ 60#include <linux/blkdev.h> /* sector_div */
61#include <linux/pid_namespace.h>
61 62
62/* 63/*
63 * These constants control the amount of freespace that suspend and 64 * These constants control the amount of freespace that suspend and
@@ -74,7 +75,7 @@ int acct_parm[3] = {4, 2, 30};
74/* 75/*
75 * External references and all of the globals. 76 * External references and all of the globals.
76 */ 77 */
77static void do_acct_process(struct file *); 78static void do_acct_process(struct pid_namespace *ns, struct file *);
78 79
79/* 80/*
80 * This structure is used so that all the data protected by lock 81 * This structure is used so that all the data protected by lock
@@ -86,6 +87,7 @@ struct acct_glbs {
86 volatile int active; 87 volatile int active;
87 volatile int needcheck; 88 volatile int needcheck;
88 struct file *file; 89 struct file *file;
90 struct pid_namespace *ns;
89 struct timer_list timer; 91 struct timer_list timer;
90}; 92};
91 93
@@ -175,9 +177,11 @@ out:
175static void acct_file_reopen(struct file *file) 177static void acct_file_reopen(struct file *file)
176{ 178{
177 struct file *old_acct = NULL; 179 struct file *old_acct = NULL;
180 struct pid_namespace *old_ns = NULL;
178 181
179 if (acct_globals.file) { 182 if (acct_globals.file) {
180 old_acct = acct_globals.file; 183 old_acct = acct_globals.file;
184 old_ns = acct_globals.ns;
181 del_timer(&acct_globals.timer); 185 del_timer(&acct_globals.timer);
182 acct_globals.active = 0; 186 acct_globals.active = 0;
183 acct_globals.needcheck = 0; 187 acct_globals.needcheck = 0;
@@ -185,6 +189,7 @@ static void acct_file_reopen(struct file *file)
185 } 189 }
186 if (file) { 190 if (file) {
187 acct_globals.file = file; 191 acct_globals.file = file;
192 acct_globals.ns = get_pid_ns(task_active_pid_ns(current));
188 acct_globals.needcheck = 0; 193 acct_globals.needcheck = 0;
189 acct_globals.active = 1; 194 acct_globals.active = 1;
190 /* It's been deleted if it was used before so this is safe */ 195 /* It's been deleted if it was used before so this is safe */
@@ -196,8 +201,9 @@ static void acct_file_reopen(struct file *file)
196 if (old_acct) { 201 if (old_acct) {
197 mnt_unpin(old_acct->f_path.mnt); 202 mnt_unpin(old_acct->f_path.mnt);
198 spin_unlock(&acct_globals.lock); 203 spin_unlock(&acct_globals.lock);
199 do_acct_process(old_acct); 204 do_acct_process(old_ns, old_acct);
200 filp_close(old_acct, NULL); 205 filp_close(old_acct, NULL);
206 put_pid_ns(old_ns);
201 spin_lock(&acct_globals.lock); 207 spin_lock(&acct_globals.lock);
202 } 208 }
203} 209}
@@ -419,7 +425,7 @@ static u32 encode_float(u64 value)
419/* 425/*
420 * do_acct_process does all actual work. Caller holds the reference to file. 426 * do_acct_process does all actual work. Caller holds the reference to file.
421 */ 427 */
422static void do_acct_process(struct file *file) 428static void do_acct_process(struct pid_namespace *ns, struct file *file)
423{ 429{
424 struct pacct_struct *pacct = &current->signal->pacct; 430 struct pacct_struct *pacct = &current->signal->pacct;
425 acct_t ac; 431 acct_t ac;
@@ -481,8 +487,10 @@ static void do_acct_process(struct file *file)
481 ac.ac_gid16 = current->gid; 487 ac.ac_gid16 = current->gid;
482#endif 488#endif
483#if ACCT_VERSION==3 489#if ACCT_VERSION==3
484 ac.ac_pid = current->tgid; 490 ac.ac_pid = task_tgid_nr_ns(current, ns);
485 ac.ac_ppid = current->real_parent->tgid; 491 rcu_read_lock();
492 ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
493 rcu_read_unlock();
486#endif 494#endif
487 495
488 spin_lock_irq(&current->sighand->siglock); 496 spin_lock_irq(&current->sighand->siglock);
@@ -578,6 +586,7 @@ void acct_collect(long exitcode, int group_dead)
578void acct_process(void) 586void acct_process(void)
579{ 587{
580 struct file *file = NULL; 588 struct file *file = NULL;
589 struct pid_namespace *ns;
581 590
582 /* 591 /*
583 * accelerate the common fastpath: 592 * accelerate the common fastpath:
@@ -592,8 +601,10 @@ void acct_process(void)
592 return; 601 return;
593 } 602 }
594 get_file(file); 603 get_file(file);
604 ns = get_pid_ns(acct_globals.ns);
595 spin_unlock(&acct_globals.lock); 605 spin_unlock(&acct_globals.lock);
596 606
597 do_acct_process(file); 607 do_acct_process(ns, file);
598 fput(file); 608 fput(file);
609 put_pid_ns(ns);
599} 610}
diff --git a/kernel/audit.c b/kernel/audit.c
index 10c4930c2bbf..b782b046543d 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -78,9 +78,13 @@ static int audit_default;
78/* If auditing cannot proceed, audit_failure selects what happens. */ 78/* If auditing cannot proceed, audit_failure selects what happens. */
79static int audit_failure = AUDIT_FAIL_PRINTK; 79static int audit_failure = AUDIT_FAIL_PRINTK;
80 80
81/* If audit records are to be written to the netlink socket, audit_pid 81/*
82 * contains the (non-zero) pid. */ 82 * If audit records are to be written to the netlink socket, audit_pid
83 * contains the pid of the auditd process and audit_nlk_pid contains
84 * the pid to use to send netlink messages to that process.
85 */
83int audit_pid; 86int audit_pid;
87static int audit_nlk_pid;
84 88
85/* If audit_rate_limit is non-zero, limit the rate of sending audit records 89/* If audit_rate_limit is non-zero, limit the rate of sending audit records
86 * to that number per second. This prevents DoS attacks, but results in 90 * to that number per second. This prevents DoS attacks, but results in
@@ -350,7 +354,7 @@ static int kauditd_thread(void *dummy)
350 wake_up(&audit_backlog_wait); 354 wake_up(&audit_backlog_wait);
351 if (skb) { 355 if (skb) {
352 if (audit_pid) { 356 if (audit_pid) {
353 int err = netlink_unicast(audit_sock, skb, audit_pid, 0); 357 int err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
354 if (err < 0) { 358 if (err < 0) {
355 BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ 359 BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
356 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); 360 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
@@ -626,6 +630,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
626 sid, 1); 630 sid, 1);
627 631
628 audit_pid = new_pid; 632 audit_pid = new_pid;
633 audit_nlk_pid = NETLINK_CB(skb).pid;
629 } 634 }
630 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) 635 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
631 err = audit_set_rate_limit(status_get->rate_limit, 636 err = audit_set_rate_limit(status_get->rate_limit,
@@ -1264,8 +1269,8 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
1264 1269
1265/** 1270/**
1266 * audit_string_contains_control - does a string need to be logged in hex 1271 * audit_string_contains_control - does a string need to be logged in hex
1267 * @string - string to be checked 1272 * @string: string to be checked
1268 * @len - max length of the string to check 1273 * @len: max length of the string to check
1269 */ 1274 */
1270int audit_string_contains_control(const char *string, size_t len) 1275int audit_string_contains_control(const char *string, size_t len)
1271{ 1276{
@@ -1280,7 +1285,7 @@ int audit_string_contains_control(const char *string, size_t len)
1280/** 1285/**
1281 * audit_log_n_untrustedstring - log a string that may contain random characters 1286 * audit_log_n_untrustedstring - log a string that may contain random characters
1282 * @ab: audit_buffer 1287 * @ab: audit_buffer
1283 * @len: lenth of string (not including trailing null) 1288 * @len: length of string (not including trailing null)
1284 * @string: string to be logged 1289 * @string: string to be logged
1285 * 1290 *
1286 * This code will escape a string that is passed to it if the string 1291 * This code will escape a string that is passed to it if the string
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e9c2fb01e89b..53d86b4b0ce0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2082,7 +2082,7 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
2082 2082
2083 kfree(pidarray); 2083 kfree(pidarray);
2084 } else { 2084 } else {
2085 ctr->buf = 0; 2085 ctr->buf = NULL;
2086 ctr->bufsz = 0; 2086 ctr->bufsz = 0;
2087 } 2087 }
2088 file->private_data = ctr; 2088 file->private_data = ctr;
@@ -2614,7 +2614,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
2614 2614
2615static int cgroupstats_open(struct inode *inode, struct file *file) 2615static int cgroupstats_open(struct inode *inode, struct file *file)
2616{ 2616{
2617 return single_open(file, proc_cgroupstats_show, 0); 2617 return single_open(file, proc_cgroupstats_show, NULL);
2618} 2618}
2619 2619
2620static struct file_operations proc_cgroupstats_operations = { 2620static struct file_operations proc_cgroupstats_operations = {
diff --git a/kernel/fork.c b/kernel/fork.c
index dd249c37b3a3..9c042f901570 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -394,7 +394,6 @@ void __mmdrop(struct mm_struct *mm)
394{ 394{
395 BUG_ON(mm == &init_mm); 395 BUG_ON(mm == &init_mm);
396 mm_free_pgd(mm); 396 mm_free_pgd(mm);
397 mm_free_cgroup(mm);
398 destroy_context(mm); 397 destroy_context(mm);
399 free_mm(mm); 398 free_mm(mm);
400} 399}
@@ -416,6 +415,7 @@ void mmput(struct mm_struct *mm)
416 spin_unlock(&mmlist_lock); 415 spin_unlock(&mmlist_lock);
417 } 416 }
418 put_swap_token(mm); 417 put_swap_token(mm);
418 mm_free_cgroup(mm);
419 mmdrop(mm); 419 mmdrop(mm);
420 } 420 }
421} 421}
diff --git a/kernel/futex.c b/kernel/futex.c
index 06968cd79200..e43945e995f5 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -281,7 +281,7 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
281 */ 281 */
282static void get_futex_key_refs(union futex_key *key) 282static void get_futex_key_refs(union futex_key *key)
283{ 283{
284 if (key->both.ptr == 0) 284 if (key->both.ptr == NULL)
285 return; 285 return;
286 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 286 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
287 case FUT_OFF_INODE: 287 case FUT_OFF_INODE:
@@ -2158,7 +2158,7 @@ static struct file_system_type futex_fs_type = {
2158 .kill_sb = kill_anon_super, 2158 .kill_sb = kill_anon_super,
2159}; 2159};
2160 2160
2161static int __init init(void) 2161static int __init futex_init(void)
2162{ 2162{
2163 u32 curval; 2163 u32 curval;
2164 int i; 2164 int i;
@@ -2194,4 +2194,4 @@ static int __init init(void)
2194 2194
2195 return 0; 2195 return 0;
2196} 2196}
2197__initcall(init); 2197__initcall(futex_init);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index ff90f049f8f6..04ac3a9e42cf 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -30,7 +30,7 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
30 return 0; 30 return 0;
31} 31}
32 32
33static void __user *futex_uaddr(struct robust_list *entry, 33static void __user *futex_uaddr(struct robust_list __user *entry,
34 compat_long_t futex_offset) 34 compat_long_t futex_offset)
35{ 35{
36 compat_uptr_t base = ptr_to_compat(entry); 36 compat_uptr_t base = ptr_to_compat(entry);
diff --git a/kernel/marker.c b/kernel/marker.c
index 48a4ea5afffd..041c33e3e95c 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -104,18 +104,18 @@ void marker_probe_cb(const struct marker *mdata, void *call_private,
104 char ptype; 104 char ptype;
105 105
106 /* 106 /*
107 * disabling preemption to make sure the teardown of the callbacks can 107 * preempt_disable does two things : disabling preemption to make sure
108 * be done correctly when they are in modules and they insure RCU read 108 * the teardown of the callbacks can be done correctly when they are in
109 * coherency. 109 * modules and they insure RCU read coherency.
110 */ 110 */
111 preempt_disable(); 111 preempt_disable();
112 ptype = ACCESS_ONCE(mdata->ptype); 112 ptype = mdata->ptype;
113 if (likely(!ptype)) { 113 if (likely(!ptype)) {
114 marker_probe_func *func; 114 marker_probe_func *func;
115 /* Must read the ptype before ptr. They are not data dependant, 115 /* Must read the ptype before ptr. They are not data dependant,
116 * so we put an explicit smp_rmb() here. */ 116 * so we put an explicit smp_rmb() here. */
117 smp_rmb(); 117 smp_rmb();
118 func = ACCESS_ONCE(mdata->single.func); 118 func = mdata->single.func;
119 /* Must read the ptr before private data. They are not data 119 /* Must read the ptr before private data. They are not data
120 * dependant, so we put an explicit smp_rmb() here. */ 120 * dependant, so we put an explicit smp_rmb() here. */
121 smp_rmb(); 121 smp_rmb();
@@ -133,7 +133,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private,
133 * in the fast path, so put the explicit barrier here. 133 * in the fast path, so put the explicit barrier here.
134 */ 134 */
135 smp_read_barrier_depends(); 135 smp_read_barrier_depends();
136 multi = ACCESS_ONCE(mdata->multi); 136 multi = mdata->multi;
137 for (i = 0; multi[i].func; i++) { 137 for (i = 0; multi[i].func; i++) {
138 va_start(args, fmt); 138 va_start(args, fmt);
139 multi[i].func(multi[i].probe_private, call_private, fmt, 139 multi[i].func(multi[i].probe_private, call_private, fmt,
@@ -161,13 +161,13 @@ void marker_probe_cb_noarg(const struct marker *mdata,
161 char ptype; 161 char ptype;
162 162
163 preempt_disable(); 163 preempt_disable();
164 ptype = ACCESS_ONCE(mdata->ptype); 164 ptype = mdata->ptype;
165 if (likely(!ptype)) { 165 if (likely(!ptype)) {
166 marker_probe_func *func; 166 marker_probe_func *func;
167 /* Must read the ptype before ptr. They are not data dependant, 167 /* Must read the ptype before ptr. They are not data dependant,
168 * so we put an explicit smp_rmb() here. */ 168 * so we put an explicit smp_rmb() here. */
169 smp_rmb(); 169 smp_rmb();
170 func = ACCESS_ONCE(mdata->single.func); 170 func = mdata->single.func;
171 /* Must read the ptr before private data. They are not data 171 /* Must read the ptr before private data. They are not data
172 * dependant, so we put an explicit smp_rmb() here. */ 172 * dependant, so we put an explicit smp_rmb() here. */
173 smp_rmb(); 173 smp_rmb();
@@ -183,7 +183,7 @@ void marker_probe_cb_noarg(const struct marker *mdata,
183 * in the fast path, so put the explicit barrier here. 183 * in the fast path, so put the explicit barrier here.
184 */ 184 */
185 smp_read_barrier_depends(); 185 smp_read_barrier_depends();
186 multi = ACCESS_ONCE(mdata->multi); 186 multi = mdata->multi;
187 for (i = 0; multi[i].func; i++) 187 for (i = 0; multi[i].func; i++)
188 multi[i].func(multi[i].probe_private, call_private, fmt, 188 multi[i].func(multi[i].probe_private, call_private, fmt,
189 &args); 189 &args);
@@ -551,9 +551,9 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,
551 551
552/* 552/*
553 * Disable a marker and its probe callback. 553 * Disable a marker and its probe callback.
554 * Note: only after a synchronize_sched() issued after setting elem->call to the 554 * Note: only waiting an RCU period after setting elem->call to the empty
555 * empty function insures that the original callback is not used anymore. This 555 * function insures that the original callback is not used anymore. This insured
556 * insured by preemption disabling around the call site. 556 * by preempt_disable around the call site.
557 */ 557 */
558static void disable_marker(struct marker *elem) 558static void disable_marker(struct marker *elem)
559{ 559{
@@ -565,8 +565,8 @@ static void disable_marker(struct marker *elem)
565 elem->ptype = 0; /* single probe */ 565 elem->ptype = 0; /* single probe */
566 /* 566 /*
567 * Leave the private data and id there, because removal is racy and 567 * Leave the private data and id there, because removal is racy and
568 * should be done only after a synchronize_sched(). These are never used 568 * should be done only after an RCU period. These are never used until
569 * until the next initialization anyway. 569 * the next initialization anyway.
570 */ 570 */
571} 571}
572 572
@@ -601,9 +601,6 @@ void marker_update_probe_range(struct marker *begin,
601 601
602/* 602/*
603 * Update probes, removing the faulty probes. 603 * Update probes, removing the faulty probes.
604 * Issues a synchronize_sched() when no reference to the module passed
605 * as parameter is found in the probes so the probe module can be
606 * safely unloaded from now on.
607 * 604 *
608 * Internal callback only changed before the first probe is connected to it. 605 * Internal callback only changed before the first probe is connected to it.
609 * Single probe private data can only be changed on 0 -> 1 and 2 -> 1 606 * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 79833170bb9c..6233f3b4ae66 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -190,7 +190,7 @@ config APM_EMULATION
190 notification of APM "events" (e.g. battery status change). 190 notification of APM "events" (e.g. battery status change).
191 191
192 In order to use APM, you will need supporting software. For location 192 In order to use APM, you will need supporting software. For location
193 and more information, read <file:Documentation/pm.txt> and the 193 and more information, read <file:Documentation/power/pm.txt> and the
194 Battery Powered Linux mini-HOWTO, available from 194 Battery Powered Linux mini-HOWTO, available from
195 <http://www.tldp.org/docs.html#howto>. 195 <http://www.tldp.org/docs.html#howto>.
196 196
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 72a020cabb4c..5f91a07c4eac 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -447,7 +447,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
447 * of @bm->cur_zone_bm are updated. 447 * of @bm->cur_zone_bm are updated.
448 */ 448 */
449 449
450static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, 450static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
451 void **addr, unsigned int *bit_nr) 451 void **addr, unsigned int *bit_nr)
452{ 452{
453 struct zone_bitmap *zone_bm; 453 struct zone_bitmap *zone_bm;
@@ -461,7 +461,8 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
461 while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { 461 while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
462 zone_bm = zone_bm->next; 462 zone_bm = zone_bm->next;
463 463
464 BUG_ON(!zone_bm); 464 if (!zone_bm)
465 return -EFAULT;
465 } 466 }
466 bm->cur.zone_bm = zone_bm; 467 bm->cur.zone_bm = zone_bm;
467 } 468 }
@@ -479,23 +480,40 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
479 pfn -= bb->start_pfn; 480 pfn -= bb->start_pfn;
480 *bit_nr = pfn % BM_BITS_PER_CHUNK; 481 *bit_nr = pfn % BM_BITS_PER_CHUNK;
481 *addr = bb->data + pfn / BM_BITS_PER_CHUNK; 482 *addr = bb->data + pfn / BM_BITS_PER_CHUNK;
483 return 0;
482} 484}
483 485
484static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) 486static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
485{ 487{
486 void *addr; 488 void *addr;
487 unsigned int bit; 489 unsigned int bit;
490 int error;
488 491
489 memory_bm_find_bit(bm, pfn, &addr, &bit); 492 error = memory_bm_find_bit(bm, pfn, &addr, &bit);
493 BUG_ON(error);
490 set_bit(bit, addr); 494 set_bit(bit, addr);
491} 495}
492 496
497static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
498{
499 void *addr;
500 unsigned int bit;
501 int error;
502
503 error = memory_bm_find_bit(bm, pfn, &addr, &bit);
504 if (!error)
505 set_bit(bit, addr);
506 return error;
507}
508
493static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) 509static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
494{ 510{
495 void *addr; 511 void *addr;
496 unsigned int bit; 512 unsigned int bit;
513 int error;
497 514
498 memory_bm_find_bit(bm, pfn, &addr, &bit); 515 error = memory_bm_find_bit(bm, pfn, &addr, &bit);
516 BUG_ON(error);
499 clear_bit(bit, addr); 517 clear_bit(bit, addr);
500} 518}
501 519
@@ -503,8 +521,10 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
503{ 521{
504 void *addr; 522 void *addr;
505 unsigned int bit; 523 unsigned int bit;
524 int error;
506 525
507 memory_bm_find_bit(bm, pfn, &addr, &bit); 526 error = memory_bm_find_bit(bm, pfn, &addr, &bit);
527 BUG_ON(error);
508 return test_bit(bit, addr); 528 return test_bit(bit, addr);
509} 529}
510 530
@@ -709,8 +729,15 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
709 region->end_pfn << PAGE_SHIFT); 729 region->end_pfn << PAGE_SHIFT);
710 730
711 for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) 731 for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
712 if (pfn_valid(pfn)) 732 if (pfn_valid(pfn)) {
713 memory_bm_set_bit(bm, pfn); 733 /*
734 * It is safe to ignore the result of
735 * mem_bm_set_bit_check() here, since we won't
736 * touch the PFNs for which the error is
737 * returned anyway.
738 */
739 mem_bm_set_bit_check(bm, pfn);
740 }
714 } 741 }
715} 742}
716 743
diff --git a/kernel/printk.c b/kernel/printk.c
index 9adc2a473e6e..c46a20a19a15 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -616,6 +616,40 @@ asmlinkage int printk(const char *fmt, ...)
616/* cpu currently holding logbuf_lock */ 616/* cpu currently holding logbuf_lock */
617static volatile unsigned int printk_cpu = UINT_MAX; 617static volatile unsigned int printk_cpu = UINT_MAX;
618 618
619/*
620 * Can we actually use the console at this time on this cpu?
621 *
622 * Console drivers may assume that per-cpu resources have
623 * been allocated. So unless they're explicitly marked as
624 * being able to cope (CON_ANYTIME) don't call them until
625 * this CPU is officially up.
626 */
627static inline int can_use_console(unsigned int cpu)
628{
629 return cpu_online(cpu) || have_callable_console();
630}
631
632/*
633 * Try to get console ownership to actually show the kernel
634 * messages from a 'printk'. Return true (and with the
635 * console_semaphore held, and 'console_locked' set) if it
636 * is successful, false otherwise.
637 *
638 * This gets called with the 'logbuf_lock' spinlock held and
639 * interrupts disabled. It should return with 'lockbuf_lock'
640 * released but interrupts still disabled.
641 */
642static int acquire_console_semaphore_for_printk(unsigned int cpu)
643{
644 int retval = 0;
645
646 if (can_use_console(cpu))
647 retval = !try_acquire_console_sem();
648 printk_cpu = UINT_MAX;
649 spin_unlock(&logbuf_lock);
650 return retval;
651}
652
619const char printk_recursion_bug_msg [] = 653const char printk_recursion_bug_msg [] =
620 KERN_CRIT "BUG: recent printk recursion!\n"; 654 KERN_CRIT "BUG: recent printk recursion!\n";
621static int printk_recursion_bug; 655static int printk_recursion_bug;
@@ -725,43 +759,22 @@ asmlinkage int vprintk(const char *fmt, va_list args)
725 log_level_unknown = 1; 759 log_level_unknown = 1;
726 } 760 }
727 761
728 if (!down_trylock(&console_sem)) { 762 /*
729 /* 763 * Try to acquire and then immediately release the
730 * We own the drivers. We can drop the spinlock and 764 * console semaphore. The release will do all the
731 * let release_console_sem() print the text, maybe ... 765 * actual magic (print out buffers, wake up klogd,
732 */ 766 * etc).
733 console_locked = 1; 767 *
734 printk_cpu = UINT_MAX; 768 * The acquire_console_semaphore_for_printk() function
735 spin_unlock(&logbuf_lock); 769 * will release 'logbuf_lock' regardless of whether it
770 * actually gets the semaphore or not.
771 */
772 if (acquire_console_semaphore_for_printk(this_cpu))
773 release_console_sem();
736 774
737 /* 775 lockdep_on();
738 * Console drivers may assume that per-cpu resources have
739 * been allocated. So unless they're explicitly marked as
740 * being able to cope (CON_ANYTIME) don't call them until
741 * this CPU is officially up.
742 */
743 if (cpu_online(smp_processor_id()) || have_callable_console()) {
744 console_may_schedule = 0;
745 release_console_sem();
746 } else {
747 /* Release by hand to avoid flushing the buffer. */
748 console_locked = 0;
749 up(&console_sem);
750 }
751 lockdep_on();
752 raw_local_irq_restore(flags);
753 } else {
754 /*
755 * Someone else owns the drivers. We drop the spinlock, which
756 * allows the semaphore holder to proceed and to call the
757 * console drivers with the output which we just produced.
758 */
759 printk_cpu = UINT_MAX;
760 spin_unlock(&logbuf_lock);
761 lockdep_on();
762out_restore_irqs: 776out_restore_irqs:
763 raw_local_irq_restore(flags); 777 raw_local_irq_restore(flags);
764 }
765 778
766 preempt_enable(); 779 preempt_enable();
767 return printed_len; 780 return printed_len;
diff --git a/kernel/relay.c b/kernel/relay.c
index d080b9d161a7..d6204a485818 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -736,7 +736,7 @@ static int relay_file_open(struct inode *inode, struct file *filp)
736 kref_get(&buf->kref); 736 kref_get(&buf->kref);
737 filp->private_data = buf; 737 filp->private_data = buf;
738 738
739 return 0; 739 return nonseekable_open(inode, filp);
740} 740}
741 741
742/** 742/**
@@ -1056,6 +1056,10 @@ static struct pipe_buf_operations relay_pipe_buf_ops = {
1056 .get = generic_pipe_buf_get, 1056 .get = generic_pipe_buf_get,
1057}; 1057};
1058 1058
1059static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
1060{
1061}
1062
1059/* 1063/*
1060 * subbuf_splice_actor - splice up to one subbuf's worth of data 1064 * subbuf_splice_actor - splice up to one subbuf's worth of data
1061 */ 1065 */
@@ -1066,7 +1070,7 @@ static int subbuf_splice_actor(struct file *in,
1066 unsigned int flags, 1070 unsigned int flags,
1067 int *nonpad_ret) 1071 int *nonpad_ret)
1068{ 1072{
1069 unsigned int pidx, poff, total_len, subbuf_pages, ret; 1073 unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret;
1070 struct rchan_buf *rbuf = in->private_data; 1074 struct rchan_buf *rbuf = in->private_data;
1071 unsigned int subbuf_size = rbuf->chan->subbuf_size; 1075 unsigned int subbuf_size = rbuf->chan->subbuf_size;
1072 uint64_t pos = (uint64_t) *ppos; 1076 uint64_t pos = (uint64_t) *ppos;
@@ -1083,6 +1087,7 @@ static int subbuf_splice_actor(struct file *in,
1083 .partial = partial, 1087 .partial = partial,
1084 .flags = flags, 1088 .flags = flags,
1085 .ops = &relay_pipe_buf_ops, 1089 .ops = &relay_pipe_buf_ops,
1090 .spd_release = relay_page_release,
1086 }; 1091 };
1087 1092
1088 if (rbuf->subbufs_produced == rbuf->subbufs_consumed) 1093 if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
@@ -1097,8 +1102,9 @@ static int subbuf_splice_actor(struct file *in,
1097 subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; 1102 subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
1098 pidx = (read_start / PAGE_SIZE) % subbuf_pages; 1103 pidx = (read_start / PAGE_SIZE) % subbuf_pages;
1099 poff = read_start & ~PAGE_MASK; 1104 poff = read_start & ~PAGE_MASK;
1105 nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS);
1100 1106
1101 for (total_len = 0; spd.nr_pages < subbuf_pages; spd.nr_pages++) { 1107 for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
1102 unsigned int this_len, this_end, private; 1108 unsigned int this_len, this_end, private;
1103 unsigned int cur_pos = read_start + total_len; 1109 unsigned int cur_pos = read_start + total_len;
1104 1110
diff --git a/kernel/sched.c b/kernel/sched.c
index 1cb53fb1fe3d..8dcdec6fe0fe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -301,7 +301,7 @@ struct cfs_rq {
301 /* 'curr' points to currently running entity on this cfs_rq. 301 /* 'curr' points to currently running entity on this cfs_rq.
302 * It is set to NULL otherwise (i.e when none are currently running). 302 * It is set to NULL otherwise (i.e when none are currently running).
303 */ 303 */
304 struct sched_entity *curr; 304 struct sched_entity *curr, *next;
305 305
306 unsigned long nr_spread_over; 306 unsigned long nr_spread_over;
307 307
@@ -594,18 +594,14 @@ enum {
594 SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, 594 SCHED_FEAT_NEW_FAIR_SLEEPERS = 1,
595 SCHED_FEAT_WAKEUP_PREEMPT = 2, 595 SCHED_FEAT_WAKEUP_PREEMPT = 2,
596 SCHED_FEAT_START_DEBIT = 4, 596 SCHED_FEAT_START_DEBIT = 4,
597 SCHED_FEAT_TREE_AVG = 8, 597 SCHED_FEAT_HRTICK = 8,
598 SCHED_FEAT_APPROX_AVG = 16, 598 SCHED_FEAT_DOUBLE_TICK = 16,
599 SCHED_FEAT_HRTICK = 32,
600 SCHED_FEAT_DOUBLE_TICK = 64,
601}; 599};
602 600
603const_debug unsigned int sysctl_sched_features = 601const_debug unsigned int sysctl_sched_features =
604 SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | 602 SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 |
605 SCHED_FEAT_WAKEUP_PREEMPT * 1 | 603 SCHED_FEAT_WAKEUP_PREEMPT * 1 |
606 SCHED_FEAT_START_DEBIT * 1 | 604 SCHED_FEAT_START_DEBIT * 1 |
607 SCHED_FEAT_TREE_AVG * 0 |
608 SCHED_FEAT_APPROX_AVG * 0 |
609 SCHED_FEAT_HRTICK * 1 | 605 SCHED_FEAT_HRTICK * 1 |
610 SCHED_FEAT_DOUBLE_TICK * 0; 606 SCHED_FEAT_DOUBLE_TICK * 0;
611 607
@@ -1056,6 +1052,49 @@ static void resched_cpu(int cpu)
1056 resched_task(cpu_curr(cpu)); 1052 resched_task(cpu_curr(cpu));
1057 spin_unlock_irqrestore(&rq->lock, flags); 1053 spin_unlock_irqrestore(&rq->lock, flags);
1058} 1054}
1055
1056#ifdef CONFIG_NO_HZ
1057/*
1058 * When add_timer_on() enqueues a timer into the timer wheel of an
1059 * idle CPU then this timer might expire before the next timer event
1060 * which is scheduled to wake up that CPU. In case of a completely
1061 * idle system the next event might even be infinite time into the
1062 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
1063 * leaves the inner idle loop so the newly added timer is taken into
1064 * account when the CPU goes back to idle and evaluates the timer
1065 * wheel for the next timer event.
1066 */
1067void wake_up_idle_cpu(int cpu)
1068{
1069 struct rq *rq = cpu_rq(cpu);
1070
1071 if (cpu == smp_processor_id())
1072 return;
1073
1074 /*
1075 * This is safe, as this function is called with the timer
1076 * wheel base lock of (cpu) held. When the CPU is on the way
1077 * to idle and has not yet set rq->curr to idle then it will
1078 * be serialized on the timer wheel base lock and take the new
1079 * timer into account automatically.
1080 */
1081 if (rq->curr != rq->idle)
1082 return;
1083
1084 /*
1085 * We can set TIF_RESCHED on the idle task of the other CPU
1086 * lockless. The worst case is that the other CPU runs the
1087 * idle task through an additional NOOP schedule()
1088 */
1089 set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
1090
1091 /* NEED_RESCHED must be visible before we test polling */
1092 smp_mb();
1093 if (!tsk_is_polling(rq->idle))
1094 smp_send_reschedule(cpu);
1095}
1096#endif
1097
1059#else 1098#else
1060static void __resched_task(struct task_struct *p, int tif_bit) 1099static void __resched_task(struct task_struct *p, int tif_bit)
1061{ 1100{
@@ -1084,7 +1123,7 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1084 u64 tmp; 1123 u64 tmp;
1085 1124
1086 if (unlikely(!lw->inv_weight)) 1125 if (unlikely(!lw->inv_weight))
1087 lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1; 1126 lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1);
1088 1127
1089 tmp = (u64)delta_exec * weight; 1128 tmp = (u64)delta_exec * weight;
1090 /* 1129 /*
@@ -1108,11 +1147,13 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
1108static inline void update_load_add(struct load_weight *lw, unsigned long inc) 1147static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1109{ 1148{
1110 lw->weight += inc; 1149 lw->weight += inc;
1150 lw->inv_weight = 0;
1111} 1151}
1112 1152
1113static inline void update_load_sub(struct load_weight *lw, unsigned long dec) 1153static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1114{ 1154{
1115 lw->weight -= dec; 1155 lw->weight -= dec;
1156 lw->inv_weight = 0;
1116} 1157}
1117 1158
1118/* 1159/*
@@ -1394,6 +1435,12 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1394{ 1435{
1395 s64 delta; 1436 s64 delta;
1396 1437
1438 /*
1439 * Buddy candidates are cache hot:
1440 */
1441 if (&p->se == cfs_rq_of(&p->se)->next)
1442 return 1;
1443
1397 if (p->sched_class != &fair_sched_class) 1444 if (p->sched_class != &fair_sched_class)
1398 return 0; 1445 return 0;
1399 1446
@@ -1853,10 +1900,11 @@ out_activate:
1853 schedstat_inc(p, se.nr_wakeups_remote); 1900 schedstat_inc(p, se.nr_wakeups_remote);
1854 update_rq_clock(rq); 1901 update_rq_clock(rq);
1855 activate_task(rq, p, 1); 1902 activate_task(rq, p, 1);
1856 check_preempt_curr(rq, p);
1857 success = 1; 1903 success = 1;
1858 1904
1859out_running: 1905out_running:
1906 check_preempt_curr(rq, p);
1907
1860 p->state = TASK_RUNNING; 1908 p->state = TASK_RUNNING;
1861#ifdef CONFIG_SMP 1909#ifdef CONFIG_SMP
1862 if (p->sched_class->task_wake_up) 1910 if (p->sched_class->task_wake_up)
@@ -1890,6 +1938,8 @@ static void __sched_fork(struct task_struct *p)
1890 p->se.exec_start = 0; 1938 p->se.exec_start = 0;
1891 p->se.sum_exec_runtime = 0; 1939 p->se.sum_exec_runtime = 0;
1892 p->se.prev_sum_exec_runtime = 0; 1940 p->se.prev_sum_exec_runtime = 0;
1941 p->se.last_wakeup = 0;
1942 p->se.avg_overlap = 0;
1893 1943
1894#ifdef CONFIG_SCHEDSTATS 1944#ifdef CONFIG_SCHEDSTATS
1895 p->se.wait_start = 0; 1945 p->se.wait_start = 0;
@@ -3875,7 +3925,7 @@ need_resched_nonpreemptible:
3875 3925
3876 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3926 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3877 if (unlikely((prev->state & TASK_INTERRUPTIBLE) && 3927 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3878 unlikely(signal_pending(prev)))) { 3928 signal_pending(prev))) {
3879 prev->state = TASK_RUNNING; 3929 prev->state = TASK_RUNNING;
3880 } else { 3930 } else {
3881 deactivate_task(rq, prev, 1); 3931 deactivate_task(rq, prev, 1);
@@ -4268,11 +4318,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4268 oldprio = p->prio; 4318 oldprio = p->prio;
4269 on_rq = p->se.on_rq; 4319 on_rq = p->se.on_rq;
4270 running = task_current(rq, p); 4320 running = task_current(rq, p);
4271 if (on_rq) { 4321 if (on_rq)
4272 dequeue_task(rq, p, 0); 4322 dequeue_task(rq, p, 0);
4273 if (running) 4323 if (running)
4274 p->sched_class->put_prev_task(rq, p); 4324 p->sched_class->put_prev_task(rq, p);
4275 }
4276 4325
4277 if (rt_prio(prio)) 4326 if (rt_prio(prio))
4278 p->sched_class = &rt_sched_class; 4327 p->sched_class = &rt_sched_class;
@@ -4281,10 +4330,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4281 4330
4282 p->prio = prio; 4331 p->prio = prio;
4283 4332
4333 if (running)
4334 p->sched_class->set_curr_task(rq);
4284 if (on_rq) { 4335 if (on_rq) {
4285 if (running)
4286 p->sched_class->set_curr_task(rq);
4287
4288 enqueue_task(rq, p, 0); 4336 enqueue_task(rq, p, 0);
4289 4337
4290 check_class_changed(rq, p, prev_class, oldprio, running); 4338 check_class_changed(rq, p, prev_class, oldprio, running);
@@ -4581,19 +4629,17 @@ recheck:
4581 update_rq_clock(rq); 4629 update_rq_clock(rq);
4582 on_rq = p->se.on_rq; 4630 on_rq = p->se.on_rq;
4583 running = task_current(rq, p); 4631 running = task_current(rq, p);
4584 if (on_rq) { 4632 if (on_rq)
4585 deactivate_task(rq, p, 0); 4633 deactivate_task(rq, p, 0);
4586 if (running) 4634 if (running)
4587 p->sched_class->put_prev_task(rq, p); 4635 p->sched_class->put_prev_task(rq, p);
4588 }
4589 4636
4590 oldprio = p->prio; 4637 oldprio = p->prio;
4591 __setscheduler(rq, p, policy, param->sched_priority); 4638 __setscheduler(rq, p, policy, param->sched_priority);
4592 4639
4640 if (running)
4641 p->sched_class->set_curr_task(rq);
4593 if (on_rq) { 4642 if (on_rq) {
4594 if (running)
4595 p->sched_class->set_curr_task(rq);
4596
4597 activate_task(rq, p, 0); 4643 activate_task(rq, p, 0);
4598 4644
4599 check_class_changed(rq, p, prev_class, oldprio, running); 4645 check_class_changed(rq, p, prev_class, oldprio, running);
@@ -6804,6 +6850,10 @@ static int ndoms_cur; /* number of sched domains in 'doms_cur' */
6804 */ 6850 */
6805static cpumask_t fallback_doms; 6851static cpumask_t fallback_doms;
6806 6852
6853void __attribute__((weak)) arch_update_cpu_topology(void)
6854{
6855}
6856
6807/* 6857/*
6808 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6858 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6809 * For now this just excludes isolated cpus, but could be used to 6859 * For now this just excludes isolated cpus, but could be used to
@@ -6813,6 +6863,7 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
6813{ 6863{
6814 int err; 6864 int err;
6815 6865
6866 arch_update_cpu_topology();
6816 ndoms_cur = 1; 6867 ndoms_cur = 1;
6817 doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 6868 doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
6818 if (!doms_cur) 6869 if (!doms_cur)
@@ -6917,7 +6968,7 @@ match2:
6917} 6968}
6918 6969
6919#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 6970#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6920static int arch_reinit_sched_domains(void) 6971int arch_reinit_sched_domains(void)
6921{ 6972{
6922 int err; 6973 int err;
6923 6974
@@ -7618,11 +7669,10 @@ void sched_move_task(struct task_struct *tsk)
7618 running = task_current(rq, tsk); 7669 running = task_current(rq, tsk);
7619 on_rq = tsk->se.on_rq; 7670 on_rq = tsk->se.on_rq;
7620 7671
7621 if (on_rq) { 7672 if (on_rq)
7622 dequeue_task(rq, tsk, 0); 7673 dequeue_task(rq, tsk, 0);
7623 if (unlikely(running)) 7674 if (unlikely(running))
7624 tsk->sched_class->put_prev_task(rq, tsk); 7675 tsk->sched_class->put_prev_task(rq, tsk);
7625 }
7626 7676
7627 set_task_rq(tsk, task_cpu(tsk)); 7677 set_task_rq(tsk, task_cpu(tsk));
7628 7678
@@ -7631,11 +7681,10 @@ void sched_move_task(struct task_struct *tsk)
7631 tsk->sched_class->moved_group(tsk); 7681 tsk->sched_class->moved_group(tsk);
7632#endif 7682#endif
7633 7683
7634 if (on_rq) { 7684 if (unlikely(running))
7635 if (unlikely(running)) 7685 tsk->sched_class->set_curr_task(rq);
7636 tsk->sched_class->set_curr_task(rq); 7686 if (on_rq)
7637 enqueue_task(rq, tsk, 0); 7687 enqueue_task(rq, tsk, 0);
7638 }
7639 7688
7640 task_rq_unlock(rq, &flags); 7689 task_rq_unlock(rq, &flags);
7641} 7690}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 4b5e24cf2f4a..ef358ba07683 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -288,6 +288,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
288 PN(se.exec_start); 288 PN(se.exec_start);
289 PN(se.vruntime); 289 PN(se.vruntime);
290 PN(se.sum_exec_runtime); 290 PN(se.sum_exec_runtime);
291 PN(se.avg_overlap);
291 292
292 nr_switches = p->nvcsw + p->nivcsw; 293 nr_switches = p->nvcsw + p->nivcsw;
293 294
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e2a530515619..86a93376282c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -73,13 +73,13 @@ unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
73 73
74/* 74/*
75 * SCHED_OTHER wake-up granularity. 75 * SCHED_OTHER wake-up granularity.
76 * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) 76 * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
77 * 77 *
78 * This option delays the preemption effects of decoupled workloads 78 * This option delays the preemption effects of decoupled workloads
79 * and reduces their over-scheduling. Synchronous workloads will still 79 * and reduces their over-scheduling. Synchronous workloads will still
80 * have immediate wakeup/sleep latencies. 80 * have immediate wakeup/sleep latencies.
81 */ 81 */
82unsigned int sysctl_sched_wakeup_granularity = 10000000UL; 82unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
83 83
84const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 84const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
85 85
@@ -175,8 +175,15 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
175 * Maintain a cache of leftmost tree entries (it is frequently 175 * Maintain a cache of leftmost tree entries (it is frequently
176 * used): 176 * used):
177 */ 177 */
178 if (leftmost) 178 if (leftmost) {
179 cfs_rq->rb_leftmost = &se->run_node; 179 cfs_rq->rb_leftmost = &se->run_node;
180 /*
181 * maintain cfs_rq->min_vruntime to be a monotonic increasing
182 * value tracking the leftmost vruntime in the tree.
183 */
184 cfs_rq->min_vruntime =
185 max_vruntime(cfs_rq->min_vruntime, se->vruntime);
186 }
180 187
181 rb_link_node(&se->run_node, parent, link); 188 rb_link_node(&se->run_node, parent, link);
182 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); 189 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -184,8 +191,24 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
184 191
185static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 192static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
186{ 193{
187 if (cfs_rq->rb_leftmost == &se->run_node) 194 if (cfs_rq->rb_leftmost == &se->run_node) {
188 cfs_rq->rb_leftmost = rb_next(&se->run_node); 195 struct rb_node *next_node;
196 struct sched_entity *next;
197
198 next_node = rb_next(&se->run_node);
199 cfs_rq->rb_leftmost = next_node;
200
201 if (next_node) {
202 next = rb_entry(next_node,
203 struct sched_entity, run_node);
204 cfs_rq->min_vruntime =
205 max_vruntime(cfs_rq->min_vruntime,
206 next->vruntime);
207 }
208 }
209
210 if (cfs_rq->next == se)
211 cfs_rq->next = NULL;
189 212
190 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 213 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
191} 214}
@@ -260,12 +283,8 @@ static u64 __sched_period(unsigned long nr_running)
260 */ 283 */
261static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 284static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
262{ 285{
263 u64 slice = __sched_period(cfs_rq->nr_running); 286 return calc_delta_mine(__sched_period(cfs_rq->nr_running),
264 287 se->load.weight, &cfs_rq->load);
265 slice *= se->load.weight;
266 do_div(slice, cfs_rq->load.weight);
267
268 return slice;
269} 288}
270 289
271/* 290/*
@@ -283,11 +302,6 @@ static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
283 return vslice; 302 return vslice;
284} 303}
285 304
286static u64 sched_vslice(struct cfs_rq *cfs_rq)
287{
288 return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running);
289}
290
291static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) 305static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
292{ 306{
293 return __sched_vslice(cfs_rq->load.weight + se->load.weight, 307 return __sched_vslice(cfs_rq->load.weight + se->load.weight,
@@ -303,7 +317,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
303 unsigned long delta_exec) 317 unsigned long delta_exec)
304{ 318{
305 unsigned long delta_exec_weighted; 319 unsigned long delta_exec_weighted;
306 u64 vruntime;
307 320
308 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); 321 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
309 322
@@ -315,19 +328,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
315 &curr->load); 328 &curr->load);
316 } 329 }
317 curr->vruntime += delta_exec_weighted; 330 curr->vruntime += delta_exec_weighted;
318
319 /*
320 * maintain cfs_rq->min_vruntime to be a monotonic increasing
321 * value tracking the leftmost vruntime in the tree.
322 */
323 if (first_fair(cfs_rq)) {
324 vruntime = min_vruntime(curr->vruntime,
325 __pick_next_entity(cfs_rq)->vruntime);
326 } else
327 vruntime = curr->vruntime;
328
329 cfs_rq->min_vruntime =
330 max_vruntime(cfs_rq->min_vruntime, vruntime);
331} 331}
332 332
333static void update_curr(struct cfs_rq *cfs_rq) 333static void update_curr(struct cfs_rq *cfs_rq)
@@ -493,16 +493,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
493{ 493{
494 u64 vruntime; 494 u64 vruntime;
495 495
496 vruntime = cfs_rq->min_vruntime; 496 if (first_fair(cfs_rq)) {
497 497 vruntime = min_vruntime(cfs_rq->min_vruntime,
498 if (sched_feat(TREE_AVG)) { 498 __pick_next_entity(cfs_rq)->vruntime);
499 struct sched_entity *last = __pick_last_entity(cfs_rq); 499 } else
500 if (last) { 500 vruntime = cfs_rq->min_vruntime;
501 vruntime += last->vruntime;
502 vruntime >>= 1;
503 }
504 } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
505 vruntime += sched_vslice(cfs_rq)/2;
506 501
507 /* 502 /*
508 * The 'current' period is already promised to the current tasks, 503 * The 'current' period is already promised to the current tasks,
@@ -515,8 +510,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
515 510
516 if (!initial) { 511 if (!initial) {
517 /* sleeps upto a single latency don't count. */ 512 /* sleeps upto a single latency don't count. */
518 if (sched_feat(NEW_FAIR_SLEEPERS)) 513 if (sched_feat(NEW_FAIR_SLEEPERS)) {
519 vruntime -= sysctl_sched_latency; 514 vruntime -= calc_delta_fair(sysctl_sched_latency,
515 &cfs_rq->load);
516 }
520 517
521 /* ensure we never gain time by being placed backwards. */ 518 /* ensure we never gain time by being placed backwards. */
522 vruntime = max_vruntime(se->vruntime, vruntime); 519 vruntime = max_vruntime(se->vruntime, vruntime);
@@ -545,6 +542,21 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
545 account_entity_enqueue(cfs_rq, se); 542 account_entity_enqueue(cfs_rq, se);
546} 543}
547 544
545static void update_avg(u64 *avg, u64 sample)
546{
547 s64 diff = sample - *avg;
548 *avg += diff >> 3;
549}
550
551static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
552{
553 if (!se->last_wakeup)
554 return;
555
556 update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
557 se->last_wakeup = 0;
558}
559
548static void 560static void
549dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 561dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
550{ 562{
@@ -555,6 +567,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
555 567
556 update_stats_dequeue(cfs_rq, se); 568 update_stats_dequeue(cfs_rq, se);
557 if (sleep) { 569 if (sleep) {
570 update_avg_stats(cfs_rq, se);
558#ifdef CONFIG_SCHEDSTATS 571#ifdef CONFIG_SCHEDSTATS
559 if (entity_is_task(se)) { 572 if (entity_is_task(se)) {
560 struct task_struct *tsk = task_of(se); 573 struct task_struct *tsk = task_of(se);
@@ -616,12 +629,32 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
616 se->prev_sum_exec_runtime = se->sum_exec_runtime; 629 se->prev_sum_exec_runtime = se->sum_exec_runtime;
617} 630}
618 631
632static struct sched_entity *
633pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
634{
635 s64 diff, gran;
636
637 if (!cfs_rq->next)
638 return se;
639
640 diff = cfs_rq->next->vruntime - se->vruntime;
641 if (diff < 0)
642 return se;
643
644 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load);
645 if (diff > gran)
646 return se;
647
648 return cfs_rq->next;
649}
650
619static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 651static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
620{ 652{
621 struct sched_entity *se = NULL; 653 struct sched_entity *se = NULL;
622 654
623 if (first_fair(cfs_rq)) { 655 if (first_fair(cfs_rq)) {
624 se = __pick_next_entity(cfs_rq); 656 se = __pick_next_entity(cfs_rq);
657 se = pick_next(cfs_rq, se);
625 set_next_entity(cfs_rq, se); 658 set_next_entity(cfs_rq, se);
626 } 659 }
627 660
@@ -949,96 +982,121 @@ static inline int wake_idle(int cpu, struct task_struct *p)
949#endif 982#endif
950 983
951#ifdef CONFIG_SMP 984#ifdef CONFIG_SMP
952static int select_task_rq_fair(struct task_struct *p, int sync) 985
986static const struct sched_class fair_sched_class;
987
988static int
989wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
990 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
991 int idx, unsigned long load, unsigned long this_load,
992 unsigned int imbalance)
953{ 993{
954 int cpu, this_cpu; 994 struct task_struct *curr = this_rq->curr;
955 struct rq *rq; 995 unsigned long tl = this_load;
956 struct sched_domain *sd, *this_sd = NULL; 996 unsigned long tl_per_task;
957 int new_cpu; 997
998 if (!(this_sd->flags & SD_WAKE_AFFINE))
999 return 0;
1000
1001 /*
1002 * If the currently running task will sleep within
1003 * a reasonable amount of time then attract this newly
1004 * woken task:
1005 */
1006 if (sync && curr->sched_class == &fair_sched_class) {
1007 if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1008 p->se.avg_overlap < sysctl_sched_migration_cost)
1009 return 1;
1010 }
1011
1012 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1013 tl_per_task = cpu_avg_load_per_task(this_cpu);
958 1014
959 cpu = task_cpu(p); 1015 /*
960 rq = task_rq(p); 1016 * If sync wakeup then subtract the (maximum possible)
961 this_cpu = smp_processor_id(); 1017 * effect of the currently running task from the load
962 new_cpu = cpu; 1018 * of the current CPU:
1019 */
1020 if (sync)
1021 tl -= current->se.load.weight;
1022
1023 if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
1024 100*(tl + p->se.load.weight) <= imbalance*load) {
1025 /*
1026 * This domain has SD_WAKE_AFFINE and
1027 * p is cache cold in this domain, and
1028 * there is no bad imbalance.
1029 */
1030 schedstat_inc(this_sd, ttwu_move_affine);
1031 schedstat_inc(p, se.nr_wakeups_affine);
1032
1033 return 1;
1034 }
1035 return 0;
1036}
963 1037
964 if (cpu == this_cpu) 1038static int select_task_rq_fair(struct task_struct *p, int sync)
965 goto out_set_cpu; 1039{
1040 struct sched_domain *sd, *this_sd = NULL;
1041 int prev_cpu, this_cpu, new_cpu;
1042 unsigned long load, this_load;
1043 struct rq *rq, *this_rq;
1044 unsigned int imbalance;
1045 int idx;
1046
1047 prev_cpu = task_cpu(p);
1048 rq = task_rq(p);
1049 this_cpu = smp_processor_id();
1050 this_rq = cpu_rq(this_cpu);
1051 new_cpu = prev_cpu;
966 1052
1053 /*
1054 * 'this_sd' is the first domain that both
1055 * this_cpu and prev_cpu are present in:
1056 */
967 for_each_domain(this_cpu, sd) { 1057 for_each_domain(this_cpu, sd) {
968 if (cpu_isset(cpu, sd->span)) { 1058 if (cpu_isset(prev_cpu, sd->span)) {
969 this_sd = sd; 1059 this_sd = sd;
970 break; 1060 break;
971 } 1061 }
972 } 1062 }
973 1063
974 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) 1064 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
975 goto out_set_cpu; 1065 goto out;
976 1066
977 /* 1067 /*
978 * Check for affine wakeup and passive balancing possibilities. 1068 * Check for affine wakeup and passive balancing possibilities.
979 */ 1069 */
980 if (this_sd) { 1070 if (!this_sd)
981 int idx = this_sd->wake_idx; 1071 goto out;
982 unsigned int imbalance;
983 unsigned long load, this_load;
984
985 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
986
987 load = source_load(cpu, idx);
988 this_load = target_load(this_cpu, idx);
989
990 new_cpu = this_cpu; /* Wake to this CPU if we can */
991
992 if (this_sd->flags & SD_WAKE_AFFINE) {
993 unsigned long tl = this_load;
994 unsigned long tl_per_task;
995
996 /*
997 * Attract cache-cold tasks on sync wakeups:
998 */
999 if (sync && !task_hot(p, rq->clock, this_sd))
1000 goto out_set_cpu;
1001
1002 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1003 tl_per_task = cpu_avg_load_per_task(this_cpu);
1004
1005 /*
1006 * If sync wakeup then subtract the (maximum possible)
1007 * effect of the currently running task from the load
1008 * of the current CPU:
1009 */
1010 if (sync)
1011 tl -= current->se.load.weight;
1012
1013 if ((tl <= load &&
1014 tl + target_load(cpu, idx) <= tl_per_task) ||
1015 100*(tl + p->se.load.weight) <= imbalance*load) {
1016 /*
1017 * This domain has SD_WAKE_AFFINE and
1018 * p is cache cold in this domain, and
1019 * there is no bad imbalance.
1020 */
1021 schedstat_inc(this_sd, ttwu_move_affine);
1022 schedstat_inc(p, se.nr_wakeups_affine);
1023 goto out_set_cpu;
1024 }
1025 }
1026 1072
1027 /* 1073 idx = this_sd->wake_idx;
1028 * Start passive balancing when half the imbalance_pct 1074
1029 * limit is reached. 1075 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1030 */ 1076
1031 if (this_sd->flags & SD_WAKE_BALANCE) { 1077 load = source_load(prev_cpu, idx);
1032 if (imbalance*this_load <= 100*load) { 1078 this_load = target_load(this_cpu, idx);
1033 schedstat_inc(this_sd, ttwu_move_balance); 1079
1034 schedstat_inc(p, se.nr_wakeups_passive); 1080 if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
1035 goto out_set_cpu; 1081 load, this_load, imbalance))
1036 } 1082 return this_cpu;
1083
1084 if (prev_cpu == this_cpu)
1085 goto out;
1086
1087 /*
1088 * Start passive balancing when half the imbalance_pct
1089 * limit is reached.
1090 */
1091 if (this_sd->flags & SD_WAKE_BALANCE) {
1092 if (imbalance*this_load <= 100*load) {
1093 schedstat_inc(this_sd, ttwu_move_balance);
1094 schedstat_inc(p, se.nr_wakeups_passive);
1095 return this_cpu;
1037 } 1096 }
1038 } 1097 }
1039 1098
1040 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ 1099out:
1041out_set_cpu:
1042 return wake_idle(new_cpu, p); 1100 return wake_idle(new_cpu, p);
1043} 1101}
1044#endif /* CONFIG_SMP */ 1102#endif /* CONFIG_SMP */
@@ -1060,6 +1118,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1060 resched_task(curr); 1118 resched_task(curr);
1061 return; 1119 return;
1062 } 1120 }
1121
1122 se->last_wakeup = se->sum_exec_runtime;
1123 if (unlikely(se == pse))
1124 return;
1125
1126 cfs_rq_of(pse)->next = pse;
1127
1063 /* 1128 /*
1064 * Batch tasks do not preempt (their preemption is driven by 1129 * Batch tasks do not preempt (their preemption is driven by
1065 * the tick): 1130 * the tick):
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 548c436a776b..7f60097d443a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -141,13 +141,8 @@ static void clocksource_watchdog(unsigned long data)
141 } 141 }
142 142
143 if (!list_empty(&watchdog_list)) { 143 if (!list_empty(&watchdog_list)) {
144 /* Cycle through CPUs to check if the CPUs stay synchronized to 144 __mod_timer(&watchdog_timer,
145 * each other. */ 145 watchdog_timer.expires + WATCHDOG_INTERVAL);
146 int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map);
147 if (next_cpu >= NR_CPUS)
148 next_cpu = first_cpu(cpu_online_map);
149 watchdog_timer.expires += WATCHDOG_INTERVAL;
150 add_timer_on(&watchdog_timer, next_cpu);
151 } 146 }
152 spin_unlock(&watchdog_lock); 147 spin_unlock(&watchdog_lock);
153} 148}
@@ -169,7 +164,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
169 if (!started && watchdog) { 164 if (!started && watchdog) {
170 watchdog_last = watchdog->read(); 165 watchdog_last = watchdog->read();
171 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; 166 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
172 add_timer_on(&watchdog_timer, first_cpu(cpu_online_map)); 167 add_timer(&watchdog_timer);
173 } 168 }
174 } else { 169 } else {
175 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 170 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
@@ -179,7 +174,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
179 if (watchdog) 174 if (watchdog)
180 del_timer(&watchdog_timer); 175 del_timer(&watchdog_timer);
181 watchdog = cs; 176 watchdog = cs;
182 init_timer_deferrable(&watchdog_timer); 177 init_timer(&watchdog_timer);
183 watchdog_timer.function = clocksource_watchdog; 178 watchdog_timer.function = clocksource_watchdog;
184 179
185 /* Reset watchdog cycles */ 180 /* Reset watchdog cycles */
@@ -190,8 +185,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
190 watchdog_last = watchdog->read(); 185 watchdog_last = watchdog->read();
191 watchdog_timer.expires = 186 watchdog_timer.expires =
192 jiffies + WATCHDOG_INTERVAL; 187 jiffies + WATCHDOG_INTERVAL;
193 add_timer_on(&watchdog_timer, 188 add_timer(&watchdog_timer);
194 first_cpu(cpu_online_map));
195 } 189 }
196 } 190 }
197 } 191 }
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 671af612b768..a3fa587c350c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -191,8 +191,12 @@ static void change_clocksource(void)
191 191
192 tick_clock_notify(); 192 tick_clock_notify();
193 193
194 /*
195 * We're holding xtime lock and waking up klogd would deadlock
196 * us on enqueue. So no printing!
194 printk(KERN_INFO "Time: %s clocksource has been installed.\n", 197 printk(KERN_INFO "Time: %s clocksource has been installed.\n",
195 clock->name); 198 clock->name);
199 */
196} 200}
197#else 201#else
198static inline void change_clocksource(void) { } 202static inline void change_clocksource(void) { }
diff --git a/kernel/timer.c b/kernel/timer.c
index 99b00a25f88b..b024106daa70 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -451,10 +451,18 @@ void add_timer_on(struct timer_list *timer, int cpu)
451 spin_lock_irqsave(&base->lock, flags); 451 spin_lock_irqsave(&base->lock, flags);
452 timer_set_base(timer, base); 452 timer_set_base(timer, base);
453 internal_add_timer(base, timer); 453 internal_add_timer(base, timer);
454 /*
455 * Check whether the other CPU is idle and needs to be
456 * triggered to reevaluate the timer wheel when nohz is
457 * active. We are protected against the other CPU fiddling
458 * with the timer by holding the timer base lock. This also
459 * makes sure that a CPU on the way to idle can not evaluate
460 * the timer wheel.
461 */
462 wake_up_idle_cpu(cpu);
454 spin_unlock_irqrestore(&base->lock, flags); 463 spin_unlock_irqrestore(&base->lock, flags);
455} 464}
456 465
457
458/** 466/**
459 * mod_timer - modify a timer's timeout 467 * mod_timer - modify a timer's timeout
460 * @timer: the timer to be modified 468 * @timer: the timer to be modified