diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/acct.c | 23 | ||||
| -rw-r--r-- | kernel/audit.c | 17 | ||||
| -rw-r--r-- | kernel/cgroup.c | 4 | ||||
| -rw-r--r-- | kernel/fork.c | 2 | ||||
| -rw-r--r-- | kernel/futex.c | 6 | ||||
| -rw-r--r-- | kernel/futex_compat.c | 2 | ||||
| -rw-r--r-- | kernel/marker.c | 31 | ||||
| -rw-r--r-- | kernel/power/Kconfig | 2 | ||||
| -rw-r--r-- | kernel/power/snapshot.c | 41 | ||||
| -rw-r--r-- | kernel/printk.c | 83 | ||||
| -rw-r--r-- | kernel/relay.c | 12 | ||||
| -rw-r--r-- | kernel/sched.c | 115 | ||||
| -rw-r--r-- | kernel/sched_debug.c | 1 | ||||
| -rw-r--r-- | kernel/sched_fair.c | 291 | ||||
| -rw-r--r-- | kernel/time/clocksource.c | 16 | ||||
| -rw-r--r-- | kernel/time/timekeeping.c | 4 | ||||
| -rw-r--r-- | kernel/timer.c | 10 |
17 files changed, 420 insertions, 240 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index 521dfa53cb99..91e1cfd734d2 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
| @@ -58,6 +58,7 @@ | |||
| 58 | #include <asm/uaccess.h> | 58 | #include <asm/uaccess.h> |
| 59 | #include <asm/div64.h> | 59 | #include <asm/div64.h> |
| 60 | #include <linux/blkdev.h> /* sector_div */ | 60 | #include <linux/blkdev.h> /* sector_div */ |
| 61 | #include <linux/pid_namespace.h> | ||
| 61 | 62 | ||
| 62 | /* | 63 | /* |
| 63 | * These constants control the amount of freespace that suspend and | 64 | * These constants control the amount of freespace that suspend and |
| @@ -74,7 +75,7 @@ int acct_parm[3] = {4, 2, 30}; | |||
| 74 | /* | 75 | /* |
| 75 | * External references and all of the globals. | 76 | * External references and all of the globals. |
| 76 | */ | 77 | */ |
| 77 | static void do_acct_process(struct file *); | 78 | static void do_acct_process(struct pid_namespace *ns, struct file *); |
| 78 | 79 | ||
| 79 | /* | 80 | /* |
| 80 | * This structure is used so that all the data protected by lock | 81 | * This structure is used so that all the data protected by lock |
| @@ -86,6 +87,7 @@ struct acct_glbs { | |||
| 86 | volatile int active; | 87 | volatile int active; |
| 87 | volatile int needcheck; | 88 | volatile int needcheck; |
| 88 | struct file *file; | 89 | struct file *file; |
| 90 | struct pid_namespace *ns; | ||
| 89 | struct timer_list timer; | 91 | struct timer_list timer; |
| 90 | }; | 92 | }; |
| 91 | 93 | ||
| @@ -175,9 +177,11 @@ out: | |||
| 175 | static void acct_file_reopen(struct file *file) | 177 | static void acct_file_reopen(struct file *file) |
| 176 | { | 178 | { |
| 177 | struct file *old_acct = NULL; | 179 | struct file *old_acct = NULL; |
| 180 | struct pid_namespace *old_ns = NULL; | ||
| 178 | 181 | ||
| 179 | if (acct_globals.file) { | 182 | if (acct_globals.file) { |
| 180 | old_acct = acct_globals.file; | 183 | old_acct = acct_globals.file; |
| 184 | old_ns = acct_globals.ns; | ||
| 181 | del_timer(&acct_globals.timer); | 185 | del_timer(&acct_globals.timer); |
| 182 | acct_globals.active = 0; | 186 | acct_globals.active = 0; |
| 183 | acct_globals.needcheck = 0; | 187 | acct_globals.needcheck = 0; |
| @@ -185,6 +189,7 @@ static void acct_file_reopen(struct file *file) | |||
| 185 | } | 189 | } |
| 186 | if (file) { | 190 | if (file) { |
| 187 | acct_globals.file = file; | 191 | acct_globals.file = file; |
| 192 | acct_globals.ns = get_pid_ns(task_active_pid_ns(current)); | ||
| 188 | acct_globals.needcheck = 0; | 193 | acct_globals.needcheck = 0; |
| 189 | acct_globals.active = 1; | 194 | acct_globals.active = 1; |
| 190 | /* It's been deleted if it was used before so this is safe */ | 195 | /* It's been deleted if it was used before so this is safe */ |
| @@ -196,8 +201,9 @@ static void acct_file_reopen(struct file *file) | |||
| 196 | if (old_acct) { | 201 | if (old_acct) { |
| 197 | mnt_unpin(old_acct->f_path.mnt); | 202 | mnt_unpin(old_acct->f_path.mnt); |
| 198 | spin_unlock(&acct_globals.lock); | 203 | spin_unlock(&acct_globals.lock); |
| 199 | do_acct_process(old_acct); | 204 | do_acct_process(old_ns, old_acct); |
| 200 | filp_close(old_acct, NULL); | 205 | filp_close(old_acct, NULL); |
| 206 | put_pid_ns(old_ns); | ||
| 201 | spin_lock(&acct_globals.lock); | 207 | spin_lock(&acct_globals.lock); |
| 202 | } | 208 | } |
| 203 | } | 209 | } |
| @@ -419,7 +425,7 @@ static u32 encode_float(u64 value) | |||
| 419 | /* | 425 | /* |
| 420 | * do_acct_process does all actual work. Caller holds the reference to file. | 426 | * do_acct_process does all actual work. Caller holds the reference to file. |
| 421 | */ | 427 | */ |
| 422 | static void do_acct_process(struct file *file) | 428 | static void do_acct_process(struct pid_namespace *ns, struct file *file) |
| 423 | { | 429 | { |
| 424 | struct pacct_struct *pacct = ¤t->signal->pacct; | 430 | struct pacct_struct *pacct = ¤t->signal->pacct; |
| 425 | acct_t ac; | 431 | acct_t ac; |
| @@ -481,8 +487,10 @@ static void do_acct_process(struct file *file) | |||
| 481 | ac.ac_gid16 = current->gid; | 487 | ac.ac_gid16 = current->gid; |
| 482 | #endif | 488 | #endif |
| 483 | #if ACCT_VERSION==3 | 489 | #if ACCT_VERSION==3 |
| 484 | ac.ac_pid = current->tgid; | 490 | ac.ac_pid = task_tgid_nr_ns(current, ns); |
| 485 | ac.ac_ppid = current->real_parent->tgid; | 491 | rcu_read_lock(); |
| 492 | ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); | ||
| 493 | rcu_read_unlock(); | ||
| 486 | #endif | 494 | #endif |
| 487 | 495 | ||
| 488 | spin_lock_irq(¤t->sighand->siglock); | 496 | spin_lock_irq(¤t->sighand->siglock); |
| @@ -578,6 +586,7 @@ void acct_collect(long exitcode, int group_dead) | |||
| 578 | void acct_process(void) | 586 | void acct_process(void) |
| 579 | { | 587 | { |
| 580 | struct file *file = NULL; | 588 | struct file *file = NULL; |
| 589 | struct pid_namespace *ns; | ||
| 581 | 590 | ||
| 582 | /* | 591 | /* |
| 583 | * accelerate the common fastpath: | 592 | * accelerate the common fastpath: |
| @@ -592,8 +601,10 @@ void acct_process(void) | |||
| 592 | return; | 601 | return; |
| 593 | } | 602 | } |
| 594 | get_file(file); | 603 | get_file(file); |
| 604 | ns = get_pid_ns(acct_globals.ns); | ||
| 595 | spin_unlock(&acct_globals.lock); | 605 | spin_unlock(&acct_globals.lock); |
| 596 | 606 | ||
| 597 | do_acct_process(file); | 607 | do_acct_process(ns, file); |
| 598 | fput(file); | 608 | fput(file); |
| 609 | put_pid_ns(ns); | ||
| 599 | } | 610 | } |
diff --git a/kernel/audit.c b/kernel/audit.c index 10c4930c2bbf..b782b046543d 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
| @@ -78,9 +78,13 @@ static int audit_default; | |||
| 78 | /* If auditing cannot proceed, audit_failure selects what happens. */ | 78 | /* If auditing cannot proceed, audit_failure selects what happens. */ |
| 79 | static int audit_failure = AUDIT_FAIL_PRINTK; | 79 | static int audit_failure = AUDIT_FAIL_PRINTK; |
| 80 | 80 | ||
| 81 | /* If audit records are to be written to the netlink socket, audit_pid | 81 | /* |
| 82 | * contains the (non-zero) pid. */ | 82 | * If audit records are to be written to the netlink socket, audit_pid |
| 83 | * contains the pid of the auditd process and audit_nlk_pid contains | ||
| 84 | * the pid to use to send netlink messages to that process. | ||
| 85 | */ | ||
| 83 | int audit_pid; | 86 | int audit_pid; |
| 87 | static int audit_nlk_pid; | ||
| 84 | 88 | ||
| 85 | /* If audit_rate_limit is non-zero, limit the rate of sending audit records | 89 | /* If audit_rate_limit is non-zero, limit the rate of sending audit records |
| 86 | * to that number per second. This prevents DoS attacks, but results in | 90 | * to that number per second. This prevents DoS attacks, but results in |
| @@ -350,7 +354,7 @@ static int kauditd_thread(void *dummy) | |||
| 350 | wake_up(&audit_backlog_wait); | 354 | wake_up(&audit_backlog_wait); |
| 351 | if (skb) { | 355 | if (skb) { |
| 352 | if (audit_pid) { | 356 | if (audit_pid) { |
| 353 | int err = netlink_unicast(audit_sock, skb, audit_pid, 0); | 357 | int err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); |
| 354 | if (err < 0) { | 358 | if (err < 0) { |
| 355 | BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ | 359 | BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ |
| 356 | printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); | 360 | printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); |
| @@ -626,6 +630,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 626 | sid, 1); | 630 | sid, 1); |
| 627 | 631 | ||
| 628 | audit_pid = new_pid; | 632 | audit_pid = new_pid; |
| 633 | audit_nlk_pid = NETLINK_CB(skb).pid; | ||
| 629 | } | 634 | } |
| 630 | if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) | 635 | if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) |
| 631 | err = audit_set_rate_limit(status_get->rate_limit, | 636 | err = audit_set_rate_limit(status_get->rate_limit, |
| @@ -1264,8 +1269,8 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen, | |||
| 1264 | 1269 | ||
| 1265 | /** | 1270 | /** |
| 1266 | * audit_string_contains_control - does a string need to be logged in hex | 1271 | * audit_string_contains_control - does a string need to be logged in hex |
| 1267 | * @string - string to be checked | 1272 | * @string: string to be checked |
| 1268 | * @len - max length of the string to check | 1273 | * @len: max length of the string to check |
| 1269 | */ | 1274 | */ |
| 1270 | int audit_string_contains_control(const char *string, size_t len) | 1275 | int audit_string_contains_control(const char *string, size_t len) |
| 1271 | { | 1276 | { |
| @@ -1280,7 +1285,7 @@ int audit_string_contains_control(const char *string, size_t len) | |||
| 1280 | /** | 1285 | /** |
| 1281 | * audit_log_n_untrustedstring - log a string that may contain random characters | 1286 | * audit_log_n_untrustedstring - log a string that may contain random characters |
| 1282 | * @ab: audit_buffer | 1287 | * @ab: audit_buffer |
| 1283 | * @len: lenth of string (not including trailing null) | 1288 | * @len: length of string (not including trailing null) |
| 1284 | * @string: string to be logged | 1289 | * @string: string to be logged |
| 1285 | * | 1290 | * |
| 1286 | * This code will escape a string that is passed to it if the string | 1291 | * This code will escape a string that is passed to it if the string |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e9c2fb01e89b..53d86b4b0ce0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -2082,7 +2082,7 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file) | |||
| 2082 | 2082 | ||
| 2083 | kfree(pidarray); | 2083 | kfree(pidarray); |
| 2084 | } else { | 2084 | } else { |
| 2085 | ctr->buf = 0; | 2085 | ctr->buf = NULL; |
| 2086 | ctr->bufsz = 0; | 2086 | ctr->bufsz = 0; |
| 2087 | } | 2087 | } |
| 2088 | file->private_data = ctr; | 2088 | file->private_data = ctr; |
| @@ -2614,7 +2614,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) | |||
| 2614 | 2614 | ||
| 2615 | static int cgroupstats_open(struct inode *inode, struct file *file) | 2615 | static int cgroupstats_open(struct inode *inode, struct file *file) |
| 2616 | { | 2616 | { |
| 2617 | return single_open(file, proc_cgroupstats_show, 0); | 2617 | return single_open(file, proc_cgroupstats_show, NULL); |
| 2618 | } | 2618 | } |
| 2619 | 2619 | ||
| 2620 | static struct file_operations proc_cgroupstats_operations = { | 2620 | static struct file_operations proc_cgroupstats_operations = { |
diff --git a/kernel/fork.c b/kernel/fork.c index dd249c37b3a3..9c042f901570 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -394,7 +394,6 @@ void __mmdrop(struct mm_struct *mm) | |||
| 394 | { | 394 | { |
| 395 | BUG_ON(mm == &init_mm); | 395 | BUG_ON(mm == &init_mm); |
| 396 | mm_free_pgd(mm); | 396 | mm_free_pgd(mm); |
| 397 | mm_free_cgroup(mm); | ||
| 398 | destroy_context(mm); | 397 | destroy_context(mm); |
| 399 | free_mm(mm); | 398 | free_mm(mm); |
| 400 | } | 399 | } |
| @@ -416,6 +415,7 @@ void mmput(struct mm_struct *mm) | |||
| 416 | spin_unlock(&mmlist_lock); | 415 | spin_unlock(&mmlist_lock); |
| 417 | } | 416 | } |
| 418 | put_swap_token(mm); | 417 | put_swap_token(mm); |
| 418 | mm_free_cgroup(mm); | ||
| 419 | mmdrop(mm); | 419 | mmdrop(mm); |
| 420 | } | 420 | } |
| 421 | } | 421 | } |
diff --git a/kernel/futex.c b/kernel/futex.c index 06968cd79200..e43945e995f5 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
| @@ -281,7 +281,7 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
| 281 | */ | 281 | */ |
| 282 | static void get_futex_key_refs(union futex_key *key) | 282 | static void get_futex_key_refs(union futex_key *key) |
| 283 | { | 283 | { |
| 284 | if (key->both.ptr == 0) | 284 | if (key->both.ptr == NULL) |
| 285 | return; | 285 | return; |
| 286 | switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { | 286 | switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { |
| 287 | case FUT_OFF_INODE: | 287 | case FUT_OFF_INODE: |
| @@ -2158,7 +2158,7 @@ static struct file_system_type futex_fs_type = { | |||
| 2158 | .kill_sb = kill_anon_super, | 2158 | .kill_sb = kill_anon_super, |
| 2159 | }; | 2159 | }; |
| 2160 | 2160 | ||
| 2161 | static int __init init(void) | 2161 | static int __init futex_init(void) |
| 2162 | { | 2162 | { |
| 2163 | u32 curval; | 2163 | u32 curval; |
| 2164 | int i; | 2164 | int i; |
| @@ -2194,4 +2194,4 @@ static int __init init(void) | |||
| 2194 | 2194 | ||
| 2195 | return 0; | 2195 | return 0; |
| 2196 | } | 2196 | } |
| 2197 | __initcall(init); | 2197 | __initcall(futex_init); |
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index ff90f049f8f6..04ac3a9e42cf 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c | |||
| @@ -30,7 +30,7 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, | |||
| 30 | return 0; | 30 | return 0; |
| 31 | } | 31 | } |
| 32 | 32 | ||
| 33 | static void __user *futex_uaddr(struct robust_list *entry, | 33 | static void __user *futex_uaddr(struct robust_list __user *entry, |
| 34 | compat_long_t futex_offset) | 34 | compat_long_t futex_offset) |
| 35 | { | 35 | { |
| 36 | compat_uptr_t base = ptr_to_compat(entry); | 36 | compat_uptr_t base = ptr_to_compat(entry); |
diff --git a/kernel/marker.c b/kernel/marker.c index 48a4ea5afffd..041c33e3e95c 100644 --- a/kernel/marker.c +++ b/kernel/marker.c | |||
| @@ -104,18 +104,18 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, | |||
| 104 | char ptype; | 104 | char ptype; |
| 105 | 105 | ||
| 106 | /* | 106 | /* |
| 107 | * disabling preemption to make sure the teardown of the callbacks can | 107 | * preempt_disable does two things : disabling preemption to make sure |
| 108 | * be done correctly when they are in modules and they insure RCU read | 108 | * the teardown of the callbacks can be done correctly when they are in |
| 109 | * coherency. | 109 | * modules and they insure RCU read coherency. |
| 110 | */ | 110 | */ |
| 111 | preempt_disable(); | 111 | preempt_disable(); |
| 112 | ptype = ACCESS_ONCE(mdata->ptype); | 112 | ptype = mdata->ptype; |
| 113 | if (likely(!ptype)) { | 113 | if (likely(!ptype)) { |
| 114 | marker_probe_func *func; | 114 | marker_probe_func *func; |
| 115 | /* Must read the ptype before ptr. They are not data dependant, | 115 | /* Must read the ptype before ptr. They are not data dependant, |
| 116 | * so we put an explicit smp_rmb() here. */ | 116 | * so we put an explicit smp_rmb() here. */ |
| 117 | smp_rmb(); | 117 | smp_rmb(); |
| 118 | func = ACCESS_ONCE(mdata->single.func); | 118 | func = mdata->single.func; |
| 119 | /* Must read the ptr before private data. They are not data | 119 | /* Must read the ptr before private data. They are not data |
| 120 | * dependant, so we put an explicit smp_rmb() here. */ | 120 | * dependant, so we put an explicit smp_rmb() here. */ |
| 121 | smp_rmb(); | 121 | smp_rmb(); |
| @@ -133,7 +133,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, | |||
| 133 | * in the fast path, so put the explicit barrier here. | 133 | * in the fast path, so put the explicit barrier here. |
| 134 | */ | 134 | */ |
| 135 | smp_read_barrier_depends(); | 135 | smp_read_barrier_depends(); |
| 136 | multi = ACCESS_ONCE(mdata->multi); | 136 | multi = mdata->multi; |
| 137 | for (i = 0; multi[i].func; i++) { | 137 | for (i = 0; multi[i].func; i++) { |
| 138 | va_start(args, fmt); | 138 | va_start(args, fmt); |
| 139 | multi[i].func(multi[i].probe_private, call_private, fmt, | 139 | multi[i].func(multi[i].probe_private, call_private, fmt, |
| @@ -161,13 +161,13 @@ void marker_probe_cb_noarg(const struct marker *mdata, | |||
| 161 | char ptype; | 161 | char ptype; |
| 162 | 162 | ||
| 163 | preempt_disable(); | 163 | preempt_disable(); |
| 164 | ptype = ACCESS_ONCE(mdata->ptype); | 164 | ptype = mdata->ptype; |
| 165 | if (likely(!ptype)) { | 165 | if (likely(!ptype)) { |
| 166 | marker_probe_func *func; | 166 | marker_probe_func *func; |
| 167 | /* Must read the ptype before ptr. They are not data dependant, | 167 | /* Must read the ptype before ptr. They are not data dependant, |
| 168 | * so we put an explicit smp_rmb() here. */ | 168 | * so we put an explicit smp_rmb() here. */ |
| 169 | smp_rmb(); | 169 | smp_rmb(); |
| 170 | func = ACCESS_ONCE(mdata->single.func); | 170 | func = mdata->single.func; |
| 171 | /* Must read the ptr before private data. They are not data | 171 | /* Must read the ptr before private data. They are not data |
| 172 | * dependant, so we put an explicit smp_rmb() here. */ | 172 | * dependant, so we put an explicit smp_rmb() here. */ |
| 173 | smp_rmb(); | 173 | smp_rmb(); |
| @@ -183,7 +183,7 @@ void marker_probe_cb_noarg(const struct marker *mdata, | |||
| 183 | * in the fast path, so put the explicit barrier here. | 183 | * in the fast path, so put the explicit barrier here. |
| 184 | */ | 184 | */ |
| 185 | smp_read_barrier_depends(); | 185 | smp_read_barrier_depends(); |
| 186 | multi = ACCESS_ONCE(mdata->multi); | 186 | multi = mdata->multi; |
| 187 | for (i = 0; multi[i].func; i++) | 187 | for (i = 0; multi[i].func; i++) |
| 188 | multi[i].func(multi[i].probe_private, call_private, fmt, | 188 | multi[i].func(multi[i].probe_private, call_private, fmt, |
| 189 | &args); | 189 | &args); |
| @@ -551,9 +551,9 @@ static int set_marker(struct marker_entry **entry, struct marker *elem, | |||
| 551 | 551 | ||
| 552 | /* | 552 | /* |
| 553 | * Disable a marker and its probe callback. | 553 | * Disable a marker and its probe callback. |
| 554 | * Note: only after a synchronize_sched() issued after setting elem->call to the | 554 | * Note: only waiting an RCU period after setting elem->call to the empty |
| 555 | * empty function insures that the original callback is not used anymore. This | 555 | * function insures that the original callback is not used anymore. This insured |
| 556 | * insured by preemption disabling around the call site. | 556 | * by preempt_disable around the call site. |
| 557 | */ | 557 | */ |
| 558 | static void disable_marker(struct marker *elem) | 558 | static void disable_marker(struct marker *elem) |
| 559 | { | 559 | { |
| @@ -565,8 +565,8 @@ static void disable_marker(struct marker *elem) | |||
| 565 | elem->ptype = 0; /* single probe */ | 565 | elem->ptype = 0; /* single probe */ |
| 566 | /* | 566 | /* |
| 567 | * Leave the private data and id there, because removal is racy and | 567 | * Leave the private data and id there, because removal is racy and |
| 568 | * should be done only after a synchronize_sched(). These are never used | 568 | * should be done only after an RCU period. These are never used until |
| 569 | * until the next initialization anyway. | 569 | * the next initialization anyway. |
| 570 | */ | 570 | */ |
| 571 | } | 571 | } |
| 572 | 572 | ||
| @@ -601,9 +601,6 @@ void marker_update_probe_range(struct marker *begin, | |||
| 601 | 601 | ||
| 602 | /* | 602 | /* |
| 603 | * Update probes, removing the faulty probes. | 603 | * Update probes, removing the faulty probes. |
| 604 | * Issues a synchronize_sched() when no reference to the module passed | ||
| 605 | * as parameter is found in the probes so the probe module can be | ||
| 606 | * safely unloaded from now on. | ||
| 607 | * | 604 | * |
| 608 | * Internal callback only changed before the first probe is connected to it. | 605 | * Internal callback only changed before the first probe is connected to it. |
| 609 | * Single probe private data can only be changed on 0 -> 1 and 2 -> 1 | 606 | * Single probe private data can only be changed on 0 -> 1 and 2 -> 1 |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 79833170bb9c..6233f3b4ae66 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
| @@ -190,7 +190,7 @@ config APM_EMULATION | |||
| 190 | notification of APM "events" (e.g. battery status change). | 190 | notification of APM "events" (e.g. battery status change). |
| 191 | 191 | ||
| 192 | In order to use APM, you will need supporting software. For location | 192 | In order to use APM, you will need supporting software. For location |
| 193 | and more information, read <file:Documentation/pm.txt> and the | 193 | and more information, read <file:Documentation/power/pm.txt> and the |
| 194 | Battery Powered Linux mini-HOWTO, available from | 194 | Battery Powered Linux mini-HOWTO, available from |
| 195 | <http://www.tldp.org/docs.html#howto>. | 195 | <http://www.tldp.org/docs.html#howto>. |
| 196 | 196 | ||
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 72a020cabb4c..5f91a07c4eac 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
| @@ -447,7 +447,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) | |||
| 447 | * of @bm->cur_zone_bm are updated. | 447 | * of @bm->cur_zone_bm are updated. |
| 448 | */ | 448 | */ |
| 449 | 449 | ||
| 450 | static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, | 450 | static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, |
| 451 | void **addr, unsigned int *bit_nr) | 451 | void **addr, unsigned int *bit_nr) |
| 452 | { | 452 | { |
| 453 | struct zone_bitmap *zone_bm; | 453 | struct zone_bitmap *zone_bm; |
| @@ -461,7 +461,8 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, | |||
| 461 | while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { | 461 | while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { |
| 462 | zone_bm = zone_bm->next; | 462 | zone_bm = zone_bm->next; |
| 463 | 463 | ||
| 464 | BUG_ON(!zone_bm); | 464 | if (!zone_bm) |
| 465 | return -EFAULT; | ||
| 465 | } | 466 | } |
| 466 | bm->cur.zone_bm = zone_bm; | 467 | bm->cur.zone_bm = zone_bm; |
| 467 | } | 468 | } |
| @@ -479,23 +480,40 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, | |||
| 479 | pfn -= bb->start_pfn; | 480 | pfn -= bb->start_pfn; |
| 480 | *bit_nr = pfn % BM_BITS_PER_CHUNK; | 481 | *bit_nr = pfn % BM_BITS_PER_CHUNK; |
| 481 | *addr = bb->data + pfn / BM_BITS_PER_CHUNK; | 482 | *addr = bb->data + pfn / BM_BITS_PER_CHUNK; |
| 483 | return 0; | ||
| 482 | } | 484 | } |
| 483 | 485 | ||
| 484 | static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) | 486 | static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) |
| 485 | { | 487 | { |
| 486 | void *addr; | 488 | void *addr; |
| 487 | unsigned int bit; | 489 | unsigned int bit; |
| 490 | int error; | ||
| 488 | 491 | ||
| 489 | memory_bm_find_bit(bm, pfn, &addr, &bit); | 492 | error = memory_bm_find_bit(bm, pfn, &addr, &bit); |
| 493 | BUG_ON(error); | ||
| 490 | set_bit(bit, addr); | 494 | set_bit(bit, addr); |
| 491 | } | 495 | } |
| 492 | 496 | ||
| 497 | static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) | ||
| 498 | { | ||
| 499 | void *addr; | ||
| 500 | unsigned int bit; | ||
| 501 | int error; | ||
| 502 | |||
| 503 | error = memory_bm_find_bit(bm, pfn, &addr, &bit); | ||
| 504 | if (!error) | ||
| 505 | set_bit(bit, addr); | ||
| 506 | return error; | ||
| 507 | } | ||
| 508 | |||
| 493 | static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) | 509 | static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) |
| 494 | { | 510 | { |
| 495 | void *addr; | 511 | void *addr; |
| 496 | unsigned int bit; | 512 | unsigned int bit; |
| 513 | int error; | ||
| 497 | 514 | ||
| 498 | memory_bm_find_bit(bm, pfn, &addr, &bit); | 515 | error = memory_bm_find_bit(bm, pfn, &addr, &bit); |
| 516 | BUG_ON(error); | ||
| 499 | clear_bit(bit, addr); | 517 | clear_bit(bit, addr); |
| 500 | } | 518 | } |
| 501 | 519 | ||
| @@ -503,8 +521,10 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) | |||
| 503 | { | 521 | { |
| 504 | void *addr; | 522 | void *addr; |
| 505 | unsigned int bit; | 523 | unsigned int bit; |
| 524 | int error; | ||
| 506 | 525 | ||
| 507 | memory_bm_find_bit(bm, pfn, &addr, &bit); | 526 | error = memory_bm_find_bit(bm, pfn, &addr, &bit); |
| 527 | BUG_ON(error); | ||
| 508 | return test_bit(bit, addr); | 528 | return test_bit(bit, addr); |
| 509 | } | 529 | } |
| 510 | 530 | ||
| @@ -709,8 +729,15 @@ static void mark_nosave_pages(struct memory_bitmap *bm) | |||
| 709 | region->end_pfn << PAGE_SHIFT); | 729 | region->end_pfn << PAGE_SHIFT); |
| 710 | 730 | ||
| 711 | for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) | 731 | for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) |
| 712 | if (pfn_valid(pfn)) | 732 | if (pfn_valid(pfn)) { |
| 713 | memory_bm_set_bit(bm, pfn); | 733 | /* |
| 734 | * It is safe to ignore the result of | ||
| 735 | * mem_bm_set_bit_check() here, since we won't | ||
| 736 | * touch the PFNs for which the error is | ||
| 737 | * returned anyway. | ||
| 738 | */ | ||
| 739 | mem_bm_set_bit_check(bm, pfn); | ||
| 740 | } | ||
| 714 | } | 741 | } |
| 715 | } | 742 | } |
| 716 | 743 | ||
diff --git a/kernel/printk.c b/kernel/printk.c index 9adc2a473e6e..c46a20a19a15 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
| @@ -616,6 +616,40 @@ asmlinkage int printk(const char *fmt, ...) | |||
| 616 | /* cpu currently holding logbuf_lock */ | 616 | /* cpu currently holding logbuf_lock */ |
| 617 | static volatile unsigned int printk_cpu = UINT_MAX; | 617 | static volatile unsigned int printk_cpu = UINT_MAX; |
| 618 | 618 | ||
| 619 | /* | ||
| 620 | * Can we actually use the console at this time on this cpu? | ||
| 621 | * | ||
| 622 | * Console drivers may assume that per-cpu resources have | ||
| 623 | * been allocated. So unless they're explicitly marked as | ||
| 624 | * being able to cope (CON_ANYTIME) don't call them until | ||
| 625 | * this CPU is officially up. | ||
| 626 | */ | ||
| 627 | static inline int can_use_console(unsigned int cpu) | ||
| 628 | { | ||
| 629 | return cpu_online(cpu) || have_callable_console(); | ||
| 630 | } | ||
| 631 | |||
| 632 | /* | ||
| 633 | * Try to get console ownership to actually show the kernel | ||
| 634 | * messages from a 'printk'. Return true (and with the | ||
| 635 | * console_semaphore held, and 'console_locked' set) if it | ||
| 636 | * is successful, false otherwise. | ||
| 637 | * | ||
| 638 | * This gets called with the 'logbuf_lock' spinlock held and | ||
| 639 | * interrupts disabled. It should return with 'lockbuf_lock' | ||
| 640 | * released but interrupts still disabled. | ||
| 641 | */ | ||
| 642 | static int acquire_console_semaphore_for_printk(unsigned int cpu) | ||
| 643 | { | ||
| 644 | int retval = 0; | ||
| 645 | |||
| 646 | if (can_use_console(cpu)) | ||
| 647 | retval = !try_acquire_console_sem(); | ||
| 648 | printk_cpu = UINT_MAX; | ||
| 649 | spin_unlock(&logbuf_lock); | ||
| 650 | return retval; | ||
| 651 | } | ||
| 652 | |||
| 619 | const char printk_recursion_bug_msg [] = | 653 | const char printk_recursion_bug_msg [] = |
| 620 | KERN_CRIT "BUG: recent printk recursion!\n"; | 654 | KERN_CRIT "BUG: recent printk recursion!\n"; |
| 621 | static int printk_recursion_bug; | 655 | static int printk_recursion_bug; |
| @@ -725,43 +759,22 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
| 725 | log_level_unknown = 1; | 759 | log_level_unknown = 1; |
| 726 | } | 760 | } |
| 727 | 761 | ||
| 728 | if (!down_trylock(&console_sem)) { | 762 | /* |
| 729 | /* | 763 | * Try to acquire and then immediately release the |
| 730 | * We own the drivers. We can drop the spinlock and | 764 | * console semaphore. The release will do all the |
| 731 | * let release_console_sem() print the text, maybe ... | 765 | * actual magic (print out buffers, wake up klogd, |
| 732 | */ | 766 | * etc). |
| 733 | console_locked = 1; | 767 | * |
| 734 | printk_cpu = UINT_MAX; | 768 | * The acquire_console_semaphore_for_printk() function |
| 735 | spin_unlock(&logbuf_lock); | 769 | * will release 'logbuf_lock' regardless of whether it |
| 770 | * actually gets the semaphore or not. | ||
| 771 | */ | ||
| 772 | if (acquire_console_semaphore_for_printk(this_cpu)) | ||
| 773 | release_console_sem(); | ||
| 736 | 774 | ||
| 737 | /* | 775 | lockdep_on(); |
| 738 | * Console drivers may assume that per-cpu resources have | ||
| 739 | * been allocated. So unless they're explicitly marked as | ||
| 740 | * being able to cope (CON_ANYTIME) don't call them until | ||
| 741 | * this CPU is officially up. | ||
| 742 | */ | ||
| 743 | if (cpu_online(smp_processor_id()) || have_callable_console()) { | ||
| 744 | console_may_schedule = 0; | ||
| 745 | release_console_sem(); | ||
| 746 | } else { | ||
| 747 | /* Release by hand to avoid flushing the buffer. */ | ||
| 748 | console_locked = 0; | ||
| 749 | up(&console_sem); | ||
| 750 | } | ||
| 751 | lockdep_on(); | ||
| 752 | raw_local_irq_restore(flags); | ||
| 753 | } else { | ||
| 754 | /* | ||
| 755 | * Someone else owns the drivers. We drop the spinlock, which | ||
| 756 | * allows the semaphore holder to proceed and to call the | ||
| 757 | * console drivers with the output which we just produced. | ||
| 758 | */ | ||
| 759 | printk_cpu = UINT_MAX; | ||
| 760 | spin_unlock(&logbuf_lock); | ||
| 761 | lockdep_on(); | ||
| 762 | out_restore_irqs: | 776 | out_restore_irqs: |
| 763 | raw_local_irq_restore(flags); | 777 | raw_local_irq_restore(flags); |
| 764 | } | ||
| 765 | 778 | ||
| 766 | preempt_enable(); | 779 | preempt_enable(); |
| 767 | return printed_len; | 780 | return printed_len; |
diff --git a/kernel/relay.c b/kernel/relay.c index d080b9d161a7..d6204a485818 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
| @@ -736,7 +736,7 @@ static int relay_file_open(struct inode *inode, struct file *filp) | |||
| 736 | kref_get(&buf->kref); | 736 | kref_get(&buf->kref); |
| 737 | filp->private_data = buf; | 737 | filp->private_data = buf; |
| 738 | 738 | ||
| 739 | return 0; | 739 | return nonseekable_open(inode, filp); |
| 740 | } | 740 | } |
| 741 | 741 | ||
| 742 | /** | 742 | /** |
| @@ -1056,6 +1056,10 @@ static struct pipe_buf_operations relay_pipe_buf_ops = { | |||
| 1056 | .get = generic_pipe_buf_get, | 1056 | .get = generic_pipe_buf_get, |
| 1057 | }; | 1057 | }; |
| 1058 | 1058 | ||
| 1059 | static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i) | ||
| 1060 | { | ||
| 1061 | } | ||
| 1062 | |||
| 1059 | /* | 1063 | /* |
| 1060 | * subbuf_splice_actor - splice up to one subbuf's worth of data | 1064 | * subbuf_splice_actor - splice up to one subbuf's worth of data |
| 1061 | */ | 1065 | */ |
| @@ -1066,7 +1070,7 @@ static int subbuf_splice_actor(struct file *in, | |||
| 1066 | unsigned int flags, | 1070 | unsigned int flags, |
| 1067 | int *nonpad_ret) | 1071 | int *nonpad_ret) |
| 1068 | { | 1072 | { |
| 1069 | unsigned int pidx, poff, total_len, subbuf_pages, ret; | 1073 | unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret; |
| 1070 | struct rchan_buf *rbuf = in->private_data; | 1074 | struct rchan_buf *rbuf = in->private_data; |
| 1071 | unsigned int subbuf_size = rbuf->chan->subbuf_size; | 1075 | unsigned int subbuf_size = rbuf->chan->subbuf_size; |
| 1072 | uint64_t pos = (uint64_t) *ppos; | 1076 | uint64_t pos = (uint64_t) *ppos; |
| @@ -1083,6 +1087,7 @@ static int subbuf_splice_actor(struct file *in, | |||
| 1083 | .partial = partial, | 1087 | .partial = partial, |
| 1084 | .flags = flags, | 1088 | .flags = flags, |
| 1085 | .ops = &relay_pipe_buf_ops, | 1089 | .ops = &relay_pipe_buf_ops, |
| 1090 | .spd_release = relay_page_release, | ||
| 1086 | }; | 1091 | }; |
| 1087 | 1092 | ||
| 1088 | if (rbuf->subbufs_produced == rbuf->subbufs_consumed) | 1093 | if (rbuf->subbufs_produced == rbuf->subbufs_consumed) |
| @@ -1097,8 +1102,9 @@ static int subbuf_splice_actor(struct file *in, | |||
| 1097 | subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; | 1102 | subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; |
| 1098 | pidx = (read_start / PAGE_SIZE) % subbuf_pages; | 1103 | pidx = (read_start / PAGE_SIZE) % subbuf_pages; |
| 1099 | poff = read_start & ~PAGE_MASK; | 1104 | poff = read_start & ~PAGE_MASK; |
| 1105 | nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS); | ||
| 1100 | 1106 | ||
| 1101 | for (total_len = 0; spd.nr_pages < subbuf_pages; spd.nr_pages++) { | 1107 | for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { |
| 1102 | unsigned int this_len, this_end, private; | 1108 | unsigned int this_len, this_end, private; |
| 1103 | unsigned int cur_pos = read_start + total_len; | 1109 | unsigned int cur_pos = read_start + total_len; |
| 1104 | 1110 | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 1cb53fb1fe3d..8dcdec6fe0fe 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -301,7 +301,7 @@ struct cfs_rq { | |||
| 301 | /* 'curr' points to currently running entity on this cfs_rq. | 301 | /* 'curr' points to currently running entity on this cfs_rq. |
| 302 | * It is set to NULL otherwise (i.e when none are currently running). | 302 | * It is set to NULL otherwise (i.e when none are currently running). |
| 303 | */ | 303 | */ |
| 304 | struct sched_entity *curr; | 304 | struct sched_entity *curr, *next; |
| 305 | 305 | ||
| 306 | unsigned long nr_spread_over; | 306 | unsigned long nr_spread_over; |
| 307 | 307 | ||
| @@ -594,18 +594,14 @@ enum { | |||
| 594 | SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, | 594 | SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, |
| 595 | SCHED_FEAT_WAKEUP_PREEMPT = 2, | 595 | SCHED_FEAT_WAKEUP_PREEMPT = 2, |
| 596 | SCHED_FEAT_START_DEBIT = 4, | 596 | SCHED_FEAT_START_DEBIT = 4, |
| 597 | SCHED_FEAT_TREE_AVG = 8, | 597 | SCHED_FEAT_HRTICK = 8, |
| 598 | SCHED_FEAT_APPROX_AVG = 16, | 598 | SCHED_FEAT_DOUBLE_TICK = 16, |
| 599 | SCHED_FEAT_HRTICK = 32, | ||
| 600 | SCHED_FEAT_DOUBLE_TICK = 64, | ||
| 601 | }; | 599 | }; |
| 602 | 600 | ||
| 603 | const_debug unsigned int sysctl_sched_features = | 601 | const_debug unsigned int sysctl_sched_features = |
| 604 | SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | | 602 | SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | |
| 605 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | | 603 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | |
| 606 | SCHED_FEAT_START_DEBIT * 1 | | 604 | SCHED_FEAT_START_DEBIT * 1 | |
| 607 | SCHED_FEAT_TREE_AVG * 0 | | ||
| 608 | SCHED_FEAT_APPROX_AVG * 0 | | ||
| 609 | SCHED_FEAT_HRTICK * 1 | | 605 | SCHED_FEAT_HRTICK * 1 | |
| 610 | SCHED_FEAT_DOUBLE_TICK * 0; | 606 | SCHED_FEAT_DOUBLE_TICK * 0; |
| 611 | 607 | ||
| @@ -1056,6 +1052,49 @@ static void resched_cpu(int cpu) | |||
| 1056 | resched_task(cpu_curr(cpu)); | 1052 | resched_task(cpu_curr(cpu)); |
| 1057 | spin_unlock_irqrestore(&rq->lock, flags); | 1053 | spin_unlock_irqrestore(&rq->lock, flags); |
| 1058 | } | 1054 | } |
| 1055 | |||
| 1056 | #ifdef CONFIG_NO_HZ | ||
| 1057 | /* | ||
| 1058 | * When add_timer_on() enqueues a timer into the timer wheel of an | ||
| 1059 | * idle CPU then this timer might expire before the next timer event | ||
| 1060 | * which is scheduled to wake up that CPU. In case of a completely | ||
| 1061 | * idle system the next event might even be infinite time into the | ||
| 1062 | * future. wake_up_idle_cpu() ensures that the CPU is woken up and | ||
| 1063 | * leaves the inner idle loop so the newly added timer is taken into | ||
| 1064 | * account when the CPU goes back to idle and evaluates the timer | ||
| 1065 | * wheel for the next timer event. | ||
| 1066 | */ | ||
| 1067 | void wake_up_idle_cpu(int cpu) | ||
| 1068 | { | ||
| 1069 | struct rq *rq = cpu_rq(cpu); | ||
| 1070 | |||
| 1071 | if (cpu == smp_processor_id()) | ||
| 1072 | return; | ||
| 1073 | |||
| 1074 | /* | ||
| 1075 | * This is safe, as this function is called with the timer | ||
| 1076 | * wheel base lock of (cpu) held. When the CPU is on the way | ||
| 1077 | * to idle and has not yet set rq->curr to idle then it will | ||
| 1078 | * be serialized on the timer wheel base lock and take the new | ||
| 1079 | * timer into account automatically. | ||
| 1080 | */ | ||
| 1081 | if (rq->curr != rq->idle) | ||
| 1082 | return; | ||
| 1083 | |||
| 1084 | /* | ||
| 1085 | * We can set TIF_RESCHED on the idle task of the other CPU | ||
| 1086 | * lockless. The worst case is that the other CPU runs the | ||
| 1087 | * idle task through an additional NOOP schedule() | ||
| 1088 | */ | ||
| 1089 | set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); | ||
| 1090 | |||
| 1091 | /* NEED_RESCHED must be visible before we test polling */ | ||
| 1092 | smp_mb(); | ||
| 1093 | if (!tsk_is_polling(rq->idle)) | ||
| 1094 | smp_send_reschedule(cpu); | ||
| 1095 | } | ||
| 1096 | #endif | ||
| 1097 | |||
| 1059 | #else | 1098 | #else |
| 1060 | static void __resched_task(struct task_struct *p, int tif_bit) | 1099 | static void __resched_task(struct task_struct *p, int tif_bit) |
| 1061 | { | 1100 | { |
| @@ -1084,7 +1123,7 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, | |||
| 1084 | u64 tmp; | 1123 | u64 tmp; |
| 1085 | 1124 | ||
| 1086 | if (unlikely(!lw->inv_weight)) | 1125 | if (unlikely(!lw->inv_weight)) |
| 1087 | lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1; | 1126 | lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1); |
| 1088 | 1127 | ||
| 1089 | tmp = (u64)delta_exec * weight; | 1128 | tmp = (u64)delta_exec * weight; |
| 1090 | /* | 1129 | /* |
| @@ -1108,11 +1147,13 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) | |||
| 1108 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | 1147 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) |
| 1109 | { | 1148 | { |
| 1110 | lw->weight += inc; | 1149 | lw->weight += inc; |
| 1150 | lw->inv_weight = 0; | ||
| 1111 | } | 1151 | } |
| 1112 | 1152 | ||
| 1113 | static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | 1153 | static inline void update_load_sub(struct load_weight *lw, unsigned long dec) |
| 1114 | { | 1154 | { |
| 1115 | lw->weight -= dec; | 1155 | lw->weight -= dec; |
| 1156 | lw->inv_weight = 0; | ||
| 1116 | } | 1157 | } |
| 1117 | 1158 | ||
| 1118 | /* | 1159 | /* |
| @@ -1394,6 +1435,12 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
| 1394 | { | 1435 | { |
| 1395 | s64 delta; | 1436 | s64 delta; |
| 1396 | 1437 | ||
| 1438 | /* | ||
| 1439 | * Buddy candidates are cache hot: | ||
| 1440 | */ | ||
| 1441 | if (&p->se == cfs_rq_of(&p->se)->next) | ||
| 1442 | return 1; | ||
| 1443 | |||
| 1397 | if (p->sched_class != &fair_sched_class) | 1444 | if (p->sched_class != &fair_sched_class) |
| 1398 | return 0; | 1445 | return 0; |
| 1399 | 1446 | ||
| @@ -1853,10 +1900,11 @@ out_activate: | |||
| 1853 | schedstat_inc(p, se.nr_wakeups_remote); | 1900 | schedstat_inc(p, se.nr_wakeups_remote); |
| 1854 | update_rq_clock(rq); | 1901 | update_rq_clock(rq); |
| 1855 | activate_task(rq, p, 1); | 1902 | activate_task(rq, p, 1); |
| 1856 | check_preempt_curr(rq, p); | ||
| 1857 | success = 1; | 1903 | success = 1; |
| 1858 | 1904 | ||
| 1859 | out_running: | 1905 | out_running: |
| 1906 | check_preempt_curr(rq, p); | ||
| 1907 | |||
| 1860 | p->state = TASK_RUNNING; | 1908 | p->state = TASK_RUNNING; |
| 1861 | #ifdef CONFIG_SMP | 1909 | #ifdef CONFIG_SMP |
| 1862 | if (p->sched_class->task_wake_up) | 1910 | if (p->sched_class->task_wake_up) |
| @@ -1890,6 +1938,8 @@ static void __sched_fork(struct task_struct *p) | |||
| 1890 | p->se.exec_start = 0; | 1938 | p->se.exec_start = 0; |
| 1891 | p->se.sum_exec_runtime = 0; | 1939 | p->se.sum_exec_runtime = 0; |
| 1892 | p->se.prev_sum_exec_runtime = 0; | 1940 | p->se.prev_sum_exec_runtime = 0; |
| 1941 | p->se.last_wakeup = 0; | ||
| 1942 | p->se.avg_overlap = 0; | ||
| 1893 | 1943 | ||
| 1894 | #ifdef CONFIG_SCHEDSTATS | 1944 | #ifdef CONFIG_SCHEDSTATS |
| 1895 | p->se.wait_start = 0; | 1945 | p->se.wait_start = 0; |
| @@ -3875,7 +3925,7 @@ need_resched_nonpreemptible: | |||
| 3875 | 3925 | ||
| 3876 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3926 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
| 3877 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && | 3927 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && |
| 3878 | unlikely(signal_pending(prev)))) { | 3928 | signal_pending(prev))) { |
| 3879 | prev->state = TASK_RUNNING; | 3929 | prev->state = TASK_RUNNING; |
| 3880 | } else { | 3930 | } else { |
| 3881 | deactivate_task(rq, prev, 1); | 3931 | deactivate_task(rq, prev, 1); |
| @@ -4268,11 +4318,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 4268 | oldprio = p->prio; | 4318 | oldprio = p->prio; |
| 4269 | on_rq = p->se.on_rq; | 4319 | on_rq = p->se.on_rq; |
| 4270 | running = task_current(rq, p); | 4320 | running = task_current(rq, p); |
| 4271 | if (on_rq) { | 4321 | if (on_rq) |
| 4272 | dequeue_task(rq, p, 0); | 4322 | dequeue_task(rq, p, 0); |
| 4273 | if (running) | 4323 | if (running) |
| 4274 | p->sched_class->put_prev_task(rq, p); | 4324 | p->sched_class->put_prev_task(rq, p); |
| 4275 | } | ||
| 4276 | 4325 | ||
| 4277 | if (rt_prio(prio)) | 4326 | if (rt_prio(prio)) |
| 4278 | p->sched_class = &rt_sched_class; | 4327 | p->sched_class = &rt_sched_class; |
| @@ -4281,10 +4330,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 4281 | 4330 | ||
| 4282 | p->prio = prio; | 4331 | p->prio = prio; |
| 4283 | 4332 | ||
| 4333 | if (running) | ||
| 4334 | p->sched_class->set_curr_task(rq); | ||
| 4284 | if (on_rq) { | 4335 | if (on_rq) { |
| 4285 | if (running) | ||
| 4286 | p->sched_class->set_curr_task(rq); | ||
| 4287 | |||
| 4288 | enqueue_task(rq, p, 0); | 4336 | enqueue_task(rq, p, 0); |
| 4289 | 4337 | ||
| 4290 | check_class_changed(rq, p, prev_class, oldprio, running); | 4338 | check_class_changed(rq, p, prev_class, oldprio, running); |
| @@ -4581,19 +4629,17 @@ recheck: | |||
| 4581 | update_rq_clock(rq); | 4629 | update_rq_clock(rq); |
| 4582 | on_rq = p->se.on_rq; | 4630 | on_rq = p->se.on_rq; |
| 4583 | running = task_current(rq, p); | 4631 | running = task_current(rq, p); |
| 4584 | if (on_rq) { | 4632 | if (on_rq) |
| 4585 | deactivate_task(rq, p, 0); | 4633 | deactivate_task(rq, p, 0); |
| 4586 | if (running) | 4634 | if (running) |
| 4587 | p->sched_class->put_prev_task(rq, p); | 4635 | p->sched_class->put_prev_task(rq, p); |
| 4588 | } | ||
| 4589 | 4636 | ||
| 4590 | oldprio = p->prio; | 4637 | oldprio = p->prio; |
| 4591 | __setscheduler(rq, p, policy, param->sched_priority); | 4638 | __setscheduler(rq, p, policy, param->sched_priority); |
| 4592 | 4639 | ||
| 4640 | if (running) | ||
| 4641 | p->sched_class->set_curr_task(rq); | ||
| 4593 | if (on_rq) { | 4642 | if (on_rq) { |
| 4594 | if (running) | ||
| 4595 | p->sched_class->set_curr_task(rq); | ||
| 4596 | |||
| 4597 | activate_task(rq, p, 0); | 4643 | activate_task(rq, p, 0); |
| 4598 | 4644 | ||
| 4599 | check_class_changed(rq, p, prev_class, oldprio, running); | 4645 | check_class_changed(rq, p, prev_class, oldprio, running); |
| @@ -6804,6 +6850,10 @@ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ | |||
| 6804 | */ | 6850 | */ |
| 6805 | static cpumask_t fallback_doms; | 6851 | static cpumask_t fallback_doms; |
| 6806 | 6852 | ||
| 6853 | void __attribute__((weak)) arch_update_cpu_topology(void) | ||
| 6854 | { | ||
| 6855 | } | ||
| 6856 | |||
| 6807 | /* | 6857 | /* |
| 6808 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 6858 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
| 6809 | * For now this just excludes isolated cpus, but could be used to | 6859 | * For now this just excludes isolated cpus, but could be used to |
| @@ -6813,6 +6863,7 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) | |||
| 6813 | { | 6863 | { |
| 6814 | int err; | 6864 | int err; |
| 6815 | 6865 | ||
| 6866 | arch_update_cpu_topology(); | ||
| 6816 | ndoms_cur = 1; | 6867 | ndoms_cur = 1; |
| 6817 | doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); | 6868 | doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); |
| 6818 | if (!doms_cur) | 6869 | if (!doms_cur) |
| @@ -6917,7 +6968,7 @@ match2: | |||
| 6917 | } | 6968 | } |
| 6918 | 6969 | ||
| 6919 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 6970 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
| 6920 | static int arch_reinit_sched_domains(void) | 6971 | int arch_reinit_sched_domains(void) |
| 6921 | { | 6972 | { |
| 6922 | int err; | 6973 | int err; |
| 6923 | 6974 | ||
| @@ -7618,11 +7669,10 @@ void sched_move_task(struct task_struct *tsk) | |||
| 7618 | running = task_current(rq, tsk); | 7669 | running = task_current(rq, tsk); |
| 7619 | on_rq = tsk->se.on_rq; | 7670 | on_rq = tsk->se.on_rq; |
| 7620 | 7671 | ||
| 7621 | if (on_rq) { | 7672 | if (on_rq) |
| 7622 | dequeue_task(rq, tsk, 0); | 7673 | dequeue_task(rq, tsk, 0); |
| 7623 | if (unlikely(running)) | 7674 | if (unlikely(running)) |
| 7624 | tsk->sched_class->put_prev_task(rq, tsk); | 7675 | tsk->sched_class->put_prev_task(rq, tsk); |
| 7625 | } | ||
| 7626 | 7676 | ||
| 7627 | set_task_rq(tsk, task_cpu(tsk)); | 7677 | set_task_rq(tsk, task_cpu(tsk)); |
| 7628 | 7678 | ||
| @@ -7631,11 +7681,10 @@ void sched_move_task(struct task_struct *tsk) | |||
| 7631 | tsk->sched_class->moved_group(tsk); | 7681 | tsk->sched_class->moved_group(tsk); |
| 7632 | #endif | 7682 | #endif |
| 7633 | 7683 | ||
| 7634 | if (on_rq) { | 7684 | if (unlikely(running)) |
| 7635 | if (unlikely(running)) | 7685 | tsk->sched_class->set_curr_task(rq); |
| 7636 | tsk->sched_class->set_curr_task(rq); | 7686 | if (on_rq) |
| 7637 | enqueue_task(rq, tsk, 0); | 7687 | enqueue_task(rq, tsk, 0); |
| 7638 | } | ||
| 7639 | 7688 | ||
| 7640 | task_rq_unlock(rq, &flags); | 7689 | task_rq_unlock(rq, &flags); |
| 7641 | } | 7690 | } |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 4b5e24cf2f4a..ef358ba07683 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
| @@ -288,6 +288,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
| 288 | PN(se.exec_start); | 288 | PN(se.exec_start); |
| 289 | PN(se.vruntime); | 289 | PN(se.vruntime); |
| 290 | PN(se.sum_exec_runtime); | 290 | PN(se.sum_exec_runtime); |
| 291 | PN(se.avg_overlap); | ||
| 291 | 292 | ||
| 292 | nr_switches = p->nvcsw + p->nivcsw; | 293 | nr_switches = p->nvcsw + p->nivcsw; |
| 293 | 294 | ||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e2a530515619..86a93376282c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
| @@ -73,13 +73,13 @@ unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; | |||
| 73 | 73 | ||
| 74 | /* | 74 | /* |
| 75 | * SCHED_OTHER wake-up granularity. | 75 | * SCHED_OTHER wake-up granularity. |
| 76 | * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) | 76 | * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) |
| 77 | * | 77 | * |
| 78 | * This option delays the preemption effects of decoupled workloads | 78 | * This option delays the preemption effects of decoupled workloads |
| 79 | * and reduces their over-scheduling. Synchronous workloads will still | 79 | * and reduces their over-scheduling. Synchronous workloads will still |
| 80 | * have immediate wakeup/sleep latencies. | 80 | * have immediate wakeup/sleep latencies. |
| 81 | */ | 81 | */ |
| 82 | unsigned int sysctl_sched_wakeup_granularity = 10000000UL; | 82 | unsigned int sysctl_sched_wakeup_granularity = 5000000UL; |
| 83 | 83 | ||
| 84 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | 84 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
| 85 | 85 | ||
| @@ -175,8 +175,15 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 175 | * Maintain a cache of leftmost tree entries (it is frequently | 175 | * Maintain a cache of leftmost tree entries (it is frequently |
| 176 | * used): | 176 | * used): |
| 177 | */ | 177 | */ |
| 178 | if (leftmost) | 178 | if (leftmost) { |
| 179 | cfs_rq->rb_leftmost = &se->run_node; | 179 | cfs_rq->rb_leftmost = &se->run_node; |
| 180 | /* | ||
| 181 | * maintain cfs_rq->min_vruntime to be a monotonic increasing | ||
| 182 | * value tracking the leftmost vruntime in the tree. | ||
| 183 | */ | ||
| 184 | cfs_rq->min_vruntime = | ||
| 185 | max_vruntime(cfs_rq->min_vruntime, se->vruntime); | ||
| 186 | } | ||
| 180 | 187 | ||
| 181 | rb_link_node(&se->run_node, parent, link); | 188 | rb_link_node(&se->run_node, parent, link); |
| 182 | rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); | 189 | rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); |
| @@ -184,8 +191,24 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 184 | 191 | ||
| 185 | static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | 192 | static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 186 | { | 193 | { |
| 187 | if (cfs_rq->rb_leftmost == &se->run_node) | 194 | if (cfs_rq->rb_leftmost == &se->run_node) { |
| 188 | cfs_rq->rb_leftmost = rb_next(&se->run_node); | 195 | struct rb_node *next_node; |
| 196 | struct sched_entity *next; | ||
| 197 | |||
| 198 | next_node = rb_next(&se->run_node); | ||
| 199 | cfs_rq->rb_leftmost = next_node; | ||
| 200 | |||
| 201 | if (next_node) { | ||
| 202 | next = rb_entry(next_node, | ||
| 203 | struct sched_entity, run_node); | ||
| 204 | cfs_rq->min_vruntime = | ||
| 205 | max_vruntime(cfs_rq->min_vruntime, | ||
| 206 | next->vruntime); | ||
| 207 | } | ||
| 208 | } | ||
| 209 | |||
| 210 | if (cfs_rq->next == se) | ||
| 211 | cfs_rq->next = NULL; | ||
| 189 | 212 | ||
| 190 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); | 213 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); |
| 191 | } | 214 | } |
| @@ -260,12 +283,8 @@ static u64 __sched_period(unsigned long nr_running) | |||
| 260 | */ | 283 | */ |
| 261 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) | 284 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 262 | { | 285 | { |
| 263 | u64 slice = __sched_period(cfs_rq->nr_running); | 286 | return calc_delta_mine(__sched_period(cfs_rq->nr_running), |
| 264 | 287 | se->load.weight, &cfs_rq->load); | |
| 265 | slice *= se->load.weight; | ||
| 266 | do_div(slice, cfs_rq->load.weight); | ||
| 267 | |||
| 268 | return slice; | ||
| 269 | } | 288 | } |
| 270 | 289 | ||
| 271 | /* | 290 | /* |
| @@ -283,11 +302,6 @@ static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) | |||
| 283 | return vslice; | 302 | return vslice; |
| 284 | } | 303 | } |
| 285 | 304 | ||
| 286 | static u64 sched_vslice(struct cfs_rq *cfs_rq) | ||
| 287 | { | ||
| 288 | return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running); | ||
| 289 | } | ||
| 290 | |||
| 291 | static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) | 305 | static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 292 | { | 306 | { |
| 293 | return __sched_vslice(cfs_rq->load.weight + se->load.weight, | 307 | return __sched_vslice(cfs_rq->load.weight + se->load.weight, |
| @@ -303,7 +317,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
| 303 | unsigned long delta_exec) | 317 | unsigned long delta_exec) |
| 304 | { | 318 | { |
| 305 | unsigned long delta_exec_weighted; | 319 | unsigned long delta_exec_weighted; |
| 306 | u64 vruntime; | ||
| 307 | 320 | ||
| 308 | schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); | 321 | schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); |
| 309 | 322 | ||
| @@ -315,19 +328,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
| 315 | &curr->load); | 328 | &curr->load); |
| 316 | } | 329 | } |
| 317 | curr->vruntime += delta_exec_weighted; | 330 | curr->vruntime += delta_exec_weighted; |
| 318 | |||
| 319 | /* | ||
| 320 | * maintain cfs_rq->min_vruntime to be a monotonic increasing | ||
| 321 | * value tracking the leftmost vruntime in the tree. | ||
| 322 | */ | ||
| 323 | if (first_fair(cfs_rq)) { | ||
| 324 | vruntime = min_vruntime(curr->vruntime, | ||
| 325 | __pick_next_entity(cfs_rq)->vruntime); | ||
| 326 | } else | ||
| 327 | vruntime = curr->vruntime; | ||
| 328 | |||
| 329 | cfs_rq->min_vruntime = | ||
| 330 | max_vruntime(cfs_rq->min_vruntime, vruntime); | ||
| 331 | } | 331 | } |
| 332 | 332 | ||
| 333 | static void update_curr(struct cfs_rq *cfs_rq) | 333 | static void update_curr(struct cfs_rq *cfs_rq) |
| @@ -493,16 +493,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
| 493 | { | 493 | { |
| 494 | u64 vruntime; | 494 | u64 vruntime; |
| 495 | 495 | ||
| 496 | vruntime = cfs_rq->min_vruntime; | 496 | if (first_fair(cfs_rq)) { |
| 497 | 497 | vruntime = min_vruntime(cfs_rq->min_vruntime, | |
| 498 | if (sched_feat(TREE_AVG)) { | 498 | __pick_next_entity(cfs_rq)->vruntime); |
| 499 | struct sched_entity *last = __pick_last_entity(cfs_rq); | 499 | } else |
| 500 | if (last) { | 500 | vruntime = cfs_rq->min_vruntime; |
| 501 | vruntime += last->vruntime; | ||
| 502 | vruntime >>= 1; | ||
| 503 | } | ||
| 504 | } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) | ||
| 505 | vruntime += sched_vslice(cfs_rq)/2; | ||
| 506 | 501 | ||
| 507 | /* | 502 | /* |
| 508 | * The 'current' period is already promised to the current tasks, | 503 | * The 'current' period is already promised to the current tasks, |
| @@ -515,8 +510,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
| 515 | 510 | ||
| 516 | if (!initial) { | 511 | if (!initial) { |
| 517 | /* sleeps upto a single latency don't count. */ | 512 | /* sleeps upto a single latency don't count. */ |
| 518 | if (sched_feat(NEW_FAIR_SLEEPERS)) | 513 | if (sched_feat(NEW_FAIR_SLEEPERS)) { |
| 519 | vruntime -= sysctl_sched_latency; | 514 | vruntime -= calc_delta_fair(sysctl_sched_latency, |
| 515 | &cfs_rq->load); | ||
| 516 | } | ||
| 520 | 517 | ||
| 521 | /* ensure we never gain time by being placed backwards. */ | 518 | /* ensure we never gain time by being placed backwards. */ |
| 522 | vruntime = max_vruntime(se->vruntime, vruntime); | 519 | vruntime = max_vruntime(se->vruntime, vruntime); |
| @@ -545,6 +542,21 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) | |||
| 545 | account_entity_enqueue(cfs_rq, se); | 542 | account_entity_enqueue(cfs_rq, se); |
| 546 | } | 543 | } |
| 547 | 544 | ||
| 545 | static void update_avg(u64 *avg, u64 sample) | ||
| 546 | { | ||
| 547 | s64 diff = sample - *avg; | ||
| 548 | *avg += diff >> 3; | ||
| 549 | } | ||
| 550 | |||
| 551 | static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 552 | { | ||
| 553 | if (!se->last_wakeup) | ||
| 554 | return; | ||
| 555 | |||
| 556 | update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup); | ||
| 557 | se->last_wakeup = 0; | ||
| 558 | } | ||
| 559 | |||
| 548 | static void | 560 | static void |
| 549 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | 561 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) |
| 550 | { | 562 | { |
| @@ -555,6 +567,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | |||
| 555 | 567 | ||
| 556 | update_stats_dequeue(cfs_rq, se); | 568 | update_stats_dequeue(cfs_rq, se); |
| 557 | if (sleep) { | 569 | if (sleep) { |
| 570 | update_avg_stats(cfs_rq, se); | ||
| 558 | #ifdef CONFIG_SCHEDSTATS | 571 | #ifdef CONFIG_SCHEDSTATS |
| 559 | if (entity_is_task(se)) { | 572 | if (entity_is_task(se)) { |
| 560 | struct task_struct *tsk = task_of(se); | 573 | struct task_struct *tsk = task_of(se); |
| @@ -616,12 +629,32 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 616 | se->prev_sum_exec_runtime = se->sum_exec_runtime; | 629 | se->prev_sum_exec_runtime = se->sum_exec_runtime; |
| 617 | } | 630 | } |
| 618 | 631 | ||
| 632 | static struct sched_entity * | ||
| 633 | pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 634 | { | ||
| 635 | s64 diff, gran; | ||
| 636 | |||
| 637 | if (!cfs_rq->next) | ||
| 638 | return se; | ||
| 639 | |||
| 640 | diff = cfs_rq->next->vruntime - se->vruntime; | ||
| 641 | if (diff < 0) | ||
| 642 | return se; | ||
| 643 | |||
| 644 | gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load); | ||
| 645 | if (diff > gran) | ||
| 646 | return se; | ||
| 647 | |||
| 648 | return cfs_rq->next; | ||
| 649 | } | ||
| 650 | |||
| 619 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | 651 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) |
| 620 | { | 652 | { |
| 621 | struct sched_entity *se = NULL; | 653 | struct sched_entity *se = NULL; |
| 622 | 654 | ||
| 623 | if (first_fair(cfs_rq)) { | 655 | if (first_fair(cfs_rq)) { |
| 624 | se = __pick_next_entity(cfs_rq); | 656 | se = __pick_next_entity(cfs_rq); |
| 657 | se = pick_next(cfs_rq, se); | ||
| 625 | set_next_entity(cfs_rq, se); | 658 | set_next_entity(cfs_rq, se); |
| 626 | } | 659 | } |
| 627 | 660 | ||
| @@ -949,96 +982,121 @@ static inline int wake_idle(int cpu, struct task_struct *p) | |||
| 949 | #endif | 982 | #endif |
| 950 | 983 | ||
| 951 | #ifdef CONFIG_SMP | 984 | #ifdef CONFIG_SMP |
| 952 | static int select_task_rq_fair(struct task_struct *p, int sync) | 985 | |
| 986 | static const struct sched_class fair_sched_class; | ||
| 987 | |||
| 988 | static int | ||
| 989 | wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, | ||
| 990 | struct task_struct *p, int prev_cpu, int this_cpu, int sync, | ||
| 991 | int idx, unsigned long load, unsigned long this_load, | ||
| 992 | unsigned int imbalance) | ||
| 953 | { | 993 | { |
| 954 | int cpu, this_cpu; | 994 | struct task_struct *curr = this_rq->curr; |
| 955 | struct rq *rq; | 995 | unsigned long tl = this_load; |
| 956 | struct sched_domain *sd, *this_sd = NULL; | 996 | unsigned long tl_per_task; |
| 957 | int new_cpu; | 997 | |
| 998 | if (!(this_sd->flags & SD_WAKE_AFFINE)) | ||
| 999 | return 0; | ||
| 1000 | |||
| 1001 | /* | ||
| 1002 | * If the currently running task will sleep within | ||
| 1003 | * a reasonable amount of time then attract this newly | ||
| 1004 | * woken task: | ||
| 1005 | */ | ||
| 1006 | if (sync && curr->sched_class == &fair_sched_class) { | ||
| 1007 | if (curr->se.avg_overlap < sysctl_sched_migration_cost && | ||
| 1008 | p->se.avg_overlap < sysctl_sched_migration_cost) | ||
| 1009 | return 1; | ||
| 1010 | } | ||
| 1011 | |||
| 1012 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | ||
| 1013 | tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
| 958 | 1014 | ||
| 959 | cpu = task_cpu(p); | 1015 | /* |
| 960 | rq = task_rq(p); | 1016 | * If sync wakeup then subtract the (maximum possible) |
| 961 | this_cpu = smp_processor_id(); | 1017 | * effect of the currently running task from the load |
| 962 | new_cpu = cpu; | 1018 | * of the current CPU: |
| 1019 | */ | ||
| 1020 | if (sync) | ||
| 1021 | tl -= current->se.load.weight; | ||
| 1022 | |||
| 1023 | if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || | ||
| 1024 | 100*(tl + p->se.load.weight) <= imbalance*load) { | ||
| 1025 | /* | ||
| 1026 | * This domain has SD_WAKE_AFFINE and | ||
| 1027 | * p is cache cold in this domain, and | ||
| 1028 | * there is no bad imbalance. | ||
| 1029 | */ | ||
| 1030 | schedstat_inc(this_sd, ttwu_move_affine); | ||
| 1031 | schedstat_inc(p, se.nr_wakeups_affine); | ||
| 1032 | |||
| 1033 | return 1; | ||
| 1034 | } | ||
| 1035 | return 0; | ||
| 1036 | } | ||
| 963 | 1037 | ||
| 964 | if (cpu == this_cpu) | 1038 | static int select_task_rq_fair(struct task_struct *p, int sync) |
| 965 | goto out_set_cpu; | 1039 | { |
| 1040 | struct sched_domain *sd, *this_sd = NULL; | ||
| 1041 | int prev_cpu, this_cpu, new_cpu; | ||
| 1042 | unsigned long load, this_load; | ||
| 1043 | struct rq *rq, *this_rq; | ||
| 1044 | unsigned int imbalance; | ||
| 1045 | int idx; | ||
| 1046 | |||
| 1047 | prev_cpu = task_cpu(p); | ||
| 1048 | rq = task_rq(p); | ||
| 1049 | this_cpu = smp_processor_id(); | ||
| 1050 | this_rq = cpu_rq(this_cpu); | ||
| 1051 | new_cpu = prev_cpu; | ||
| 966 | 1052 | ||
| 1053 | /* | ||
| 1054 | * 'this_sd' is the first domain that both | ||
| 1055 | * this_cpu and prev_cpu are present in: | ||
| 1056 | */ | ||
| 967 | for_each_domain(this_cpu, sd) { | 1057 | for_each_domain(this_cpu, sd) { |
| 968 | if (cpu_isset(cpu, sd->span)) { | 1058 | if (cpu_isset(prev_cpu, sd->span)) { |
| 969 | this_sd = sd; | 1059 | this_sd = sd; |
| 970 | break; | 1060 | break; |
| 971 | } | 1061 | } |
| 972 | } | 1062 | } |
| 973 | 1063 | ||
| 974 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | 1064 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) |
| 975 | goto out_set_cpu; | 1065 | goto out; |
| 976 | 1066 | ||
| 977 | /* | 1067 | /* |
| 978 | * Check for affine wakeup and passive balancing possibilities. | 1068 | * Check for affine wakeup and passive balancing possibilities. |
| 979 | */ | 1069 | */ |
| 980 | if (this_sd) { | 1070 | if (!this_sd) |
| 981 | int idx = this_sd->wake_idx; | 1071 | goto out; |
| 982 | unsigned int imbalance; | ||
| 983 | unsigned long load, this_load; | ||
| 984 | |||
| 985 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; | ||
| 986 | |||
| 987 | load = source_load(cpu, idx); | ||
| 988 | this_load = target_load(this_cpu, idx); | ||
| 989 | |||
| 990 | new_cpu = this_cpu; /* Wake to this CPU if we can */ | ||
| 991 | |||
| 992 | if (this_sd->flags & SD_WAKE_AFFINE) { | ||
| 993 | unsigned long tl = this_load; | ||
| 994 | unsigned long tl_per_task; | ||
| 995 | |||
| 996 | /* | ||
| 997 | * Attract cache-cold tasks on sync wakeups: | ||
| 998 | */ | ||
| 999 | if (sync && !task_hot(p, rq->clock, this_sd)) | ||
| 1000 | goto out_set_cpu; | ||
| 1001 | |||
| 1002 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | ||
| 1003 | tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
| 1004 | |||
| 1005 | /* | ||
| 1006 | * If sync wakeup then subtract the (maximum possible) | ||
| 1007 | * effect of the currently running task from the load | ||
| 1008 | * of the current CPU: | ||
| 1009 | */ | ||
| 1010 | if (sync) | ||
| 1011 | tl -= current->se.load.weight; | ||
| 1012 | |||
| 1013 | if ((tl <= load && | ||
| 1014 | tl + target_load(cpu, idx) <= tl_per_task) || | ||
| 1015 | 100*(tl + p->se.load.weight) <= imbalance*load) { | ||
| 1016 | /* | ||
| 1017 | * This domain has SD_WAKE_AFFINE and | ||
| 1018 | * p is cache cold in this domain, and | ||
| 1019 | * there is no bad imbalance. | ||
| 1020 | */ | ||
| 1021 | schedstat_inc(this_sd, ttwu_move_affine); | ||
| 1022 | schedstat_inc(p, se.nr_wakeups_affine); | ||
| 1023 | goto out_set_cpu; | ||
| 1024 | } | ||
| 1025 | } | ||
| 1026 | 1072 | ||
| 1027 | /* | 1073 | idx = this_sd->wake_idx; |
| 1028 | * Start passive balancing when half the imbalance_pct | 1074 | |
| 1029 | * limit is reached. | 1075 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; |
| 1030 | */ | 1076 | |
| 1031 | if (this_sd->flags & SD_WAKE_BALANCE) { | 1077 | load = source_load(prev_cpu, idx); |
| 1032 | if (imbalance*this_load <= 100*load) { | 1078 | this_load = target_load(this_cpu, idx); |
| 1033 | schedstat_inc(this_sd, ttwu_move_balance); | 1079 | |
| 1034 | schedstat_inc(p, se.nr_wakeups_passive); | 1080 | if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, |
| 1035 | goto out_set_cpu; | 1081 | load, this_load, imbalance)) |
| 1036 | } | 1082 | return this_cpu; |
| 1083 | |||
| 1084 | if (prev_cpu == this_cpu) | ||
| 1085 | goto out; | ||
| 1086 | |||
| 1087 | /* | ||
| 1088 | * Start passive balancing when half the imbalance_pct | ||
| 1089 | * limit is reached. | ||
| 1090 | */ | ||
| 1091 | if (this_sd->flags & SD_WAKE_BALANCE) { | ||
| 1092 | if (imbalance*this_load <= 100*load) { | ||
| 1093 | schedstat_inc(this_sd, ttwu_move_balance); | ||
| 1094 | schedstat_inc(p, se.nr_wakeups_passive); | ||
| 1095 | return this_cpu; | ||
| 1037 | } | 1096 | } |
| 1038 | } | 1097 | } |
| 1039 | 1098 | ||
| 1040 | new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ | 1099 | out: |
| 1041 | out_set_cpu: | ||
| 1042 | return wake_idle(new_cpu, p); | 1100 | return wake_idle(new_cpu, p); |
| 1043 | } | 1101 | } |
| 1044 | #endif /* CONFIG_SMP */ | 1102 | #endif /* CONFIG_SMP */ |
| @@ -1060,6 +1118,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
| 1060 | resched_task(curr); | 1118 | resched_task(curr); |
| 1061 | return; | 1119 | return; |
| 1062 | } | 1120 | } |
| 1121 | |||
| 1122 | se->last_wakeup = se->sum_exec_runtime; | ||
| 1123 | if (unlikely(se == pse)) | ||
| 1124 | return; | ||
| 1125 | |||
| 1126 | cfs_rq_of(pse)->next = pse; | ||
| 1127 | |||
| 1063 | /* | 1128 | /* |
| 1064 | * Batch tasks do not preempt (their preemption is driven by | 1129 | * Batch tasks do not preempt (their preemption is driven by |
| 1065 | * the tick): | 1130 | * the tick): |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 548c436a776b..7f60097d443a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
| @@ -141,13 +141,8 @@ static void clocksource_watchdog(unsigned long data) | |||
| 141 | } | 141 | } |
| 142 | 142 | ||
| 143 | if (!list_empty(&watchdog_list)) { | 143 | if (!list_empty(&watchdog_list)) { |
| 144 | /* Cycle through CPUs to check if the CPUs stay synchronized to | 144 | __mod_timer(&watchdog_timer, |
| 145 | * each other. */ | 145 | watchdog_timer.expires + WATCHDOG_INTERVAL); |
| 146 | int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map); | ||
| 147 | if (next_cpu >= NR_CPUS) | ||
| 148 | next_cpu = first_cpu(cpu_online_map); | ||
| 149 | watchdog_timer.expires += WATCHDOG_INTERVAL; | ||
| 150 | add_timer_on(&watchdog_timer, next_cpu); | ||
| 151 | } | 146 | } |
| 152 | spin_unlock(&watchdog_lock); | 147 | spin_unlock(&watchdog_lock); |
| 153 | } | 148 | } |
| @@ -169,7 +164,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) | |||
| 169 | if (!started && watchdog) { | 164 | if (!started && watchdog) { |
| 170 | watchdog_last = watchdog->read(); | 165 | watchdog_last = watchdog->read(); |
| 171 | watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; | 166 | watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; |
| 172 | add_timer_on(&watchdog_timer, first_cpu(cpu_online_map)); | 167 | add_timer(&watchdog_timer); |
| 173 | } | 168 | } |
| 174 | } else { | 169 | } else { |
| 175 | if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) | 170 | if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) |
| @@ -179,7 +174,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) | |||
| 179 | if (watchdog) | 174 | if (watchdog) |
| 180 | del_timer(&watchdog_timer); | 175 | del_timer(&watchdog_timer); |
| 181 | watchdog = cs; | 176 | watchdog = cs; |
| 182 | init_timer_deferrable(&watchdog_timer); | 177 | init_timer(&watchdog_timer); |
| 183 | watchdog_timer.function = clocksource_watchdog; | 178 | watchdog_timer.function = clocksource_watchdog; |
| 184 | 179 | ||
| 185 | /* Reset watchdog cycles */ | 180 | /* Reset watchdog cycles */ |
| @@ -190,8 +185,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) | |||
| 190 | watchdog_last = watchdog->read(); | 185 | watchdog_last = watchdog->read(); |
| 191 | watchdog_timer.expires = | 186 | watchdog_timer.expires = |
| 192 | jiffies + WATCHDOG_INTERVAL; | 187 | jiffies + WATCHDOG_INTERVAL; |
| 193 | add_timer_on(&watchdog_timer, | 188 | add_timer(&watchdog_timer); |
| 194 | first_cpu(cpu_online_map)); | ||
| 195 | } | 189 | } |
| 196 | } | 190 | } |
| 197 | } | 191 | } |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 671af612b768..a3fa587c350c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -191,8 +191,12 @@ static void change_clocksource(void) | |||
| 191 | 191 | ||
| 192 | tick_clock_notify(); | 192 | tick_clock_notify(); |
| 193 | 193 | ||
| 194 | /* | ||
| 195 | * We're holding xtime lock and waking up klogd would deadlock | ||
| 196 | * us on enqueue. So no printing! | ||
| 194 | printk(KERN_INFO "Time: %s clocksource has been installed.\n", | 197 | printk(KERN_INFO "Time: %s clocksource has been installed.\n", |
| 195 | clock->name); | 198 | clock->name); |
| 199 | */ | ||
| 196 | } | 200 | } |
| 197 | #else | 201 | #else |
| 198 | static inline void change_clocksource(void) { } | 202 | static inline void change_clocksource(void) { } |
diff --git a/kernel/timer.c b/kernel/timer.c index 99b00a25f88b..b024106daa70 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
| @@ -451,10 +451,18 @@ void add_timer_on(struct timer_list *timer, int cpu) | |||
| 451 | spin_lock_irqsave(&base->lock, flags); | 451 | spin_lock_irqsave(&base->lock, flags); |
| 452 | timer_set_base(timer, base); | 452 | timer_set_base(timer, base); |
| 453 | internal_add_timer(base, timer); | 453 | internal_add_timer(base, timer); |
| 454 | /* | ||
| 455 | * Check whether the other CPU is idle and needs to be | ||
| 456 | * triggered to reevaluate the timer wheel when nohz is | ||
| 457 | * active. We are protected against the other CPU fiddling | ||
| 458 | * with the timer by holding the timer base lock. This also | ||
| 459 | * makes sure that a CPU on the way to idle can not evaluate | ||
| 460 | * the timer wheel. | ||
| 461 | */ | ||
| 462 | wake_up_idle_cpu(cpu); | ||
| 454 | spin_unlock_irqrestore(&base->lock, flags); | 463 | spin_unlock_irqrestore(&base->lock, flags); |
| 455 | } | 464 | } |
| 456 | 465 | ||
| 457 | |||
| 458 | /** | 466 | /** |
| 459 | * mod_timer - modify a timer's timeout | 467 | * mod_timer - modify a timer's timeout |
| 460 | * @timer: the timer to be modified | 468 | * @timer: the timer to be modified |
