aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c19
-rw-r--r--kernel/debug/kdb/kdb_main.c91
-rw-r--r--kernel/debug/kdb/kdb_private.h1
-rw-r--r--kernel/events/core.c11
-rw-r--r--kernel/exit.c19
-rw-r--r--kernel/fork.c11
-rw-r--r--kernel/hrtimer.c53
-rw-r--r--kernel/irq/chip.c8
-rw-r--r--kernel/irq/internals.h3
-rw-r--r--kernel/irq/manage.c39
-rw-r--r--kernel/irq/migration.c13
-rw-r--r--kernel/panic.c6
-rw-r--r--kernel/pid_namespace.c20
-rw-r--r--kernel/power/hibernate.c8
-rw-r--r--kernel/power/user.c2
-rw-r--r--kernel/printk.c727
-rw-r--r--kernel/rcutree.c17
-rw-r--r--kernel/rcutree.h15
-rw-r--r--kernel/rcutree_plugin.h179
-rw-r--r--kernel/sched/core.c525
-rw-r--r--kernel/sched/fair.c71
-rw-r--r--kernel/sched/idle_task.c1
-rw-r--r--kernel/sched/rt.c53
-rw-r--r--kernel/sched/sched.h4
-rw-r--r--kernel/smpboot.c17
-rw-r--r--kernel/sys.c64
-rw-r--r--kernel/time/clockevents.c3
-rw-r--r--kernel/time/ntp.c8
-rw-r--r--kernel/time/tick-sched.c28
-rw-r--r--kernel/time/timekeeping.c66
-rw-r--r--kernel/trace/ring_buffer.c6
-rw-r--r--kernel/trace/trace.c2
-rw-r--r--kernel/watchdog.c19
33 files changed, 1494 insertions, 615 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0f3527d6184a..b303dfc7dce0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -255,12 +255,17 @@ int cgroup_lock_is_held(void)
255 255
256EXPORT_SYMBOL_GPL(cgroup_lock_is_held); 256EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
257 257
258static int css_unbias_refcnt(int refcnt)
259{
260 return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
261}
262
258/* the current nr of refs, always >= 0 whether @css is deactivated or not */ 263/* the current nr of refs, always >= 0 whether @css is deactivated or not */
259static int css_refcnt(struct cgroup_subsys_state *css) 264static int css_refcnt(struct cgroup_subsys_state *css)
260{ 265{
261 int v = atomic_read(&css->refcnt); 266 int v = atomic_read(&css->refcnt);
262 267
263 return v >= 0 ? v : v - CSS_DEACT_BIAS; 268 return css_unbias_refcnt(v);
264} 269}
265 270
266/* convenient tests for these bits */ 271/* convenient tests for these bits */
@@ -3878,8 +3883,12 @@ static void css_dput_fn(struct work_struct *work)
3878{ 3883{
3879 struct cgroup_subsys_state *css = 3884 struct cgroup_subsys_state *css =
3880 container_of(work, struct cgroup_subsys_state, dput_work); 3885 container_of(work, struct cgroup_subsys_state, dput_work);
3886 struct dentry *dentry = css->cgroup->dentry;
3887 struct super_block *sb = dentry->d_sb;
3881 3888
3882 dput(css->cgroup->dentry); 3889 atomic_inc(&sb->s_active);
3890 dput(dentry);
3891 deactivate_super(sb);
3883} 3892}
3884 3893
3885static void init_cgroup_css(struct cgroup_subsys_state *css, 3894static void init_cgroup_css(struct cgroup_subsys_state *css,
@@ -4971,10 +4980,12 @@ EXPORT_SYMBOL_GPL(__css_tryget);
4971void __css_put(struct cgroup_subsys_state *css) 4980void __css_put(struct cgroup_subsys_state *css)
4972{ 4981{
4973 struct cgroup *cgrp = css->cgroup; 4982 struct cgroup *cgrp = css->cgroup;
4983 int v;
4974 4984
4975 rcu_read_lock(); 4985 rcu_read_lock();
4976 atomic_dec(&css->refcnt); 4986 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
4977 switch (css_refcnt(css)) { 4987
4988 switch (v) {
4978 case 1: 4989 case 1:
4979 if (notify_on_release(cgrp)) { 4990 if (notify_on_release(cgrp)) {
4980 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4991 set_bit(CGRP_RELEASABLE, &cgrp->flags);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 67b847dfa2bb..1f91413edb87 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -14,6 +14,7 @@
14#include <linux/ctype.h> 14#include <linux/ctype.h>
15#include <linux/string.h> 15#include <linux/string.h>
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/kmsg_dump.h>
17#include <linux/reboot.h> 18#include <linux/reboot.h>
18#include <linux/sched.h> 19#include <linux/sched.h>
19#include <linux/sysrq.h> 20#include <linux/sysrq.h>
@@ -2040,8 +2041,15 @@ static int kdb_env(int argc, const char **argv)
2040 */ 2041 */
2041static int kdb_dmesg(int argc, const char **argv) 2042static int kdb_dmesg(int argc, const char **argv)
2042{ 2043{
2043 char *syslog_data[4], *start, *end, c = '\0', *p; 2044 int diag;
2044 int diag, logging, logsize, lines = 0, adjust = 0, n; 2045 int logging;
2046 int lines = 0;
2047 int adjust = 0;
2048 int n = 0;
2049 int skip = 0;
2050 struct kmsg_dumper dumper = { .active = 1 };
2051 size_t len;
2052 char buf[201];
2045 2053
2046 if (argc > 2) 2054 if (argc > 2)
2047 return KDB_ARGCOUNT; 2055 return KDB_ARGCOUNT;
@@ -2064,22 +2072,10 @@ static int kdb_dmesg(int argc, const char **argv)
2064 kdb_set(2, setargs); 2072 kdb_set(2, setargs);
2065 } 2073 }
2066 2074
2067 /* syslog_data[0,1] physical start, end+1. syslog_data[2,3] 2075 kmsg_dump_rewind_nolock(&dumper);
2068 * logical start, end+1. */ 2076 while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL))
2069 kdb_syslog_data(syslog_data); 2077 n++;
2070 if (syslog_data[2] == syslog_data[3]) 2078
2071 return 0;
2072 logsize = syslog_data[1] - syslog_data[0];
2073 start = syslog_data[2];
2074 end = syslog_data[3];
2075#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0])
2076 for (n = 0, p = start; p < end; ++p) {
2077 c = *KDB_WRAP(p);
2078 if (c == '\n')
2079 ++n;
2080 }
2081 if (c != '\n')
2082 ++n;
2083 if (lines < 0) { 2079 if (lines < 0) {
2084 if (adjust >= n) 2080 if (adjust >= n)
2085 kdb_printf("buffer only contains %d lines, nothing " 2081 kdb_printf("buffer only contains %d lines, nothing "
@@ -2087,21 +2083,11 @@ static int kdb_dmesg(int argc, const char **argv)
2087 else if (adjust - lines >= n) 2083 else if (adjust - lines >= n)
2088 kdb_printf("buffer only contains %d lines, last %d " 2084 kdb_printf("buffer only contains %d lines, last %d "
2089 "lines printed\n", n, n - adjust); 2085 "lines printed\n", n, n - adjust);
2090 if (adjust) { 2086 skip = adjust;
2091 for (; start < end && adjust; ++start) { 2087 lines = abs(lines);
2092 if (*KDB_WRAP(start) == '\n')
2093 --adjust;
2094 }
2095 if (start < end)
2096 ++start;
2097 }
2098 for (p = start; p < end && lines; ++p) {
2099 if (*KDB_WRAP(p) == '\n')
2100 ++lines;
2101 }
2102 end = p;
2103 } else if (lines > 0) { 2088 } else if (lines > 0) {
2104 int skip = n - (adjust + lines); 2089 skip = n - lines - adjust;
2090 lines = abs(lines);
2105 if (adjust >= n) { 2091 if (adjust >= n) {
2106 kdb_printf("buffer only contains %d lines, " 2092 kdb_printf("buffer only contains %d lines, "
2107 "nothing printed\n", n); 2093 "nothing printed\n", n);
@@ -2112,35 +2098,24 @@ static int kdb_dmesg(int argc, const char **argv)
2112 kdb_printf("buffer only contains %d lines, first " 2098 kdb_printf("buffer only contains %d lines, first "
2113 "%d lines printed\n", n, lines); 2099 "%d lines printed\n", n, lines);
2114 } 2100 }
2115 for (; start < end && skip; ++start) { 2101 } else {
2116 if (*KDB_WRAP(start) == '\n') 2102 lines = n;
2117 --skip;
2118 }
2119 for (p = start; p < end && lines; ++p) {
2120 if (*KDB_WRAP(p) == '\n')
2121 --lines;
2122 }
2123 end = p;
2124 } 2103 }
2125 /* Do a line at a time (max 200 chars) to reduce protocol overhead */ 2104
2126 c = '\n'; 2105 if (skip >= n || skip < 0)
2127 while (start != end) { 2106 return 0;
2128 char buf[201]; 2107
2129 p = buf; 2108 kmsg_dump_rewind_nolock(&dumper);
2130 if (KDB_FLAG(CMD_INTERRUPT)) 2109 while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) {
2131 return 0; 2110 if (skip) {
2132 while (start < end && (c = *KDB_WRAP(start)) && 2111 skip--;
2133 (p - buf) < sizeof(buf)-1) { 2112 continue;
2134 ++start;
2135 *p++ = c;
2136 if (c == '\n')
2137 break;
2138 } 2113 }
2139 *p = '\0'; 2114 if (!lines--)
2140 kdb_printf("%s", buf); 2115 break;
2116
2117 kdb_printf("%.*s\n", (int)len - 1, buf);
2141 } 2118 }
2142 if (c != '\n')
2143 kdb_printf("\n");
2144 2119
2145 return 0; 2120 return 0;
2146} 2121}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 47c4e56e513b..392ec6a25844 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -205,7 +205,6 @@ extern char kdb_grep_string[];
205extern int kdb_grep_leading; 205extern int kdb_grep_leading;
206extern int kdb_grep_trailing; 206extern int kdb_grep_trailing;
207extern char *kdb_cmds[]; 207extern char *kdb_cmds[];
208extern void kdb_syslog_data(char *syslog_data[]);
209extern unsigned long kdb_task_state_string(const char *); 208extern unsigned long kdb_task_state_string(const char *);
210extern char kdb_task_state_char (const struct task_struct *); 209extern char kdb_task_state_char (const struct task_struct *);
211extern unsigned long kdb_task_state(const struct task_struct *p, 210extern unsigned long kdb_task_state(const struct task_struct *p,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5b06cbbf6931..d7d71d6ec972 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -253,9 +253,9 @@ perf_cgroup_match(struct perf_event *event)
253 return !event->cgrp || event->cgrp == cpuctx->cgrp; 253 return !event->cgrp || event->cgrp == cpuctx->cgrp;
254} 254}
255 255
256static inline void perf_get_cgroup(struct perf_event *event) 256static inline bool perf_tryget_cgroup(struct perf_event *event)
257{ 257{
258 css_get(&event->cgrp->css); 258 return css_tryget(&event->cgrp->css);
259} 259}
260 260
261static inline void perf_put_cgroup(struct perf_event *event) 261static inline void perf_put_cgroup(struct perf_event *event)
@@ -484,7 +484,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
484 event->cgrp = cgrp; 484 event->cgrp = cgrp;
485 485
486 /* must be done before we fput() the file */ 486 /* must be done before we fput() the file */
487 perf_get_cgroup(event); 487 if (!perf_tryget_cgroup(event)) {
488 event->cgrp = NULL;
489 ret = -ENOENT;
490 goto out;
491 }
488 492
489 /* 493 /*
490 * all events in a group must monitor 494 * all events in a group must monitor
@@ -3181,7 +3185,6 @@ static void perf_event_for_each(struct perf_event *event,
3181 event = event->group_leader; 3185 event = event->group_leader;
3182 3186
3183 perf_event_for_each_child(event, func); 3187 perf_event_for_each_child(event, func);
3184 func(event);
3185 list_for_each_entry(sibling, &event->sibling_list, group_entry) 3188 list_for_each_entry(sibling, &event->sibling_list, group_entry)
3186 perf_event_for_each_child(sibling, func); 3189 perf_event_for_each_child(sibling, func);
3187 mutex_unlock(&ctx->mutex); 3190 mutex_unlock(&ctx->mutex);
diff --git a/kernel/exit.c b/kernel/exit.c
index 34867cc5b42a..2f59cc334516 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,6 +72,18 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
72 list_del_rcu(&p->tasks); 72 list_del_rcu(&p->tasks);
73 list_del_init(&p->sibling); 73 list_del_init(&p->sibling);
74 __this_cpu_dec(process_counts); 74 __this_cpu_dec(process_counts);
75 /*
76 * If we are the last child process in a pid namespace to be
77 * reaped, notify the reaper sleeping zap_pid_ns_processes().
78 */
79 if (IS_ENABLED(CONFIG_PID_NS)) {
80 struct task_struct *parent = p->real_parent;
81
82 if ((task_active_pid_ns(parent)->child_reaper == parent) &&
83 list_empty(&parent->children) &&
84 (parent->flags & PF_EXITING))
85 wake_up_process(parent);
86 }
75 } 87 }
76 list_del_rcu(&p->thread_group); 88 list_del_rcu(&p->thread_group);
77} 89}
@@ -643,6 +655,7 @@ static void exit_mm(struct task_struct * tsk)
643 mm_release(tsk, mm); 655 mm_release(tsk, mm);
644 if (!mm) 656 if (!mm)
645 return; 657 return;
658 sync_mm_rss(mm);
646 /* 659 /*
647 * Serialize with any possible pending coredump. 660 * Serialize with any possible pending coredump.
648 * We must hold mmap_sem around checking core_state 661 * We must hold mmap_sem around checking core_state
@@ -719,12 +732,6 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
719 732
720 zap_pid_ns_processes(pid_ns); 733 zap_pid_ns_processes(pid_ns);
721 write_lock_irq(&tasklist_lock); 734 write_lock_irq(&tasklist_lock);
722 /*
723 * We can not clear ->child_reaper or leave it alone.
724 * There may by stealth EXIT_DEAD tasks on ->children,
725 * forget_original_parent() must move them somewhere.
726 */
727 pid_ns->child_reaper = init_pid_ns.child_reaper;
728 } else if (father->signal->has_child_subreaper) { 735 } else if (father->signal->has_child_subreaper) {
729 struct task_struct *reaper; 736 struct task_struct *reaper;
730 737
diff --git a/kernel/fork.c b/kernel/fork.c
index ab5211b9e622..f00e319d8376 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -304,12 +304,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
304 } 304 }
305 305
306 err = arch_dup_task_struct(tsk, orig); 306 err = arch_dup_task_struct(tsk, orig);
307 if (err)
308 goto out;
309 307
308 /*
309 * We defer looking at err, because we will need this setup
310 * for the clean up path to work correctly.
311 */
310 tsk->stack = ti; 312 tsk->stack = ti;
311
312 setup_thread_stack(tsk, orig); 313 setup_thread_stack(tsk, orig);
314
315 if (err)
316 goto out;
317
313 clear_user_return_notifier(tsk); 318 clear_user_return_notifier(tsk);
314 clear_tsk_need_resched(tsk); 319 clear_tsk_need_resched(tsk);
315 stackend = end_of_stack(tsk); 320 stackend = end_of_stack(tsk);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ae34bf51682b..6db7a5ed52b5 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -657,6 +657,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
657 return 0; 657 return 0;
658} 658}
659 659
660static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
661{
662 ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
663 ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
664
665 return ktime_get_update_offsets(offs_real, offs_boot);
666}
667
660/* 668/*
661 * Retrigger next event is called after clock was set 669 * Retrigger next event is called after clock was set
662 * 670 *
@@ -665,22 +673,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
665static void retrigger_next_event(void *arg) 673static void retrigger_next_event(void *arg)
666{ 674{
667 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); 675 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
668 struct timespec realtime_offset, xtim, wtm, sleep;
669 676
670 if (!hrtimer_hres_active()) 677 if (!hrtimer_hres_active())
671 return; 678 return;
672 679
673 /* Optimized out for !HIGH_RES */
674 get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
675 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
676
677 /* Adjust CLOCK_REALTIME offset */
678 raw_spin_lock(&base->lock); 680 raw_spin_lock(&base->lock);
679 base->clock_base[HRTIMER_BASE_REALTIME].offset = 681 hrtimer_update_base(base);
680 timespec_to_ktime(realtime_offset);
681 base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
682 timespec_to_ktime(sleep);
683
684 hrtimer_force_reprogram(base, 0); 682 hrtimer_force_reprogram(base, 0);
685 raw_spin_unlock(&base->lock); 683 raw_spin_unlock(&base->lock);
686} 684}
@@ -710,13 +708,25 @@ static int hrtimer_switch_to_hres(void)
710 base->clock_base[i].resolution = KTIME_HIGH_RES; 708 base->clock_base[i].resolution = KTIME_HIGH_RES;
711 709
712 tick_setup_sched_timer(); 710 tick_setup_sched_timer();
713
714 /* "Retrigger" the interrupt to get things going */ 711 /* "Retrigger" the interrupt to get things going */
715 retrigger_next_event(NULL); 712 retrigger_next_event(NULL);
716 local_irq_restore(flags); 713 local_irq_restore(flags);
717 return 1; 714 return 1;
718} 715}
719 716
717/*
718 * Called from timekeeping code to reprogramm the hrtimer interrupt
719 * device. If called from the timer interrupt context we defer it to
720 * softirq context.
721 */
722void clock_was_set_delayed(void)
723{
724 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
725
726 cpu_base->clock_was_set = 1;
727 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
728}
729
720#else 730#else
721 731
722static inline int hrtimer_hres_active(void) { return 0; } 732static inline int hrtimer_hres_active(void) { return 0; }
@@ -1250,11 +1260,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1250 cpu_base->nr_events++; 1260 cpu_base->nr_events++;
1251 dev->next_event.tv64 = KTIME_MAX; 1261 dev->next_event.tv64 = KTIME_MAX;
1252 1262
1253 entry_time = now = ktime_get(); 1263 raw_spin_lock(&cpu_base->lock);
1264 entry_time = now = hrtimer_update_base(cpu_base);
1254retry: 1265retry:
1255 expires_next.tv64 = KTIME_MAX; 1266 expires_next.tv64 = KTIME_MAX;
1256
1257 raw_spin_lock(&cpu_base->lock);
1258 /* 1267 /*
1259 * We set expires_next to KTIME_MAX here with cpu_base->lock 1268 * We set expires_next to KTIME_MAX here with cpu_base->lock
1260 * held to prevent that a timer is enqueued in our queue via 1269 * held to prevent that a timer is enqueued in our queue via
@@ -1330,8 +1339,12 @@ retry:
1330 * We need to prevent that we loop forever in the hrtimer 1339 * We need to prevent that we loop forever in the hrtimer
1331 * interrupt routine. We give it 3 attempts to avoid 1340 * interrupt routine. We give it 3 attempts to avoid
1332 * overreacting on some spurious event. 1341 * overreacting on some spurious event.
1342 *
1343 * Acquire base lock for updating the offsets and retrieving
1344 * the current time.
1333 */ 1345 */
1334 now = ktime_get(); 1346 raw_spin_lock(&cpu_base->lock);
1347 now = hrtimer_update_base(cpu_base);
1335 cpu_base->nr_retries++; 1348 cpu_base->nr_retries++;
1336 if (++retries < 3) 1349 if (++retries < 3)
1337 goto retry; 1350 goto retry;
@@ -1343,6 +1356,7 @@ retry:
1343 */ 1356 */
1344 cpu_base->nr_hangs++; 1357 cpu_base->nr_hangs++;
1345 cpu_base->hang_detected = 1; 1358 cpu_base->hang_detected = 1;
1359 raw_spin_unlock(&cpu_base->lock);
1346 delta = ktime_sub(now, entry_time); 1360 delta = ktime_sub(now, entry_time);
1347 if (delta.tv64 > cpu_base->max_hang_time.tv64) 1361 if (delta.tv64 > cpu_base->max_hang_time.tv64)
1348 cpu_base->max_hang_time = delta; 1362 cpu_base->max_hang_time = delta;
@@ -1395,6 +1409,13 @@ void hrtimer_peek_ahead_timers(void)
1395 1409
1396static void run_hrtimer_softirq(struct softirq_action *h) 1410static void run_hrtimer_softirq(struct softirq_action *h)
1397{ 1411{
1412 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1413
1414 if (cpu_base->clock_was_set) {
1415 cpu_base->clock_was_set = 0;
1416 clock_was_set();
1417 }
1418
1398 hrtimer_peek_ahead_timers(); 1419 hrtimer_peek_ahead_timers();
1399} 1420}
1400 1421
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index fc275e4f629b..eebd6d5cfb44 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -275,8 +275,10 @@ void handle_nested_irq(unsigned int irq)
275 kstat_incr_irqs_this_cpu(irq, desc); 275 kstat_incr_irqs_this_cpu(irq, desc);
276 276
277 action = desc->action; 277 action = desc->action;
278 if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) 278 if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
279 desc->istate |= IRQS_PENDING;
279 goto out_unlock; 280 goto out_unlock;
281 }
280 282
281 irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); 283 irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
282 raw_spin_unlock_irq(&desc->lock); 284 raw_spin_unlock_irq(&desc->lock);
@@ -324,8 +326,10 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
324 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); 326 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
325 kstat_incr_irqs_this_cpu(irq, desc); 327 kstat_incr_irqs_this_cpu(irq, desc);
326 328
327 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) 329 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
330 desc->istate |= IRQS_PENDING;
328 goto out_unlock; 331 goto out_unlock;
332 }
329 333
330 handle_irq_event(desc); 334 handle_irq_event(desc);
331 335
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 8e5c56b3b7d9..001fa5bab490 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -101,6 +101,9 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
101 101
102extern void irq_set_thread_affinity(struct irq_desc *desc); 102extern void irq_set_thread_affinity(struct irq_desc *desc);
103 103
104extern int irq_do_set_affinity(struct irq_data *data,
105 const struct cpumask *dest, bool force);
106
104/* Inline functions for support of irq chips on slow busses */ 107/* Inline functions for support of irq chips on slow busses */
105static inline void chip_bus_lock(struct irq_desc *desc) 108static inline void chip_bus_lock(struct irq_desc *desc)
106{ 109{
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ea0c6c2ae6f7..8c548232ba39 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -142,6 +142,25 @@ static inline void
142irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } 142irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
143#endif 143#endif
144 144
145int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
146 bool force)
147{
148 struct irq_desc *desc = irq_data_to_desc(data);
149 struct irq_chip *chip = irq_data_get_irq_chip(data);
150 int ret;
151
152 ret = chip->irq_set_affinity(data, mask, false);
153 switch (ret) {
154 case IRQ_SET_MASK_OK:
155 cpumask_copy(data->affinity, mask);
156 case IRQ_SET_MASK_OK_NOCOPY:
157 irq_set_thread_affinity(desc);
158 ret = 0;
159 }
160
161 return ret;
162}
163
145int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) 164int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
146{ 165{
147 struct irq_chip *chip = irq_data_get_irq_chip(data); 166 struct irq_chip *chip = irq_data_get_irq_chip(data);
@@ -152,14 +171,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
152 return -EINVAL; 171 return -EINVAL;
153 172
154 if (irq_can_move_pcntxt(data)) { 173 if (irq_can_move_pcntxt(data)) {
155 ret = chip->irq_set_affinity(data, mask, false); 174 ret = irq_do_set_affinity(data, mask, false);
156 switch (ret) {
157 case IRQ_SET_MASK_OK:
158 cpumask_copy(data->affinity, mask);
159 case IRQ_SET_MASK_OK_NOCOPY:
160 irq_set_thread_affinity(desc);
161 ret = 0;
162 }
163 } else { 175 } else {
164 irqd_set_move_pending(data); 176 irqd_set_move_pending(data);
165 irq_copy_pending(desc, mask); 177 irq_copy_pending(desc, mask);
@@ -283,9 +295,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
283static int 295static int
284setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) 296setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
285{ 297{
286 struct irq_chip *chip = irq_desc_get_chip(desc);
287 struct cpumask *set = irq_default_affinity; 298 struct cpumask *set = irq_default_affinity;
288 int ret, node = desc->irq_data.node; 299 int node = desc->irq_data.node;
289 300
290 /* Excludes PER_CPU and NO_BALANCE interrupts */ 301 /* Excludes PER_CPU and NO_BALANCE interrupts */
291 if (!irq_can_set_affinity(irq)) 302 if (!irq_can_set_affinity(irq))
@@ -311,13 +322,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
311 if (cpumask_intersects(mask, nodemask)) 322 if (cpumask_intersects(mask, nodemask))
312 cpumask_and(mask, mask, nodemask); 323 cpumask_and(mask, mask, nodemask);
313 } 324 }
314 ret = chip->irq_set_affinity(&desc->irq_data, mask, false); 325 irq_do_set_affinity(&desc->irq_data, mask, false);
315 switch (ret) {
316 case IRQ_SET_MASK_OK:
317 cpumask_copy(desc->irq_data.affinity, mask);
318 case IRQ_SET_MASK_OK_NOCOPY:
319 irq_set_thread_affinity(desc);
320 }
321 return 0; 326 return 0;
322} 327}
323#else 328#else
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index c3c89751b327..ca3f4aaff707 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -42,17 +42,8 @@ void irq_move_masked_irq(struct irq_data *idata)
42 * For correct operation this depends on the caller 42 * For correct operation this depends on the caller
43 * masking the irqs. 43 * masking the irqs.
44 */ 44 */
45 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) 45 if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)
46 < nr_cpu_ids)) { 46 irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false);
47 int ret = chip->irq_set_affinity(&desc->irq_data,
48 desc->pending_mask, false);
49 switch (ret) {
50 case IRQ_SET_MASK_OK:
51 cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
52 case IRQ_SET_MASK_OK_NOCOPY:
53 irq_set_thread_affinity(desc);
54 }
55 }
56 47
57 cpumask_clear(desc->pending_mask); 48 cpumask_clear(desc->pending_mask);
58} 49}
diff --git a/kernel/panic.c b/kernel/panic.c
index 8ed89a175d79..d2a5f4ecc6dd 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -27,7 +27,7 @@
27#define PANIC_TIMER_STEP 100 27#define PANIC_TIMER_STEP 100
28#define PANIC_BLINK_SPD 18 28#define PANIC_BLINK_SPD 18
29 29
30int panic_on_oops; 30int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
31static unsigned long tainted_mask; 31static unsigned long tainted_mask;
32static int pause_on_oops; 32static int pause_on_oops;
33static int pause_on_oops_flag; 33static int pause_on_oops_flag;
@@ -108,8 +108,6 @@ void panic(const char *fmt, ...)
108 */ 108 */
109 crash_kexec(NULL); 109 crash_kexec(NULL);
110 110
111 kmsg_dump(KMSG_DUMP_PANIC);
112
113 /* 111 /*
114 * Note smp_send_stop is the usual smp shutdown function, which 112 * Note smp_send_stop is the usual smp shutdown function, which
115 * unfortunately means it may not be hardened to work in a panic 113 * unfortunately means it may not be hardened to work in a panic
@@ -117,6 +115,8 @@ void panic(const char *fmt, ...)
117 */ 115 */
118 smp_send_stop(); 116 smp_send_stop();
119 117
118 kmsg_dump(KMSG_DUMP_PANIC);
119
120 atomic_notifier_call_chain(&panic_notifier_list, 0, buf); 120 atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
121 121
122 bust_spinlocks(0); 122 bust_spinlocks(0);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 16b20e38c4a1..b3c7fd554250 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -184,11 +184,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
184 } 184 }
185 read_unlock(&tasklist_lock); 185 read_unlock(&tasklist_lock);
186 186
187 /* Firstly reap the EXIT_ZOMBIE children we may have. */
187 do { 188 do {
188 clear_thread_flag(TIF_SIGPENDING); 189 clear_thread_flag(TIF_SIGPENDING);
189 rc = sys_wait4(-1, NULL, __WALL, NULL); 190 rc = sys_wait4(-1, NULL, __WALL, NULL);
190 } while (rc != -ECHILD); 191 } while (rc != -ECHILD);
191 192
193 /*
194 * sys_wait4() above can't reap the TASK_DEAD children.
195 * Make sure they all go away, see __unhash_process().
196 */
197 for (;;) {
198 bool need_wait = false;
199
200 read_lock(&tasklist_lock);
201 if (!list_empty(&current->children)) {
202 __set_current_state(TASK_UNINTERRUPTIBLE);
203 need_wait = true;
204 }
205 read_unlock(&tasklist_lock);
206
207 if (!need_wait)
208 break;
209 schedule();
210 }
211
192 if (pid_ns->reboot) 212 if (pid_ns->reboot)
193 current->signal->group_exit_code = pid_ns->reboot; 213 current->signal->group_exit_code = pid_ns->reboot;
194 214
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8b53db38a279..238025f5472e 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -27,7 +27,6 @@
27#include <linux/syscore_ops.h> 27#include <linux/syscore_ops.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/genhd.h> 29#include <linux/genhd.h>
30#include <scsi/scsi_scan.h>
31 30
32#include "power.h" 31#include "power.h"
33 32
@@ -748,13 +747,6 @@ static int software_resume(void)
748 async_synchronize_full(); 747 async_synchronize_full();
749 } 748 }
750 749
751 /*
752 * We can't depend on SCSI devices being available after loading
753 * one of their modules until scsi_complete_async_scans() is
754 * called and the resume device usually is a SCSI one.
755 */
756 scsi_complete_async_scans();
757
758 swsusp_resume_device = name_to_dev_t(resume_file); 750 swsusp_resume_device = name_to_dev_t(resume_file);
759 if (!swsusp_resume_device) { 751 if (!swsusp_resume_device) {
760 error = -ENODEV; 752 error = -ENODEV;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 91b0fd021a95..4ed81e74f86f 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -24,7 +24,6 @@
24#include <linux/console.h> 24#include <linux/console.h>
25#include <linux/cpu.h> 25#include <linux/cpu.h>
26#include <linux/freezer.h> 26#include <linux/freezer.h>
27#include <scsi/scsi_scan.h>
28 27
29#include <asm/uaccess.h> 28#include <asm/uaccess.h>
30 29
@@ -84,7 +83,6 @@ static int snapshot_open(struct inode *inode, struct file *filp)
84 * appear. 83 * appear.
85 */ 84 */
86 wait_for_device_probe(); 85 wait_for_device_probe();
87 scsi_complete_async_scans();
88 86
89 data->swap = -1; 87 data->swap = -1;
90 data->mode = O_WRONLY; 88 data->mode = O_WRONLY;
diff --git a/kernel/printk.c b/kernel/printk.c
index 32462d2b364a..ac4bc9e79465 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -193,12 +193,21 @@ static int console_may_schedule;
193 * separated by ',', and find the message after the ';' character. 193 * separated by ',', and find the message after the ';' character.
194 */ 194 */
195 195
196enum log_flags {
197 LOG_NOCONS = 1, /* already flushed, do not print to console */
198 LOG_NEWLINE = 2, /* text ended with a newline */
199 LOG_PREFIX = 4, /* text started with a prefix */
200 LOG_CONT = 8, /* text is a fragment of a continuation line */
201};
202
196struct log { 203struct log {
197 u64 ts_nsec; /* timestamp in nanoseconds */ 204 u64 ts_nsec; /* timestamp in nanoseconds */
198 u16 len; /* length of entire record */ 205 u16 len; /* length of entire record */
199 u16 text_len; /* length of text buffer */ 206 u16 text_len; /* length of text buffer */
200 u16 dict_len; /* length of dictionary buffer */ 207 u16 dict_len; /* length of dictionary buffer */
201 u16 level; /* syslog level + facility */ 208 u8 facility; /* syslog facility */
209 u8 flags:5; /* internal record flags */
210 u8 level:3; /* syslog level */
202}; 211};
203 212
204/* 213/*
@@ -210,6 +219,8 @@ static DEFINE_RAW_SPINLOCK(logbuf_lock);
210/* the next printk record to read by syslog(READ) or /proc/kmsg */ 219/* the next printk record to read by syslog(READ) or /proc/kmsg */
211static u64 syslog_seq; 220static u64 syslog_seq;
212static u32 syslog_idx; 221static u32 syslog_idx;
222static enum log_flags syslog_prev;
223static size_t syslog_partial;
213 224
214/* index and sequence number of the first record stored in the buffer */ 225/* index and sequence number of the first record stored in the buffer */
215static u64 log_first_seq; 226static u64 log_first_seq;
@@ -227,10 +238,10 @@ static u32 clear_idx;
227#define LOG_LINE_MAX 1024 238#define LOG_LINE_MAX 1024
228 239
229/* record buffer */ 240/* record buffer */
230#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) 241#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
231#define LOG_ALIGN 4 242#define LOG_ALIGN 4
232#else 243#else
233#define LOG_ALIGN 8 244#define LOG_ALIGN __alignof__(struct log)
234#endif 245#endif
235#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) 246#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
236static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); 247static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
@@ -286,6 +297,7 @@ static u32 log_next(u32 idx)
286 297
287/* insert record into the buffer, discard old ones, update heads */ 298/* insert record into the buffer, discard old ones, update heads */
288static void log_store(int facility, int level, 299static void log_store(int facility, int level,
300 enum log_flags flags, u64 ts_nsec,
289 const char *dict, u16 dict_len, 301 const char *dict, u16 dict_len,
290 const char *text, u16 text_len) 302 const char *text, u16 text_len)
291{ 303{
@@ -329,8 +341,13 @@ static void log_store(int facility, int level,
329 msg->text_len = text_len; 341 msg->text_len = text_len;
330 memcpy(log_dict(msg), dict, dict_len); 342 memcpy(log_dict(msg), dict, dict_len);
331 msg->dict_len = dict_len; 343 msg->dict_len = dict_len;
332 msg->level = (facility << 3) | (level & 7); 344 msg->facility = facility;
333 msg->ts_nsec = local_clock(); 345 msg->level = level & 7;
346 msg->flags = flags & 0x1f;
347 if (ts_nsec > 0)
348 msg->ts_nsec = ts_nsec;
349 else
350 msg->ts_nsec = local_clock();
334 memset(log_dict(msg) + dict_len, 0, pad_len); 351 memset(log_dict(msg) + dict_len, 0, pad_len);
335 msg->len = sizeof(struct log) + text_len + dict_len + pad_len; 352 msg->len = sizeof(struct log) + text_len + dict_len + pad_len;
336 353
@@ -414,21 +431,23 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
414 if (!user) 431 if (!user)
415 return -EBADF; 432 return -EBADF;
416 433
417 mutex_lock(&user->lock); 434 ret = mutex_lock_interruptible(&user->lock);
418 raw_spin_lock(&logbuf_lock); 435 if (ret)
436 return ret;
437 raw_spin_lock_irq(&logbuf_lock);
419 while (user->seq == log_next_seq) { 438 while (user->seq == log_next_seq) {
420 if (file->f_flags & O_NONBLOCK) { 439 if (file->f_flags & O_NONBLOCK) {
421 ret = -EAGAIN; 440 ret = -EAGAIN;
422 raw_spin_unlock(&logbuf_lock); 441 raw_spin_unlock_irq(&logbuf_lock);
423 goto out; 442 goto out;
424 } 443 }
425 444
426 raw_spin_unlock(&logbuf_lock); 445 raw_spin_unlock_irq(&logbuf_lock);
427 ret = wait_event_interruptible(log_wait, 446 ret = wait_event_interruptible(log_wait,
428 user->seq != log_next_seq); 447 user->seq != log_next_seq);
429 if (ret) 448 if (ret)
430 goto out; 449 goto out;
431 raw_spin_lock(&logbuf_lock); 450 raw_spin_lock_irq(&logbuf_lock);
432 } 451 }
433 452
434 if (user->seq < log_first_seq) { 453 if (user->seq < log_first_seq) {
@@ -436,7 +455,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
436 user->idx = log_first_idx; 455 user->idx = log_first_idx;
437 user->seq = log_first_seq; 456 user->seq = log_first_seq;
438 ret = -EPIPE; 457 ret = -EPIPE;
439 raw_spin_unlock(&logbuf_lock); 458 raw_spin_unlock_irq(&logbuf_lock);
440 goto out; 459 goto out;
441 } 460 }
442 461
@@ -444,13 +463,13 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
444 ts_usec = msg->ts_nsec; 463 ts_usec = msg->ts_nsec;
445 do_div(ts_usec, 1000); 464 do_div(ts_usec, 1000);
446 len = sprintf(user->buf, "%u,%llu,%llu;", 465 len = sprintf(user->buf, "%u,%llu,%llu;",
447 msg->level, user->seq, ts_usec); 466 (msg->facility << 3) | msg->level, user->seq, ts_usec);
448 467
449 /* escape non-printable characters */ 468 /* escape non-printable characters */
450 for (i = 0; i < msg->text_len; i++) { 469 for (i = 0; i < msg->text_len; i++) {
451 unsigned char c = log_text(msg)[i]; 470 unsigned char c = log_text(msg)[i];
452 471
453 if (c < ' ' || c >= 128) 472 if (c < ' ' || c >= 127 || c == '\\')
454 len += sprintf(user->buf + len, "\\x%02x", c); 473 len += sprintf(user->buf + len, "\\x%02x", c);
455 else 474 else
456 user->buf[len++] = c; 475 user->buf[len++] = c;
@@ -474,7 +493,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
474 continue; 493 continue;
475 } 494 }
476 495
477 if (c < ' ' || c >= 128) { 496 if (c < ' ' || c >= 127 || c == '\\') {
478 len += sprintf(user->buf + len, "\\x%02x", c); 497 len += sprintf(user->buf + len, "\\x%02x", c);
479 continue; 498 continue;
480 } 499 }
@@ -486,7 +505,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
486 505
487 user->idx = log_next(user->idx); 506 user->idx = log_next(user->idx);
488 user->seq++; 507 user->seq++;
489 raw_spin_unlock(&logbuf_lock); 508 raw_spin_unlock_irq(&logbuf_lock);
490 509
491 if (len > count) { 510 if (len > count) {
492 ret = -EINVAL; 511 ret = -EINVAL;
@@ -513,7 +532,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
513 if (offset) 532 if (offset)
514 return -ESPIPE; 533 return -ESPIPE;
515 534
516 raw_spin_lock(&logbuf_lock); 535 raw_spin_lock_irq(&logbuf_lock);
517 switch (whence) { 536 switch (whence) {
518 case SEEK_SET: 537 case SEEK_SET:
519 /* the first record */ 538 /* the first record */
@@ -537,7 +556,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
537 default: 556 default:
538 ret = -EINVAL; 557 ret = -EINVAL;
539 } 558 }
540 raw_spin_unlock(&logbuf_lock); 559 raw_spin_unlock_irq(&logbuf_lock);
541 return ret; 560 return ret;
542} 561}
543 562
@@ -551,14 +570,14 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
551 570
552 poll_wait(file, &log_wait, wait); 571 poll_wait(file, &log_wait, wait);
553 572
554 raw_spin_lock(&logbuf_lock); 573 raw_spin_lock_irq(&logbuf_lock);
555 if (user->seq < log_next_seq) { 574 if (user->seq < log_next_seq) {
556 /* return error when data has vanished underneath us */ 575 /* return error when data has vanished underneath us */
557 if (user->seq < log_first_seq) 576 if (user->seq < log_first_seq)
558 ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; 577 ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
559 ret = POLLIN|POLLRDNORM; 578 ret = POLLIN|POLLRDNORM;
560 } 579 }
561 raw_spin_unlock(&logbuf_lock); 580 raw_spin_unlock_irq(&logbuf_lock);
562 581
563 return ret; 582 return ret;
564} 583}
@@ -582,10 +601,10 @@ static int devkmsg_open(struct inode *inode, struct file *file)
582 601
583 mutex_init(&user->lock); 602 mutex_init(&user->lock);
584 603
585 raw_spin_lock(&logbuf_lock); 604 raw_spin_lock_irq(&logbuf_lock);
586 user->idx = log_first_idx; 605 user->idx = log_first_idx;
587 user->seq = log_first_seq; 606 user->seq = log_first_seq;
588 raw_spin_unlock(&logbuf_lock); 607 raw_spin_unlock_irq(&logbuf_lock);
589 608
590 file->private_data = user; 609 file->private_data = user;
591 return 0; 610 return 0;
@@ -785,44 +804,64 @@ static bool printk_time;
785#endif 804#endif
786module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); 805module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
787 806
807static size_t print_time(u64 ts, char *buf)
808{
809 unsigned long rem_nsec;
810
811 if (!printk_time)
812 return 0;
813
814 if (!buf)
815 return 15;
816
817 rem_nsec = do_div(ts, 1000000000);
818 return sprintf(buf, "[%5lu.%06lu] ",
819 (unsigned long)ts, rem_nsec / 1000);
820}
821
788static size_t print_prefix(const struct log *msg, bool syslog, char *buf) 822static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
789{ 823{
790 size_t len = 0; 824 size_t len = 0;
825 unsigned int prefix = (msg->facility << 3) | msg->level;
791 826
792 if (syslog) { 827 if (syslog) {
793 if (buf) { 828 if (buf) {
794 len += sprintf(buf, "<%u>", msg->level); 829 len += sprintf(buf, "<%u>", prefix);
795 } else { 830 } else {
796 len += 3; 831 len += 3;
797 if (msg->level > 9) 832 if (prefix > 999)
798 len++; 833 len += 3;
799 if (msg->level > 99) 834 else if (prefix > 99)
835 len += 2;
836 else if (prefix > 9)
800 len++; 837 len++;
801 } 838 }
802 } 839 }
803 840
804 if (printk_time) { 841 len += print_time(msg->ts_nsec, buf ? buf + len : NULL);
805 if (buf) {
806 unsigned long long ts = msg->ts_nsec;
807 unsigned long rem_nsec = do_div(ts, 1000000000);
808
809 len += sprintf(buf + len, "[%5lu.%06lu] ",
810 (unsigned long) ts, rem_nsec / 1000);
811 } else {
812 len += 15;
813 }
814 }
815
816 return len; 842 return len;
817} 843}
818 844
819static size_t msg_print_text(const struct log *msg, bool syslog, 845static size_t msg_print_text(const struct log *msg, enum log_flags prev,
820 char *buf, size_t size) 846 bool syslog, char *buf, size_t size)
821{ 847{
822 const char *text = log_text(msg); 848 const char *text = log_text(msg);
823 size_t text_size = msg->text_len; 849 size_t text_size = msg->text_len;
850 bool prefix = true;
851 bool newline = true;
824 size_t len = 0; 852 size_t len = 0;
825 853
854 if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))
855 prefix = false;
856
857 if (msg->flags & LOG_CONT) {
858 if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE))
859 prefix = false;
860
861 if (!(msg->flags & LOG_NEWLINE))
862 newline = false;
863 }
864
826 do { 865 do {
827 const char *next = memchr(text, '\n', text_size); 866 const char *next = memchr(text, '\n', text_size);
828 size_t text_len; 867 size_t text_len;
@@ -840,16 +879,22 @@ static size_t msg_print_text(const struct log *msg, bool syslog,
840 text_len + 1>= size - len) 879 text_len + 1>= size - len)
841 break; 880 break;
842 881
843 len += print_prefix(msg, syslog, buf + len); 882 if (prefix)
883 len += print_prefix(msg, syslog, buf + len);
844 memcpy(buf + len, text, text_len); 884 memcpy(buf + len, text, text_len);
845 len += text_len; 885 len += text_len;
846 buf[len++] = '\n'; 886 if (next || newline)
887 buf[len++] = '\n';
847 } else { 888 } else {
848 /* SYSLOG_ACTION_* buffer size only calculation */ 889 /* SYSLOG_ACTION_* buffer size only calculation */
849 len += print_prefix(msg, syslog, NULL); 890 if (prefix)
850 len += text_len + 1; 891 len += print_prefix(msg, syslog, NULL);
892 len += text_len;
893 if (next || newline)
894 len++;
851 } 895 }
852 896
897 prefix = true;
853 text = next; 898 text = next;
854 } while (text); 899 } while (text);
855 900
@@ -860,26 +905,60 @@ static int syslog_print(char __user *buf, int size)
860{ 905{
861 char *text; 906 char *text;
862 struct log *msg; 907 struct log *msg;
863 int len; 908 int len = 0;
864 909
865 text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); 910 text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
866 if (!text) 911 if (!text)
867 return -ENOMEM; 912 return -ENOMEM;
868 913
869 raw_spin_lock_irq(&logbuf_lock); 914 while (size > 0) {
870 if (syslog_seq < log_first_seq) { 915 size_t n;
871 /* messages are gone, move to first one */ 916 size_t skip;
872 syslog_seq = log_first_seq;
873 syslog_idx = log_first_idx;
874 }
875 msg = log_from_idx(syslog_idx);
876 len = msg_print_text(msg, true, text, LOG_LINE_MAX);
877 syslog_idx = log_next(syslog_idx);
878 syslog_seq++;
879 raw_spin_unlock_irq(&logbuf_lock);
880 917
881 if (len > 0 && copy_to_user(buf, text, len)) 918 raw_spin_lock_irq(&logbuf_lock);
882 len = -EFAULT; 919 if (syslog_seq < log_first_seq) {
920 /* messages are gone, move to first one */
921 syslog_seq = log_first_seq;
922 syslog_idx = log_first_idx;
923 syslog_prev = 0;
924 syslog_partial = 0;
925 }
926 if (syslog_seq == log_next_seq) {
927 raw_spin_unlock_irq(&logbuf_lock);
928 break;
929 }
930
931 skip = syslog_partial;
932 msg = log_from_idx(syslog_idx);
933 n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX);
934 if (n - syslog_partial <= size) {
935 /* message fits into buffer, move forward */
936 syslog_idx = log_next(syslog_idx);
937 syslog_seq++;
938 syslog_prev = msg->flags;
939 n -= syslog_partial;
940 syslog_partial = 0;
941 } else if (!len){
942 /* partial read(), remember position */
943 n = size;
944 syslog_partial += n;
945 } else
946 n = 0;
947 raw_spin_unlock_irq(&logbuf_lock);
948
949 if (!n)
950 break;
951
952 if (copy_to_user(buf, text + skip, n)) {
953 if (!len)
954 len = -EFAULT;
955 break;
956 }
957
958 len += n;
959 size -= n;
960 buf += n;
961 }
883 962
884 kfree(text); 963 kfree(text);
885 return len; 964 return len;
@@ -899,6 +978,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
899 u64 next_seq; 978 u64 next_seq;
900 u64 seq; 979 u64 seq;
901 u32 idx; 980 u32 idx;
981 enum log_flags prev;
902 982
903 if (clear_seq < log_first_seq) { 983 if (clear_seq < log_first_seq) {
904 /* messages are gone, move to first available one */ 984 /* messages are gone, move to first available one */
@@ -909,41 +989,47 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
909 /* 989 /*
910 * Find first record that fits, including all following records, 990 * Find first record that fits, including all following records,
911 * into the user-provided buffer for this dump. 991 * into the user-provided buffer for this dump.
912 */ 992 */
913 seq = clear_seq; 993 seq = clear_seq;
914 idx = clear_idx; 994 idx = clear_idx;
995 prev = 0;
915 while (seq < log_next_seq) { 996 while (seq < log_next_seq) {
916 struct log *msg = log_from_idx(idx); 997 struct log *msg = log_from_idx(idx);
917 998
918 len += msg_print_text(msg, true, NULL, 0); 999 len += msg_print_text(msg, prev, true, NULL, 0);
919 idx = log_next(idx); 1000 idx = log_next(idx);
920 seq++; 1001 seq++;
921 } 1002 }
1003
1004 /* move first record forward until length fits into the buffer */
922 seq = clear_seq; 1005 seq = clear_seq;
923 idx = clear_idx; 1006 idx = clear_idx;
1007 prev = 0;
924 while (len > size && seq < log_next_seq) { 1008 while (len > size && seq < log_next_seq) {
925 struct log *msg = log_from_idx(idx); 1009 struct log *msg = log_from_idx(idx);
926 1010
927 len -= msg_print_text(msg, true, NULL, 0); 1011 len -= msg_print_text(msg, prev, true, NULL, 0);
928 idx = log_next(idx); 1012 idx = log_next(idx);
929 seq++; 1013 seq++;
930 } 1014 }
931 1015
932 /* last message in this dump */ 1016 /* last message fitting into this dump */
933 next_seq = log_next_seq; 1017 next_seq = log_next_seq;
934 1018
935 len = 0; 1019 len = 0;
1020 prev = 0;
936 while (len >= 0 && seq < next_seq) { 1021 while (len >= 0 && seq < next_seq) {
937 struct log *msg = log_from_idx(idx); 1022 struct log *msg = log_from_idx(idx);
938 int textlen; 1023 int textlen;
939 1024
940 textlen = msg_print_text(msg, true, text, LOG_LINE_MAX); 1025 textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX);
941 if (textlen < 0) { 1026 if (textlen < 0) {
942 len = textlen; 1027 len = textlen;
943 break; 1028 break;
944 } 1029 }
945 idx = log_next(idx); 1030 idx = log_next(idx);
946 seq++; 1031 seq++;
1032 prev = msg->flags;
947 1033
948 raw_spin_unlock_irq(&logbuf_lock); 1034 raw_spin_unlock_irq(&logbuf_lock);
949 if (copy_to_user(buf + len, text, textlen)) 1035 if (copy_to_user(buf + len, text, textlen))
@@ -956,6 +1042,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
956 /* messages are gone, move to next one */ 1042 /* messages are gone, move to next one */
957 seq = log_first_seq; 1043 seq = log_first_seq;
958 idx = log_first_idx; 1044 idx = log_first_idx;
1045 prev = 0;
959 } 1046 }
960 } 1047 }
961 } 1048 }
@@ -1027,6 +1114,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1027 /* Clear ring buffer */ 1114 /* Clear ring buffer */
1028 case SYSLOG_ACTION_CLEAR: 1115 case SYSLOG_ACTION_CLEAR:
1029 syslog_print_all(NULL, 0, true); 1116 syslog_print_all(NULL, 0, true);
1117 break;
1030 /* Disable logging to console */ 1118 /* Disable logging to console */
1031 case SYSLOG_ACTION_CONSOLE_OFF: 1119 case SYSLOG_ACTION_CONSOLE_OFF:
1032 if (saved_console_loglevel == -1) 1120 if (saved_console_loglevel == -1)
@@ -1059,6 +1147,8 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1059 /* messages are gone, move to first one */ 1147 /* messages are gone, move to first one */
1060 syslog_seq = log_first_seq; 1148 syslog_seq = log_first_seq;
1061 syslog_idx = log_first_idx; 1149 syslog_idx = log_first_idx;
1150 syslog_prev = 0;
1151 syslog_partial = 0;
1062 } 1152 }
1063 if (from_file) { 1153 if (from_file) {
1064 /* 1154 /*
@@ -1068,19 +1158,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1068 */ 1158 */
1069 error = log_next_idx - syslog_idx; 1159 error = log_next_idx - syslog_idx;
1070 } else { 1160 } else {
1071 u64 seq; 1161 u64 seq = syslog_seq;
1072 u32 idx; 1162 u32 idx = syslog_idx;
1163 enum log_flags prev = syslog_prev;
1073 1164
1074 error = 0; 1165 error = 0;
1075 seq = syslog_seq;
1076 idx = syslog_idx;
1077 while (seq < log_next_seq) { 1166 while (seq < log_next_seq) {
1078 struct log *msg = log_from_idx(idx); 1167 struct log *msg = log_from_idx(idx);
1079 1168
1080 error += msg_print_text(msg, true, NULL, 0); 1169 error += msg_print_text(msg, prev, true, NULL, 0);
1081 idx = log_next(idx); 1170 idx = log_next(idx);
1082 seq++; 1171 seq++;
1172 prev = msg->flags;
1083 } 1173 }
1174 error -= syslog_partial;
1084 } 1175 }
1085 raw_spin_unlock_irq(&logbuf_lock); 1176 raw_spin_unlock_irq(&logbuf_lock);
1086 break; 1177 break;
@@ -1101,21 +1192,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
1101 return do_syslog(type, buf, len, SYSLOG_FROM_CALL); 1192 return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
1102} 1193}
1103 1194
1104#ifdef CONFIG_KGDB_KDB
1105/* kdb dmesg command needs access to the syslog buffer. do_syslog()
1106 * uses locks so it cannot be used during debugging. Just tell kdb
1107 * where the start and end of the physical and logical logs are. This
1108 * is equivalent to do_syslog(3).
1109 */
1110void kdb_syslog_data(char *syslog_data[4])
1111{
1112 syslog_data[0] = log_buf;
1113 syslog_data[1] = log_buf + log_buf_len;
1114 syslog_data[2] = log_buf + log_first_idx;
1115 syslog_data[3] = log_buf + log_next_idx;
1116}
1117#endif /* CONFIG_KGDB_KDB */
1118
1119static bool __read_mostly ignore_loglevel; 1195static bool __read_mostly ignore_loglevel;
1120 1196
1121static int __init ignore_loglevel_setup(char *str) 1197static int __init ignore_loglevel_setup(char *str)
@@ -1259,22 +1335,98 @@ static inline void printk_delay(void)
1259 } 1335 }
1260} 1336}
1261 1337
1338/*
1339 * Continuation lines are buffered, and not committed to the record buffer
1340 * until the line is complete, or a race forces it. The line fragments
1341 * though, are printed immediately to the consoles to ensure everything has
1342 * reached the console in case of a kernel crash.
1343 */
1344static struct cont {
1345 char buf[LOG_LINE_MAX];
1346 size_t len; /* length == 0 means unused buffer */
1347 size_t cons; /* bytes written to console */
1348 struct task_struct *owner; /* task of first print*/
1349 u64 ts_nsec; /* time of first print */
1350 u8 level; /* log level of first message */
1351 u8 facility; /* log level of first message */
1352 bool flushed:1; /* buffer sealed and committed */
1353} cont;
1354
1355static void cont_flush(void)
1356{
1357 if (cont.flushed)
1358 return;
1359 if (cont.len == 0)
1360 return;
1361
1362 log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec,
1363 NULL, 0, cont.buf, cont.len);
1364
1365 cont.flushed = true;
1366}
1367
1368static bool cont_add(int facility, int level, const char *text, size_t len)
1369{
1370 if (cont.len && cont.flushed)
1371 return false;
1372
1373 if (cont.len + len > sizeof(cont.buf)) {
1374 cont_flush();
1375 return false;
1376 }
1377
1378 if (!cont.len) {
1379 cont.facility = facility;
1380 cont.level = level;
1381 cont.owner = current;
1382 cont.ts_nsec = local_clock();
1383 cont.cons = 0;
1384 cont.flushed = false;
1385 }
1386
1387 memcpy(cont.buf + cont.len, text, len);
1388 cont.len += len;
1389 return true;
1390}
1391
1392static size_t cont_print_text(char *text, size_t size)
1393{
1394 size_t textlen = 0;
1395 size_t len;
1396
1397 if (cont.cons == 0) {
1398 textlen += print_time(cont.ts_nsec, text);
1399 size -= textlen;
1400 }
1401
1402 len = cont.len - cont.cons;
1403 if (len > 0) {
1404 if (len+1 > size)
1405 len = size-1;
1406 memcpy(text + textlen, cont.buf + cont.cons, len);
1407 textlen += len;
1408 cont.cons = cont.len;
1409 }
1410
1411 if (cont.flushed) {
1412 text[textlen++] = '\n';
1413 /* got everything, release buffer */
1414 cont.len = 0;
1415 }
1416 return textlen;
1417}
1418
1262asmlinkage int vprintk_emit(int facility, int level, 1419asmlinkage int vprintk_emit(int facility, int level,
1263 const char *dict, size_t dictlen, 1420 const char *dict, size_t dictlen,
1264 const char *fmt, va_list args) 1421 const char *fmt, va_list args)
1265{ 1422{
1266 static int recursion_bug; 1423 static int recursion_bug;
1267 static char cont_buf[LOG_LINE_MAX];
1268 static size_t cont_len;
1269 static int cont_level;
1270 static struct task_struct *cont_task;
1271 static char textbuf[LOG_LINE_MAX]; 1424 static char textbuf[LOG_LINE_MAX];
1272 char *text = textbuf; 1425 char *text = textbuf;
1273 size_t text_len; 1426 size_t text_len;
1427 enum log_flags lflags = 0;
1274 unsigned long flags; 1428 unsigned long flags;
1275 int this_cpu; 1429 int this_cpu;
1276 bool newline = false;
1277 bool prefix = false;
1278 int printed_len = 0; 1430 int printed_len = 0;
1279 1431
1280 boot_delay_msec(); 1432 boot_delay_msec();
@@ -1313,7 +1465,8 @@ asmlinkage int vprintk_emit(int facility, int level,
1313 recursion_bug = 0; 1465 recursion_bug = 0;
1314 printed_len += strlen(recursion_msg); 1466 printed_len += strlen(recursion_msg);
1315 /* emit KERN_CRIT message */ 1467 /* emit KERN_CRIT message */
1316 log_store(0, 2, NULL, 0, recursion_msg, printed_len); 1468 log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
1469 NULL, 0, recursion_msg, printed_len);
1317 } 1470 }
1318 1471
1319 /* 1472 /*
@@ -1325,7 +1478,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1325 /* mark and strip a trailing newline */ 1478 /* mark and strip a trailing newline */
1326 if (text_len && text[text_len-1] == '\n') { 1479 if (text_len && text[text_len-1] == '\n') {
1327 text_len--; 1480 text_len--;
1328 newline = true; 1481 lflags |= LOG_NEWLINE;
1329 } 1482 }
1330 1483
1331 /* strip syslog prefix and extract log level or control flags */ 1484 /* strip syslog prefix and extract log level or control flags */
@@ -1335,7 +1488,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1335 if (level == -1) 1488 if (level == -1)
1336 level = text[1] - '0'; 1489 level = text[1] - '0';
1337 case 'd': /* KERN_DEFAULT */ 1490 case 'd': /* KERN_DEFAULT */
1338 prefix = true; 1491 lflags |= LOG_PREFIX;
1339 case 'c': /* KERN_CONT */ 1492 case 'c': /* KERN_CONT */
1340 text += 3; 1493 text += 3;
1341 text_len -= 3; 1494 text_len -= 3;
@@ -1345,61 +1498,41 @@ asmlinkage int vprintk_emit(int facility, int level,
1345 if (level == -1) 1498 if (level == -1)
1346 level = default_message_loglevel; 1499 level = default_message_loglevel;
1347 1500
1348 if (dict) { 1501 if (dict)
1349 prefix = true; 1502 lflags |= LOG_PREFIX|LOG_NEWLINE;
1350 newline = true;
1351 }
1352
1353 if (!newline) {
1354 if (cont_len && (prefix || cont_task != current)) {
1355 /*
1356 * Flush earlier buffer, which is either from a
1357 * different thread, or when we got a new prefix.
1358 */
1359 log_store(facility, cont_level, NULL, 0, cont_buf, cont_len);
1360 cont_len = 0;
1361 }
1362 1503
1363 if (!cont_len) { 1504 if (!(lflags & LOG_NEWLINE)) {
1364 cont_level = level; 1505 /*
1365 cont_task = current; 1506 * Flush the conflicting buffer. An earlier newline was missing,
1366 } 1507 * or another task also prints continuation lines.
1508 */
1509 if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
1510 cont_flush();
1367 1511
1368 /* buffer or append to earlier buffer from the same thread */ 1512 /* buffer line if possible, otherwise store it right away */
1369 if (cont_len + text_len > sizeof(cont_buf)) 1513 if (!cont_add(facility, level, text, text_len))
1370 text_len = sizeof(cont_buf) - cont_len; 1514 log_store(facility, level, lflags | LOG_CONT, 0,
1371 memcpy(cont_buf + cont_len, text, text_len); 1515 dict, dictlen, text, text_len);
1372 cont_len += text_len;
1373 } else { 1516 } else {
1374 if (cont_len && cont_task == current) { 1517 bool stored = false;
1375 if (prefix) {
1376 /*
1377 * New prefix from the same thread; flush. We
1378 * either got no earlier newline, or we race
1379 * with an interrupt.
1380 */
1381 log_store(facility, cont_level,
1382 NULL, 0, cont_buf, cont_len);
1383 cont_len = 0;
1384 }
1385 1518
1386 /* append to the earlier buffer and flush */ 1519 /*
1387 if (cont_len + text_len > sizeof(cont_buf)) 1520 * If an earlier newline was missing and it was the same task,
1388 text_len = sizeof(cont_buf) - cont_len; 1521 * either merge it with the current buffer and flush, or if
1389 memcpy(cont_buf + cont_len, text, text_len); 1522 * there was a race with interrupts (prefix == true) then just
1390 cont_len += text_len; 1523 * flush it out and store this line separately.
1391 log_store(facility, cont_level, 1524 */
1392 NULL, 0, cont_buf, cont_len); 1525 if (cont.len && cont.owner == current) {
1393 cont_len = 0; 1526 if (!(lflags & LOG_PREFIX))
1394 cont_task = NULL; 1527 stored = cont_add(facility, level, text, text_len);
1395 printed_len = cont_len; 1528 cont_flush();
1396 } else {
1397 /* ordinary single and terminated line */
1398 log_store(facility, level,
1399 dict, dictlen, text, text_len);
1400 printed_len = text_len;
1401 } 1529 }
1530
1531 if (!stored)
1532 log_store(facility, level, lflags, 0,
1533 dict, dictlen, text, text_len);
1402 } 1534 }
1535 printed_len += text_len;
1403 1536
1404 /* 1537 /*
1405 * Try to acquire and then immediately release the console semaphore. 1538 * Try to acquire and then immediately release the console semaphore.
@@ -1486,11 +1619,18 @@ EXPORT_SYMBOL(printk);
1486#else 1619#else
1487 1620
1488#define LOG_LINE_MAX 0 1621#define LOG_LINE_MAX 0
1622static struct cont {
1623 size_t len;
1624 size_t cons;
1625 u8 level;
1626 bool flushed:1;
1627} cont;
1489static struct log *log_from_idx(u32 idx) { return NULL; } 1628static struct log *log_from_idx(u32 idx) { return NULL; }
1490static u32 log_next(u32 idx) { return 0; } 1629static u32 log_next(u32 idx) { return 0; }
1491static void call_console_drivers(int level, const char *text, size_t len) {} 1630static void call_console_drivers(int level, const char *text, size_t len) {}
1492static size_t msg_print_text(const struct log *msg, bool syslog, 1631static size_t msg_print_text(const struct log *msg, enum log_flags prev,
1493 char *buf, size_t size) { return 0; } 1632 bool syslog, char *buf, size_t size) { return 0; }
1633static size_t cont_print_text(char *text, size_t size) { return 0; }
1494 1634
1495#endif /* CONFIG_PRINTK */ 1635#endif /* CONFIG_PRINTK */
1496 1636
@@ -1765,6 +1905,7 @@ void wake_up_klogd(void)
1765/* the next printk record to write to the console */ 1905/* the next printk record to write to the console */
1766static u64 console_seq; 1906static u64 console_seq;
1767static u32 console_idx; 1907static u32 console_idx;
1908static enum log_flags console_prev;
1768 1909
1769/** 1910/**
1770 * console_unlock - unlock the console system 1911 * console_unlock - unlock the console system
@@ -1782,6 +1923,7 @@ static u32 console_idx;
1782 */ 1923 */
1783void console_unlock(void) 1924void console_unlock(void)
1784{ 1925{
1926 static char text[LOG_LINE_MAX];
1785 static u64 seen_seq; 1927 static u64 seen_seq;
1786 unsigned long flags; 1928 unsigned long flags;
1787 bool wake_klogd = false; 1929 bool wake_klogd = false;
@@ -1794,10 +1936,23 @@ void console_unlock(void)
1794 1936
1795 console_may_schedule = 0; 1937 console_may_schedule = 0;
1796 1938
1939 /* flush buffered message fragment immediately to console */
1940 raw_spin_lock_irqsave(&logbuf_lock, flags);
1941 if (cont.len && (cont.cons < cont.len || cont.flushed)) {
1942 size_t len;
1943
1944 len = cont_print_text(text, sizeof(text));
1945 raw_spin_unlock(&logbuf_lock);
1946 stop_critical_timings();
1947 call_console_drivers(cont.level, text, len);
1948 start_critical_timings();
1949 local_irq_restore(flags);
1950 } else
1951 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1952
1797again: 1953again:
1798 for (;;) { 1954 for (;;) {
1799 struct log *msg; 1955 struct log *msg;
1800 static char text[LOG_LINE_MAX];
1801 size_t len; 1956 size_t len;
1802 int level; 1957 int level;
1803 1958
@@ -1811,18 +1966,35 @@ again:
1811 /* messages are gone, move to first one */ 1966 /* messages are gone, move to first one */
1812 console_seq = log_first_seq; 1967 console_seq = log_first_seq;
1813 console_idx = log_first_idx; 1968 console_idx = log_first_idx;
1969 console_prev = 0;
1814 } 1970 }
1815 1971skip:
1816 if (console_seq == log_next_seq) 1972 if (console_seq == log_next_seq)
1817 break; 1973 break;
1818 1974
1819 msg = log_from_idx(console_idx); 1975 msg = log_from_idx(console_idx);
1820 level = msg->level & 7; 1976 if (msg->flags & LOG_NOCONS) {
1821 1977 /*
1822 len = msg_print_text(msg, false, text, sizeof(text)); 1978 * Skip record we have buffered and already printed
1979 * directly to the console when we received it.
1980 */
1981 console_idx = log_next(console_idx);
1982 console_seq++;
1983 /*
1984 * We will get here again when we register a new
1985 * CON_PRINTBUFFER console. Clear the flag so we
1986 * will properly dump everything later.
1987 */
1988 msg->flags &= ~LOG_NOCONS;
1989 goto skip;
1990 }
1823 1991
1992 level = msg->level;
1993 len = msg_print_text(msg, console_prev, false,
1994 text, sizeof(text));
1824 console_idx = log_next(console_idx); 1995 console_idx = log_next(console_idx);
1825 console_seq++; 1996 console_seq++;
1997 console_prev = msg->flags;
1826 raw_spin_unlock(&logbuf_lock); 1998 raw_spin_unlock(&logbuf_lock);
1827 1999
1828 stop_critical_timings(); /* don't trace print latency */ 2000 stop_critical_timings(); /* don't trace print latency */
@@ -2085,6 +2257,7 @@ void register_console(struct console *newcon)
2085 raw_spin_lock_irqsave(&logbuf_lock, flags); 2257 raw_spin_lock_irqsave(&logbuf_lock, flags);
2086 console_seq = syslog_seq; 2258 console_seq = syslog_seq;
2087 console_idx = syslog_idx; 2259 console_idx = syslog_idx;
2260 console_prev = syslog_prev;
2088 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2261 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2089 /* 2262 /*
2090 * We're about to replay the log buffer. Only do this to the 2263 * We're about to replay the log buffer. Only do this to the
@@ -2300,48 +2473,256 @@ module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
2300 * kmsg_dump - dump kernel log to kernel message dumpers. 2473 * kmsg_dump - dump kernel log to kernel message dumpers.
2301 * @reason: the reason (oops, panic etc) for dumping 2474 * @reason: the reason (oops, panic etc) for dumping
2302 * 2475 *
2303 * Iterate through each of the dump devices and call the oops/panic 2476 * Call each of the registered dumper's dump() callback, which can
2304 * callbacks with the log buffer. 2477 * retrieve the kmsg records with kmsg_dump_get_line() or
2478 * kmsg_dump_get_buffer().
2305 */ 2479 */
2306void kmsg_dump(enum kmsg_dump_reason reason) 2480void kmsg_dump(enum kmsg_dump_reason reason)
2307{ 2481{
2308 u64 idx;
2309 struct kmsg_dumper *dumper; 2482 struct kmsg_dumper *dumper;
2310 const char *s1, *s2;
2311 unsigned long l1, l2;
2312 unsigned long flags; 2483 unsigned long flags;
2313 2484
2314 if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) 2485 if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump)
2315 return; 2486 return;
2316 2487
2317 /* Theoretically, the log could move on after we do this, but 2488 rcu_read_lock();
2318 there's not a lot we can do about that. The new messages 2489 list_for_each_entry_rcu(dumper, &dump_list, list) {
2319 will overwrite the start of what we dump. */ 2490 if (dumper->max_reason && reason > dumper->max_reason)
2491 continue;
2492
2493 /* initialize iterator with data about the stored records */
2494 dumper->active = true;
2495
2496 raw_spin_lock_irqsave(&logbuf_lock, flags);
2497 dumper->cur_seq = clear_seq;
2498 dumper->cur_idx = clear_idx;
2499 dumper->next_seq = log_next_seq;
2500 dumper->next_idx = log_next_idx;
2501 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2502
2503 /* invoke dumper which will iterate over records */
2504 dumper->dump(dumper, reason);
2505
2506 /* reset iterator */
2507 dumper->active = false;
2508 }
2509 rcu_read_unlock();
2510}
2511
2512/**
2513 * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version)
2514 * @dumper: registered kmsg dumper
2515 * @syslog: include the "<4>" prefixes
2516 * @line: buffer to copy the line to
2517 * @size: maximum size of the buffer
2518 * @len: length of line placed into buffer
2519 *
2520 * Start at the beginning of the kmsg buffer, with the oldest kmsg
2521 * record, and copy one record into the provided buffer.
2522 *
2523 * Consecutive calls will return the next available record moving
2524 * towards the end of the buffer with the youngest messages.
2525 *
2526 * A return value of FALSE indicates that there are no more records to
2527 * read.
2528 *
2529 * The function is similar to kmsg_dump_get_line(), but grabs no locks.
2530 */
2531bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
2532 char *line, size_t size, size_t *len)
2533{
2534 struct log *msg;
2535 size_t l = 0;
2536 bool ret = false;
2537
2538 if (!dumper->active)
2539 goto out;
2540
2541 if (dumper->cur_seq < log_first_seq) {
2542 /* messages are gone, move to first available one */
2543 dumper->cur_seq = log_first_seq;
2544 dumper->cur_idx = log_first_idx;
2545 }
2546
2547 /* last entry */
2548 if (dumper->cur_seq >= log_next_seq)
2549 goto out;
2550
2551 msg = log_from_idx(dumper->cur_idx);
2552 l = msg_print_text(msg, 0, syslog, line, size);
2553
2554 dumper->cur_idx = log_next(dumper->cur_idx);
2555 dumper->cur_seq++;
2556 ret = true;
2557out:
2558 if (len)
2559 *len = l;
2560 return ret;
2561}
2562
2563/**
2564 * kmsg_dump_get_line - retrieve one kmsg log line
2565 * @dumper: registered kmsg dumper
2566 * @syslog: include the "<4>" prefixes
2567 * @line: buffer to copy the line to
2568 * @size: maximum size of the buffer
2569 * @len: length of line placed into buffer
2570 *
2571 * Start at the beginning of the kmsg buffer, with the oldest kmsg
2572 * record, and copy one record into the provided buffer.
2573 *
2574 * Consecutive calls will return the next available record moving
2575 * towards the end of the buffer with the youngest messages.
2576 *
2577 * A return value of FALSE indicates that there are no more records to
2578 * read.
2579 */
2580bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
2581 char *line, size_t size, size_t *len)
2582{
2583 unsigned long flags;
2584 bool ret;
2585
2586 raw_spin_lock_irqsave(&logbuf_lock, flags);
2587 ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
2588 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2589
2590 return ret;
2591}
2592EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
2593
2594/**
2595 * kmsg_dump_get_buffer - copy kmsg log lines
2596 * @dumper: registered kmsg dumper
2597 * @syslog: include the "<4>" prefixes
2598 * @buf: buffer to copy the line to
2599 * @size: maximum size of the buffer
2600 * @len: length of line placed into buffer
2601 *
2602 * Start at the end of the kmsg buffer and fill the provided buffer
2603 * with as many of the the *youngest* kmsg records that fit into it.
2604 * If the buffer is large enough, all available kmsg records will be
2605 * copied with a single call.
2606 *
2607 * Consecutive calls will fill the buffer with the next block of
2608 * available older records, not including the earlier retrieved ones.
2609 *
2610 * A return value of FALSE indicates that there are no more records to
2611 * read.
2612 */
2613bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
2614 char *buf, size_t size, size_t *len)
2615{
2616 unsigned long flags;
2617 u64 seq;
2618 u32 idx;
2619 u64 next_seq;
2620 u32 next_idx;
2621 enum log_flags prev;
2622 size_t l = 0;
2623 bool ret = false;
2624
2625 if (!dumper->active)
2626 goto out;
2320 2627
2321 raw_spin_lock_irqsave(&logbuf_lock, flags); 2628 raw_spin_lock_irqsave(&logbuf_lock, flags);
2322 if (syslog_seq < log_first_seq) 2629 if (dumper->cur_seq < log_first_seq) {
2323 idx = syslog_idx; 2630 /* messages are gone, move to first available one */
2324 else 2631 dumper->cur_seq = log_first_seq;
2325 idx = log_first_idx; 2632 dumper->cur_idx = log_first_idx;
2633 }
2634
2635 /* last entry */
2636 if (dumper->cur_seq >= dumper->next_seq) {
2637 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2638 goto out;
2639 }
2326 2640
2327 if (idx > log_next_idx) { 2641 /* calculate length of entire buffer */
2328 s1 = log_buf; 2642 seq = dumper->cur_seq;
2329 l1 = log_next_idx; 2643 idx = dumper->cur_idx;
2644 prev = 0;
2645 while (seq < dumper->next_seq) {
2646 struct log *msg = log_from_idx(idx);
2647
2648 l += msg_print_text(msg, prev, true, NULL, 0);
2649 idx = log_next(idx);
2650 seq++;
2651 prev = msg->flags;
2652 }
2330 2653
2331 s2 = log_buf + idx; 2654 /* move first record forward until length fits into the buffer */
2332 l2 = log_buf_len - idx; 2655 seq = dumper->cur_seq;
2333 } else { 2656 idx = dumper->cur_idx;
2334 s1 = ""; 2657 prev = 0;
2335 l1 = 0; 2658 while (l > size && seq < dumper->next_seq) {
2659 struct log *msg = log_from_idx(idx);
2660
2661 l -= msg_print_text(msg, prev, true, NULL, 0);
2662 idx = log_next(idx);
2663 seq++;
2664 prev = msg->flags;
2665 }
2666
2667 /* last message in next interation */
2668 next_seq = seq;
2669 next_idx = idx;
2670
2671 l = 0;
2672 prev = 0;
2673 while (seq < dumper->next_seq) {
2674 struct log *msg = log_from_idx(idx);
2336 2675
2337 s2 = log_buf + idx; 2676 l += msg_print_text(msg, prev, syslog, buf + l, size - l);
2338 l2 = log_next_idx - idx; 2677 idx = log_next(idx);
2678 seq++;
2679 prev = msg->flags;
2339 } 2680 }
2681
2682 dumper->next_seq = next_seq;
2683 dumper->next_idx = next_idx;
2684 ret = true;
2340 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2685 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2686out:
2687 if (len)
2688 *len = l;
2689 return ret;
2690}
2691EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
2341 2692
2342 rcu_read_lock(); 2693/**
2343 list_for_each_entry_rcu(dumper, &dump_list, list) 2694 * kmsg_dump_rewind_nolock - reset the interator (unlocked version)
2344 dumper->dump(dumper, reason, s1, l1, s2, l2); 2695 * @dumper: registered kmsg dumper
2345 rcu_read_unlock(); 2696 *
2697 * Reset the dumper's iterator so that kmsg_dump_get_line() and
2698 * kmsg_dump_get_buffer() can be called again and used multiple
2699 * times within the same dumper.dump() callback.
2700 *
2701 * The function is similar to kmsg_dump_rewind(), but grabs no locks.
2702 */
2703void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
2704{
2705 dumper->cur_seq = clear_seq;
2706 dumper->cur_idx = clear_idx;
2707 dumper->next_seq = log_next_seq;
2708 dumper->next_idx = log_next_idx;
2709}
2710
2711/**
2712 * kmsg_dump_rewind - reset the interator
2713 * @dumper: registered kmsg dumper
2714 *
2715 * Reset the dumper's iterator so that kmsg_dump_get_line() and
2716 * kmsg_dump_get_buffer() can be called again and used multiple
2717 * times within the same dumper.dump() callback.
2718 */
2719void kmsg_dump_rewind(struct kmsg_dumper *dumper)
2720{
2721 unsigned long flags;
2722
2723 raw_spin_lock_irqsave(&logbuf_lock, flags);
2724 kmsg_dump_rewind_nolock(dumper);
2725 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2346} 2726}
2727EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
2347#endif 2728#endif
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 0da7b88d92d0..4b97bba7396e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -201,6 +201,7 @@ void rcu_note_context_switch(int cpu)
201{ 201{
202 trace_rcu_utilization("Start context switch"); 202 trace_rcu_utilization("Start context switch");
203 rcu_sched_qs(cpu); 203 rcu_sched_qs(cpu);
204 rcu_preempt_note_context_switch(cpu);
204 trace_rcu_utilization("End context switch"); 205 trace_rcu_utilization("End context switch");
205} 206}
206EXPORT_SYMBOL_GPL(rcu_note_context_switch); 207EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -1397,6 +1398,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1397 rdp->qlen_lazy += rsp->qlen_lazy; 1398 rdp->qlen_lazy += rsp->qlen_lazy;
1398 rdp->qlen += rsp->qlen; 1399 rdp->qlen += rsp->qlen;
1399 rdp->n_cbs_adopted += rsp->qlen; 1400 rdp->n_cbs_adopted += rsp->qlen;
1401 if (rsp->qlen_lazy != rsp->qlen)
1402 rcu_idle_count_callbacks_posted();
1400 rsp->qlen_lazy = 0; 1403 rsp->qlen_lazy = 0;
1401 rsp->qlen = 0; 1404 rsp->qlen = 0;
1402 1405
@@ -1528,7 +1531,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1528{ 1531{
1529 unsigned long flags; 1532 unsigned long flags;
1530 struct rcu_head *next, *list, **tail; 1533 struct rcu_head *next, *list, **tail;
1531 int bl, count, count_lazy; 1534 int bl, count, count_lazy, i;
1532 1535
1533 /* If no callbacks are ready, just return.*/ 1536 /* If no callbacks are ready, just return.*/
1534 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 1537 if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
@@ -1551,9 +1554,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1551 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; 1554 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
1552 *rdp->nxttail[RCU_DONE_TAIL] = NULL; 1555 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
1553 tail = rdp->nxttail[RCU_DONE_TAIL]; 1556 tail = rdp->nxttail[RCU_DONE_TAIL];
1554 for (count = RCU_NEXT_SIZE - 1; count >= 0; count--) 1557 for (i = RCU_NEXT_SIZE - 1; i >= 0; i--)
1555 if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL]) 1558 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
1556 rdp->nxttail[count] = &rdp->nxtlist; 1559 rdp->nxttail[i] = &rdp->nxtlist;
1557 local_irq_restore(flags); 1560 local_irq_restore(flags);
1558 1561
1559 /* Invoke callbacks. */ 1562 /* Invoke callbacks. */
@@ -1581,9 +1584,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1581 if (list != NULL) { 1584 if (list != NULL) {
1582 *tail = rdp->nxtlist; 1585 *tail = rdp->nxtlist;
1583 rdp->nxtlist = list; 1586 rdp->nxtlist = list;
1584 for (count = 0; count < RCU_NEXT_SIZE; count++) 1587 for (i = 0; i < RCU_NEXT_SIZE; i++)
1585 if (&rdp->nxtlist == rdp->nxttail[count]) 1588 if (&rdp->nxtlist == rdp->nxttail[i])
1586 rdp->nxttail[count] = tail; 1589 rdp->nxttail[i] = tail;
1587 else 1590 else
1588 break; 1591 break;
1589 } 1592 }
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 7f5d138dedf5..19b61ac1079f 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,6 +84,20 @@ struct rcu_dynticks {
84 /* Process level is worth LLONG_MAX/2. */ 84 /* Process level is worth LLONG_MAX/2. */
85 int dynticks_nmi_nesting; /* Track NMI nesting level. */ 85 int dynticks_nmi_nesting; /* Track NMI nesting level. */
86 atomic_t dynticks; /* Even value for idle, else odd. */ 86 atomic_t dynticks; /* Even value for idle, else odd. */
87#ifdef CONFIG_RCU_FAST_NO_HZ
88 int dyntick_drain; /* Prepare-for-idle state variable. */
89 unsigned long dyntick_holdoff;
90 /* No retries for the jiffy of failure. */
91 struct timer_list idle_gp_timer;
92 /* Wake up CPU sleeping with callbacks. */
93 unsigned long idle_gp_timer_expires;
94 /* When to wake up CPU (for repost). */
95 bool idle_first_pass; /* First pass of attempt to go idle? */
96 unsigned long nonlazy_posted;
97 /* # times non-lazy CBs posted to CPU. */
98 unsigned long nonlazy_posted_snap;
99 /* idle-period nonlazy_posted snapshot. */
100#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
87}; 101};
88 102
89/* RCU's kthread states for tracing. */ 103/* RCU's kthread states for tracing. */
@@ -430,6 +444,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
430/* Forward declarations for rcutree_plugin.h */ 444/* Forward declarations for rcutree_plugin.h */
431static void rcu_bootup_announce(void); 445static void rcu_bootup_announce(void);
432long rcu_batches_completed(void); 446long rcu_batches_completed(void);
447static void rcu_preempt_note_context_switch(int cpu);
433static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); 448static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
434#ifdef CONFIG_HOTPLUG_CPU 449#ifdef CONFIG_HOTPLUG_CPU
435static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 450static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 2411000d9869..3e4899459f3d 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu)
153 * 153 *
154 * Caller must disable preemption. 154 * Caller must disable preemption.
155 */ 155 */
156void rcu_preempt_note_context_switch(void) 156static void rcu_preempt_note_context_switch(int cpu)
157{ 157{
158 struct task_struct *t = current; 158 struct task_struct *t = current;
159 unsigned long flags; 159 unsigned long flags;
@@ -164,7 +164,7 @@ void rcu_preempt_note_context_switch(void)
164 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 164 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
165 165
166 /* Possibly blocking in an RCU read-side critical section. */ 166 /* Possibly blocking in an RCU read-side critical section. */
167 rdp = __this_cpu_ptr(rcu_preempt_state.rda); 167 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
168 rnp = rdp->mynode; 168 rnp = rdp->mynode;
169 raw_spin_lock_irqsave(&rnp->lock, flags); 169 raw_spin_lock_irqsave(&rnp->lock, flags);
170 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 170 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -228,7 +228,7 @@ void rcu_preempt_note_context_switch(void)
228 * means that we continue to block the current grace period. 228 * means that we continue to block the current grace period.
229 */ 229 */
230 local_irq_save(flags); 230 local_irq_save(flags);
231 rcu_preempt_qs(smp_processor_id()); 231 rcu_preempt_qs(cpu);
232 local_irq_restore(flags); 232 local_irq_restore(flags);
233} 233}
234 234
@@ -1002,6 +1002,14 @@ void rcu_force_quiescent_state(void)
1002EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 1002EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
1003 1003
1004/* 1004/*
1005 * Because preemptible RCU does not exist, we never have to check for
1006 * CPUs being in quiescent states.
1007 */
1008static void rcu_preempt_note_context_switch(int cpu)
1009{
1010}
1011
1012/*
1005 * Because preemptible RCU does not exist, there are never any preempted 1013 * Because preemptible RCU does not exist, there are never any preempted
1006 * RCU readers. 1014 * RCU readers.
1007 */ 1015 */
@@ -1886,8 +1894,9 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
1886 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs 1894 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
1887 * any flavor of RCU. 1895 * any flavor of RCU.
1888 */ 1896 */
1889int rcu_needs_cpu(int cpu) 1897int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
1890{ 1898{
1899 *delta_jiffies = ULONG_MAX;
1891 return rcu_cpu_has_callbacks(cpu); 1900 return rcu_cpu_has_callbacks(cpu);
1892} 1901}
1893 1902
@@ -1962,41 +1971,6 @@ static void rcu_idle_count_callbacks_posted(void)
1962#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ 1971#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */
1963#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ 1972#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
1964 1973
1965/* Loop counter for rcu_prepare_for_idle(). */
1966static DEFINE_PER_CPU(int, rcu_dyntick_drain);
1967/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */
1968static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1969/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */
1970static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer);
1971/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */
1972static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires);
1973/* Enable special processing on first attempt to enter dyntick-idle mode. */
1974static DEFINE_PER_CPU(bool, rcu_idle_first_pass);
1975/* Running count of non-lazy callbacks posted, never decremented. */
1976static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted);
1977/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */
1978static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap);
1979
1980/*
1981 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
1982 * callbacks on this CPU, (2) this CPU has not yet attempted to enter
1983 * dyntick-idle mode, or (3) this CPU is in the process of attempting to
1984 * enter dyntick-idle mode. Otherwise, if we have recently tried and failed
1985 * to enter dyntick-idle mode, we refuse to try to enter it. After all,
1986 * it is better to incur scheduling-clock interrupts than to spin
1987 * continuously for the same time duration!
1988 */
1989int rcu_needs_cpu(int cpu)
1990{
1991 /* Flag a new idle sojourn to the idle-entry state machine. */
1992 per_cpu(rcu_idle_first_pass, cpu) = 1;
1993 /* If no callbacks, RCU doesn't need the CPU. */
1994 if (!rcu_cpu_has_callbacks(cpu))
1995 return 0;
1996 /* Otherwise, RCU needs the CPU only if it recently tried and failed. */
1997 return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies;
1998}
1999
2000/* 1974/*
2001 * Does the specified flavor of RCU have non-lazy callbacks pending on 1975 * Does the specified flavor of RCU have non-lazy callbacks pending on
2002 * the specified CPU? Both RCU flavor and CPU are specified by the 1976 * the specified CPU? Both RCU flavor and CPU are specified by the
@@ -2040,6 +2014,47 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
2040} 2014}
2041 2015
2042/* 2016/*
2017 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
2018 * callbacks on this CPU, (2) this CPU has not yet attempted to enter
2019 * dyntick-idle mode, or (3) this CPU is in the process of attempting to
2020 * enter dyntick-idle mode. Otherwise, if we have recently tried and failed
2021 * to enter dyntick-idle mode, we refuse to try to enter it. After all,
2022 * it is better to incur scheduling-clock interrupts than to spin
2023 * continuously for the same time duration!
2024 *
2025 * The delta_jiffies argument is used to store the time when RCU is
2026 * going to need the CPU again if it still has callbacks. The reason
2027 * for this is that rcu_prepare_for_idle() might need to post a timer,
2028 * but if so, it will do so after tick_nohz_stop_sched_tick() has set
2029 * the wakeup time for this CPU. This means that RCU's timer can be
2030 * delayed until the wakeup time, which defeats the purpose of posting
2031 * a timer.
2032 */
2033int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
2034{
2035 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2036
2037 /* Flag a new idle sojourn to the idle-entry state machine. */
2038 rdtp->idle_first_pass = 1;
2039 /* If no callbacks, RCU doesn't need the CPU. */
2040 if (!rcu_cpu_has_callbacks(cpu)) {
2041 *delta_jiffies = ULONG_MAX;
2042 return 0;
2043 }
2044 if (rdtp->dyntick_holdoff == jiffies) {
2045 /* RCU recently tried and failed, so don't try again. */
2046 *delta_jiffies = 1;
2047 return 1;
2048 }
2049 /* Set up for the possibility that RCU will post a timer. */
2050 if (rcu_cpu_has_nonlazy_callbacks(cpu))
2051 *delta_jiffies = RCU_IDLE_GP_DELAY;
2052 else
2053 *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY;
2054 return 0;
2055}
2056
2057/*
2043 * Handler for smp_call_function_single(). The only point of this 2058 * Handler for smp_call_function_single(). The only point of this
2044 * handler is to wake the CPU up, so the handler does only tracing. 2059 * handler is to wake the CPU up, so the handler does only tracing.
2045 */ 2060 */
@@ -2075,21 +2090,24 @@ static void rcu_idle_gp_timer_func(unsigned long cpu_in)
2075 */ 2090 */
2076static void rcu_prepare_for_idle_init(int cpu) 2091static void rcu_prepare_for_idle_init(int cpu)
2077{ 2092{
2078 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 2093 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2079 setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), 2094
2080 rcu_idle_gp_timer_func, cpu); 2095 rdtp->dyntick_holdoff = jiffies - 1;
2081 per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1; 2096 setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu);
2082 per_cpu(rcu_idle_first_pass, cpu) = 1; 2097 rdtp->idle_gp_timer_expires = jiffies - 1;
2098 rdtp->idle_first_pass = 1;
2083} 2099}
2084 2100
2085/* 2101/*
2086 * Clean up for exit from idle. Because we are exiting from idle, there 2102 * Clean up for exit from idle. Because we are exiting from idle, there
2087 * is no longer any point to rcu_idle_gp_timer, so cancel it. This will 2103 * is no longer any point to ->idle_gp_timer, so cancel it. This will
2088 * do nothing if this timer is not active, so just cancel it unconditionally. 2104 * do nothing if this timer is not active, so just cancel it unconditionally.
2089 */ 2105 */
2090static void rcu_cleanup_after_idle(int cpu) 2106static void rcu_cleanup_after_idle(int cpu)
2091{ 2107{
2092 del_timer(&per_cpu(rcu_idle_gp_timer, cpu)); 2108 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2109
2110 del_timer(&rdtp->idle_gp_timer);
2093 trace_rcu_prep_idle("Cleanup after idle"); 2111 trace_rcu_prep_idle("Cleanup after idle");
2094} 2112}
2095 2113
@@ -2108,42 +2126,41 @@ static void rcu_cleanup_after_idle(int cpu)
2108 * Because it is not legal to invoke rcu_process_callbacks() with irqs 2126 * Because it is not legal to invoke rcu_process_callbacks() with irqs
2109 * disabled, we do one pass of force_quiescent_state(), then do a 2127 * disabled, we do one pass of force_quiescent_state(), then do a
2110 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked 2128 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
2111 * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. 2129 * later. The ->dyntick_drain field controls the sequencing.
2112 * 2130 *
2113 * The caller must have disabled interrupts. 2131 * The caller must have disabled interrupts.
2114 */ 2132 */
2115static void rcu_prepare_for_idle(int cpu) 2133static void rcu_prepare_for_idle(int cpu)
2116{ 2134{
2117 struct timer_list *tp; 2135 struct timer_list *tp;
2136 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2118 2137
2119 /* 2138 /*
2120 * If this is an idle re-entry, for example, due to use of 2139 * If this is an idle re-entry, for example, due to use of
2121 * RCU_NONIDLE() or the new idle-loop tracing API within the idle 2140 * RCU_NONIDLE() or the new idle-loop tracing API within the idle
2122 * loop, then don't take any state-machine actions, unless the 2141 * loop, then don't take any state-machine actions, unless the
2123 * momentary exit from idle queued additional non-lazy callbacks. 2142 * momentary exit from idle queued additional non-lazy callbacks.
2124 * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks 2143 * Instead, repost the ->idle_gp_timer if this CPU has callbacks
2125 * pending. 2144 * pending.
2126 */ 2145 */
2127 if (!per_cpu(rcu_idle_first_pass, cpu) && 2146 if (!rdtp->idle_first_pass &&
2128 (per_cpu(rcu_nonlazy_posted, cpu) == 2147 (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) {
2129 per_cpu(rcu_nonlazy_posted_snap, cpu))) {
2130 if (rcu_cpu_has_callbacks(cpu)) { 2148 if (rcu_cpu_has_callbacks(cpu)) {
2131 tp = &per_cpu(rcu_idle_gp_timer, cpu); 2149 tp = &rdtp->idle_gp_timer;
2132 mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); 2150 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
2133 } 2151 }
2134 return; 2152 return;
2135 } 2153 }
2136 per_cpu(rcu_idle_first_pass, cpu) = 0; 2154 rdtp->idle_first_pass = 0;
2137 per_cpu(rcu_nonlazy_posted_snap, cpu) = 2155 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;
2138 per_cpu(rcu_nonlazy_posted, cpu) - 1;
2139 2156
2140 /* 2157 /*
2141 * If there are no callbacks on this CPU, enter dyntick-idle mode. 2158 * If there are no callbacks on this CPU, enter dyntick-idle mode.
2142 * Also reset state to avoid prejudicing later attempts. 2159 * Also reset state to avoid prejudicing later attempts.
2143 */ 2160 */
2144 if (!rcu_cpu_has_callbacks(cpu)) { 2161 if (!rcu_cpu_has_callbacks(cpu)) {
2145 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 2162 rdtp->dyntick_holdoff = jiffies - 1;
2146 per_cpu(rcu_dyntick_drain, cpu) = 0; 2163 rdtp->dyntick_drain = 0;
2147 trace_rcu_prep_idle("No callbacks"); 2164 trace_rcu_prep_idle("No callbacks");
2148 return; 2165 return;
2149 } 2166 }
@@ -2152,36 +2169,37 @@ static void rcu_prepare_for_idle(int cpu)
2152 * If in holdoff mode, just return. We will presumably have 2169 * If in holdoff mode, just return. We will presumably have
2153 * refrained from disabling the scheduling-clock tick. 2170 * refrained from disabling the scheduling-clock tick.
2154 */ 2171 */
2155 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { 2172 if (rdtp->dyntick_holdoff == jiffies) {
2156 trace_rcu_prep_idle("In holdoff"); 2173 trace_rcu_prep_idle("In holdoff");
2157 return; 2174 return;
2158 } 2175 }
2159 2176
2160 /* Check and update the rcu_dyntick_drain sequencing. */ 2177 /* Check and update the ->dyntick_drain sequencing. */
2161 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2178 if (rdtp->dyntick_drain <= 0) {
2162 /* First time through, initialize the counter. */ 2179 /* First time through, initialize the counter. */
2163 per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; 2180 rdtp->dyntick_drain = RCU_IDLE_FLUSHES;
2164 } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && 2181 } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES &&
2165 !rcu_pending(cpu) && 2182 !rcu_pending(cpu) &&
2166 !local_softirq_pending()) { 2183 !local_softirq_pending()) {
2167 /* Can we go dyntick-idle despite still having callbacks? */ 2184 /* Can we go dyntick-idle despite still having callbacks? */
2168 trace_rcu_prep_idle("Dyntick with callbacks"); 2185 rdtp->dyntick_drain = 0;
2169 per_cpu(rcu_dyntick_drain, cpu) = 0; 2186 rdtp->dyntick_holdoff = jiffies;
2170 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; 2187 if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
2171 if (rcu_cpu_has_nonlazy_callbacks(cpu)) 2188 trace_rcu_prep_idle("Dyntick with callbacks");
2172 per_cpu(rcu_idle_gp_timer_expires, cpu) = 2189 rdtp->idle_gp_timer_expires =
2173 jiffies + RCU_IDLE_GP_DELAY; 2190 jiffies + RCU_IDLE_GP_DELAY;
2174 else 2191 } else {
2175 per_cpu(rcu_idle_gp_timer_expires, cpu) = 2192 rdtp->idle_gp_timer_expires =
2176 jiffies + RCU_IDLE_LAZY_GP_DELAY; 2193 jiffies + RCU_IDLE_LAZY_GP_DELAY;
2177 tp = &per_cpu(rcu_idle_gp_timer, cpu); 2194 trace_rcu_prep_idle("Dyntick with lazy callbacks");
2178 mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); 2195 }
2179 per_cpu(rcu_nonlazy_posted_snap, cpu) = 2196 tp = &rdtp->idle_gp_timer;
2180 per_cpu(rcu_nonlazy_posted, cpu); 2197 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
2198 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
2181 return; /* Nothing more to do immediately. */ 2199 return; /* Nothing more to do immediately. */
2182 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2200 } else if (--(rdtp->dyntick_drain) <= 0) {
2183 /* We have hit the limit, so time to give up. */ 2201 /* We have hit the limit, so time to give up. */
2184 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; 2202 rdtp->dyntick_holdoff = jiffies;
2185 trace_rcu_prep_idle("Begin holdoff"); 2203 trace_rcu_prep_idle("Begin holdoff");
2186 invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ 2204 invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */
2187 return; 2205 return;
@@ -2227,7 +2245,7 @@ static void rcu_prepare_for_idle(int cpu)
2227 */ 2245 */
2228static void rcu_idle_count_callbacks_posted(void) 2246static void rcu_idle_count_callbacks_posted(void)
2229{ 2247{
2230 __this_cpu_add(rcu_nonlazy_posted, 1); 2248 __this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
2231} 2249}
2232 2250
2233#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 2251#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
@@ -2238,11 +2256,12 @@ static void rcu_idle_count_callbacks_posted(void)
2238 2256
2239static void print_cpu_stall_fast_no_hz(char *cp, int cpu) 2257static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2240{ 2258{
2241 struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu); 2259 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2260 struct timer_list *tltp = &rdtp->idle_gp_timer;
2242 2261
2243 sprintf(cp, "drain=%d %c timer=%lu", 2262 sprintf(cp, "drain=%d %c timer=%lu",
2244 per_cpu(rcu_dyntick_drain, cpu), 2263 rdtp->dyntick_drain,
2245 per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', 2264 rdtp->dyntick_holdoff == jiffies ? 'H' : '.',
2246 timer_pending(tltp) ? tltp->expires - jiffies : -1); 2265 timer_pending(tltp) ? tltp->expires - jiffies : -1);
2247} 2266}
2248 2267
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 39eb6011bc38..468bdd44c1ba 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -142,9 +142,8 @@ const_debug unsigned int sysctl_sched_features =
142#define SCHED_FEAT(name, enabled) \ 142#define SCHED_FEAT(name, enabled) \
143 #name , 143 #name ,
144 144
145static __read_mostly char *sched_feat_names[] = { 145static const char * const sched_feat_names[] = {
146#include "features.h" 146#include "features.h"
147 NULL
148}; 147};
149 148
150#undef SCHED_FEAT 149#undef SCHED_FEAT
@@ -2082,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev,
2082#endif 2081#endif
2083 2082
2084 /* Here we just switch the register state and the stack. */ 2083 /* Here we just switch the register state and the stack. */
2085 rcu_switch_from(prev);
2086 switch_to(prev, next, prev); 2084 switch_to(prev, next, prev);
2087 2085
2088 barrier(); 2086 barrier();
@@ -2162,11 +2160,73 @@ unsigned long this_cpu_load(void)
2162} 2160}
2163 2161
2164 2162
2163/*
2164 * Global load-average calculations
2165 *
2166 * We take a distributed and async approach to calculating the global load-avg
2167 * in order to minimize overhead.
2168 *
2169 * The global load average is an exponentially decaying average of nr_running +
2170 * nr_uninterruptible.
2171 *
2172 * Once every LOAD_FREQ:
2173 *
2174 * nr_active = 0;
2175 * for_each_possible_cpu(cpu)
2176 * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
2177 *
2178 * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
2179 *
2180 * Due to a number of reasons the above turns in the mess below:
2181 *
2182 * - for_each_possible_cpu() is prohibitively expensive on machines with
2183 * serious number of cpus, therefore we need to take a distributed approach
2184 * to calculating nr_active.
2185 *
2186 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
2187 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
2188 *
2189 * So assuming nr_active := 0 when we start out -- true per definition, we
2190 * can simply take per-cpu deltas and fold those into a global accumulate
2191 * to obtain the same result. See calc_load_fold_active().
2192 *
2193 * Furthermore, in order to avoid synchronizing all per-cpu delta folding
2194 * across the machine, we assume 10 ticks is sufficient time for every
2195 * cpu to have completed this task.
2196 *
2197 * This places an upper-bound on the IRQ-off latency of the machine. Then
2198 * again, being late doesn't loose the delta, just wrecks the sample.
2199 *
2200 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
2201 * this would add another cross-cpu cacheline miss and atomic operation
2202 * to the wakeup path. Instead we increment on whatever cpu the task ran
2203 * when it went into uninterruptible state and decrement on whatever cpu
2204 * did the wakeup. This means that only the sum of nr_uninterruptible over
2205 * all cpus yields the correct result.
2206 *
2207 * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
2208 */
2209
2165/* Variables and functions for calc_load */ 2210/* Variables and functions for calc_load */
2166static atomic_long_t calc_load_tasks; 2211static atomic_long_t calc_load_tasks;
2167static unsigned long calc_load_update; 2212static unsigned long calc_load_update;
2168unsigned long avenrun[3]; 2213unsigned long avenrun[3];
2169EXPORT_SYMBOL(avenrun); 2214EXPORT_SYMBOL(avenrun); /* should be removed */
2215
2216/**
2217 * get_avenrun - get the load average array
2218 * @loads: pointer to dest load array
2219 * @offset: offset to add
2220 * @shift: shift count to shift the result left
2221 *
2222 * These values are estimates at best, so no need for locking.
2223 */
2224void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2225{
2226 loads[0] = (avenrun[0] + offset) << shift;
2227 loads[1] = (avenrun[1] + offset) << shift;
2228 loads[2] = (avenrun[2] + offset) << shift;
2229}
2170 2230
2171static long calc_load_fold_active(struct rq *this_rq) 2231static long calc_load_fold_active(struct rq *this_rq)
2172{ 2232{
@@ -2183,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq)
2183 return delta; 2243 return delta;
2184} 2244}
2185 2245
2246/*
2247 * a1 = a0 * e + a * (1 - e)
2248 */
2186static unsigned long 2249static unsigned long
2187calc_load(unsigned long load, unsigned long exp, unsigned long active) 2250calc_load(unsigned long load, unsigned long exp, unsigned long active)
2188{ 2251{
@@ -2194,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
2194 2257
2195#ifdef CONFIG_NO_HZ 2258#ifdef CONFIG_NO_HZ
2196/* 2259/*
2197 * For NO_HZ we delay the active fold to the next LOAD_FREQ update. 2260 * Handle NO_HZ for the global load-average.
2261 *
2262 * Since the above described distributed algorithm to compute the global
2263 * load-average relies on per-cpu sampling from the tick, it is affected by
2264 * NO_HZ.
2265 *
2266 * The basic idea is to fold the nr_active delta into a global idle-delta upon
2267 * entering NO_HZ state such that we can include this as an 'extra' cpu delta
2268 * when we read the global state.
2269 *
2270 * Obviously reality has to ruin such a delightfully simple scheme:
2271 *
2272 * - When we go NO_HZ idle during the window, we can negate our sample
2273 * contribution, causing under-accounting.
2274 *
2275 * We avoid this by keeping two idle-delta counters and flipping them
2276 * when the window starts, thus separating old and new NO_HZ load.
2277 *
2278 * The only trick is the slight shift in index flip for read vs write.
2279 *
2280 * 0s 5s 10s 15s
2281 * +10 +10 +10 +10
2282 * |-|-----------|-|-----------|-|-----------|-|
2283 * r:0 0 1 1 0 0 1 1 0
2284 * w:0 1 1 0 0 1 1 0 0
2285 *
2286 * This ensures we'll fold the old idle contribution in this window while
2287 * accumlating the new one.
2288 *
2289 * - When we wake up from NO_HZ idle during the window, we push up our
2290 * contribution, since we effectively move our sample point to a known
2291 * busy state.
2292 *
2293 * This is solved by pushing the window forward, and thus skipping the
2294 * sample, for this cpu (effectively using the idle-delta for this cpu which
2295 * was in effect at the time the window opened). This also solves the issue
2296 * of having to deal with a cpu having been in NOHZ idle for multiple
2297 * LOAD_FREQ intervals.
2198 * 2298 *
2199 * When making the ILB scale, we should try to pull this in as well. 2299 * When making the ILB scale, we should try to pull this in as well.
2200 */ 2300 */
2201static atomic_long_t calc_load_tasks_idle; 2301static atomic_long_t calc_load_idle[2];
2302static int calc_load_idx;
2303
2304static inline int calc_load_write_idx(void)
2305{
2306 int idx = calc_load_idx;
2307
2308 /*
2309 * See calc_global_nohz(), if we observe the new index, we also
2310 * need to observe the new update time.
2311 */
2312 smp_rmb();
2313
2314 /*
2315 * If the folding window started, make sure we start writing in the
2316 * next idle-delta.
2317 */
2318 if (!time_before(jiffies, calc_load_update))
2319 idx++;
2202 2320
2203void calc_load_account_idle(struct rq *this_rq) 2321 return idx & 1;
2322}
2323
2324static inline int calc_load_read_idx(void)
2204{ 2325{
2326 return calc_load_idx & 1;
2327}
2328
2329void calc_load_enter_idle(void)
2330{
2331 struct rq *this_rq = this_rq();
2205 long delta; 2332 long delta;
2206 2333
2334 /*
2335 * We're going into NOHZ mode, if there's any pending delta, fold it
2336 * into the pending idle delta.
2337 */
2207 delta = calc_load_fold_active(this_rq); 2338 delta = calc_load_fold_active(this_rq);
2208 if (delta) 2339 if (delta) {
2209 atomic_long_add(delta, &calc_load_tasks_idle); 2340 int idx = calc_load_write_idx();
2341 atomic_long_add(delta, &calc_load_idle[idx]);
2342 }
2210} 2343}
2211 2344
2212static long calc_load_fold_idle(void) 2345void calc_load_exit_idle(void)
2213{ 2346{
2214 long delta = 0; 2347 struct rq *this_rq = this_rq();
2215 2348
2216 /* 2349 /*
2217 * Its got a race, we don't care... 2350 * If we're still before the sample window, we're done.
2218 */ 2351 */
2219 if (atomic_long_read(&calc_load_tasks_idle)) 2352 if (time_before(jiffies, this_rq->calc_load_update))
2220 delta = atomic_long_xchg(&calc_load_tasks_idle, 0); 2353 return;
2354
2355 /*
2356 * We woke inside or after the sample window, this means we're already
2357 * accounted through the nohz accounting, so skip the entire deal and
2358 * sync up for the next window.
2359 */
2360 this_rq->calc_load_update = calc_load_update;
2361 if (time_before(jiffies, this_rq->calc_load_update + 10))
2362 this_rq->calc_load_update += LOAD_FREQ;
2363}
2364
2365static long calc_load_fold_idle(void)
2366{
2367 int idx = calc_load_read_idx();
2368 long delta = 0;
2369
2370 if (atomic_long_read(&calc_load_idle[idx]))
2371 delta = atomic_long_xchg(&calc_load_idle[idx], 0);
2221 2372
2222 return delta; 2373 return delta;
2223} 2374}
@@ -2303,66 +2454,39 @@ static void calc_global_nohz(void)
2303{ 2454{
2304 long delta, active, n; 2455 long delta, active, n;
2305 2456
2306 /* 2457 if (!time_before(jiffies, calc_load_update + 10)) {
2307 * If we crossed a calc_load_update boundary, make sure to fold 2458 /*
2308 * any pending idle changes, the respective CPUs might have 2459 * Catch-up, fold however many we are behind still
2309 * missed the tick driven calc_load_account_active() update 2460 */
2310 * due to NO_HZ. 2461 delta = jiffies - calc_load_update - 10;
2311 */ 2462 n = 1 + (delta / LOAD_FREQ);
2312 delta = calc_load_fold_idle();
2313 if (delta)
2314 atomic_long_add(delta, &calc_load_tasks);
2315
2316 /*
2317 * It could be the one fold was all it took, we done!
2318 */
2319 if (time_before(jiffies, calc_load_update + 10))
2320 return;
2321
2322 /*
2323 * Catch-up, fold however many we are behind still
2324 */
2325 delta = jiffies - calc_load_update - 10;
2326 n = 1 + (delta / LOAD_FREQ);
2327 2463
2328 active = atomic_long_read(&calc_load_tasks); 2464 active = atomic_long_read(&calc_load_tasks);
2329 active = active > 0 ? active * FIXED_1 : 0; 2465 active = active > 0 ? active * FIXED_1 : 0;
2330 2466
2331 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); 2467 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2332 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); 2468 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2333 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); 2469 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2334 2470
2335 calc_load_update += n * LOAD_FREQ; 2471 calc_load_update += n * LOAD_FREQ;
2336} 2472 }
2337#else
2338void calc_load_account_idle(struct rq *this_rq)
2339{
2340}
2341 2473
2342static inline long calc_load_fold_idle(void) 2474 /*
2343{ 2475 * Flip the idle index...
2344 return 0; 2476 *
2477 * Make sure we first write the new time then flip the index, so that
2478 * calc_load_write_idx() will see the new time when it reads the new
2479 * index, this avoids a double flip messing things up.
2480 */
2481 smp_wmb();
2482 calc_load_idx++;
2345} 2483}
2484#else /* !CONFIG_NO_HZ */
2346 2485
2347static void calc_global_nohz(void) 2486static inline long calc_load_fold_idle(void) { return 0; }
2348{ 2487static inline void calc_global_nohz(void) { }
2349}
2350#endif
2351 2488
2352/** 2489#endif /* CONFIG_NO_HZ */
2353 * get_avenrun - get the load average array
2354 * @loads: pointer to dest load array
2355 * @offset: offset to add
2356 * @shift: shift count to shift the result left
2357 *
2358 * These values are estimates at best, so no need for locking.
2359 */
2360void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2361{
2362 loads[0] = (avenrun[0] + offset) << shift;
2363 loads[1] = (avenrun[1] + offset) << shift;
2364 loads[2] = (avenrun[2] + offset) << shift;
2365}
2366 2490
2367/* 2491/*
2368 * calc_load - update the avenrun load estimates 10 ticks after the 2492 * calc_load - update the avenrun load estimates 10 ticks after the
@@ -2370,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2370 */ 2494 */
2371void calc_global_load(unsigned long ticks) 2495void calc_global_load(unsigned long ticks)
2372{ 2496{
2373 long active; 2497 long active, delta;
2374 2498
2375 if (time_before(jiffies, calc_load_update + 10)) 2499 if (time_before(jiffies, calc_load_update + 10))
2376 return; 2500 return;
2377 2501
2502 /*
2503 * Fold the 'old' idle-delta to include all NO_HZ cpus.
2504 */
2505 delta = calc_load_fold_idle();
2506 if (delta)
2507 atomic_long_add(delta, &calc_load_tasks);
2508
2378 active = atomic_long_read(&calc_load_tasks); 2509 active = atomic_long_read(&calc_load_tasks);
2379 active = active > 0 ? active * FIXED_1 : 0; 2510 active = active > 0 ? active * FIXED_1 : 0;
2380 2511
@@ -2385,12 +2516,7 @@ void calc_global_load(unsigned long ticks)
2385 calc_load_update += LOAD_FREQ; 2516 calc_load_update += LOAD_FREQ;
2386 2517
2387 /* 2518 /*
2388 * Account one period with whatever state we found before 2519 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
2389 * folding in the nohz state and ageing the entire idle period.
2390 *
2391 * This avoids loosing a sample when we go idle between
2392 * calc_load_account_active() (10 ticks ago) and now and thus
2393 * under-accounting.
2394 */ 2520 */
2395 calc_global_nohz(); 2521 calc_global_nohz();
2396} 2522}
@@ -2407,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq)
2407 return; 2533 return;
2408 2534
2409 delta = calc_load_fold_active(this_rq); 2535 delta = calc_load_fold_active(this_rq);
2410 delta += calc_load_fold_idle();
2411 if (delta) 2536 if (delta)
2412 atomic_long_add(delta, &calc_load_tasks); 2537 atomic_long_add(delta, &calc_load_tasks);
2413 2538
@@ -2415,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq)
2415} 2540}
2416 2541
2417/* 2542/*
2543 * End of global load-average stuff
2544 */
2545
2546/*
2418 * The exact cpuload at various idx values, calculated at every tick would be 2547 * The exact cpuload at various idx values, calculated at every tick would be
2419 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load 2548 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
2420 * 2549 *
@@ -2517,25 +2646,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2517 sched_avg_update(this_rq); 2646 sched_avg_update(this_rq);
2518} 2647}
2519 2648
2649#ifdef CONFIG_NO_HZ
2650/*
2651 * There is no sane way to deal with nohz on smp when using jiffies because the
2652 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
2653 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
2654 *
2655 * Therefore we cannot use the delta approach from the regular tick since that
2656 * would seriously skew the load calculation. However we'll make do for those
2657 * updates happening while idle (nohz_idle_balance) or coming out of idle
2658 * (tick_nohz_idle_exit).
2659 *
2660 * This means we might still be one tick off for nohz periods.
2661 */
2662
2520/* 2663/*
2521 * Called from nohz_idle_balance() to update the load ratings before doing the 2664 * Called from nohz_idle_balance() to update the load ratings before doing the
2522 * idle balance. 2665 * idle balance.
2523 */ 2666 */
2524void update_idle_cpu_load(struct rq *this_rq) 2667void update_idle_cpu_load(struct rq *this_rq)
2525{ 2668{
2526 unsigned long curr_jiffies = jiffies; 2669 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2527 unsigned long load = this_rq->load.weight; 2670 unsigned long load = this_rq->load.weight;
2528 unsigned long pending_updates; 2671 unsigned long pending_updates;
2529 2672
2530 /* 2673 /*
2531 * Bloody broken means of dealing with nohz, but better than nothing.. 2674 * bail if there's load or we're actually up-to-date.
2532 * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
2533 * update and see 0 difference the one time and 2 the next, even though
2534 * we ticked at roughtly the same rate.
2535 *
2536 * Hence we only use this from nohz_idle_balance() and skip this
2537 * nonsense when called from the scheduler_tick() since that's
2538 * guaranteed a stable rate.
2539 */ 2675 */
2540 if (load || curr_jiffies == this_rq->last_load_update_tick) 2676 if (load || curr_jiffies == this_rq->last_load_update_tick)
2541 return; 2677 return;
@@ -2547,12 +2683,38 @@ void update_idle_cpu_load(struct rq *this_rq)
2547} 2683}
2548 2684
2549/* 2685/*
2686 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
2687 */
2688void update_cpu_load_nohz(void)
2689{
2690 struct rq *this_rq = this_rq();
2691 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2692 unsigned long pending_updates;
2693
2694 if (curr_jiffies == this_rq->last_load_update_tick)
2695 return;
2696
2697 raw_spin_lock(&this_rq->lock);
2698 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2699 if (pending_updates) {
2700 this_rq->last_load_update_tick = curr_jiffies;
2701 /*
2702 * We were idle, this means load 0, the current load might be
2703 * !0 due to remote wakeups and the sort.
2704 */
2705 __update_cpu_load(this_rq, 0, pending_updates);
2706 }
2707 raw_spin_unlock(&this_rq->lock);
2708}
2709#endif /* CONFIG_NO_HZ */
2710
2711/*
2550 * Called from scheduler_tick() 2712 * Called from scheduler_tick()
2551 */ 2713 */
2552static void update_cpu_load_active(struct rq *this_rq) 2714static void update_cpu_load_active(struct rq *this_rq)
2553{ 2715{
2554 /* 2716 /*
2555 * See the mess in update_idle_cpu_load(). 2717 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
2556 */ 2718 */
2557 this_rq->last_load_update_tick = jiffies; 2719 this_rq->last_load_update_tick = jiffies;
2558 __update_cpu_load(this_rq, this_rq->load.weight, 1); 2720 __update_cpu_load(this_rq, this_rq->load.weight, 1);
@@ -4982,7 +5144,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4982 p->sched_class->set_cpus_allowed(p, new_mask); 5144 p->sched_class->set_cpus_allowed(p, new_mask);
4983 5145
4984 cpumask_copy(&p->cpus_allowed, new_mask); 5146 cpumask_copy(&p->cpus_allowed, new_mask);
4985 p->rt.nr_cpus_allowed = cpumask_weight(new_mask); 5147 p->nr_cpus_allowed = cpumask_weight(new_mask);
4986} 5148}
4987 5149
4988/* 5150/*
@@ -5524,15 +5686,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
5524 5686
5525#ifdef CONFIG_SCHED_DEBUG 5687#ifdef CONFIG_SCHED_DEBUG
5526 5688
5527static __read_mostly int sched_domain_debug_enabled; 5689static __read_mostly int sched_debug_enabled;
5528 5690
5529static int __init sched_domain_debug_setup(char *str) 5691static int __init sched_debug_setup(char *str)
5530{ 5692{
5531 sched_domain_debug_enabled = 1; 5693 sched_debug_enabled = 1;
5532 5694
5533 return 0; 5695 return 0;
5534} 5696}
5535early_param("sched_debug", sched_domain_debug_setup); 5697early_param("sched_debug", sched_debug_setup);
5698
5699static inline bool sched_debug(void)
5700{
5701 return sched_debug_enabled;
5702}
5536 5703
5537static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 5704static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5538 struct cpumask *groupmask) 5705 struct cpumask *groupmask)
@@ -5572,7 +5739,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5572 break; 5739 break;
5573 } 5740 }
5574 5741
5575 if (!group->sgp->power) { 5742 /*
5743 * Even though we initialize ->power to something semi-sane,
5744 * we leave power_orig unset. This allows us to detect if
5745 * domain iteration is still funny without causing /0 traps.
5746 */
5747 if (!group->sgp->power_orig) {
5576 printk(KERN_CONT "\n"); 5748 printk(KERN_CONT "\n");
5577 printk(KERN_ERR "ERROR: domain->cpu_power not " 5749 printk(KERN_ERR "ERROR: domain->cpu_power not "
5578 "set\n"); 5750 "set\n");
@@ -5620,7 +5792,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5620{ 5792{
5621 int level = 0; 5793 int level = 0;
5622 5794
5623 if (!sched_domain_debug_enabled) 5795 if (!sched_debug_enabled)
5624 return; 5796 return;
5625 5797
5626 if (!sd) { 5798 if (!sd) {
@@ -5641,6 +5813,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5641} 5813}
5642#else /* !CONFIG_SCHED_DEBUG */ 5814#else /* !CONFIG_SCHED_DEBUG */
5643# define sched_domain_debug(sd, cpu) do { } while (0) 5815# define sched_domain_debug(sd, cpu) do { } while (0)
5816static inline bool sched_debug(void)
5817{
5818 return false;
5819}
5644#endif /* CONFIG_SCHED_DEBUG */ 5820#endif /* CONFIG_SCHED_DEBUG */
5645 5821
5646static int sd_degenerate(struct sched_domain *sd) 5822static int sd_degenerate(struct sched_domain *sd)
@@ -5962,6 +6138,44 @@ struct sched_domain_topology_level {
5962 struct sd_data data; 6138 struct sd_data data;
5963}; 6139};
5964 6140
6141/*
6142 * Build an iteration mask that can exclude certain CPUs from the upwards
6143 * domain traversal.
6144 *
6145 * Asymmetric node setups can result in situations where the domain tree is of
6146 * unequal depth, make sure to skip domains that already cover the entire
6147 * range.
6148 *
6149 * In that case build_sched_domains() will have terminated the iteration early
6150 * and our sibling sd spans will be empty. Domains should always include the
6151 * cpu they're built on, so check that.
6152 *
6153 */
6154static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
6155{
6156 const struct cpumask *span = sched_domain_span(sd);
6157 struct sd_data *sdd = sd->private;
6158 struct sched_domain *sibling;
6159 int i;
6160
6161 for_each_cpu(i, span) {
6162 sibling = *per_cpu_ptr(sdd->sd, i);
6163 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
6164 continue;
6165
6166 cpumask_set_cpu(i, sched_group_mask(sg));
6167 }
6168}
6169
6170/*
6171 * Return the canonical balance cpu for this group, this is the first cpu
6172 * of this group that's also in the iteration mask.
6173 */
6174int group_balance_cpu(struct sched_group *sg)
6175{
6176 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
6177}
6178
5965static int 6179static int
5966build_overlap_sched_groups(struct sched_domain *sd, int cpu) 6180build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5967{ 6181{
@@ -5980,6 +6194,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5980 if (cpumask_test_cpu(i, covered)) 6194 if (cpumask_test_cpu(i, covered))
5981 continue; 6195 continue;
5982 6196
6197 child = *per_cpu_ptr(sdd->sd, i);
6198
6199 /* See the comment near build_group_mask(). */
6200 if (!cpumask_test_cpu(i, sched_domain_span(child)))
6201 continue;
6202
5983 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 6203 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
5984 GFP_KERNEL, cpu_to_node(cpu)); 6204 GFP_KERNEL, cpu_to_node(cpu));
5985 6205
@@ -5987,8 +6207,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5987 goto fail; 6207 goto fail;
5988 6208
5989 sg_span = sched_group_cpus(sg); 6209 sg_span = sched_group_cpus(sg);
5990
5991 child = *per_cpu_ptr(sdd->sd, i);
5992 if (child->child) { 6210 if (child->child) {
5993 child = child->child; 6211 child = child->child;
5994 cpumask_copy(sg_span, sched_domain_span(child)); 6212 cpumask_copy(sg_span, sched_domain_span(child));
@@ -5997,10 +6215,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5997 6215
5998 cpumask_or(covered, covered, sg_span); 6216 cpumask_or(covered, covered, sg_span);
5999 6217
6000 sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); 6218 sg->sgp = *per_cpu_ptr(sdd->sgp, i);
6001 atomic_inc(&sg->sgp->ref); 6219 if (atomic_inc_return(&sg->sgp->ref) == 1)
6220 build_group_mask(sd, sg);
6002 6221
6003 if (cpumask_test_cpu(cpu, sg_span)) 6222 /*
6223 * Initialize sgp->power such that even if we mess up the
6224 * domains and no possible iteration will get us here, we won't
6225 * die on a /0 trap.
6226 */
6227 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
6228
6229 /*
6230 * Make sure the first group of this domain contains the
6231 * canonical balance cpu. Otherwise the sched_domain iteration
6232 * breaks. See update_sg_lb_stats().
6233 */
6234 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
6235 group_balance_cpu(sg) == cpu)
6004 groups = sg; 6236 groups = sg;
6005 6237
6006 if (!first) 6238 if (!first)
@@ -6074,6 +6306,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
6074 6306
6075 cpumask_clear(sched_group_cpus(sg)); 6307 cpumask_clear(sched_group_cpus(sg));
6076 sg->sgp->power = 0; 6308 sg->sgp->power = 0;
6309 cpumask_setall(sched_group_mask(sg));
6077 6310
6078 for_each_cpu(j, span) { 6311 for_each_cpu(j, span) {
6079 if (get_group(j, sdd, NULL) != group) 6312 if (get_group(j, sdd, NULL) != group)
@@ -6115,7 +6348,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6115 sg = sg->next; 6348 sg = sg->next;
6116 } while (sg != sd->groups); 6349 } while (sg != sd->groups);
6117 6350
6118 if (cpu != group_first_cpu(sg)) 6351 if (cpu != group_balance_cpu(sg))
6119 return; 6352 return;
6120 6353
6121 update_group_power(sd, cpu); 6354 update_group_power(sd, cpu);
@@ -6165,11 +6398,8 @@ int sched_domain_level_max;
6165 6398
6166static int __init setup_relax_domain_level(char *str) 6399static int __init setup_relax_domain_level(char *str)
6167{ 6400{
6168 unsigned long val; 6401 if (kstrtoint(str, 0, &default_relax_domain_level))
6169 6402 pr_warn("Unable to set relax_domain_level\n");
6170 val = simple_strtoul(str, NULL, 0);
6171 if (val < sched_domain_level_max)
6172 default_relax_domain_level = val;
6173 6403
6174 return 1; 6404 return 1;
6175} 6405}
@@ -6279,14 +6509,13 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol
6279#ifdef CONFIG_NUMA 6509#ifdef CONFIG_NUMA
6280 6510
6281static int sched_domains_numa_levels; 6511static int sched_domains_numa_levels;
6282static int sched_domains_numa_scale;
6283static int *sched_domains_numa_distance; 6512static int *sched_domains_numa_distance;
6284static struct cpumask ***sched_domains_numa_masks; 6513static struct cpumask ***sched_domains_numa_masks;
6285static int sched_domains_curr_level; 6514static int sched_domains_curr_level;
6286 6515
6287static inline int sd_local_flags(int level) 6516static inline int sd_local_flags(int level)
6288{ 6517{
6289 if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) 6518 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
6290 return 0; 6519 return 0;
6291 6520
6292 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; 6521 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
@@ -6344,6 +6573,42 @@ static const struct cpumask *sd_numa_mask(int cpu)
6344 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 6573 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6345} 6574}
6346 6575
6576static void sched_numa_warn(const char *str)
6577{
6578 static int done = false;
6579 int i,j;
6580
6581 if (done)
6582 return;
6583
6584 done = true;
6585
6586 printk(KERN_WARNING "ERROR: %s\n\n", str);
6587
6588 for (i = 0; i < nr_node_ids; i++) {
6589 printk(KERN_WARNING " ");
6590 for (j = 0; j < nr_node_ids; j++)
6591 printk(KERN_CONT "%02d ", node_distance(i,j));
6592 printk(KERN_CONT "\n");
6593 }
6594 printk(KERN_WARNING "\n");
6595}
6596
6597static bool find_numa_distance(int distance)
6598{
6599 int i;
6600
6601 if (distance == node_distance(0, 0))
6602 return true;
6603
6604 for (i = 0; i < sched_domains_numa_levels; i++) {
6605 if (sched_domains_numa_distance[i] == distance)
6606 return true;
6607 }
6608
6609 return false;
6610}
6611
6347static void sched_init_numa(void) 6612static void sched_init_numa(void)
6348{ 6613{
6349 int next_distance, curr_distance = node_distance(0, 0); 6614 int next_distance, curr_distance = node_distance(0, 0);
@@ -6351,7 +6616,6 @@ static void sched_init_numa(void)
6351 int level = 0; 6616 int level = 0;
6352 int i, j, k; 6617 int i, j, k;
6353 6618
6354 sched_domains_numa_scale = curr_distance;
6355 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); 6619 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6356 if (!sched_domains_numa_distance) 6620 if (!sched_domains_numa_distance)
6357 return; 6621 return;
@@ -6362,23 +6626,41 @@ static void sched_init_numa(void)
6362 * 6626 *
6363 * Assumes node_distance(0,j) includes all distances in 6627 * Assumes node_distance(0,j) includes all distances in
6364 * node_distance(i,j) in order to avoid cubic time. 6628 * node_distance(i,j) in order to avoid cubic time.
6365 *
6366 * XXX: could be optimized to O(n log n) by using sort()
6367 */ 6629 */
6368 next_distance = curr_distance; 6630 next_distance = curr_distance;
6369 for (i = 0; i < nr_node_ids; i++) { 6631 for (i = 0; i < nr_node_ids; i++) {
6370 for (j = 0; j < nr_node_ids; j++) { 6632 for (j = 0; j < nr_node_ids; j++) {
6371 int distance = node_distance(0, j); 6633 for (k = 0; k < nr_node_ids; k++) {
6372 if (distance > curr_distance && 6634 int distance = node_distance(i, k);
6373 (distance < next_distance || 6635
6374 next_distance == curr_distance)) 6636 if (distance > curr_distance &&
6375 next_distance = distance; 6637 (distance < next_distance ||
6638 next_distance == curr_distance))
6639 next_distance = distance;
6640
6641 /*
6642 * While not a strong assumption it would be nice to know
6643 * about cases where if node A is connected to B, B is not
6644 * equally connected to A.
6645 */
6646 if (sched_debug() && node_distance(k, i) != distance)
6647 sched_numa_warn("Node-distance not symmetric");
6648
6649 if (sched_debug() && i && !find_numa_distance(distance))
6650 sched_numa_warn("Node-0 not representative");
6651 }
6652 if (next_distance != curr_distance) {
6653 sched_domains_numa_distance[level++] = next_distance;
6654 sched_domains_numa_levels = level;
6655 curr_distance = next_distance;
6656 } else break;
6376 } 6657 }
6377 if (next_distance != curr_distance) { 6658
6378 sched_domains_numa_distance[level++] = next_distance; 6659 /*
6379 sched_domains_numa_levels = level; 6660 * In case of sched_debug() we verify the above assumption.
6380 curr_distance = next_distance; 6661 */
6381 } else break; 6662 if (!sched_debug())
6663 break;
6382 } 6664 }
6383 /* 6665 /*
6384 * 'level' contains the number of unique distances, excluding the 6666 * 'level' contains the number of unique distances, excluding the
@@ -6403,7 +6685,7 @@ static void sched_init_numa(void)
6403 return; 6685 return;
6404 6686
6405 for (j = 0; j < nr_node_ids; j++) { 6687 for (j = 0; j < nr_node_ids; j++) {
6406 struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j); 6688 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
6407 if (!mask) 6689 if (!mask)
6408 return; 6690 return;
6409 6691
@@ -6490,7 +6772,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6490 6772
6491 *per_cpu_ptr(sdd->sg, j) = sg; 6773 *per_cpu_ptr(sdd->sg, j) = sg;
6492 6774
6493 sgp = kzalloc_node(sizeof(struct sched_group_power), 6775 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
6494 GFP_KERNEL, cpu_to_node(j)); 6776 GFP_KERNEL, cpu_to_node(j));
6495 if (!sgp) 6777 if (!sgp)
6496 return -ENOMEM; 6778 return -ENOMEM;
@@ -6543,7 +6825,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6543 if (!sd) 6825 if (!sd)
6544 return child; 6826 return child;
6545 6827
6546 set_domain_attribute(sd, attr);
6547 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); 6828 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6548 if (child) { 6829 if (child) {
6549 sd->level = child->level + 1; 6830 sd->level = child->level + 1;
@@ -6551,6 +6832,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6551 child->parent = sd; 6832 child->parent = sd;
6552 } 6833 }
6553 sd->child = child; 6834 sd->child = child;
6835 set_domain_attribute(sd, attr);
6554 6836
6555 return sd; 6837 return sd;
6556} 6838}
@@ -6691,7 +6973,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)
6691 if (!doms_cur) 6973 if (!doms_cur)
6692 doms_cur = &fallback_doms; 6974 doms_cur = &fallback_doms;
6693 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 6975 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6694 dattr_cur = NULL;
6695 err = build_sched_domains(doms_cur[0], NULL); 6976 err = build_sched_domains(doms_cur[0], NULL);
6696 register_sched_domain_sysctl(); 6977 register_sched_domain_sysctl();
6697 6978
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 940e6d17cf96..c099cc6eebe3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2703 int want_sd = 1; 2703 int want_sd = 1;
2704 int sync = wake_flags & WF_SYNC; 2704 int sync = wake_flags & WF_SYNC;
2705 2705
2706 if (p->rt.nr_cpus_allowed == 1) 2706 if (p->nr_cpus_allowed == 1)
2707 return prev_cpu; 2707 return prev_cpu;
2708 2708
2709 if (sd_flag & SD_BALANCE_WAKE) { 2709 if (sd_flag & SD_BALANCE_WAKE) {
@@ -3503,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3503unsigned long scale_rt_power(int cpu) 3503unsigned long scale_rt_power(int cpu)
3504{ 3504{
3505 struct rq *rq = cpu_rq(cpu); 3505 struct rq *rq = cpu_rq(cpu);
3506 u64 total, available; 3506 u64 total, available, age_stamp, avg;
3507 3507
3508 total = sched_avg_period() + (rq->clock - rq->age_stamp); 3508 /*
3509 * Since we're reading these variables without serialization make sure
3510 * we read them once before doing sanity checks on them.
3511 */
3512 age_stamp = ACCESS_ONCE(rq->age_stamp);
3513 avg = ACCESS_ONCE(rq->rt_avg);
3514
3515 total = sched_avg_period() + (rq->clock - age_stamp);
3509 3516
3510 if (unlikely(total < rq->rt_avg)) { 3517 if (unlikely(total < avg)) {
3511 /* Ensures that power won't end up being negative */ 3518 /* Ensures that power won't end up being negative */
3512 available = 0; 3519 available = 0;
3513 } else { 3520 } else {
3514 available = total - rq->rt_avg; 3521 available = total - avg;
3515 } 3522 }
3516 3523
3517 if (unlikely((s64)total < SCHED_POWER_SCALE)) 3524 if (unlikely((s64)total < SCHED_POWER_SCALE))
@@ -3574,13 +3581,28 @@ void update_group_power(struct sched_domain *sd, int cpu)
3574 3581
3575 power = 0; 3582 power = 0;
3576 3583
3577 group = child->groups; 3584 if (child->flags & SD_OVERLAP) {
3578 do { 3585 /*
3579 power += group->sgp->power; 3586 * SD_OVERLAP domains cannot assume that child groups
3580 group = group->next; 3587 * span the current group.
3581 } while (group != child->groups); 3588 */
3582 3589
3583 sdg->sgp->power = power; 3590 for_each_cpu(cpu, sched_group_cpus(sdg))
3591 power += power_of(cpu);
3592 } else {
3593 /*
3594 * !SD_OVERLAP domains can assume that child groups
3595 * span the current group.
3596 */
3597
3598 group = child->groups;
3599 do {
3600 power += group->sgp->power;
3601 group = group->next;
3602 } while (group != child->groups);
3603 }
3604
3605 sdg->sgp->power_orig = sdg->sgp->power = power;
3584} 3606}
3585 3607
3586/* 3608/*
@@ -3610,7 +3632,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
3610 3632
3611/** 3633/**
3612 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3634 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3613 * @sd: The sched_domain whose statistics are to be updated. 3635 * @env: The load balancing environment.
3614 * @group: sched_group whose statistics are to be updated. 3636 * @group: sched_group whose statistics are to be updated.
3615 * @load_idx: Load index of sched_domain of this_cpu for load calc. 3637 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3616 * @local_group: Does group contain this_cpu. 3638 * @local_group: Does group contain this_cpu.
@@ -3630,7 +3652,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
3630 int i; 3652 int i;
3631 3653
3632 if (local_group) 3654 if (local_group)
3633 balance_cpu = group_first_cpu(group); 3655 balance_cpu = group_balance_cpu(group);
3634 3656
3635 /* Tally up the load of all CPUs in the group */ 3657 /* Tally up the load of all CPUs in the group */
3636 max_cpu_load = 0; 3658 max_cpu_load = 0;
@@ -3645,7 +3667,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
3645 3667
3646 /* Bias balancing toward cpus of our domain */ 3668 /* Bias balancing toward cpus of our domain */
3647 if (local_group) { 3669 if (local_group) {
3648 if (idle_cpu(i) && !first_idle_cpu) { 3670 if (idle_cpu(i) && !first_idle_cpu &&
3671 cpumask_test_cpu(i, sched_group_mask(group))) {
3649 first_idle_cpu = 1; 3672 first_idle_cpu = 1;
3650 balance_cpu = i; 3673 balance_cpu = i;
3651 } 3674 }
@@ -3719,11 +3742,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
3719 3742
3720/** 3743/**
3721 * update_sd_pick_busiest - return 1 on busiest group 3744 * update_sd_pick_busiest - return 1 on busiest group
3722 * @sd: sched_domain whose statistics are to be checked 3745 * @env: The load balancing environment.
3723 * @sds: sched_domain statistics 3746 * @sds: sched_domain statistics
3724 * @sg: sched_group candidate to be checked for being the busiest 3747 * @sg: sched_group candidate to be checked for being the busiest
3725 * @sgs: sched_group statistics 3748 * @sgs: sched_group statistics
3726 * @this_cpu: the current cpu
3727 * 3749 *
3728 * Determine if @sg is a busier group than the previously selected 3750 * Determine if @sg is a busier group than the previously selected
3729 * busiest group. 3751 * busiest group.
@@ -3761,9 +3783,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
3761 3783
3762/** 3784/**
3763 * update_sd_lb_stats - Update sched_domain's statistics for load balancing. 3785 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
3764 * @sd: sched_domain whose statistics are to be updated. 3786 * @env: The load balancing environment.
3765 * @this_cpu: Cpu for which load balance is currently performed.
3766 * @idle: Idle status of this_cpu
3767 * @cpus: Set of cpus considered for load balancing. 3787 * @cpus: Set of cpus considered for load balancing.
3768 * @balance: Should we balance. 3788 * @balance: Should we balance.
3769 * @sds: variable to hold the statistics for this sched_domain. 3789 * @sds: variable to hold the statistics for this sched_domain.
@@ -3852,10 +3872,8 @@ static inline void update_sd_lb_stats(struct lb_env *env,
3852 * Returns 1 when packing is required and a task should be moved to 3872 * Returns 1 when packing is required and a task should be moved to
3853 * this CPU. The amount of the imbalance is returned in *imbalance. 3873 * this CPU. The amount of the imbalance is returned in *imbalance.
3854 * 3874 *
3855 * @sd: The sched_domain whose packing is to be checked. 3875 * @env: The load balancing environment.
3856 * @sds: Statistics of the sched_domain which is to be packed 3876 * @sds: Statistics of the sched_domain which is to be packed
3857 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3858 * @imbalance: returns amount of imbalanced due to packing.
3859 */ 3877 */
3860static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) 3878static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
3861{ 3879{
@@ -3881,9 +3899,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
3881 * fix_small_imbalance - Calculate the minor imbalance that exists 3899 * fix_small_imbalance - Calculate the minor imbalance that exists
3882 * amongst the groups of a sched_domain, during 3900 * amongst the groups of a sched_domain, during
3883 * load balancing. 3901 * load balancing.
3902 * @env: The load balancing environment.
3884 * @sds: Statistics of the sched_domain whose imbalance is to be calculated. 3903 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3885 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3886 * @imbalance: Variable to store the imbalance.
3887 */ 3904 */
3888static inline 3905static inline
3889void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) 3906void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
@@ -4026,11 +4043,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4026 * Also calculates the amount of weighted load which should be moved 4043 * Also calculates the amount of weighted load which should be moved
4027 * to restore balance. 4044 * to restore balance.
4028 * 4045 *
4029 * @sd: The sched_domain whose busiest group is to be returned. 4046 * @env: The load balancing environment.
4030 * @this_cpu: The cpu for which load balancing is currently being performed.
4031 * @imbalance: Variable which stores amount of weighted load which should
4032 * be moved to restore balance/put a group to idle.
4033 * @idle: The idle status of this_cpu.
4034 * @cpus: The set of CPUs under consideration for load-balancing. 4047 * @cpus: The set of CPUs under consideration for load-balancing.
4035 * @balance: Pointer to a variable indicating if this_cpu 4048 * @balance: Pointer to a variable indicating if this_cpu
4036 * is the appropriate cpu to perform load balancing at this_level. 4049 * is the appropriate cpu to perform load balancing at this_level.
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b44d604b35d1..b6baf370cae9 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
25static struct task_struct *pick_next_task_idle(struct rq *rq) 25static struct task_struct *pick_next_task_idle(struct rq *rq)
26{ 26{
27 schedstat_inc(rq, sched_goidle); 27 schedstat_inc(rq, sched_goidle);
28 calc_load_account_idle(rq);
29 return rq->idle; 28 return rq->idle;
30} 29}
31 30
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c5565c3c515f..573e1ca01102 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq)
274 274
275static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 275static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
276{ 276{
277 struct task_struct *p;
278
277 if (!rt_entity_is_task(rt_se)) 279 if (!rt_entity_is_task(rt_se))
278 return; 280 return;
279 281
282 p = rt_task_of(rt_se);
280 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 283 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
281 284
282 rt_rq->rt_nr_total++; 285 rt_rq->rt_nr_total++;
283 if (rt_se->nr_cpus_allowed > 1) 286 if (p->nr_cpus_allowed > 1)
284 rt_rq->rt_nr_migratory++; 287 rt_rq->rt_nr_migratory++;
285 288
286 update_rt_migration(rt_rq); 289 update_rt_migration(rt_rq);
@@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
288 291
289static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 292static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
290{ 293{
294 struct task_struct *p;
295
291 if (!rt_entity_is_task(rt_se)) 296 if (!rt_entity_is_task(rt_se))
292 return; 297 return;
293 298
299 p = rt_task_of(rt_se);
294 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 300 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
295 301
296 rt_rq->rt_nr_total--; 302 rt_rq->rt_nr_total--;
297 if (rt_se->nr_cpus_allowed > 1) 303 if (p->nr_cpus_allowed > 1)
298 rt_rq->rt_nr_migratory--; 304 rt_rq->rt_nr_migratory--;
299 305
300 update_rt_migration(rt_rq); 306 update_rt_migration(rt_rq);
@@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1161 1167
1162 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); 1168 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
1163 1169
1164 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 1170 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1165 enqueue_pushable_task(rq, p); 1171 enqueue_pushable_task(rq, p);
1166 1172
1167 inc_nr_running(rq); 1173 inc_nr_running(rq);
@@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1225 1231
1226 cpu = task_cpu(p); 1232 cpu = task_cpu(p);
1227 1233
1228 if (p->rt.nr_cpus_allowed == 1) 1234 if (p->nr_cpus_allowed == 1)
1229 goto out; 1235 goto out;
1230 1236
1231 /* For anything but wake ups, just return the task_cpu */ 1237 /* For anything but wake ups, just return the task_cpu */
@@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1260 * will have to sort it out. 1266 * will have to sort it out.
1261 */ 1267 */
1262 if (curr && unlikely(rt_task(curr)) && 1268 if (curr && unlikely(rt_task(curr)) &&
1263 (curr->rt.nr_cpus_allowed < 2 || 1269 (curr->nr_cpus_allowed < 2 ||
1264 curr->prio <= p->prio) && 1270 curr->prio <= p->prio) &&
1265 (p->rt.nr_cpus_allowed > 1)) { 1271 (p->nr_cpus_allowed > 1)) {
1266 int target = find_lowest_rq(p); 1272 int target = find_lowest_rq(p);
1267 1273
1268 if (target != -1) 1274 if (target != -1)
@@ -1276,10 +1282,10 @@ out:
1276 1282
1277static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1283static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1278{ 1284{
1279 if (rq->curr->rt.nr_cpus_allowed == 1) 1285 if (rq->curr->nr_cpus_allowed == 1)
1280 return; 1286 return;
1281 1287
1282 if (p->rt.nr_cpus_allowed != 1 1288 if (p->nr_cpus_allowed != 1
1283 && cpupri_find(&rq->rd->cpupri, p, NULL)) 1289 && cpupri_find(&rq->rd->cpupri, p, NULL))
1284 return; 1290 return;
1285 1291
@@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1395 * The previous task needs to be made eligible for pushing 1401 * The previous task needs to be made eligible for pushing
1396 * if it is still active 1402 * if it is still active
1397 */ 1403 */
1398 if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) 1404 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1399 enqueue_pushable_task(rq, p); 1405 enqueue_pushable_task(rq, p);
1400} 1406}
1401 1407
@@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1408{ 1414{
1409 if (!task_running(rq, p) && 1415 if (!task_running(rq, p) &&
1410 (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && 1416 (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
1411 (p->rt.nr_cpus_allowed > 1)) 1417 (p->nr_cpus_allowed > 1))
1412 return 1; 1418 return 1;
1413 return 0; 1419 return 0;
1414} 1420}
@@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task)
1464 if (unlikely(!lowest_mask)) 1470 if (unlikely(!lowest_mask))
1465 return -1; 1471 return -1;
1466 1472
1467 if (task->rt.nr_cpus_allowed == 1) 1473 if (task->nr_cpus_allowed == 1)
1468 return -1; /* No other targets possible */ 1474 return -1; /* No other targets possible */
1469 1475
1470 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) 1476 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1556,7 +1562,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1556 task_running(rq, task) || 1562 task_running(rq, task) ||
1557 !task->on_rq)) { 1563 !task->on_rq)) {
1558 1564
1559 raw_spin_unlock(&lowest_rq->lock); 1565 double_unlock_balance(rq, lowest_rq);
1560 lowest_rq = NULL; 1566 lowest_rq = NULL;
1561 break; 1567 break;
1562 } 1568 }
@@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
1586 1592
1587 BUG_ON(rq->cpu != task_cpu(p)); 1593 BUG_ON(rq->cpu != task_cpu(p));
1588 BUG_ON(task_current(rq, p)); 1594 BUG_ON(task_current(rq, p));
1589 BUG_ON(p->rt.nr_cpus_allowed <= 1); 1595 BUG_ON(p->nr_cpus_allowed <= 1);
1590 1596
1591 BUG_ON(!p->on_rq); 1597 BUG_ON(!p->on_rq);
1592 BUG_ON(!rt_task(p)); 1598 BUG_ON(!rt_task(p));
@@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1793 if (!task_running(rq, p) && 1799 if (!task_running(rq, p) &&
1794 !test_tsk_need_resched(rq->curr) && 1800 !test_tsk_need_resched(rq->curr) &&
1795 has_pushable_tasks(rq) && 1801 has_pushable_tasks(rq) &&
1796 p->rt.nr_cpus_allowed > 1 && 1802 p->nr_cpus_allowed > 1 &&
1797 rt_task(rq->curr) && 1803 rt_task(rq->curr) &&
1798 (rq->curr->rt.nr_cpus_allowed < 2 || 1804 (rq->curr->nr_cpus_allowed < 2 ||
1799 rq->curr->prio <= p->prio)) 1805 rq->curr->prio <= p->prio))
1800 push_rt_tasks(rq); 1806 push_rt_tasks(rq);
1801} 1807}
@@ -1817,7 +1823,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1817 * Only update if the process changes its state from whether it 1823 * Only update if the process changes its state from whether it
1818 * can migrate or not. 1824 * can migrate or not.
1819 */ 1825 */
1820 if ((p->rt.nr_cpus_allowed > 1) == (weight > 1)) 1826 if ((p->nr_cpus_allowed > 1) == (weight > 1))
1821 return; 1827 return;
1822 1828
1823 rq = task_rq(p); 1829 rq = task_rq(p);
@@ -1979,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1979 1985
1980static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) 1986static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1981{ 1987{
1988 struct sched_rt_entity *rt_se = &p->rt;
1989
1982 update_curr_rt(rq); 1990 update_curr_rt(rq);
1983 1991
1984 watchdog(rq, p); 1992 watchdog(rq, p);
@@ -1996,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1996 p->rt.time_slice = RR_TIMESLICE; 2004 p->rt.time_slice = RR_TIMESLICE;
1997 2005
1998 /* 2006 /*
1999 * Requeue to the end of queue if we are not the only element 2007 * Requeue to the end of queue if we (and all of our ancestors) are the
2000 * on the queue: 2008 * only element on the queue
2001 */ 2009 */
2002 if (p->rt.run_list.prev != p->rt.run_list.next) { 2010 for_each_sched_rt_entity(rt_se) {
2003 requeue_task_rt(rq, p, 0); 2011 if (rt_se->run_list.prev != rt_se->run_list.next) {
2004 set_tsk_need_resched(p); 2012 requeue_task_rt(rq, p, 0);
2013 set_tsk_need_resched(p);
2014 return;
2015 }
2005 } 2016 }
2006} 2017}
2007 2018
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ba9dccfd24ce..55844f24435a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
526DECLARE_PER_CPU(struct sched_domain *, sd_llc); 526DECLARE_PER_CPU(struct sched_domain *, sd_llc);
527DECLARE_PER_CPU(int, sd_llc_id); 527DECLARE_PER_CPU(int, sd_llc_id);
528 528
529extern int group_balance_cpu(struct sched_group *sg);
530
529#endif /* CONFIG_SMP */ 531#endif /* CONFIG_SMP */
530 532
531#include "stats.h" 533#include "stats.h"
@@ -940,8 +942,6 @@ static inline u64 sched_avg_period(void)
940 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; 942 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
941} 943}
942 944
943void calc_load_account_idle(struct rq *this_rq);
944
945#ifdef CONFIG_SCHED_HRTICK 945#ifdef CONFIG_SCHED_HRTICK
946 946
947/* 947/*
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index e1a797e028a3..98f60c5caa1b 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -31,6 +31,12 @@ void __init idle_thread_set_boot_cpu(void)
31 per_cpu(idle_threads, smp_processor_id()) = current; 31 per_cpu(idle_threads, smp_processor_id()) = current;
32} 32}
33 33
34/**
35 * idle_init - Initialize the idle thread for a cpu
36 * @cpu: The cpu for which the idle thread should be initialized
37 *
38 * Creates the thread if it does not exist.
39 */
34static inline void idle_init(unsigned int cpu) 40static inline void idle_init(unsigned int cpu)
35{ 41{
36 struct task_struct *tsk = per_cpu(idle_threads, cpu); 42 struct task_struct *tsk = per_cpu(idle_threads, cpu);
@@ -45,17 +51,16 @@ static inline void idle_init(unsigned int cpu)
45} 51}
46 52
47/** 53/**
48 * idle_thread_init - Initialize the idle thread for a cpu 54 * idle_threads_init - Initialize idle threads for all cpus
49 * @cpu: The cpu for which the idle thread should be initialized
50 *
51 * Creates the thread if it does not exist.
52 */ 55 */
53void __init idle_threads_init(void) 56void __init idle_threads_init(void)
54{ 57{
55 unsigned int cpu; 58 unsigned int cpu, boot_cpu;
59
60 boot_cpu = smp_processor_id();
56 61
57 for_each_possible_cpu(cpu) { 62 for_each_possible_cpu(cpu) {
58 if (cpu != smp_processor_id()) 63 if (cpu != boot_cpu)
59 idle_init(cpu); 64 idle_init(cpu);
60 } 65 }
61} 66}
diff --git a/kernel/sys.c b/kernel/sys.c
index 9ff89cb9657a..2d39a84cd857 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1786,27 +1786,12 @@ SYSCALL_DEFINE1(umask, int, mask)
1786} 1786}
1787 1787
1788#ifdef CONFIG_CHECKPOINT_RESTORE 1788#ifdef CONFIG_CHECKPOINT_RESTORE
1789static bool vma_flags_mismatch(struct vm_area_struct *vma,
1790 unsigned long required,
1791 unsigned long banned)
1792{
1793 return (vma->vm_flags & required) != required ||
1794 (vma->vm_flags & banned);
1795}
1796
1797static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) 1789static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1798{ 1790{
1799 struct file *exe_file; 1791 struct file *exe_file;
1800 struct dentry *dentry; 1792 struct dentry *dentry;
1801 int err; 1793 int err;
1802 1794
1803 /*
1804 * Setting new mm::exe_file is only allowed when no VM_EXECUTABLE vma's
1805 * remain. So perform a quick test first.
1806 */
1807 if (mm->num_exe_file_vmas)
1808 return -EBUSY;
1809
1810 exe_file = fget(fd); 1795 exe_file = fget(fd);
1811 if (!exe_file) 1796 if (!exe_file)
1812 return -EBADF; 1797 return -EBADF;
@@ -1827,17 +1812,35 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1827 if (err) 1812 if (err)
1828 goto exit; 1813 goto exit;
1829 1814
1815 down_write(&mm->mmap_sem);
1816
1817 /*
1818 * Forbid mm->exe_file change if old file still mapped.
1819 */
1820 err = -EBUSY;
1821 if (mm->exe_file) {
1822 struct vm_area_struct *vma;
1823
1824 for (vma = mm->mmap; vma; vma = vma->vm_next)
1825 if (vma->vm_file &&
1826 path_equal(&vma->vm_file->f_path,
1827 &mm->exe_file->f_path))
1828 goto exit_unlock;
1829 }
1830
1830 /* 1831 /*
1831 * The symlink can be changed only once, just to disallow arbitrary 1832 * The symlink can be changed only once, just to disallow arbitrary
1832 * transitions malicious software might bring in. This means one 1833 * transitions malicious software might bring in. This means one
1833 * could make a snapshot over all processes running and monitor 1834 * could make a snapshot over all processes running and monitor
1834 * /proc/pid/exe changes to notice unusual activity if needed. 1835 * /proc/pid/exe changes to notice unusual activity if needed.
1835 */ 1836 */
1836 down_write(&mm->mmap_sem); 1837 err = -EPERM;
1837 if (likely(!mm->exe_file)) 1838 if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
1838 set_mm_exe_file(mm, exe_file); 1839 goto exit_unlock;
1839 else 1840
1840 err = -EBUSY; 1841 err = 0;
1842 set_mm_exe_file(mm, exe_file);
1843exit_unlock:
1841 up_write(&mm->mmap_sem); 1844 up_write(&mm->mmap_sem);
1842 1845
1843exit: 1846exit:
@@ -1862,7 +1865,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
1862 if (opt == PR_SET_MM_EXE_FILE) 1865 if (opt == PR_SET_MM_EXE_FILE)
1863 return prctl_set_mm_exe_file(mm, (unsigned int)addr); 1866 return prctl_set_mm_exe_file(mm, (unsigned int)addr);
1864 1867
1865 if (addr >= TASK_SIZE) 1868 if (addr >= TASK_SIZE || addr < mmap_min_addr)
1866 return -EINVAL; 1869 return -EINVAL;
1867 1870
1868 error = -EINVAL; 1871 error = -EINVAL;
@@ -1924,12 +1927,6 @@ static int prctl_set_mm(int opt, unsigned long addr,
1924 error = -EFAULT; 1927 error = -EFAULT;
1925 goto out; 1928 goto out;
1926 } 1929 }
1927#ifdef CONFIG_STACK_GROWSUP
1928 if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSUP, 0))
1929#else
1930 if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSDOWN, 0))
1931#endif
1932 goto out;
1933 if (opt == PR_SET_MM_START_STACK) 1930 if (opt == PR_SET_MM_START_STACK)
1934 mm->start_stack = addr; 1931 mm->start_stack = addr;
1935 else if (opt == PR_SET_MM_ARG_START) 1932 else if (opt == PR_SET_MM_ARG_START)
@@ -1981,12 +1978,22 @@ out:
1981 up_read(&mm->mmap_sem); 1978 up_read(&mm->mmap_sem);
1982 return error; 1979 return error;
1983} 1980}
1981
1982static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
1983{
1984 return put_user(me->clear_child_tid, tid_addr);
1985}
1986
1984#else /* CONFIG_CHECKPOINT_RESTORE */ 1987#else /* CONFIG_CHECKPOINT_RESTORE */
1985static int prctl_set_mm(int opt, unsigned long addr, 1988static int prctl_set_mm(int opt, unsigned long addr,
1986 unsigned long arg4, unsigned long arg5) 1989 unsigned long arg4, unsigned long arg5)
1987{ 1990{
1988 return -EINVAL; 1991 return -EINVAL;
1989} 1992}
1993static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
1994{
1995 return -EINVAL;
1996}
1990#endif 1997#endif
1991 1998
1992SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, 1999SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
@@ -2141,6 +2148,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2141 case PR_SET_MM: 2148 case PR_SET_MM:
2142 error = prctl_set_mm(arg2, arg3, arg4, arg5); 2149 error = prctl_set_mm(arg2, arg3, arg4, arg5);
2143 break; 2150 break;
2151 case PR_GET_TID_ADDRESS:
2152 error = prctl_get_tid_address(me, (int __user **)arg2);
2153 break;
2144 case PR_SET_CHILD_SUBREAPER: 2154 case PR_SET_CHILD_SUBREAPER:
2145 me->signal->is_child_subreaper = !!arg2; 2155 me->signal->is_child_subreaper = !!arg2;
2146 error = 0; 2156 error = 0;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 9cd928f7a7c6..7e1ce012a851 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -297,8 +297,7 @@ void clockevents_register_device(struct clock_event_device *dev)
297} 297}
298EXPORT_SYMBOL_GPL(clockevents_register_device); 298EXPORT_SYMBOL_GPL(clockevents_register_device);
299 299
300static void clockevents_config(struct clock_event_device *dev, 300void clockevents_config(struct clock_event_device *dev, u32 freq)
301 u32 freq)
302{ 301{
303 u64 sec; 302 u64 sec;
304 303
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 70b33abcc7bb..b7fbadc5c973 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -409,7 +409,9 @@ int second_overflow(unsigned long secs)
409 time_state = TIME_DEL; 409 time_state = TIME_DEL;
410 break; 410 break;
411 case TIME_INS: 411 case TIME_INS:
412 if (secs % 86400 == 0) { 412 if (!(time_status & STA_INS))
413 time_state = TIME_OK;
414 else if (secs % 86400 == 0) {
413 leap = -1; 415 leap = -1;
414 time_state = TIME_OOP; 416 time_state = TIME_OOP;
415 time_tai++; 417 time_tai++;
@@ -418,7 +420,9 @@ int second_overflow(unsigned long secs)
418 } 420 }
419 break; 421 break;
420 case TIME_DEL: 422 case TIME_DEL:
421 if ((secs + 1) % 86400 == 0) { 423 if (!(time_status & STA_DEL))
424 time_state = TIME_OK;
425 else if ((secs + 1) % 86400 == 0) {
422 leap = 1; 426 leap = 1;
423 time_tai--; 427 time_tai--;
424 time_state = TIME_WAIT; 428 time_state = TIME_WAIT;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6a3a5b9ff561..4a08472c3ca7 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -274,6 +274,7 @@ EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
274static void tick_nohz_stop_sched_tick(struct tick_sched *ts) 274static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
275{ 275{
276 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; 276 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
277 unsigned long rcu_delta_jiffies;
277 ktime_t last_update, expires, now; 278 ktime_t last_update, expires, now;
278 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 279 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
279 u64 time_delta; 280 u64 time_delta;
@@ -322,7 +323,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
322 time_delta = timekeeping_max_deferment(); 323 time_delta = timekeeping_max_deferment();
323 } while (read_seqretry(&xtime_lock, seq)); 324 } while (read_seqretry(&xtime_lock, seq));
324 325
325 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || 326 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) ||
326 arch_needs_cpu(cpu)) { 327 arch_needs_cpu(cpu)) {
327 next_jiffies = last_jiffies + 1; 328 next_jiffies = last_jiffies + 1;
328 delta_jiffies = 1; 329 delta_jiffies = 1;
@@ -330,6 +331,10 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
330 /* Get the next timer wheel timer */ 331 /* Get the next timer wheel timer */
331 next_jiffies = get_next_timer_interrupt(last_jiffies); 332 next_jiffies = get_next_timer_interrupt(last_jiffies);
332 delta_jiffies = next_jiffies - last_jiffies; 333 delta_jiffies = next_jiffies - last_jiffies;
334 if (rcu_delta_jiffies < delta_jiffies) {
335 next_jiffies = last_jiffies + rcu_delta_jiffies;
336 delta_jiffies = rcu_delta_jiffies;
337 }
333 } 338 }
334 /* 339 /*
335 * Do not stop the tick, if we are only one off 340 * Do not stop the tick, if we are only one off
@@ -401,6 +406,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
401 */ 406 */
402 if (!ts->tick_stopped) { 407 if (!ts->tick_stopped) {
403 select_nohz_load_balancer(1); 408 select_nohz_load_balancer(1);
409 calc_load_enter_idle();
404 410
405 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); 411 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
406 ts->tick_stopped = 1; 412 ts->tick_stopped = 1;
@@ -576,6 +582,7 @@ void tick_nohz_idle_exit(void)
576 /* Update jiffies first */ 582 /* Update jiffies first */
577 select_nohz_load_balancer(0); 583 select_nohz_load_balancer(0);
578 tick_do_update_jiffies64(now); 584 tick_do_update_jiffies64(now);
585 update_cpu_load_nohz();
579 586
580#ifndef CONFIG_VIRT_CPU_ACCOUNTING 587#ifndef CONFIG_VIRT_CPU_ACCOUNTING
581 /* 588 /*
@@ -591,6 +598,7 @@ void tick_nohz_idle_exit(void)
591 account_idle_ticks(ticks); 598 account_idle_ticks(ticks);
592#endif 599#endif
593 600
601 calc_load_exit_idle();
594 touch_softlockup_watchdog(); 602 touch_softlockup_watchdog();
595 /* 603 /*
596 * Cancel the scheduled timer and restore the tick 604 * Cancel the scheduled timer and restore the tick
@@ -814,6 +822,16 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
814 return HRTIMER_RESTART; 822 return HRTIMER_RESTART;
815} 823}
816 824
825static int sched_skew_tick;
826
827static int __init skew_tick(char *str)
828{
829 get_option(&str, &sched_skew_tick);
830
831 return 0;
832}
833early_param("skew_tick", skew_tick);
834
817/** 835/**
818 * tick_setup_sched_timer - setup the tick emulation timer 836 * tick_setup_sched_timer - setup the tick emulation timer
819 */ 837 */
@@ -831,6 +849,14 @@ void tick_setup_sched_timer(void)
831 /* Get the next period (per cpu) */ 849 /* Get the next period (per cpu) */
832 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 850 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
833 851
852 /* Offset the tick to avert xtime_lock contention. */
853 if (sched_skew_tick) {
854 u64 offset = ktime_to_ns(tick_period) >> 1;
855 do_div(offset, num_possible_cpus());
856 offset *= smp_processor_id();
857 hrtimer_add_expires_ns(&ts->sched_timer, offset);
858 }
859
834 for (;;) { 860 for (;;) {
835 hrtimer_forward(&ts->sched_timer, now, tick_period); 861 hrtimer_forward(&ts->sched_timer, now, tick_period);
836 hrtimer_start_expires(&ts->sched_timer, 862 hrtimer_start_expires(&ts->sched_timer,
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6e46cacf5969..3447cfaf11e7 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -70,6 +70,12 @@ struct timekeeper {
70 /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ 70 /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
71 struct timespec raw_time; 71 struct timespec raw_time;
72 72
73 /* Offset clock monotonic -> clock realtime */
74 ktime_t offs_real;
75
76 /* Offset clock monotonic -> clock boottime */
77 ktime_t offs_boot;
78
73 /* Seqlock for all timekeeper values */ 79 /* Seqlock for all timekeeper values */
74 seqlock_t lock; 80 seqlock_t lock;
75}; 81};
@@ -172,6 +178,14 @@ static inline s64 timekeeping_get_ns_raw(void)
172 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 178 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
173} 179}
174 180
181static void update_rt_offset(void)
182{
183 struct timespec tmp, *wtm = &timekeeper.wall_to_monotonic;
184
185 set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec);
186 timekeeper.offs_real = timespec_to_ktime(tmp);
187}
188
175/* must hold write on timekeeper.lock */ 189/* must hold write on timekeeper.lock */
176static void timekeeping_update(bool clearntp) 190static void timekeeping_update(bool clearntp)
177{ 191{
@@ -179,6 +193,7 @@ static void timekeeping_update(bool clearntp)
179 timekeeper.ntp_error = 0; 193 timekeeper.ntp_error = 0;
180 ntp_clear(); 194 ntp_clear();
181 } 195 }
196 update_rt_offset();
182 update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, 197 update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic,
183 timekeeper.clock, timekeeper.mult); 198 timekeeper.clock, timekeeper.mult);
184} 199}
@@ -604,6 +619,7 @@ void __init timekeeping_init(void)
604 } 619 }
605 set_normalized_timespec(&timekeeper.wall_to_monotonic, 620 set_normalized_timespec(&timekeeper.wall_to_monotonic,
606 -boot.tv_sec, -boot.tv_nsec); 621 -boot.tv_sec, -boot.tv_nsec);
622 update_rt_offset();
607 timekeeper.total_sleep_time.tv_sec = 0; 623 timekeeper.total_sleep_time.tv_sec = 0;
608 timekeeper.total_sleep_time.tv_nsec = 0; 624 timekeeper.total_sleep_time.tv_nsec = 0;
609 write_sequnlock_irqrestore(&timekeeper.lock, flags); 625 write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -612,6 +628,12 @@ void __init timekeeping_init(void)
612/* time in seconds when suspend began */ 628/* time in seconds when suspend began */
613static struct timespec timekeeping_suspend_time; 629static struct timespec timekeeping_suspend_time;
614 630
631static void update_sleep_time(struct timespec t)
632{
633 timekeeper.total_sleep_time = t;
634 timekeeper.offs_boot = timespec_to_ktime(t);
635}
636
615/** 637/**
616 * __timekeeping_inject_sleeptime - Internal function to add sleep interval 638 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
617 * @delta: pointer to a timespec delta value 639 * @delta: pointer to a timespec delta value
@@ -630,8 +652,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta)
630 timekeeper.xtime = timespec_add(timekeeper.xtime, *delta); 652 timekeeper.xtime = timespec_add(timekeeper.xtime, *delta);
631 timekeeper.wall_to_monotonic = 653 timekeeper.wall_to_monotonic =
632 timespec_sub(timekeeper.wall_to_monotonic, *delta); 654 timespec_sub(timekeeper.wall_to_monotonic, *delta);
633 timekeeper.total_sleep_time = timespec_add( 655 update_sleep_time(timespec_add(timekeeper.total_sleep_time, *delta));
634 timekeeper.total_sleep_time, *delta);
635} 656}
636 657
637 658
@@ -696,6 +717,7 @@ static void timekeeping_resume(void)
696 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 717 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
697 timekeeper.ntp_error = 0; 718 timekeeper.ntp_error = 0;
698 timekeeping_suspended = 0; 719 timekeeping_suspended = 0;
720 timekeeping_update(false);
699 write_sequnlock_irqrestore(&timekeeper.lock, flags); 721 write_sequnlock_irqrestore(&timekeeper.lock, flags);
700 722
701 touch_softlockup_watchdog(); 723 touch_softlockup_watchdog();
@@ -962,6 +984,9 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
962 timekeeper.xtime.tv_sec++; 984 timekeeper.xtime.tv_sec++;
963 leap = second_overflow(timekeeper.xtime.tv_sec); 985 leap = second_overflow(timekeeper.xtime.tv_sec);
964 timekeeper.xtime.tv_sec += leap; 986 timekeeper.xtime.tv_sec += leap;
987 timekeeper.wall_to_monotonic.tv_sec -= leap;
988 if (leap)
989 clock_was_set_delayed();
965 } 990 }
966 991
967 /* Accumulate raw time */ 992 /* Accumulate raw time */
@@ -1077,6 +1102,9 @@ static void update_wall_time(void)
1077 timekeeper.xtime.tv_sec++; 1102 timekeeper.xtime.tv_sec++;
1078 leap = second_overflow(timekeeper.xtime.tv_sec); 1103 leap = second_overflow(timekeeper.xtime.tv_sec);
1079 timekeeper.xtime.tv_sec += leap; 1104 timekeeper.xtime.tv_sec += leap;
1105 timekeeper.wall_to_monotonic.tv_sec -= leap;
1106 if (leap)
1107 clock_was_set_delayed();
1080 } 1108 }
1081 1109
1082 timekeeping_update(false); 1110 timekeeping_update(false);
@@ -1244,6 +1272,40 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1244 } while (read_seqretry(&timekeeper.lock, seq)); 1272 } while (read_seqretry(&timekeeper.lock, seq));
1245} 1273}
1246 1274
1275#ifdef CONFIG_HIGH_RES_TIMERS
1276/**
1277 * ktime_get_update_offsets - hrtimer helper
1278 * @offs_real: pointer to storage for monotonic -> realtime offset
1279 * @offs_boot: pointer to storage for monotonic -> boottime offset
1280 *
1281 * Returns current monotonic time and updates the offsets
1282 * Called from hrtimer_interupt() or retrigger_next_event()
1283 */
1284ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
1285{
1286 ktime_t now;
1287 unsigned int seq;
1288 u64 secs, nsecs;
1289
1290 do {
1291 seq = read_seqbegin(&timekeeper.lock);
1292
1293 secs = timekeeper.xtime.tv_sec;
1294 nsecs = timekeeper.xtime.tv_nsec;
1295 nsecs += timekeeping_get_ns();
1296 /* If arch requires, add in gettimeoffset() */
1297 nsecs += arch_gettimeoffset();
1298
1299 *offs_real = timekeeper.offs_real;
1300 *offs_boot = timekeeper.offs_boot;
1301 } while (read_seqretry(&timekeeper.lock, seq));
1302
1303 now = ktime_add_ns(ktime_set(secs, 0), nsecs);
1304 now = ktime_sub(now, *offs_real);
1305 return now;
1306}
1307#endif
1308
1247/** 1309/**
1248 * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format 1310 * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
1249 */ 1311 */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1d0f6a8a0e5e..f765465bffe4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1075,6 +1075,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
1075 rb_init_page(bpage->page); 1075 rb_init_page(bpage->page);
1076 1076
1077 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 1077 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
1078 INIT_LIST_HEAD(&cpu_buffer->new_pages);
1078 1079
1079 ret = rb_allocate_pages(cpu_buffer, nr_pages); 1080 ret = rb_allocate_pages(cpu_buffer, nr_pages);
1080 if (ret < 0) 1081 if (ret < 0)
@@ -1346,10 +1347,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
1346 * If something was added to this page, it was full 1347 * If something was added to this page, it was full
1347 * since it is not the tail page. So we deduct the 1348 * since it is not the tail page. So we deduct the
1348 * bytes consumed in ring buffer from here. 1349 * bytes consumed in ring buffer from here.
1349 * No need to update overruns, since this page is 1350 * Increment overrun to account for the lost events.
1350 * deleted from ring buffer and its entries are
1351 * already accounted for.
1352 */ 1351 */
1352 local_add(page_entries, &cpu_buffer->overrun);
1353 local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); 1353 local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
1354 } 1354 }
1355 1355
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 288488082224..a7fa0702be1c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -371,7 +371,7 @@ EXPORT_SYMBOL_GPL(tracing_on);
371void tracing_off(void) 371void tracing_off(void)
372{ 372{
373 if (global_trace.buffer) 373 if (global_trace.buffer)
374 ring_buffer_record_on(global_trace.buffer); 374 ring_buffer_record_off(global_trace.buffer);
375 /* 375 /*
376 * This flag is only looked at when buffers haven't been 376 * This flag is only looked at when buffers haven't been
377 * allocated yet. We don't really care about the race 377 * allocated yet. We don't really care about the race
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index e5e1d85b8c7c..4b1dfba70f7c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -372,6 +372,13 @@ static int watchdog(void *unused)
372 372
373 373
374#ifdef CONFIG_HARDLOCKUP_DETECTOR 374#ifdef CONFIG_HARDLOCKUP_DETECTOR
375/*
376 * People like the simple clean cpu node info on boot.
377 * Reduce the watchdog noise by only printing messages
378 * that are different from what cpu0 displayed.
379 */
380static unsigned long cpu0_err;
381
375static int watchdog_nmi_enable(int cpu) 382static int watchdog_nmi_enable(int cpu)
376{ 383{
377 struct perf_event_attr *wd_attr; 384 struct perf_event_attr *wd_attr;
@@ -390,11 +397,21 @@ static int watchdog_nmi_enable(int cpu)
390 397
391 /* Try to register using hardware perf events */ 398 /* Try to register using hardware perf events */
392 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); 399 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
400
401 /* save cpu0 error for future comparision */
402 if (cpu == 0 && IS_ERR(event))
403 cpu0_err = PTR_ERR(event);
404
393 if (!IS_ERR(event)) { 405 if (!IS_ERR(event)) {
394 pr_info("enabled, takes one hw-pmu counter.\n"); 406 /* only print for cpu0 or different than cpu0 */
407 if (cpu == 0 || cpu0_err)
408 pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
395 goto out_save; 409 goto out_save;
396 } 410 }
397 411
412 /* skip displaying the same error again */
413 if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
414 return PTR_ERR(event);
398 415
399 /* vary the KERN level based on the returned errno */ 416 /* vary the KERN level based on the returned errno */
400 if (PTR_ERR(event) == -EOPNOTSUPP) 417 if (PTR_ERR(event) == -EOPNOTSUPP)