aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2012-07-15 04:24:53 -0400
committerThomas Gleixner <tglx@linutronix.de>2012-07-15 04:24:53 -0400
commite8b9dd7e2471b1274e3be719fcc385e0a710e46f (patch)
tree030d7ce20e8f8767d9423f78c102aba089eec372 /kernel
parent924412f66fd9d21212e560a93792b0b607d46c6e (diff)
parent6b1859dba01c7d512b72d77e3fd7da8354235189 (diff)
Merge branch 'timers/urgent' into timers/core
Reason: Update to upstream changes to avoid further conflicts. Fixup a trivial merge conflict in kernel/time/tick-sched.c Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c36
-rw-r--r--kernel/events/core.c10
-rw-r--r--kernel/exit.c19
-rw-r--r--kernel/fork.c11
-rw-r--r--kernel/hrtimer.c53
-rw-r--r--kernel/panic.c6
-rw-r--r--kernel/pid_namespace.c20
-rw-r--r--kernel/printk.c670
-rw-r--r--kernel/rcutree.c17
-rw-r--r--kernel/rcutree.h15
-rw-r--r--kernel/rcutree_plugin.h179
-rw-r--r--kernel/relay.c5
-rw-r--r--kernel/sched/core.c276
-rw-r--r--kernel/sched/idle_task.c1
-rw-r--r--kernel/sched/sched.h2
-rw-r--r--kernel/sys.c22
-rw-r--r--kernel/time/ntp.c8
-rw-r--r--kernel/time/tick-sched.c9
-rw-r--r--kernel/time/timekeeping.c63
-rw-r--r--kernel/trace/ring_buffer.c6
-rw-r--r--kernel/trace/trace.c8
-rw-r--r--kernel/watchdog.c19
22 files changed, 1061 insertions, 394 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 72fcd3069a90..b303dfc7dce0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -255,12 +255,17 @@ int cgroup_lock_is_held(void)
255 255
256EXPORT_SYMBOL_GPL(cgroup_lock_is_held); 256EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
257 257
258static int css_unbias_refcnt(int refcnt)
259{
260 return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
261}
262
258/* the current nr of refs, always >= 0 whether @css is deactivated or not */ 263/* the current nr of refs, always >= 0 whether @css is deactivated or not */
259static int css_refcnt(struct cgroup_subsys_state *css) 264static int css_refcnt(struct cgroup_subsys_state *css)
260{ 265{
261 int v = atomic_read(&css->refcnt); 266 int v = atomic_read(&css->refcnt);
262 267
263 return v >= 0 ? v : v - CSS_DEACT_BIAS; 268 return css_unbias_refcnt(v);
264} 269}
265 270
266/* convenient tests for these bits */ 271/* convenient tests for these bits */
@@ -896,13 +901,10 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
896 mutex_unlock(&cgroup_mutex); 901 mutex_unlock(&cgroup_mutex);
897 902
898 /* 903 /*
899 * We want to drop the active superblock reference from the 904 * Drop the active superblock reference that we took when we
900 * cgroup creation after all the dentry refs are gone - 905 * created the cgroup
901 * kill_sb gets mighty unhappy otherwise. Mark
902 * dentry->d_fsdata with cgroup_diput() to tell
903 * cgroup_d_release() to call deactivate_super().
904 */ 906 */
905 dentry->d_fsdata = cgroup_diput; 907 deactivate_super(cgrp->root->sb);
906 908
907 /* 909 /*
908 * if we're getting rid of the cgroup, refcount should ensure 910 * if we're getting rid of the cgroup, refcount should ensure
@@ -928,13 +930,6 @@ static int cgroup_delete(const struct dentry *d)
928 return 1; 930 return 1;
929} 931}
930 932
931static void cgroup_d_release(struct dentry *dentry)
932{
933 /* did cgroup_diput() tell me to deactivate super? */
934 if (dentry->d_fsdata == cgroup_diput)
935 deactivate_super(dentry->d_sb);
936}
937
938static void remove_dir(struct dentry *d) 933static void remove_dir(struct dentry *d)
939{ 934{
940 struct dentry *parent = dget(d->d_parent); 935 struct dentry *parent = dget(d->d_parent);
@@ -1542,7 +1537,6 @@ static int cgroup_get_rootdir(struct super_block *sb)
1542 static const struct dentry_operations cgroup_dops = { 1537 static const struct dentry_operations cgroup_dops = {
1543 .d_iput = cgroup_diput, 1538 .d_iput = cgroup_diput,
1544 .d_delete = cgroup_delete, 1539 .d_delete = cgroup_delete,
1545 .d_release = cgroup_d_release,
1546 }; 1540 };
1547 1541
1548 struct inode *inode = 1542 struct inode *inode =
@@ -3889,8 +3883,12 @@ static void css_dput_fn(struct work_struct *work)
3889{ 3883{
3890 struct cgroup_subsys_state *css = 3884 struct cgroup_subsys_state *css =
3891 container_of(work, struct cgroup_subsys_state, dput_work); 3885 container_of(work, struct cgroup_subsys_state, dput_work);
3886 struct dentry *dentry = css->cgroup->dentry;
3887 struct super_block *sb = dentry->d_sb;
3892 3888
3893 dput(css->cgroup->dentry); 3889 atomic_inc(&sb->s_active);
3890 dput(dentry);
3891 deactivate_super(sb);
3894} 3892}
3895 3893
3896static void init_cgroup_css(struct cgroup_subsys_state *css, 3894static void init_cgroup_css(struct cgroup_subsys_state *css,
@@ -4982,10 +4980,12 @@ EXPORT_SYMBOL_GPL(__css_tryget);
4982void __css_put(struct cgroup_subsys_state *css) 4980void __css_put(struct cgroup_subsys_state *css)
4983{ 4981{
4984 struct cgroup *cgrp = css->cgroup; 4982 struct cgroup *cgrp = css->cgroup;
4983 int v;
4985 4984
4986 rcu_read_lock(); 4985 rcu_read_lock();
4987 atomic_dec(&css->refcnt); 4986 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
4988 switch (css_refcnt(css)) { 4987
4988 switch (v) {
4989 case 1: 4989 case 1:
4990 if (notify_on_release(cgrp)) { 4990 if (notify_on_release(cgrp)) {
4991 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4991 set_bit(CGRP_RELEASABLE, &cgrp->flags);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f85c0154b333..d7d71d6ec972 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -253,9 +253,9 @@ perf_cgroup_match(struct perf_event *event)
253 return !event->cgrp || event->cgrp == cpuctx->cgrp; 253 return !event->cgrp || event->cgrp == cpuctx->cgrp;
254} 254}
255 255
256static inline void perf_get_cgroup(struct perf_event *event) 256static inline bool perf_tryget_cgroup(struct perf_event *event)
257{ 257{
258 css_get(&event->cgrp->css); 258 return css_tryget(&event->cgrp->css);
259} 259}
260 260
261static inline void perf_put_cgroup(struct perf_event *event) 261static inline void perf_put_cgroup(struct perf_event *event)
@@ -484,7 +484,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
484 event->cgrp = cgrp; 484 event->cgrp = cgrp;
485 485
486 /* must be done before we fput() the file */ 486 /* must be done before we fput() the file */
487 perf_get_cgroup(event); 487 if (!perf_tryget_cgroup(event)) {
488 event->cgrp = NULL;
489 ret = -ENOENT;
490 goto out;
491 }
488 492
489 /* 493 /*
490 * all events in a group must monitor 494 * all events in a group must monitor
diff --git a/kernel/exit.c b/kernel/exit.c
index 34867cc5b42a..2f59cc334516 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,6 +72,18 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
72 list_del_rcu(&p->tasks); 72 list_del_rcu(&p->tasks);
73 list_del_init(&p->sibling); 73 list_del_init(&p->sibling);
74 __this_cpu_dec(process_counts); 74 __this_cpu_dec(process_counts);
75 /*
76 * If we are the last child process in a pid namespace to be
77 * reaped, notify the reaper sleeping zap_pid_ns_processes().
78 */
79 if (IS_ENABLED(CONFIG_PID_NS)) {
80 struct task_struct *parent = p->real_parent;
81
82 if ((task_active_pid_ns(parent)->child_reaper == parent) &&
83 list_empty(&parent->children) &&
84 (parent->flags & PF_EXITING))
85 wake_up_process(parent);
86 }
75 } 87 }
76 list_del_rcu(&p->thread_group); 88 list_del_rcu(&p->thread_group);
77} 89}
@@ -643,6 +655,7 @@ static void exit_mm(struct task_struct * tsk)
643 mm_release(tsk, mm); 655 mm_release(tsk, mm);
644 if (!mm) 656 if (!mm)
645 return; 657 return;
658 sync_mm_rss(mm);
646 /* 659 /*
647 * Serialize with any possible pending coredump. 660 * Serialize with any possible pending coredump.
648 * We must hold mmap_sem around checking core_state 661 * We must hold mmap_sem around checking core_state
@@ -719,12 +732,6 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
719 732
720 zap_pid_ns_processes(pid_ns); 733 zap_pid_ns_processes(pid_ns);
721 write_lock_irq(&tasklist_lock); 734 write_lock_irq(&tasklist_lock);
722 /*
723 * We can not clear ->child_reaper or leave it alone.
724 * There may by stealth EXIT_DEAD tasks on ->children,
725 * forget_original_parent() must move them somewhere.
726 */
727 pid_ns->child_reaper = init_pid_ns.child_reaper;
728 } else if (father->signal->has_child_subreaper) { 735 } else if (father->signal->has_child_subreaper) {
729 struct task_struct *reaper; 736 struct task_struct *reaper;
730 737
diff --git a/kernel/fork.c b/kernel/fork.c
index ab5211b9e622..f00e319d8376 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -304,12 +304,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
304 } 304 }
305 305
306 err = arch_dup_task_struct(tsk, orig); 306 err = arch_dup_task_struct(tsk, orig);
307 if (err)
308 goto out;
309 307
308 /*
309 * We defer looking at err, because we will need this setup
310 * for the clean up path to work correctly.
311 */
310 tsk->stack = ti; 312 tsk->stack = ti;
311
312 setup_thread_stack(tsk, orig); 313 setup_thread_stack(tsk, orig);
314
315 if (err)
316 goto out;
317
313 clear_user_return_notifier(tsk); 318 clear_user_return_notifier(tsk);
314 clear_tsk_need_resched(tsk); 319 clear_tsk_need_resched(tsk);
315 stackend = end_of_stack(tsk); 320 stackend = end_of_stack(tsk);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ae34bf51682b..6db7a5ed52b5 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -657,6 +657,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
657 return 0; 657 return 0;
658} 658}
659 659
660static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
661{
662 ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
663 ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
664
665 return ktime_get_update_offsets(offs_real, offs_boot);
666}
667
660/* 668/*
661 * Retrigger next event is called after clock was set 669 * Retrigger next event is called after clock was set
662 * 670 *
@@ -665,22 +673,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
665static void retrigger_next_event(void *arg) 673static void retrigger_next_event(void *arg)
666{ 674{
667 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); 675 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
668 struct timespec realtime_offset, xtim, wtm, sleep;
669 676
670 if (!hrtimer_hres_active()) 677 if (!hrtimer_hres_active())
671 return; 678 return;
672 679
673 /* Optimized out for !HIGH_RES */
674 get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
675 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
676
677 /* Adjust CLOCK_REALTIME offset */
678 raw_spin_lock(&base->lock); 680 raw_spin_lock(&base->lock);
679 base->clock_base[HRTIMER_BASE_REALTIME].offset = 681 hrtimer_update_base(base);
680 timespec_to_ktime(realtime_offset);
681 base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
682 timespec_to_ktime(sleep);
683
684 hrtimer_force_reprogram(base, 0); 682 hrtimer_force_reprogram(base, 0);
685 raw_spin_unlock(&base->lock); 683 raw_spin_unlock(&base->lock);
686} 684}
@@ -710,13 +708,25 @@ static int hrtimer_switch_to_hres(void)
710 base->clock_base[i].resolution = KTIME_HIGH_RES; 708 base->clock_base[i].resolution = KTIME_HIGH_RES;
711 709
712 tick_setup_sched_timer(); 710 tick_setup_sched_timer();
713
714 /* "Retrigger" the interrupt to get things going */ 711 /* "Retrigger" the interrupt to get things going */
715 retrigger_next_event(NULL); 712 retrigger_next_event(NULL);
716 local_irq_restore(flags); 713 local_irq_restore(flags);
717 return 1; 714 return 1;
718} 715}
719 716
717/*
718 * Called from timekeeping code to reprogramm the hrtimer interrupt
719 * device. If called from the timer interrupt context we defer it to
720 * softirq context.
721 */
722void clock_was_set_delayed(void)
723{
724 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
725
726 cpu_base->clock_was_set = 1;
727 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
728}
729
720#else 730#else
721 731
722static inline int hrtimer_hres_active(void) { return 0; } 732static inline int hrtimer_hres_active(void) { return 0; }
@@ -1250,11 +1260,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1250 cpu_base->nr_events++; 1260 cpu_base->nr_events++;
1251 dev->next_event.tv64 = KTIME_MAX; 1261 dev->next_event.tv64 = KTIME_MAX;
1252 1262
1253 entry_time = now = ktime_get(); 1263 raw_spin_lock(&cpu_base->lock);
1264 entry_time = now = hrtimer_update_base(cpu_base);
1254retry: 1265retry:
1255 expires_next.tv64 = KTIME_MAX; 1266 expires_next.tv64 = KTIME_MAX;
1256
1257 raw_spin_lock(&cpu_base->lock);
1258 /* 1267 /*
1259 * We set expires_next to KTIME_MAX here with cpu_base->lock 1268 * We set expires_next to KTIME_MAX here with cpu_base->lock
1260 * held to prevent that a timer is enqueued in our queue via 1269 * held to prevent that a timer is enqueued in our queue via
@@ -1330,8 +1339,12 @@ retry:
1330 * We need to prevent that we loop forever in the hrtimer 1339 * We need to prevent that we loop forever in the hrtimer
1331 * interrupt routine. We give it 3 attempts to avoid 1340 * interrupt routine. We give it 3 attempts to avoid
1332 * overreacting on some spurious event. 1341 * overreacting on some spurious event.
1342 *
1343 * Acquire base lock for updating the offsets and retrieving
1344 * the current time.
1333 */ 1345 */
1334 now = ktime_get(); 1346 raw_spin_lock(&cpu_base->lock);
1347 now = hrtimer_update_base(cpu_base);
1335 cpu_base->nr_retries++; 1348 cpu_base->nr_retries++;
1336 if (++retries < 3) 1349 if (++retries < 3)
1337 goto retry; 1350 goto retry;
@@ -1343,6 +1356,7 @@ retry:
1343 */ 1356 */
1344 cpu_base->nr_hangs++; 1357 cpu_base->nr_hangs++;
1345 cpu_base->hang_detected = 1; 1358 cpu_base->hang_detected = 1;
1359 raw_spin_unlock(&cpu_base->lock);
1346 delta = ktime_sub(now, entry_time); 1360 delta = ktime_sub(now, entry_time);
1347 if (delta.tv64 > cpu_base->max_hang_time.tv64) 1361 if (delta.tv64 > cpu_base->max_hang_time.tv64)
1348 cpu_base->max_hang_time = delta; 1362 cpu_base->max_hang_time = delta;
@@ -1395,6 +1409,13 @@ void hrtimer_peek_ahead_timers(void)
1395 1409
1396static void run_hrtimer_softirq(struct softirq_action *h) 1410static void run_hrtimer_softirq(struct softirq_action *h)
1397{ 1411{
1412 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1413
1414 if (cpu_base->clock_was_set) {
1415 cpu_base->clock_was_set = 0;
1416 clock_was_set();
1417 }
1418
1398 hrtimer_peek_ahead_timers(); 1419 hrtimer_peek_ahead_timers();
1399} 1420}
1400 1421
diff --git a/kernel/panic.c b/kernel/panic.c
index 8ed89a175d79..d2a5f4ecc6dd 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -27,7 +27,7 @@
27#define PANIC_TIMER_STEP 100 27#define PANIC_TIMER_STEP 100
28#define PANIC_BLINK_SPD 18 28#define PANIC_BLINK_SPD 18
29 29
30int panic_on_oops; 30int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
31static unsigned long tainted_mask; 31static unsigned long tainted_mask;
32static int pause_on_oops; 32static int pause_on_oops;
33static int pause_on_oops_flag; 33static int pause_on_oops_flag;
@@ -108,8 +108,6 @@ void panic(const char *fmt, ...)
108 */ 108 */
109 crash_kexec(NULL); 109 crash_kexec(NULL);
110 110
111 kmsg_dump(KMSG_DUMP_PANIC);
112
113 /* 111 /*
114 * Note smp_send_stop is the usual smp shutdown function, which 112 * Note smp_send_stop is the usual smp shutdown function, which
115 * unfortunately means it may not be hardened to work in a panic 113 * unfortunately means it may not be hardened to work in a panic
@@ -117,6 +115,8 @@ void panic(const char *fmt, ...)
117 */ 115 */
118 smp_send_stop(); 116 smp_send_stop();
119 117
118 kmsg_dump(KMSG_DUMP_PANIC);
119
120 atomic_notifier_call_chain(&panic_notifier_list, 0, buf); 120 atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
121 121
122 bust_spinlocks(0); 122 bust_spinlocks(0);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 16b20e38c4a1..b3c7fd554250 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -184,11 +184,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
184 } 184 }
185 read_unlock(&tasklist_lock); 185 read_unlock(&tasklist_lock);
186 186
187 /* Firstly reap the EXIT_ZOMBIE children we may have. */
187 do { 188 do {
188 clear_thread_flag(TIF_SIGPENDING); 189 clear_thread_flag(TIF_SIGPENDING);
189 rc = sys_wait4(-1, NULL, __WALL, NULL); 190 rc = sys_wait4(-1, NULL, __WALL, NULL);
190 } while (rc != -ECHILD); 191 } while (rc != -ECHILD);
191 192
193 /*
194 * sys_wait4() above can't reap the TASK_DEAD children.
195 * Make sure they all go away, see __unhash_process().
196 */
197 for (;;) {
198 bool need_wait = false;
199
200 read_lock(&tasklist_lock);
201 if (!list_empty(&current->children)) {
202 __set_current_state(TASK_UNINTERRUPTIBLE);
203 need_wait = true;
204 }
205 read_unlock(&tasklist_lock);
206
207 if (!need_wait)
208 break;
209 schedule();
210 }
211
192 if (pid_ns->reboot) 212 if (pid_ns->reboot)
193 current->signal->group_exit_code = pid_ns->reboot; 213 current->signal->group_exit_code = pid_ns->reboot;
194 214
diff --git a/kernel/printk.c b/kernel/printk.c
index 32462d2b364a..177fa49357a5 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -193,12 +193,21 @@ static int console_may_schedule;
193 * separated by ',', and find the message after the ';' character. 193 * separated by ',', and find the message after the ';' character.
194 */ 194 */
195 195
196enum log_flags {
197 LOG_NOCONS = 1, /* already flushed, do not print to console */
198 LOG_NEWLINE = 2, /* text ended with a newline */
199 LOG_PREFIX = 4, /* text started with a prefix */
200 LOG_CONT = 8, /* text is a fragment of a continuation line */
201};
202
196struct log { 203struct log {
197 u64 ts_nsec; /* timestamp in nanoseconds */ 204 u64 ts_nsec; /* timestamp in nanoseconds */
198 u16 len; /* length of entire record */ 205 u16 len; /* length of entire record */
199 u16 text_len; /* length of text buffer */ 206 u16 text_len; /* length of text buffer */
200 u16 dict_len; /* length of dictionary buffer */ 207 u16 dict_len; /* length of dictionary buffer */
201 u16 level; /* syslog level + facility */ 208 u8 facility; /* syslog facility */
209 u8 flags:5; /* internal record flags */
210 u8 level:3; /* syslog level */
202}; 211};
203 212
204/* 213/*
@@ -210,6 +219,8 @@ static DEFINE_RAW_SPINLOCK(logbuf_lock);
210/* the next printk record to read by syslog(READ) or /proc/kmsg */ 219/* the next printk record to read by syslog(READ) or /proc/kmsg */
211static u64 syslog_seq; 220static u64 syslog_seq;
212static u32 syslog_idx; 221static u32 syslog_idx;
222static enum log_flags syslog_prev;
223static size_t syslog_partial;
213 224
214/* index and sequence number of the first record stored in the buffer */ 225/* index and sequence number of the first record stored in the buffer */
215static u64 log_first_seq; 226static u64 log_first_seq;
@@ -227,10 +238,10 @@ static u32 clear_idx;
227#define LOG_LINE_MAX 1024 238#define LOG_LINE_MAX 1024
228 239
229/* record buffer */ 240/* record buffer */
230#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) 241#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
231#define LOG_ALIGN 4 242#define LOG_ALIGN 4
232#else 243#else
233#define LOG_ALIGN 8 244#define LOG_ALIGN __alignof__(struct log)
234#endif 245#endif
235#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) 246#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
236static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); 247static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
@@ -286,6 +297,7 @@ static u32 log_next(u32 idx)
286 297
287/* insert record into the buffer, discard old ones, update heads */ 298/* insert record into the buffer, discard old ones, update heads */
288static void log_store(int facility, int level, 299static void log_store(int facility, int level,
300 enum log_flags flags, u64 ts_nsec,
289 const char *dict, u16 dict_len, 301 const char *dict, u16 dict_len,
290 const char *text, u16 text_len) 302 const char *text, u16 text_len)
291{ 303{
@@ -329,8 +341,13 @@ static void log_store(int facility, int level,
329 msg->text_len = text_len; 341 msg->text_len = text_len;
330 memcpy(log_dict(msg), dict, dict_len); 342 memcpy(log_dict(msg), dict, dict_len);
331 msg->dict_len = dict_len; 343 msg->dict_len = dict_len;
332 msg->level = (facility << 3) | (level & 7); 344 msg->facility = facility;
333 msg->ts_nsec = local_clock(); 345 msg->level = level & 7;
346 msg->flags = flags & 0x1f;
347 if (ts_nsec > 0)
348 msg->ts_nsec = ts_nsec;
349 else
350 msg->ts_nsec = local_clock();
334 memset(log_dict(msg) + dict_len, 0, pad_len); 351 memset(log_dict(msg) + dict_len, 0, pad_len);
335 msg->len = sizeof(struct log) + text_len + dict_len + pad_len; 352 msg->len = sizeof(struct log) + text_len + dict_len + pad_len;
336 353
@@ -414,21 +431,23 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
414 if (!user) 431 if (!user)
415 return -EBADF; 432 return -EBADF;
416 433
417 mutex_lock(&user->lock); 434 ret = mutex_lock_interruptible(&user->lock);
418 raw_spin_lock(&logbuf_lock); 435 if (ret)
436 return ret;
437 raw_spin_lock_irq(&logbuf_lock);
419 while (user->seq == log_next_seq) { 438 while (user->seq == log_next_seq) {
420 if (file->f_flags & O_NONBLOCK) { 439 if (file->f_flags & O_NONBLOCK) {
421 ret = -EAGAIN; 440 ret = -EAGAIN;
422 raw_spin_unlock(&logbuf_lock); 441 raw_spin_unlock_irq(&logbuf_lock);
423 goto out; 442 goto out;
424 } 443 }
425 444
426 raw_spin_unlock(&logbuf_lock); 445 raw_spin_unlock_irq(&logbuf_lock);
427 ret = wait_event_interruptible(log_wait, 446 ret = wait_event_interruptible(log_wait,
428 user->seq != log_next_seq); 447 user->seq != log_next_seq);
429 if (ret) 448 if (ret)
430 goto out; 449 goto out;
431 raw_spin_lock(&logbuf_lock); 450 raw_spin_lock_irq(&logbuf_lock);
432 } 451 }
433 452
434 if (user->seq < log_first_seq) { 453 if (user->seq < log_first_seq) {
@@ -436,7 +455,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
436 user->idx = log_first_idx; 455 user->idx = log_first_idx;
437 user->seq = log_first_seq; 456 user->seq = log_first_seq;
438 ret = -EPIPE; 457 ret = -EPIPE;
439 raw_spin_unlock(&logbuf_lock); 458 raw_spin_unlock_irq(&logbuf_lock);
440 goto out; 459 goto out;
441 } 460 }
442 461
@@ -444,13 +463,13 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
444 ts_usec = msg->ts_nsec; 463 ts_usec = msg->ts_nsec;
445 do_div(ts_usec, 1000); 464 do_div(ts_usec, 1000);
446 len = sprintf(user->buf, "%u,%llu,%llu;", 465 len = sprintf(user->buf, "%u,%llu,%llu;",
447 msg->level, user->seq, ts_usec); 466 (msg->facility << 3) | msg->level, user->seq, ts_usec);
448 467
449 /* escape non-printable characters */ 468 /* escape non-printable characters */
450 for (i = 0; i < msg->text_len; i++) { 469 for (i = 0; i < msg->text_len; i++) {
451 unsigned char c = log_text(msg)[i]; 470 unsigned char c = log_text(msg)[i];
452 471
453 if (c < ' ' || c >= 128) 472 if (c < ' ' || c >= 127 || c == '\\')
454 len += sprintf(user->buf + len, "\\x%02x", c); 473 len += sprintf(user->buf + len, "\\x%02x", c);
455 else 474 else
456 user->buf[len++] = c; 475 user->buf[len++] = c;
@@ -474,7 +493,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
474 continue; 493 continue;
475 } 494 }
476 495
477 if (c < ' ' || c >= 128) { 496 if (c < ' ' || c >= 127 || c == '\\') {
478 len += sprintf(user->buf + len, "\\x%02x", c); 497 len += sprintf(user->buf + len, "\\x%02x", c);
479 continue; 498 continue;
480 } 499 }
@@ -486,7 +505,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
486 505
487 user->idx = log_next(user->idx); 506 user->idx = log_next(user->idx);
488 user->seq++; 507 user->seq++;
489 raw_spin_unlock(&logbuf_lock); 508 raw_spin_unlock_irq(&logbuf_lock);
490 509
491 if (len > count) { 510 if (len > count) {
492 ret = -EINVAL; 511 ret = -EINVAL;
@@ -513,7 +532,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
513 if (offset) 532 if (offset)
514 return -ESPIPE; 533 return -ESPIPE;
515 534
516 raw_spin_lock(&logbuf_lock); 535 raw_spin_lock_irq(&logbuf_lock);
517 switch (whence) { 536 switch (whence) {
518 case SEEK_SET: 537 case SEEK_SET:
519 /* the first record */ 538 /* the first record */
@@ -537,7 +556,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
537 default: 556 default:
538 ret = -EINVAL; 557 ret = -EINVAL;
539 } 558 }
540 raw_spin_unlock(&logbuf_lock); 559 raw_spin_unlock_irq(&logbuf_lock);
541 return ret; 560 return ret;
542} 561}
543 562
@@ -551,14 +570,14 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
551 570
552 poll_wait(file, &log_wait, wait); 571 poll_wait(file, &log_wait, wait);
553 572
554 raw_spin_lock(&logbuf_lock); 573 raw_spin_lock_irq(&logbuf_lock);
555 if (user->seq < log_next_seq) { 574 if (user->seq < log_next_seq) {
556 /* return error when data has vanished underneath us */ 575 /* return error when data has vanished underneath us */
557 if (user->seq < log_first_seq) 576 if (user->seq < log_first_seq)
558 ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; 577 ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
559 ret = POLLIN|POLLRDNORM; 578 ret = POLLIN|POLLRDNORM;
560 } 579 }
561 raw_spin_unlock(&logbuf_lock); 580 raw_spin_unlock_irq(&logbuf_lock);
562 581
563 return ret; 582 return ret;
564} 583}
@@ -582,10 +601,10 @@ static int devkmsg_open(struct inode *inode, struct file *file)
582 601
583 mutex_init(&user->lock); 602 mutex_init(&user->lock);
584 603
585 raw_spin_lock(&logbuf_lock); 604 raw_spin_lock_irq(&logbuf_lock);
586 user->idx = log_first_idx; 605 user->idx = log_first_idx;
587 user->seq = log_first_seq; 606 user->seq = log_first_seq;
588 raw_spin_unlock(&logbuf_lock); 607 raw_spin_unlock_irq(&logbuf_lock);
589 608
590 file->private_data = user; 609 file->private_data = user;
591 return 0; 610 return 0;
@@ -785,44 +804,64 @@ static bool printk_time;
785#endif 804#endif
786module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); 805module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
787 806
807static size_t print_time(u64 ts, char *buf)
808{
809 unsigned long rem_nsec;
810
811 if (!printk_time)
812 return 0;
813
814 if (!buf)
815 return 15;
816
817 rem_nsec = do_div(ts, 1000000000);
818 return sprintf(buf, "[%5lu.%06lu] ",
819 (unsigned long)ts, rem_nsec / 1000);
820}
821
788static size_t print_prefix(const struct log *msg, bool syslog, char *buf) 822static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
789{ 823{
790 size_t len = 0; 824 size_t len = 0;
825 unsigned int prefix = (msg->facility << 3) | msg->level;
791 826
792 if (syslog) { 827 if (syslog) {
793 if (buf) { 828 if (buf) {
794 len += sprintf(buf, "<%u>", msg->level); 829 len += sprintf(buf, "<%u>", prefix);
795 } else { 830 } else {
796 len += 3; 831 len += 3;
797 if (msg->level > 9) 832 if (prefix > 999)
798 len++; 833 len += 3;
799 if (msg->level > 99) 834 else if (prefix > 99)
835 len += 2;
836 else if (prefix > 9)
800 len++; 837 len++;
801 } 838 }
802 } 839 }
803 840
804 if (printk_time) { 841 len += print_time(msg->ts_nsec, buf ? buf + len : NULL);
805 if (buf) {
806 unsigned long long ts = msg->ts_nsec;
807 unsigned long rem_nsec = do_div(ts, 1000000000);
808
809 len += sprintf(buf + len, "[%5lu.%06lu] ",
810 (unsigned long) ts, rem_nsec / 1000);
811 } else {
812 len += 15;
813 }
814 }
815
816 return len; 842 return len;
817} 843}
818 844
819static size_t msg_print_text(const struct log *msg, bool syslog, 845static size_t msg_print_text(const struct log *msg, enum log_flags prev,
820 char *buf, size_t size) 846 bool syslog, char *buf, size_t size)
821{ 847{
822 const char *text = log_text(msg); 848 const char *text = log_text(msg);
823 size_t text_size = msg->text_len; 849 size_t text_size = msg->text_len;
850 bool prefix = true;
851 bool newline = true;
824 size_t len = 0; 852 size_t len = 0;
825 853
854 if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))
855 prefix = false;
856
857 if (msg->flags & LOG_CONT) {
858 if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE))
859 prefix = false;
860
861 if (!(msg->flags & LOG_NEWLINE))
862 newline = false;
863 }
864
826 do { 865 do {
827 const char *next = memchr(text, '\n', text_size); 866 const char *next = memchr(text, '\n', text_size);
828 size_t text_len; 867 size_t text_len;
@@ -840,16 +879,22 @@ static size_t msg_print_text(const struct log *msg, bool syslog,
840 text_len + 1>= size - len) 879 text_len + 1>= size - len)
841 break; 880 break;
842 881
843 len += print_prefix(msg, syslog, buf + len); 882 if (prefix)
883 len += print_prefix(msg, syslog, buf + len);
844 memcpy(buf + len, text, text_len); 884 memcpy(buf + len, text, text_len);
845 len += text_len; 885 len += text_len;
846 buf[len++] = '\n'; 886 if (next || newline)
887 buf[len++] = '\n';
847 } else { 888 } else {
848 /* SYSLOG_ACTION_* buffer size only calculation */ 889 /* SYSLOG_ACTION_* buffer size only calculation */
849 len += print_prefix(msg, syslog, NULL); 890 if (prefix)
850 len += text_len + 1; 891 len += print_prefix(msg, syslog, NULL);
892 len += text_len;
893 if (next || newline)
894 len++;
851 } 895 }
852 896
897 prefix = true;
853 text = next; 898 text = next;
854 } while (text); 899 } while (text);
855 900
@@ -860,26 +905,60 @@ static int syslog_print(char __user *buf, int size)
860{ 905{
861 char *text; 906 char *text;
862 struct log *msg; 907 struct log *msg;
863 int len; 908 int len = 0;
864 909
865 text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); 910 text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
866 if (!text) 911 if (!text)
867 return -ENOMEM; 912 return -ENOMEM;
868 913
869 raw_spin_lock_irq(&logbuf_lock); 914 while (size > 0) {
870 if (syslog_seq < log_first_seq) { 915 size_t n;
871 /* messages are gone, move to first one */ 916 size_t skip;
872 syslog_seq = log_first_seq; 917
873 syslog_idx = log_first_idx; 918 raw_spin_lock_irq(&logbuf_lock);
874 } 919 if (syslog_seq < log_first_seq) {
875 msg = log_from_idx(syslog_idx); 920 /* messages are gone, move to first one */
876 len = msg_print_text(msg, true, text, LOG_LINE_MAX); 921 syslog_seq = log_first_seq;
877 syslog_idx = log_next(syslog_idx); 922 syslog_idx = log_first_idx;
878 syslog_seq++; 923 syslog_prev = 0;
879 raw_spin_unlock_irq(&logbuf_lock); 924 syslog_partial = 0;
925 }
926 if (syslog_seq == log_next_seq) {
927 raw_spin_unlock_irq(&logbuf_lock);
928 break;
929 }
930
931 skip = syslog_partial;
932 msg = log_from_idx(syslog_idx);
933 n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX);
934 if (n - syslog_partial <= size) {
935 /* message fits into buffer, move forward */
936 syslog_idx = log_next(syslog_idx);
937 syslog_seq++;
938 syslog_prev = msg->flags;
939 n -= syslog_partial;
940 syslog_partial = 0;
941 } else if (!len){
942 /* partial read(), remember position */
943 n = size;
944 syslog_partial += n;
945 } else
946 n = 0;
947 raw_spin_unlock_irq(&logbuf_lock);
948
949 if (!n)
950 break;
951
952 if (copy_to_user(buf, text + skip, n)) {
953 if (!len)
954 len = -EFAULT;
955 break;
956 }
880 957
881 if (len > 0 && copy_to_user(buf, text, len)) 958 len += n;
882 len = -EFAULT; 959 size -= n;
960 buf += n;
961 }
883 962
884 kfree(text); 963 kfree(text);
885 return len; 964 return len;
@@ -899,6 +978,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
899 u64 next_seq; 978 u64 next_seq;
900 u64 seq; 979 u64 seq;
901 u32 idx; 980 u32 idx;
981 enum log_flags prev;
902 982
903 if (clear_seq < log_first_seq) { 983 if (clear_seq < log_first_seq) {
904 /* messages are gone, move to first available one */ 984 /* messages are gone, move to first available one */
@@ -909,41 +989,47 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
909 /* 989 /*
910 * Find first record that fits, including all following records, 990 * Find first record that fits, including all following records,
911 * into the user-provided buffer for this dump. 991 * into the user-provided buffer for this dump.
912 */ 992 */
913 seq = clear_seq; 993 seq = clear_seq;
914 idx = clear_idx; 994 idx = clear_idx;
995 prev = 0;
915 while (seq < log_next_seq) { 996 while (seq < log_next_seq) {
916 struct log *msg = log_from_idx(idx); 997 struct log *msg = log_from_idx(idx);
917 998
918 len += msg_print_text(msg, true, NULL, 0); 999 len += msg_print_text(msg, prev, true, NULL, 0);
919 idx = log_next(idx); 1000 idx = log_next(idx);
920 seq++; 1001 seq++;
921 } 1002 }
1003
1004 /* move first record forward until length fits into the buffer */
922 seq = clear_seq; 1005 seq = clear_seq;
923 idx = clear_idx; 1006 idx = clear_idx;
1007 prev = 0;
924 while (len > size && seq < log_next_seq) { 1008 while (len > size && seq < log_next_seq) {
925 struct log *msg = log_from_idx(idx); 1009 struct log *msg = log_from_idx(idx);
926 1010
927 len -= msg_print_text(msg, true, NULL, 0); 1011 len -= msg_print_text(msg, prev, true, NULL, 0);
928 idx = log_next(idx); 1012 idx = log_next(idx);
929 seq++; 1013 seq++;
930 } 1014 }
931 1015
932 /* last message in this dump */ 1016 /* last message fitting into this dump */
933 next_seq = log_next_seq; 1017 next_seq = log_next_seq;
934 1018
935 len = 0; 1019 len = 0;
1020 prev = 0;
936 while (len >= 0 && seq < next_seq) { 1021 while (len >= 0 && seq < next_seq) {
937 struct log *msg = log_from_idx(idx); 1022 struct log *msg = log_from_idx(idx);
938 int textlen; 1023 int textlen;
939 1024
940 textlen = msg_print_text(msg, true, text, LOG_LINE_MAX); 1025 textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX);
941 if (textlen < 0) { 1026 if (textlen < 0) {
942 len = textlen; 1027 len = textlen;
943 break; 1028 break;
944 } 1029 }
945 idx = log_next(idx); 1030 idx = log_next(idx);
946 seq++; 1031 seq++;
1032 prev = msg->flags;
947 1033
948 raw_spin_unlock_irq(&logbuf_lock); 1034 raw_spin_unlock_irq(&logbuf_lock);
949 if (copy_to_user(buf + len, text, textlen)) 1035 if (copy_to_user(buf + len, text, textlen))
@@ -956,6 +1042,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
956 /* messages are gone, move to next one */ 1042 /* messages are gone, move to next one */
957 seq = log_first_seq; 1043 seq = log_first_seq;
958 idx = log_first_idx; 1044 idx = log_first_idx;
1045 prev = 0;
959 } 1046 }
960 } 1047 }
961 } 1048 }
@@ -1027,6 +1114,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1027 /* Clear ring buffer */ 1114 /* Clear ring buffer */
1028 case SYSLOG_ACTION_CLEAR: 1115 case SYSLOG_ACTION_CLEAR:
1029 syslog_print_all(NULL, 0, true); 1116 syslog_print_all(NULL, 0, true);
1117 break;
1030 /* Disable logging to console */ 1118 /* Disable logging to console */
1031 case SYSLOG_ACTION_CONSOLE_OFF: 1119 case SYSLOG_ACTION_CONSOLE_OFF:
1032 if (saved_console_loglevel == -1) 1120 if (saved_console_loglevel == -1)
@@ -1059,6 +1147,8 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1059 /* messages are gone, move to first one */ 1147 /* messages are gone, move to first one */
1060 syslog_seq = log_first_seq; 1148 syslog_seq = log_first_seq;
1061 syslog_idx = log_first_idx; 1149 syslog_idx = log_first_idx;
1150 syslog_prev = 0;
1151 syslog_partial = 0;
1062 } 1152 }
1063 if (from_file) { 1153 if (from_file) {
1064 /* 1154 /*
@@ -1068,19 +1158,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1068 */ 1158 */
1069 error = log_next_idx - syslog_idx; 1159 error = log_next_idx - syslog_idx;
1070 } else { 1160 } else {
1071 u64 seq; 1161 u64 seq = syslog_seq;
1072 u32 idx; 1162 u32 idx = syslog_idx;
1163 enum log_flags prev = syslog_prev;
1073 1164
1074 error = 0; 1165 error = 0;
1075 seq = syslog_seq;
1076 idx = syslog_idx;
1077 while (seq < log_next_seq) { 1166 while (seq < log_next_seq) {
1078 struct log *msg = log_from_idx(idx); 1167 struct log *msg = log_from_idx(idx);
1079 1168
1080 error += msg_print_text(msg, true, NULL, 0); 1169 error += msg_print_text(msg, prev, true, NULL, 0);
1081 idx = log_next(idx); 1170 idx = log_next(idx);
1082 seq++; 1171 seq++;
1172 prev = msg->flags;
1083 } 1173 }
1174 error -= syslog_partial;
1084 } 1175 }
1085 raw_spin_unlock_irq(&logbuf_lock); 1176 raw_spin_unlock_irq(&logbuf_lock);
1086 break; 1177 break;
@@ -1259,22 +1350,98 @@ static inline void printk_delay(void)
1259 } 1350 }
1260} 1351}
1261 1352
1353/*
1354 * Continuation lines are buffered, and not committed to the record buffer
1355 * until the line is complete, or a race forces it. The line fragments
1356 * though, are printed immediately to the consoles to ensure everything has
1357 * reached the console in case of a kernel crash.
1358 */
1359static struct cont {
1360 char buf[LOG_LINE_MAX];
1361 size_t len; /* length == 0 means unused buffer */
1362 size_t cons; /* bytes written to console */
1363 struct task_struct *owner; /* task of first print*/
1364 u64 ts_nsec; /* time of first print */
1365 u8 level; /* log level of first message */
1366 u8 facility; /* log level of first message */
1367 bool flushed:1; /* buffer sealed and committed */
1368} cont;
1369
1370static void cont_flush(void)
1371{
1372 if (cont.flushed)
1373 return;
1374 if (cont.len == 0)
1375 return;
1376
1377 log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec,
1378 NULL, 0, cont.buf, cont.len);
1379
1380 cont.flushed = true;
1381}
1382
1383static bool cont_add(int facility, int level, const char *text, size_t len)
1384{
1385 if (cont.len && cont.flushed)
1386 return false;
1387
1388 if (cont.len + len > sizeof(cont.buf)) {
1389 cont_flush();
1390 return false;
1391 }
1392
1393 if (!cont.len) {
1394 cont.facility = facility;
1395 cont.level = level;
1396 cont.owner = current;
1397 cont.ts_nsec = local_clock();
1398 cont.cons = 0;
1399 cont.flushed = false;
1400 }
1401
1402 memcpy(cont.buf + cont.len, text, len);
1403 cont.len += len;
1404 return true;
1405}
1406
1407static size_t cont_print_text(char *text, size_t size)
1408{
1409 size_t textlen = 0;
1410 size_t len;
1411
1412 if (cont.cons == 0) {
1413 textlen += print_time(cont.ts_nsec, text);
1414 size -= textlen;
1415 }
1416
1417 len = cont.len - cont.cons;
1418 if (len > 0) {
1419 if (len+1 > size)
1420 len = size-1;
1421 memcpy(text + textlen, cont.buf + cont.cons, len);
1422 textlen += len;
1423 cont.cons = cont.len;
1424 }
1425
1426 if (cont.flushed) {
1427 text[textlen++] = '\n';
1428 /* got everything, release buffer */
1429 cont.len = 0;
1430 }
1431 return textlen;
1432}
1433
1262asmlinkage int vprintk_emit(int facility, int level, 1434asmlinkage int vprintk_emit(int facility, int level,
1263 const char *dict, size_t dictlen, 1435 const char *dict, size_t dictlen,
1264 const char *fmt, va_list args) 1436 const char *fmt, va_list args)
1265{ 1437{
1266 static int recursion_bug; 1438 static int recursion_bug;
1267 static char cont_buf[LOG_LINE_MAX];
1268 static size_t cont_len;
1269 static int cont_level;
1270 static struct task_struct *cont_task;
1271 static char textbuf[LOG_LINE_MAX]; 1439 static char textbuf[LOG_LINE_MAX];
1272 char *text = textbuf; 1440 char *text = textbuf;
1273 size_t text_len; 1441 size_t text_len;
1442 enum log_flags lflags = 0;
1274 unsigned long flags; 1443 unsigned long flags;
1275 int this_cpu; 1444 int this_cpu;
1276 bool newline = false;
1277 bool prefix = false;
1278 int printed_len = 0; 1445 int printed_len = 0;
1279 1446
1280 boot_delay_msec(); 1447 boot_delay_msec();
@@ -1313,7 +1480,8 @@ asmlinkage int vprintk_emit(int facility, int level,
1313 recursion_bug = 0; 1480 recursion_bug = 0;
1314 printed_len += strlen(recursion_msg); 1481 printed_len += strlen(recursion_msg);
1315 /* emit KERN_CRIT message */ 1482 /* emit KERN_CRIT message */
1316 log_store(0, 2, NULL, 0, recursion_msg, printed_len); 1483 log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
1484 NULL, 0, recursion_msg, printed_len);
1317 } 1485 }
1318 1486
1319 /* 1487 /*
@@ -1325,7 +1493,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1325 /* mark and strip a trailing newline */ 1493 /* mark and strip a trailing newline */
1326 if (text_len && text[text_len-1] == '\n') { 1494 if (text_len && text[text_len-1] == '\n') {
1327 text_len--; 1495 text_len--;
1328 newline = true; 1496 lflags |= LOG_NEWLINE;
1329 } 1497 }
1330 1498
1331 /* strip syslog prefix and extract log level or control flags */ 1499 /* strip syslog prefix and extract log level or control flags */
@@ -1335,7 +1503,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1335 if (level == -1) 1503 if (level == -1)
1336 level = text[1] - '0'; 1504 level = text[1] - '0';
1337 case 'd': /* KERN_DEFAULT */ 1505 case 'd': /* KERN_DEFAULT */
1338 prefix = true; 1506 lflags |= LOG_PREFIX;
1339 case 'c': /* KERN_CONT */ 1507 case 'c': /* KERN_CONT */
1340 text += 3; 1508 text += 3;
1341 text_len -= 3; 1509 text_len -= 3;
@@ -1345,61 +1513,41 @@ asmlinkage int vprintk_emit(int facility, int level,
1345 if (level == -1) 1513 if (level == -1)
1346 level = default_message_loglevel; 1514 level = default_message_loglevel;
1347 1515
1348 if (dict) { 1516 if (dict)
1349 prefix = true; 1517 lflags |= LOG_PREFIX|LOG_NEWLINE;
1350 newline = true;
1351 }
1352 1518
1353 if (!newline) { 1519 if (!(lflags & LOG_NEWLINE)) {
1354 if (cont_len && (prefix || cont_task != current)) { 1520 /*
1355 /* 1521 * Flush the conflicting buffer. An earlier newline was missing,
1356 * Flush earlier buffer, which is either from a 1522 * or another task also prints continuation lines.
1357 * different thread, or when we got a new prefix. 1523 */
1358 */ 1524 if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
1359 log_store(facility, cont_level, NULL, 0, cont_buf, cont_len); 1525 cont_flush();
1360 cont_len = 0;
1361 }
1362
1363 if (!cont_len) {
1364 cont_level = level;
1365 cont_task = current;
1366 }
1367 1526
1368 /* buffer or append to earlier buffer from the same thread */ 1527 /* buffer line if possible, otherwise store it right away */
1369 if (cont_len + text_len > sizeof(cont_buf)) 1528 if (!cont_add(facility, level, text, text_len))
1370 text_len = sizeof(cont_buf) - cont_len; 1529 log_store(facility, level, lflags | LOG_CONT, 0,
1371 memcpy(cont_buf + cont_len, text, text_len); 1530 dict, dictlen, text, text_len);
1372 cont_len += text_len;
1373 } else { 1531 } else {
1374 if (cont_len && cont_task == current) { 1532 bool stored = false;
1375 if (prefix) {
1376 /*
1377 * New prefix from the same thread; flush. We
1378 * either got no earlier newline, or we race
1379 * with an interrupt.
1380 */
1381 log_store(facility, cont_level,
1382 NULL, 0, cont_buf, cont_len);
1383 cont_len = 0;
1384 }
1385 1533
1386 /* append to the earlier buffer and flush */ 1534 /*
1387 if (cont_len + text_len > sizeof(cont_buf)) 1535 * If an earlier newline was missing and it was the same task,
1388 text_len = sizeof(cont_buf) - cont_len; 1536 * either merge it with the current buffer and flush, or if
1389 memcpy(cont_buf + cont_len, text, text_len); 1537 * there was a race with interrupts (prefix == true) then just
1390 cont_len += text_len; 1538 * flush it out and store this line separately.
1391 log_store(facility, cont_level, 1539 */
1392 NULL, 0, cont_buf, cont_len); 1540 if (cont.len && cont.owner == current) {
1393 cont_len = 0; 1541 if (!(lflags & LOG_PREFIX))
1394 cont_task = NULL; 1542 stored = cont_add(facility, level, text, text_len);
1395 printed_len = cont_len; 1543 cont_flush();
1396 } else {
1397 /* ordinary single and terminated line */
1398 log_store(facility, level,
1399 dict, dictlen, text, text_len);
1400 printed_len = text_len;
1401 } 1544 }
1545
1546 if (!stored)
1547 log_store(facility, level, lflags, 0,
1548 dict, dictlen, text, text_len);
1402 } 1549 }
1550 printed_len += text_len;
1403 1551
1404 /* 1552 /*
1405 * Try to acquire and then immediately release the console semaphore. 1553 * Try to acquire and then immediately release the console semaphore.
@@ -1486,11 +1634,18 @@ EXPORT_SYMBOL(printk);
1486#else 1634#else
1487 1635
1488#define LOG_LINE_MAX 0 1636#define LOG_LINE_MAX 0
1637static struct cont {
1638 size_t len;
1639 size_t cons;
1640 u8 level;
1641 bool flushed:1;
1642} cont;
1489static struct log *log_from_idx(u32 idx) { return NULL; } 1643static struct log *log_from_idx(u32 idx) { return NULL; }
1490static u32 log_next(u32 idx) { return 0; } 1644static u32 log_next(u32 idx) { return 0; }
1491static void call_console_drivers(int level, const char *text, size_t len) {} 1645static void call_console_drivers(int level, const char *text, size_t len) {}
1492static size_t msg_print_text(const struct log *msg, bool syslog, 1646static size_t msg_print_text(const struct log *msg, enum log_flags prev,
1493 char *buf, size_t size) { return 0; } 1647 bool syslog, char *buf, size_t size) { return 0; }
1648static size_t cont_print_text(char *text, size_t size) { return 0; }
1494 1649
1495#endif /* CONFIG_PRINTK */ 1650#endif /* CONFIG_PRINTK */
1496 1651
@@ -1765,6 +1920,7 @@ void wake_up_klogd(void)
1765/* the next printk record to write to the console */ 1920/* the next printk record to write to the console */
1766static u64 console_seq; 1921static u64 console_seq;
1767static u32 console_idx; 1922static u32 console_idx;
1923static enum log_flags console_prev;
1768 1924
1769/** 1925/**
1770 * console_unlock - unlock the console system 1926 * console_unlock - unlock the console system
@@ -1782,6 +1938,7 @@ static u32 console_idx;
1782 */ 1938 */
1783void console_unlock(void) 1939void console_unlock(void)
1784{ 1940{
1941 static char text[LOG_LINE_MAX];
1785 static u64 seen_seq; 1942 static u64 seen_seq;
1786 unsigned long flags; 1943 unsigned long flags;
1787 bool wake_klogd = false; 1944 bool wake_klogd = false;
@@ -1794,10 +1951,23 @@ void console_unlock(void)
1794 1951
1795 console_may_schedule = 0; 1952 console_may_schedule = 0;
1796 1953
1954 /* flush buffered message fragment immediately to console */
1955 raw_spin_lock_irqsave(&logbuf_lock, flags);
1956 if (cont.len && (cont.cons < cont.len || cont.flushed)) {
1957 size_t len;
1958
1959 len = cont_print_text(text, sizeof(text));
1960 raw_spin_unlock(&logbuf_lock);
1961 stop_critical_timings();
1962 call_console_drivers(cont.level, text, len);
1963 start_critical_timings();
1964 local_irq_restore(flags);
1965 } else
1966 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1967
1797again: 1968again:
1798 for (;;) { 1969 for (;;) {
1799 struct log *msg; 1970 struct log *msg;
1800 static char text[LOG_LINE_MAX];
1801 size_t len; 1971 size_t len;
1802 int level; 1972 int level;
1803 1973
@@ -1811,18 +1981,35 @@ again:
1811 /* messages are gone, move to first one */ 1981 /* messages are gone, move to first one */
1812 console_seq = log_first_seq; 1982 console_seq = log_first_seq;
1813 console_idx = log_first_idx; 1983 console_idx = log_first_idx;
1984 console_prev = 0;
1814 } 1985 }
1815 1986skip:
1816 if (console_seq == log_next_seq) 1987 if (console_seq == log_next_seq)
1817 break; 1988 break;
1818 1989
1819 msg = log_from_idx(console_idx); 1990 msg = log_from_idx(console_idx);
1820 level = msg->level & 7; 1991 if (msg->flags & LOG_NOCONS) {
1821 1992 /*
1822 len = msg_print_text(msg, false, text, sizeof(text)); 1993 * Skip record we have buffered and already printed
1994 * directly to the console when we received it.
1995 */
1996 console_idx = log_next(console_idx);
1997 console_seq++;
1998 /*
1999 * We will get here again when we register a new
2000 * CON_PRINTBUFFER console. Clear the flag so we
2001 * will properly dump everything later.
2002 */
2003 msg->flags &= ~LOG_NOCONS;
2004 goto skip;
2005 }
1823 2006
2007 level = msg->level;
2008 len = msg_print_text(msg, console_prev, false,
2009 text, sizeof(text));
1824 console_idx = log_next(console_idx); 2010 console_idx = log_next(console_idx);
1825 console_seq++; 2011 console_seq++;
2012 console_prev = msg->flags;
1826 raw_spin_unlock(&logbuf_lock); 2013 raw_spin_unlock(&logbuf_lock);
1827 2014
1828 stop_critical_timings(); /* don't trace print latency */ 2015 stop_critical_timings(); /* don't trace print latency */
@@ -2085,6 +2272,7 @@ void register_console(struct console *newcon)
2085 raw_spin_lock_irqsave(&logbuf_lock, flags); 2272 raw_spin_lock_irqsave(&logbuf_lock, flags);
2086 console_seq = syslog_seq; 2273 console_seq = syslog_seq;
2087 console_idx = syslog_idx; 2274 console_idx = syslog_idx;
2275 console_prev = syslog_prev;
2088 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2276 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2089 /* 2277 /*
2090 * We're about to replay the log buffer. Only do this to the 2278 * We're about to replay the log buffer. Only do this to the
@@ -2300,48 +2488,214 @@ module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
2300 * kmsg_dump - dump kernel log to kernel message dumpers. 2488 * kmsg_dump - dump kernel log to kernel message dumpers.
2301 * @reason: the reason (oops, panic etc) for dumping 2489 * @reason: the reason (oops, panic etc) for dumping
2302 * 2490 *
2303 * Iterate through each of the dump devices and call the oops/panic 2491 * Call each of the registered dumper's dump() callback, which can
2304 * callbacks with the log buffer. 2492 * retrieve the kmsg records with kmsg_dump_get_line() or
2493 * kmsg_dump_get_buffer().
2305 */ 2494 */
2306void kmsg_dump(enum kmsg_dump_reason reason) 2495void kmsg_dump(enum kmsg_dump_reason reason)
2307{ 2496{
2308 u64 idx;
2309 struct kmsg_dumper *dumper; 2497 struct kmsg_dumper *dumper;
2310 const char *s1, *s2;
2311 unsigned long l1, l2;
2312 unsigned long flags; 2498 unsigned long flags;
2313 2499
2314 if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) 2500 if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump)
2315 return; 2501 return;
2316 2502
2317 /* Theoretically, the log could move on after we do this, but 2503 rcu_read_lock();
2318 there's not a lot we can do about that. The new messages 2504 list_for_each_entry_rcu(dumper, &dump_list, list) {
2319 will overwrite the start of what we dump. */ 2505 if (dumper->max_reason && reason > dumper->max_reason)
2506 continue;
2507
2508 /* initialize iterator with data about the stored records */
2509 dumper->active = true;
2510
2511 raw_spin_lock_irqsave(&logbuf_lock, flags);
2512 dumper->cur_seq = clear_seq;
2513 dumper->cur_idx = clear_idx;
2514 dumper->next_seq = log_next_seq;
2515 dumper->next_idx = log_next_idx;
2516 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2517
2518 /* invoke dumper which will iterate over records */
2519 dumper->dump(dumper, reason);
2520
2521 /* reset iterator */
2522 dumper->active = false;
2523 }
2524 rcu_read_unlock();
2525}
2526
2527/**
2528 * kmsg_dump_get_line - retrieve one kmsg log line
2529 * @dumper: registered kmsg dumper
2530 * @syslog: include the "<4>" prefixes
2531 * @line: buffer to copy the line to
2532 * @size: maximum size of the buffer
2533 * @len: length of line placed into buffer
2534 *
2535 * Start at the beginning of the kmsg buffer, with the oldest kmsg
2536 * record, and copy one record into the provided buffer.
2537 *
2538 * Consecutive calls will return the next available record moving
2539 * towards the end of the buffer with the youngest messages.
2540 *
2541 * A return value of FALSE indicates that there are no more records to
2542 * read.
2543 */
2544bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
2545 char *line, size_t size, size_t *len)
2546{
2547 unsigned long flags;
2548 struct log *msg;
2549 size_t l = 0;
2550 bool ret = false;
2551
2552 if (!dumper->active)
2553 goto out;
2320 2554
2321 raw_spin_lock_irqsave(&logbuf_lock, flags); 2555 raw_spin_lock_irqsave(&logbuf_lock, flags);
2322 if (syslog_seq < log_first_seq) 2556 if (dumper->cur_seq < log_first_seq) {
2323 idx = syslog_idx; 2557 /* messages are gone, move to first available one */
2324 else 2558 dumper->cur_seq = log_first_seq;
2325 idx = log_first_idx; 2559 dumper->cur_idx = log_first_idx;
2560 }
2326 2561
2327 if (idx > log_next_idx) { 2562 /* last entry */
2328 s1 = log_buf; 2563 if (dumper->cur_seq >= log_next_seq) {
2329 l1 = log_next_idx; 2564 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2565 goto out;
2566 }
2330 2567
2331 s2 = log_buf + idx; 2568 msg = log_from_idx(dumper->cur_idx);
2332 l2 = log_buf_len - idx; 2569 l = msg_print_text(msg, 0, syslog, line, size);
2333 } else { 2570
2334 s1 = ""; 2571 dumper->cur_idx = log_next(dumper->cur_idx);
2335 l1 = 0; 2572 dumper->cur_seq++;
2573 ret = true;
2574 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2575out:
2576 if (len)
2577 *len = l;
2578 return ret;
2579}
2580EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
2581
2582/**
2583 * kmsg_dump_get_buffer - copy kmsg log lines
2584 * @dumper: registered kmsg dumper
2585 * @syslog: include the "<4>" prefixes
2586 * @buf: buffer to copy the line to
2587 * @size: maximum size of the buffer
2588 * @len: length of line placed into buffer
2589 *
2590 * Start at the end of the kmsg buffer and fill the provided buffer
2591 * with as many of the the *youngest* kmsg records that fit into it.
2592 * If the buffer is large enough, all available kmsg records will be
2593 * copied with a single call.
2594 *
2595 * Consecutive calls will fill the buffer with the next block of
2596 * available older records, not including the earlier retrieved ones.
2597 *
2598 * A return value of FALSE indicates that there are no more records to
2599 * read.
2600 */
2601bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
2602 char *buf, size_t size, size_t *len)
2603{
2604 unsigned long flags;
2605 u64 seq;
2606 u32 idx;
2607 u64 next_seq;
2608 u32 next_idx;
2609 enum log_flags prev;
2610 size_t l = 0;
2611 bool ret = false;
2612
2613 if (!dumper->active)
2614 goto out;
2615
2616 raw_spin_lock_irqsave(&logbuf_lock, flags);
2617 if (dumper->cur_seq < log_first_seq) {
2618 /* messages are gone, move to first available one */
2619 dumper->cur_seq = log_first_seq;
2620 dumper->cur_idx = log_first_idx;
2621 }
2622
2623 /* last entry */
2624 if (dumper->cur_seq >= dumper->next_seq) {
2625 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2626 goto out;
2627 }
2628
2629 /* calculate length of entire buffer */
2630 seq = dumper->cur_seq;
2631 idx = dumper->cur_idx;
2632 prev = 0;
2633 while (seq < dumper->next_seq) {
2634 struct log *msg = log_from_idx(idx);
2635
2636 l += msg_print_text(msg, prev, true, NULL, 0);
2637 idx = log_next(idx);
2638 seq++;
2639 prev = msg->flags;
2640 }
2336 2641
2337 s2 = log_buf + idx; 2642 /* move first record forward until length fits into the buffer */
2338 l2 = log_next_idx - idx; 2643 seq = dumper->cur_seq;
2644 idx = dumper->cur_idx;
2645 prev = 0;
2646 while (l > size && seq < dumper->next_seq) {
2647 struct log *msg = log_from_idx(idx);
2648
2649 l -= msg_print_text(msg, prev, true, NULL, 0);
2650 idx = log_next(idx);
2651 seq++;
2652 prev = msg->flags;
2339 } 2653 }
2654
2655 /* last message in next interation */
2656 next_seq = seq;
2657 next_idx = idx;
2658
2659 l = 0;
2660 prev = 0;
2661 while (seq < dumper->next_seq) {
2662 struct log *msg = log_from_idx(idx);
2663
2664 l += msg_print_text(msg, prev, syslog, buf + l, size - l);
2665 idx = log_next(idx);
2666 seq++;
2667 prev = msg->flags;
2668 }
2669
2670 dumper->next_seq = next_seq;
2671 dumper->next_idx = next_idx;
2672 ret = true;
2340 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2673 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2674out:
2675 if (len)
2676 *len = l;
2677 return ret;
2678}
2679EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
2341 2680
2342 rcu_read_lock(); 2681/**
2343 list_for_each_entry_rcu(dumper, &dump_list, list) 2682 * kmsg_dump_rewind - reset the interator
2344 dumper->dump(dumper, reason, s1, l1, s2, l2); 2683 * @dumper: registered kmsg dumper
2345 rcu_read_unlock(); 2684 *
2685 * Reset the dumper's iterator so that kmsg_dump_get_line() and
2686 * kmsg_dump_get_buffer() can be called again and used multiple
2687 * times within the same dumper.dump() callback.
2688 */
2689void kmsg_dump_rewind(struct kmsg_dumper *dumper)
2690{
2691 unsigned long flags;
2692
2693 raw_spin_lock_irqsave(&logbuf_lock, flags);
2694 dumper->cur_seq = clear_seq;
2695 dumper->cur_idx = clear_idx;
2696 dumper->next_seq = log_next_seq;
2697 dumper->next_idx = log_next_idx;
2698 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2346} 2699}
2700EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
2347#endif 2701#endif
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 0da7b88d92d0..4b97bba7396e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -201,6 +201,7 @@ void rcu_note_context_switch(int cpu)
201{ 201{
202 trace_rcu_utilization("Start context switch"); 202 trace_rcu_utilization("Start context switch");
203 rcu_sched_qs(cpu); 203 rcu_sched_qs(cpu);
204 rcu_preempt_note_context_switch(cpu);
204 trace_rcu_utilization("End context switch"); 205 trace_rcu_utilization("End context switch");
205} 206}
206EXPORT_SYMBOL_GPL(rcu_note_context_switch); 207EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -1397,6 +1398,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1397 rdp->qlen_lazy += rsp->qlen_lazy; 1398 rdp->qlen_lazy += rsp->qlen_lazy;
1398 rdp->qlen += rsp->qlen; 1399 rdp->qlen += rsp->qlen;
1399 rdp->n_cbs_adopted += rsp->qlen; 1400 rdp->n_cbs_adopted += rsp->qlen;
1401 if (rsp->qlen_lazy != rsp->qlen)
1402 rcu_idle_count_callbacks_posted();
1400 rsp->qlen_lazy = 0; 1403 rsp->qlen_lazy = 0;
1401 rsp->qlen = 0; 1404 rsp->qlen = 0;
1402 1405
@@ -1528,7 +1531,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1528{ 1531{
1529 unsigned long flags; 1532 unsigned long flags;
1530 struct rcu_head *next, *list, **tail; 1533 struct rcu_head *next, *list, **tail;
1531 int bl, count, count_lazy; 1534 int bl, count, count_lazy, i;
1532 1535
1533 /* If no callbacks are ready, just return.*/ 1536 /* If no callbacks are ready, just return.*/
1534 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 1537 if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
@@ -1551,9 +1554,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1551 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; 1554 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
1552 *rdp->nxttail[RCU_DONE_TAIL] = NULL; 1555 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
1553 tail = rdp->nxttail[RCU_DONE_TAIL]; 1556 tail = rdp->nxttail[RCU_DONE_TAIL];
1554 for (count = RCU_NEXT_SIZE - 1; count >= 0; count--) 1557 for (i = RCU_NEXT_SIZE - 1; i >= 0; i--)
1555 if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL]) 1558 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
1556 rdp->nxttail[count] = &rdp->nxtlist; 1559 rdp->nxttail[i] = &rdp->nxtlist;
1557 local_irq_restore(flags); 1560 local_irq_restore(flags);
1558 1561
1559 /* Invoke callbacks. */ 1562 /* Invoke callbacks. */
@@ -1581,9 +1584,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1581 if (list != NULL) { 1584 if (list != NULL) {
1582 *tail = rdp->nxtlist; 1585 *tail = rdp->nxtlist;
1583 rdp->nxtlist = list; 1586 rdp->nxtlist = list;
1584 for (count = 0; count < RCU_NEXT_SIZE; count++) 1587 for (i = 0; i < RCU_NEXT_SIZE; i++)
1585 if (&rdp->nxtlist == rdp->nxttail[count]) 1588 if (&rdp->nxtlist == rdp->nxttail[i])
1586 rdp->nxttail[count] = tail; 1589 rdp->nxttail[i] = tail;
1587 else 1590 else
1588 break; 1591 break;
1589 } 1592 }
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 7f5d138dedf5..19b61ac1079f 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,6 +84,20 @@ struct rcu_dynticks {
84 /* Process level is worth LLONG_MAX/2. */ 84 /* Process level is worth LLONG_MAX/2. */
85 int dynticks_nmi_nesting; /* Track NMI nesting level. */ 85 int dynticks_nmi_nesting; /* Track NMI nesting level. */
86 atomic_t dynticks; /* Even value for idle, else odd. */ 86 atomic_t dynticks; /* Even value for idle, else odd. */
87#ifdef CONFIG_RCU_FAST_NO_HZ
88 int dyntick_drain; /* Prepare-for-idle state variable. */
89 unsigned long dyntick_holdoff;
90 /* No retries for the jiffy of failure. */
91 struct timer_list idle_gp_timer;
92 /* Wake up CPU sleeping with callbacks. */
93 unsigned long idle_gp_timer_expires;
94 /* When to wake up CPU (for repost). */
95 bool idle_first_pass; /* First pass of attempt to go idle? */
96 unsigned long nonlazy_posted;
97 /* # times non-lazy CBs posted to CPU. */
98 unsigned long nonlazy_posted_snap;
99 /* idle-period nonlazy_posted snapshot. */
100#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
87}; 101};
88 102
89/* RCU's kthread states for tracing. */ 103/* RCU's kthread states for tracing. */
@@ -430,6 +444,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
430/* Forward declarations for rcutree_plugin.h */ 444/* Forward declarations for rcutree_plugin.h */
431static void rcu_bootup_announce(void); 445static void rcu_bootup_announce(void);
432long rcu_batches_completed(void); 446long rcu_batches_completed(void);
447static void rcu_preempt_note_context_switch(int cpu);
433static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); 448static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
434#ifdef CONFIG_HOTPLUG_CPU 449#ifdef CONFIG_HOTPLUG_CPU
435static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 450static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 2411000d9869..3e4899459f3d 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu)
153 * 153 *
154 * Caller must disable preemption. 154 * Caller must disable preemption.
155 */ 155 */
156void rcu_preempt_note_context_switch(void) 156static void rcu_preempt_note_context_switch(int cpu)
157{ 157{
158 struct task_struct *t = current; 158 struct task_struct *t = current;
159 unsigned long flags; 159 unsigned long flags;
@@ -164,7 +164,7 @@ void rcu_preempt_note_context_switch(void)
164 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 164 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
165 165
166 /* Possibly blocking in an RCU read-side critical section. */ 166 /* Possibly blocking in an RCU read-side critical section. */
167 rdp = __this_cpu_ptr(rcu_preempt_state.rda); 167 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
168 rnp = rdp->mynode; 168 rnp = rdp->mynode;
169 raw_spin_lock_irqsave(&rnp->lock, flags); 169 raw_spin_lock_irqsave(&rnp->lock, flags);
170 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 170 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -228,7 +228,7 @@ void rcu_preempt_note_context_switch(void)
228 * means that we continue to block the current grace period. 228 * means that we continue to block the current grace period.
229 */ 229 */
230 local_irq_save(flags); 230 local_irq_save(flags);
231 rcu_preempt_qs(smp_processor_id()); 231 rcu_preempt_qs(cpu);
232 local_irq_restore(flags); 232 local_irq_restore(flags);
233} 233}
234 234
@@ -1002,6 +1002,14 @@ void rcu_force_quiescent_state(void)
1002EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 1002EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
1003 1003
1004/* 1004/*
1005 * Because preemptible RCU does not exist, we never have to check for
1006 * CPUs being in quiescent states.
1007 */
1008static void rcu_preempt_note_context_switch(int cpu)
1009{
1010}
1011
1012/*
1005 * Because preemptible RCU does not exist, there are never any preempted 1013 * Because preemptible RCU does not exist, there are never any preempted
1006 * RCU readers. 1014 * RCU readers.
1007 */ 1015 */
@@ -1886,8 +1894,9 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
1886 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs 1894 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
1887 * any flavor of RCU. 1895 * any flavor of RCU.
1888 */ 1896 */
1889int rcu_needs_cpu(int cpu) 1897int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
1890{ 1898{
1899 *delta_jiffies = ULONG_MAX;
1891 return rcu_cpu_has_callbacks(cpu); 1900 return rcu_cpu_has_callbacks(cpu);
1892} 1901}
1893 1902
@@ -1962,41 +1971,6 @@ static void rcu_idle_count_callbacks_posted(void)
1962#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ 1971#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */
1963#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ 1972#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
1964 1973
1965/* Loop counter for rcu_prepare_for_idle(). */
1966static DEFINE_PER_CPU(int, rcu_dyntick_drain);
1967/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */
1968static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1969/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */
1970static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer);
1971/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */
1972static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires);
1973/* Enable special processing on first attempt to enter dyntick-idle mode. */
1974static DEFINE_PER_CPU(bool, rcu_idle_first_pass);
1975/* Running count of non-lazy callbacks posted, never decremented. */
1976static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted);
1977/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */
1978static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap);
1979
1980/*
1981 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
1982 * callbacks on this CPU, (2) this CPU has not yet attempted to enter
1983 * dyntick-idle mode, or (3) this CPU is in the process of attempting to
1984 * enter dyntick-idle mode. Otherwise, if we have recently tried and failed
1985 * to enter dyntick-idle mode, we refuse to try to enter it. After all,
1986 * it is better to incur scheduling-clock interrupts than to spin
1987 * continuously for the same time duration!
1988 */
1989int rcu_needs_cpu(int cpu)
1990{
1991 /* Flag a new idle sojourn to the idle-entry state machine. */
1992 per_cpu(rcu_idle_first_pass, cpu) = 1;
1993 /* If no callbacks, RCU doesn't need the CPU. */
1994 if (!rcu_cpu_has_callbacks(cpu))
1995 return 0;
1996 /* Otherwise, RCU needs the CPU only if it recently tried and failed. */
1997 return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies;
1998}
1999
2000/* 1974/*
2001 * Does the specified flavor of RCU have non-lazy callbacks pending on 1975 * Does the specified flavor of RCU have non-lazy callbacks pending on
2002 * the specified CPU? Both RCU flavor and CPU are specified by the 1976 * the specified CPU? Both RCU flavor and CPU are specified by the
@@ -2040,6 +2014,47 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
2040} 2014}
2041 2015
2042/* 2016/*
2017 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
2018 * callbacks on this CPU, (2) this CPU has not yet attempted to enter
2019 * dyntick-idle mode, or (3) this CPU is in the process of attempting to
2020 * enter dyntick-idle mode. Otherwise, if we have recently tried and failed
2021 * to enter dyntick-idle mode, we refuse to try to enter it. After all,
2022 * it is better to incur scheduling-clock interrupts than to spin
2023 * continuously for the same time duration!
2024 *
2025 * The delta_jiffies argument is used to store the time when RCU is
2026 * going to need the CPU again if it still has callbacks. The reason
2027 * for this is that rcu_prepare_for_idle() might need to post a timer,
2028 * but if so, it will do so after tick_nohz_stop_sched_tick() has set
2029 * the wakeup time for this CPU. This means that RCU's timer can be
2030 * delayed until the wakeup time, which defeats the purpose of posting
2031 * a timer.
2032 */
2033int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
2034{
2035 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2036
2037 /* Flag a new idle sojourn to the idle-entry state machine. */
2038 rdtp->idle_first_pass = 1;
2039 /* If no callbacks, RCU doesn't need the CPU. */
2040 if (!rcu_cpu_has_callbacks(cpu)) {
2041 *delta_jiffies = ULONG_MAX;
2042 return 0;
2043 }
2044 if (rdtp->dyntick_holdoff == jiffies) {
2045 /* RCU recently tried and failed, so don't try again. */
2046 *delta_jiffies = 1;
2047 return 1;
2048 }
2049 /* Set up for the possibility that RCU will post a timer. */
2050 if (rcu_cpu_has_nonlazy_callbacks(cpu))
2051 *delta_jiffies = RCU_IDLE_GP_DELAY;
2052 else
2053 *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY;
2054 return 0;
2055}
2056
2057/*
2043 * Handler for smp_call_function_single(). The only point of this 2058 * Handler for smp_call_function_single(). The only point of this
2044 * handler is to wake the CPU up, so the handler does only tracing. 2059 * handler is to wake the CPU up, so the handler does only tracing.
2045 */ 2060 */
@@ -2075,21 +2090,24 @@ static void rcu_idle_gp_timer_func(unsigned long cpu_in)
2075 */ 2090 */
2076static void rcu_prepare_for_idle_init(int cpu) 2091static void rcu_prepare_for_idle_init(int cpu)
2077{ 2092{
2078 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 2093 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2079 setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), 2094
2080 rcu_idle_gp_timer_func, cpu); 2095 rdtp->dyntick_holdoff = jiffies - 1;
2081 per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1; 2096 setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu);
2082 per_cpu(rcu_idle_first_pass, cpu) = 1; 2097 rdtp->idle_gp_timer_expires = jiffies - 1;
2098 rdtp->idle_first_pass = 1;
2083} 2099}
2084 2100
2085/* 2101/*
2086 * Clean up for exit from idle. Because we are exiting from idle, there 2102 * Clean up for exit from idle. Because we are exiting from idle, there
2087 * is no longer any point to rcu_idle_gp_timer, so cancel it. This will 2103 * is no longer any point to ->idle_gp_timer, so cancel it. This will
2088 * do nothing if this timer is not active, so just cancel it unconditionally. 2104 * do nothing if this timer is not active, so just cancel it unconditionally.
2089 */ 2105 */
2090static void rcu_cleanup_after_idle(int cpu) 2106static void rcu_cleanup_after_idle(int cpu)
2091{ 2107{
2092 del_timer(&per_cpu(rcu_idle_gp_timer, cpu)); 2108 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2109
2110 del_timer(&rdtp->idle_gp_timer);
2093 trace_rcu_prep_idle("Cleanup after idle"); 2111 trace_rcu_prep_idle("Cleanup after idle");
2094} 2112}
2095 2113
@@ -2108,42 +2126,41 @@ static void rcu_cleanup_after_idle(int cpu)
2108 * Because it is not legal to invoke rcu_process_callbacks() with irqs 2126 * Because it is not legal to invoke rcu_process_callbacks() with irqs
2109 * disabled, we do one pass of force_quiescent_state(), then do a 2127 * disabled, we do one pass of force_quiescent_state(), then do a
2110 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked 2128 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
2111 * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. 2129 * later. The ->dyntick_drain field controls the sequencing.
2112 * 2130 *
2113 * The caller must have disabled interrupts. 2131 * The caller must have disabled interrupts.
2114 */ 2132 */
2115static void rcu_prepare_for_idle(int cpu) 2133static void rcu_prepare_for_idle(int cpu)
2116{ 2134{
2117 struct timer_list *tp; 2135 struct timer_list *tp;
2136 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2118 2137
2119 /* 2138 /*
2120 * If this is an idle re-entry, for example, due to use of 2139 * If this is an idle re-entry, for example, due to use of
2121 * RCU_NONIDLE() or the new idle-loop tracing API within the idle 2140 * RCU_NONIDLE() or the new idle-loop tracing API within the idle
2122 * loop, then don't take any state-machine actions, unless the 2141 * loop, then don't take any state-machine actions, unless the
2123 * momentary exit from idle queued additional non-lazy callbacks. 2142 * momentary exit from idle queued additional non-lazy callbacks.
2124 * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks 2143 * Instead, repost the ->idle_gp_timer if this CPU has callbacks
2125 * pending. 2144 * pending.
2126 */ 2145 */
2127 if (!per_cpu(rcu_idle_first_pass, cpu) && 2146 if (!rdtp->idle_first_pass &&
2128 (per_cpu(rcu_nonlazy_posted, cpu) == 2147 (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) {
2129 per_cpu(rcu_nonlazy_posted_snap, cpu))) {
2130 if (rcu_cpu_has_callbacks(cpu)) { 2148 if (rcu_cpu_has_callbacks(cpu)) {
2131 tp = &per_cpu(rcu_idle_gp_timer, cpu); 2149 tp = &rdtp->idle_gp_timer;
2132 mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); 2150 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
2133 } 2151 }
2134 return; 2152 return;
2135 } 2153 }
2136 per_cpu(rcu_idle_first_pass, cpu) = 0; 2154 rdtp->idle_first_pass = 0;
2137 per_cpu(rcu_nonlazy_posted_snap, cpu) = 2155 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;
2138 per_cpu(rcu_nonlazy_posted, cpu) - 1;
2139 2156
2140 /* 2157 /*
2141 * If there are no callbacks on this CPU, enter dyntick-idle mode. 2158 * If there are no callbacks on this CPU, enter dyntick-idle mode.
2142 * Also reset state to avoid prejudicing later attempts. 2159 * Also reset state to avoid prejudicing later attempts.
2143 */ 2160 */
2144 if (!rcu_cpu_has_callbacks(cpu)) { 2161 if (!rcu_cpu_has_callbacks(cpu)) {
2145 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 2162 rdtp->dyntick_holdoff = jiffies - 1;
2146 per_cpu(rcu_dyntick_drain, cpu) = 0; 2163 rdtp->dyntick_drain = 0;
2147 trace_rcu_prep_idle("No callbacks"); 2164 trace_rcu_prep_idle("No callbacks");
2148 return; 2165 return;
2149 } 2166 }
@@ -2152,36 +2169,37 @@ static void rcu_prepare_for_idle(int cpu)
2152 * If in holdoff mode, just return. We will presumably have 2169 * If in holdoff mode, just return. We will presumably have
2153 * refrained from disabling the scheduling-clock tick. 2170 * refrained from disabling the scheduling-clock tick.
2154 */ 2171 */
2155 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { 2172 if (rdtp->dyntick_holdoff == jiffies) {
2156 trace_rcu_prep_idle("In holdoff"); 2173 trace_rcu_prep_idle("In holdoff");
2157 return; 2174 return;
2158 } 2175 }
2159 2176
2160 /* Check and update the rcu_dyntick_drain sequencing. */ 2177 /* Check and update the ->dyntick_drain sequencing. */
2161 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2178 if (rdtp->dyntick_drain <= 0) {
2162 /* First time through, initialize the counter. */ 2179 /* First time through, initialize the counter. */
2163 per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; 2180 rdtp->dyntick_drain = RCU_IDLE_FLUSHES;
2164 } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && 2181 } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES &&
2165 !rcu_pending(cpu) && 2182 !rcu_pending(cpu) &&
2166 !local_softirq_pending()) { 2183 !local_softirq_pending()) {
2167 /* Can we go dyntick-idle despite still having callbacks? */ 2184 /* Can we go dyntick-idle despite still having callbacks? */
2168 trace_rcu_prep_idle("Dyntick with callbacks"); 2185 rdtp->dyntick_drain = 0;
2169 per_cpu(rcu_dyntick_drain, cpu) = 0; 2186 rdtp->dyntick_holdoff = jiffies;
2170 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; 2187 if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
2171 if (rcu_cpu_has_nonlazy_callbacks(cpu)) 2188 trace_rcu_prep_idle("Dyntick with callbacks");
2172 per_cpu(rcu_idle_gp_timer_expires, cpu) = 2189 rdtp->idle_gp_timer_expires =
2173 jiffies + RCU_IDLE_GP_DELAY; 2190 jiffies + RCU_IDLE_GP_DELAY;
2174 else 2191 } else {
2175 per_cpu(rcu_idle_gp_timer_expires, cpu) = 2192 rdtp->idle_gp_timer_expires =
2176 jiffies + RCU_IDLE_LAZY_GP_DELAY; 2193 jiffies + RCU_IDLE_LAZY_GP_DELAY;
2177 tp = &per_cpu(rcu_idle_gp_timer, cpu); 2194 trace_rcu_prep_idle("Dyntick with lazy callbacks");
2178 mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); 2195 }
2179 per_cpu(rcu_nonlazy_posted_snap, cpu) = 2196 tp = &rdtp->idle_gp_timer;
2180 per_cpu(rcu_nonlazy_posted, cpu); 2197 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
2198 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
2181 return; /* Nothing more to do immediately. */ 2199 return; /* Nothing more to do immediately. */
2182 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2200 } else if (--(rdtp->dyntick_drain) <= 0) {
2183 /* We have hit the limit, so time to give up. */ 2201 /* We have hit the limit, so time to give up. */
2184 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; 2202 rdtp->dyntick_holdoff = jiffies;
2185 trace_rcu_prep_idle("Begin holdoff"); 2203 trace_rcu_prep_idle("Begin holdoff");
2186 invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ 2204 invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */
2187 return; 2205 return;
@@ -2227,7 +2245,7 @@ static void rcu_prepare_for_idle(int cpu)
2227 */ 2245 */
2228static void rcu_idle_count_callbacks_posted(void) 2246static void rcu_idle_count_callbacks_posted(void)
2229{ 2247{
2230 __this_cpu_add(rcu_nonlazy_posted, 1); 2248 __this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
2231} 2249}
2232 2250
2233#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 2251#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
@@ -2238,11 +2256,12 @@ static void rcu_idle_count_callbacks_posted(void)
2238 2256
2239static void print_cpu_stall_fast_no_hz(char *cp, int cpu) 2257static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2240{ 2258{
2241 struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu); 2259 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2260 struct timer_list *tltp = &rdtp->idle_gp_timer;
2242 2261
2243 sprintf(cp, "drain=%d %c timer=%lu", 2262 sprintf(cp, "drain=%d %c timer=%lu",
2244 per_cpu(rcu_dyntick_drain, cpu), 2263 rdtp->dyntick_drain,
2245 per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', 2264 rdtp->dyntick_holdoff == jiffies ? 'H' : '.',
2246 timer_pending(tltp) ? tltp->expires - jiffies : -1); 2265 timer_pending(tltp) ? tltp->expires - jiffies : -1);
2247} 2266}
2248 2267
diff --git a/kernel/relay.c b/kernel/relay.c
index ab56a1764d4d..e8cd2027abbd 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1235,6 +1235,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
1235 struct splice_pipe_desc spd = { 1235 struct splice_pipe_desc spd = {
1236 .pages = pages, 1236 .pages = pages,
1237 .nr_pages = 0, 1237 .nr_pages = 0,
1238 .nr_pages_max = PIPE_DEF_BUFFERS,
1238 .partial = partial, 1239 .partial = partial,
1239 .flags = flags, 1240 .flags = flags,
1240 .ops = &relay_pipe_buf_ops, 1241 .ops = &relay_pipe_buf_ops,
@@ -1302,8 +1303,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
1302 ret += padding; 1303 ret += padding;
1303 1304
1304out: 1305out:
1305 splice_shrink_spd(pipe, &spd); 1306 splice_shrink_spd(&spd);
1306 return ret; 1307 return ret;
1307} 1308}
1308 1309
1309static ssize_t relay_file_splice_read(struct file *in, 1310static ssize_t relay_file_splice_read(struct file *in,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d5594a4268d4..468bdd44c1ba 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2081,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev,
2081#endif 2081#endif
2082 2082
2083 /* Here we just switch the register state and the stack. */ 2083 /* Here we just switch the register state and the stack. */
2084 rcu_switch_from(prev);
2085 switch_to(prev, next, prev); 2084 switch_to(prev, next, prev);
2086 2085
2087 barrier(); 2086 barrier();
@@ -2161,11 +2160,73 @@ unsigned long this_cpu_load(void)
2161} 2160}
2162 2161
2163 2162
2163/*
2164 * Global load-average calculations
2165 *
2166 * We take a distributed and async approach to calculating the global load-avg
2167 * in order to minimize overhead.
2168 *
2169 * The global load average is an exponentially decaying average of nr_running +
2170 * nr_uninterruptible.
2171 *
2172 * Once every LOAD_FREQ:
2173 *
2174 * nr_active = 0;
2175 * for_each_possible_cpu(cpu)
2176 * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
2177 *
2178 * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
2179 *
2180 * Due to a number of reasons the above turns in the mess below:
2181 *
2182 * - for_each_possible_cpu() is prohibitively expensive on machines with
2183 * serious number of cpus, therefore we need to take a distributed approach
2184 * to calculating nr_active.
2185 *
2186 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
2187 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
2188 *
2189 * So assuming nr_active := 0 when we start out -- true per definition, we
2190 * can simply take per-cpu deltas and fold those into a global accumulate
2191 * to obtain the same result. See calc_load_fold_active().
2192 *
2193 * Furthermore, in order to avoid synchronizing all per-cpu delta folding
2194 * across the machine, we assume 10 ticks is sufficient time for every
2195 * cpu to have completed this task.
2196 *
2197 * This places an upper-bound on the IRQ-off latency of the machine. Then
2198 * again, being late doesn't loose the delta, just wrecks the sample.
2199 *
2200 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
2201 * this would add another cross-cpu cacheline miss and atomic operation
2202 * to the wakeup path. Instead we increment on whatever cpu the task ran
2203 * when it went into uninterruptible state and decrement on whatever cpu
2204 * did the wakeup. This means that only the sum of nr_uninterruptible over
2205 * all cpus yields the correct result.
2206 *
2207 * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
2208 */
2209
2164/* Variables and functions for calc_load */ 2210/* Variables and functions for calc_load */
2165static atomic_long_t calc_load_tasks; 2211static atomic_long_t calc_load_tasks;
2166static unsigned long calc_load_update; 2212static unsigned long calc_load_update;
2167unsigned long avenrun[3]; 2213unsigned long avenrun[3];
2168EXPORT_SYMBOL(avenrun); 2214EXPORT_SYMBOL(avenrun); /* should be removed */
2215
2216/**
2217 * get_avenrun - get the load average array
2218 * @loads: pointer to dest load array
2219 * @offset: offset to add
2220 * @shift: shift count to shift the result left
2221 *
2222 * These values are estimates at best, so no need for locking.
2223 */
2224void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2225{
2226 loads[0] = (avenrun[0] + offset) << shift;
2227 loads[1] = (avenrun[1] + offset) << shift;
2228 loads[2] = (avenrun[2] + offset) << shift;
2229}
2169 2230
2170static long calc_load_fold_active(struct rq *this_rq) 2231static long calc_load_fold_active(struct rq *this_rq)
2171{ 2232{
@@ -2182,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq)
2182 return delta; 2243 return delta;
2183} 2244}
2184 2245
2246/*
2247 * a1 = a0 * e + a * (1 - e)
2248 */
2185static unsigned long 2249static unsigned long
2186calc_load(unsigned long load, unsigned long exp, unsigned long active) 2250calc_load(unsigned long load, unsigned long exp, unsigned long active)
2187{ 2251{
@@ -2193,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
2193 2257
2194#ifdef CONFIG_NO_HZ 2258#ifdef CONFIG_NO_HZ
2195/* 2259/*
2196 * For NO_HZ we delay the active fold to the next LOAD_FREQ update. 2260 * Handle NO_HZ for the global load-average.
2261 *
2262 * Since the above described distributed algorithm to compute the global
2263 * load-average relies on per-cpu sampling from the tick, it is affected by
2264 * NO_HZ.
2265 *
2266 * The basic idea is to fold the nr_active delta into a global idle-delta upon
2267 * entering NO_HZ state such that we can include this as an 'extra' cpu delta
2268 * when we read the global state.
2269 *
2270 * Obviously reality has to ruin such a delightfully simple scheme:
2271 *
2272 * - When we go NO_HZ idle during the window, we can negate our sample
2273 * contribution, causing under-accounting.
2274 *
2275 * We avoid this by keeping two idle-delta counters and flipping them
2276 * when the window starts, thus separating old and new NO_HZ load.
2277 *
2278 * The only trick is the slight shift in index flip for read vs write.
2279 *
2280 * 0s 5s 10s 15s
2281 * +10 +10 +10 +10
2282 * |-|-----------|-|-----------|-|-----------|-|
2283 * r:0 0 1 1 0 0 1 1 0
2284 * w:0 1 1 0 0 1 1 0 0
2285 *
2286 * This ensures we'll fold the old idle contribution in this window while
2287 * accumlating the new one.
2288 *
2289 * - When we wake up from NO_HZ idle during the window, we push up our
2290 * contribution, since we effectively move our sample point to a known
2291 * busy state.
2292 *
2293 * This is solved by pushing the window forward, and thus skipping the
2294 * sample, for this cpu (effectively using the idle-delta for this cpu which
2295 * was in effect at the time the window opened). This also solves the issue
2296 * of having to deal with a cpu having been in NOHZ idle for multiple
2297 * LOAD_FREQ intervals.
2197 * 2298 *
2198 * When making the ILB scale, we should try to pull this in as well. 2299 * When making the ILB scale, we should try to pull this in as well.
2199 */ 2300 */
2200static atomic_long_t calc_load_tasks_idle; 2301static atomic_long_t calc_load_idle[2];
2302static int calc_load_idx;
2201 2303
2202void calc_load_account_idle(struct rq *this_rq) 2304static inline int calc_load_write_idx(void)
2203{ 2305{
2306 int idx = calc_load_idx;
2307
2308 /*
2309 * See calc_global_nohz(), if we observe the new index, we also
2310 * need to observe the new update time.
2311 */
2312 smp_rmb();
2313
2314 /*
2315 * If the folding window started, make sure we start writing in the
2316 * next idle-delta.
2317 */
2318 if (!time_before(jiffies, calc_load_update))
2319 idx++;
2320
2321 return idx & 1;
2322}
2323
2324static inline int calc_load_read_idx(void)
2325{
2326 return calc_load_idx & 1;
2327}
2328
2329void calc_load_enter_idle(void)
2330{
2331 struct rq *this_rq = this_rq();
2204 long delta; 2332 long delta;
2205 2333
2334 /*
2335 * We're going into NOHZ mode, if there's any pending delta, fold it
2336 * into the pending idle delta.
2337 */
2206 delta = calc_load_fold_active(this_rq); 2338 delta = calc_load_fold_active(this_rq);
2207 if (delta) 2339 if (delta) {
2208 atomic_long_add(delta, &calc_load_tasks_idle); 2340 int idx = calc_load_write_idx();
2341 atomic_long_add(delta, &calc_load_idle[idx]);
2342 }
2209} 2343}
2210 2344
2211static long calc_load_fold_idle(void) 2345void calc_load_exit_idle(void)
2212{ 2346{
2213 long delta = 0; 2347 struct rq *this_rq = this_rq();
2348
2349 /*
2350 * If we're still before the sample window, we're done.
2351 */
2352 if (time_before(jiffies, this_rq->calc_load_update))
2353 return;
2214 2354
2215 /* 2355 /*
2216 * Its got a race, we don't care... 2356 * We woke inside or after the sample window, this means we're already
2357 * accounted through the nohz accounting, so skip the entire deal and
2358 * sync up for the next window.
2217 */ 2359 */
2218 if (atomic_long_read(&calc_load_tasks_idle)) 2360 this_rq->calc_load_update = calc_load_update;
2219 delta = atomic_long_xchg(&calc_load_tasks_idle, 0); 2361 if (time_before(jiffies, this_rq->calc_load_update + 10))
2362 this_rq->calc_load_update += LOAD_FREQ;
2363}
2364
2365static long calc_load_fold_idle(void)
2366{
2367 int idx = calc_load_read_idx();
2368 long delta = 0;
2369
2370 if (atomic_long_read(&calc_load_idle[idx]))
2371 delta = atomic_long_xchg(&calc_load_idle[idx], 0);
2220 2372
2221 return delta; 2373 return delta;
2222} 2374}
@@ -2302,66 +2454,39 @@ static void calc_global_nohz(void)
2302{ 2454{
2303 long delta, active, n; 2455 long delta, active, n;
2304 2456
2305 /* 2457 if (!time_before(jiffies, calc_load_update + 10)) {
2306 * If we crossed a calc_load_update boundary, make sure to fold 2458 /*
2307 * any pending idle changes, the respective CPUs might have 2459 * Catch-up, fold however many we are behind still
2308 * missed the tick driven calc_load_account_active() update 2460 */
2309 * due to NO_HZ. 2461 delta = jiffies - calc_load_update - 10;
2310 */ 2462 n = 1 + (delta / LOAD_FREQ);
2311 delta = calc_load_fold_idle();
2312 if (delta)
2313 atomic_long_add(delta, &calc_load_tasks);
2314
2315 /*
2316 * It could be the one fold was all it took, we done!
2317 */
2318 if (time_before(jiffies, calc_load_update + 10))
2319 return;
2320
2321 /*
2322 * Catch-up, fold however many we are behind still
2323 */
2324 delta = jiffies - calc_load_update - 10;
2325 n = 1 + (delta / LOAD_FREQ);
2326 2463
2327 active = atomic_long_read(&calc_load_tasks); 2464 active = atomic_long_read(&calc_load_tasks);
2328 active = active > 0 ? active * FIXED_1 : 0; 2465 active = active > 0 ? active * FIXED_1 : 0;
2329 2466
2330 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); 2467 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2331 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); 2468 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2332 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); 2469 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2333 2470
2334 calc_load_update += n * LOAD_FREQ; 2471 calc_load_update += n * LOAD_FREQ;
2335} 2472 }
2336#else
2337void calc_load_account_idle(struct rq *this_rq)
2338{
2339}
2340 2473
2341static inline long calc_load_fold_idle(void) 2474 /*
2342{ 2475 * Flip the idle index...
2343 return 0; 2476 *
2477 * Make sure we first write the new time then flip the index, so that
2478 * calc_load_write_idx() will see the new time when it reads the new
2479 * index, this avoids a double flip messing things up.
2480 */
2481 smp_wmb();
2482 calc_load_idx++;
2344} 2483}
2484#else /* !CONFIG_NO_HZ */
2345 2485
2346static void calc_global_nohz(void) 2486static inline long calc_load_fold_idle(void) { return 0; }
2347{ 2487static inline void calc_global_nohz(void) { }
2348}
2349#endif
2350 2488
2351/** 2489#endif /* CONFIG_NO_HZ */
2352 * get_avenrun - get the load average array
2353 * @loads: pointer to dest load array
2354 * @offset: offset to add
2355 * @shift: shift count to shift the result left
2356 *
2357 * These values are estimates at best, so no need for locking.
2358 */
2359void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2360{
2361 loads[0] = (avenrun[0] + offset) << shift;
2362 loads[1] = (avenrun[1] + offset) << shift;
2363 loads[2] = (avenrun[2] + offset) << shift;
2364}
2365 2490
2366/* 2491/*
2367 * calc_load - update the avenrun load estimates 10 ticks after the 2492 * calc_load - update the avenrun load estimates 10 ticks after the
@@ -2369,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2369 */ 2494 */
2370void calc_global_load(unsigned long ticks) 2495void calc_global_load(unsigned long ticks)
2371{ 2496{
2372 long active; 2497 long active, delta;
2373 2498
2374 if (time_before(jiffies, calc_load_update + 10)) 2499 if (time_before(jiffies, calc_load_update + 10))
2375 return; 2500 return;
2376 2501
2502 /*
2503 * Fold the 'old' idle-delta to include all NO_HZ cpus.
2504 */
2505 delta = calc_load_fold_idle();
2506 if (delta)
2507 atomic_long_add(delta, &calc_load_tasks);
2508
2377 active = atomic_long_read(&calc_load_tasks); 2509 active = atomic_long_read(&calc_load_tasks);
2378 active = active > 0 ? active * FIXED_1 : 0; 2510 active = active > 0 ? active * FIXED_1 : 0;
2379 2511
@@ -2384,12 +2516,7 @@ void calc_global_load(unsigned long ticks)
2384 calc_load_update += LOAD_FREQ; 2516 calc_load_update += LOAD_FREQ;
2385 2517
2386 /* 2518 /*
2387 * Account one period with whatever state we found before 2519 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
2388 * folding in the nohz state and ageing the entire idle period.
2389 *
2390 * This avoids loosing a sample when we go idle between
2391 * calc_load_account_active() (10 ticks ago) and now and thus
2392 * under-accounting.
2393 */ 2520 */
2394 calc_global_nohz(); 2521 calc_global_nohz();
2395} 2522}
@@ -2406,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq)
2406 return; 2533 return;
2407 2534
2408 delta = calc_load_fold_active(this_rq); 2535 delta = calc_load_fold_active(this_rq);
2409 delta += calc_load_fold_idle();
2410 if (delta) 2536 if (delta)
2411 atomic_long_add(delta, &calc_load_tasks); 2537 atomic_long_add(delta, &calc_load_tasks);
2412 2538
@@ -2414,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq)
2414} 2540}
2415 2541
2416/* 2542/*
2543 * End of global load-average stuff
2544 */
2545
2546/*
2417 * The exact cpuload at various idx values, calculated at every tick would be 2547 * The exact cpuload at various idx values, calculated at every tick would be
2418 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load 2548 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
2419 * 2549 *
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b44d604b35d1..b6baf370cae9 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
25static struct task_struct *pick_next_task_idle(struct rq *rq) 25static struct task_struct *pick_next_task_idle(struct rq *rq)
26{ 26{
27 schedstat_inc(rq, sched_goidle); 27 schedstat_inc(rq, sched_goidle);
28 calc_load_account_idle(rq);
29 return rq->idle; 28 return rq->idle;
30} 29}
31 30
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6d52cea7f33d..55844f24435a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -942,8 +942,6 @@ static inline u64 sched_avg_period(void)
942 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; 942 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
943} 943}
944 944
945void calc_load_account_idle(struct rq *this_rq);
946
947#ifdef CONFIG_SCHED_HRTICK 945#ifdef CONFIG_SCHED_HRTICK
948 946
949/* 947/*
diff --git a/kernel/sys.c b/kernel/sys.c
index f0ec44dcd415..2d39a84cd857 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1788,7 +1788,6 @@ SYSCALL_DEFINE1(umask, int, mask)
1788#ifdef CONFIG_CHECKPOINT_RESTORE 1788#ifdef CONFIG_CHECKPOINT_RESTORE
1789static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) 1789static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1790{ 1790{
1791 struct vm_area_struct *vma;
1792 struct file *exe_file; 1791 struct file *exe_file;
1793 struct dentry *dentry; 1792 struct dentry *dentry;
1794 int err; 1793 int err;
@@ -1816,13 +1815,17 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1816 down_write(&mm->mmap_sem); 1815 down_write(&mm->mmap_sem);
1817 1816
1818 /* 1817 /*
1819 * Forbid mm->exe_file change if there are mapped other files. 1818 * Forbid mm->exe_file change if old file still mapped.
1820 */ 1819 */
1821 err = -EBUSY; 1820 err = -EBUSY;
1822 for (vma = mm->mmap; vma; vma = vma->vm_next) { 1821 if (mm->exe_file) {
1823 if (vma->vm_file && !path_equal(&vma->vm_file->f_path, 1822 struct vm_area_struct *vma;
1824 &exe_file->f_path)) 1823
1825 goto exit_unlock; 1824 for (vma = mm->mmap; vma; vma = vma->vm_next)
1825 if (vma->vm_file &&
1826 path_equal(&vma->vm_file->f_path,
1827 &mm->exe_file->f_path))
1828 goto exit_unlock;
1826 } 1829 }
1827 1830
1828 /* 1831 /*
@@ -1835,6 +1838,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1835 if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) 1838 if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
1836 goto exit_unlock; 1839 goto exit_unlock;
1837 1840
1841 err = 0;
1838 set_mm_exe_file(mm, exe_file); 1842 set_mm_exe_file(mm, exe_file);
1839exit_unlock: 1843exit_unlock:
1840 up_write(&mm->mmap_sem); 1844 up_write(&mm->mmap_sem);
@@ -2127,9 +2131,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2127 else 2131 else
2128 return -EINVAL; 2132 return -EINVAL;
2129 break; 2133 break;
2130 case PR_GET_TID_ADDRESS:
2131 error = prctl_get_tid_address(me, (int __user **)arg2);
2132 break;
2133 default: 2134 default:
2134 return -EINVAL; 2135 return -EINVAL;
2135 } 2136 }
@@ -2147,6 +2148,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2147 case PR_SET_MM: 2148 case PR_SET_MM:
2148 error = prctl_set_mm(arg2, arg3, arg4, arg5); 2149 error = prctl_set_mm(arg2, arg3, arg4, arg5);
2149 break; 2150 break;
2151 case PR_GET_TID_ADDRESS:
2152 error = prctl_get_tid_address(me, (int __user **)arg2);
2153 break;
2150 case PR_SET_CHILD_SUBREAPER: 2154 case PR_SET_CHILD_SUBREAPER:
2151 me->signal->is_child_subreaper = !!arg2; 2155 me->signal->is_child_subreaper = !!arg2;
2152 error = 0; 2156 error = 0;
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 70b33abcc7bb..b7fbadc5c973 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -409,7 +409,9 @@ int second_overflow(unsigned long secs)
409 time_state = TIME_DEL; 409 time_state = TIME_DEL;
410 break; 410 break;
411 case TIME_INS: 411 case TIME_INS:
412 if (secs % 86400 == 0) { 412 if (!(time_status & STA_INS))
413 time_state = TIME_OK;
414 else if (secs % 86400 == 0) {
413 leap = -1; 415 leap = -1;
414 time_state = TIME_OOP; 416 time_state = TIME_OOP;
415 time_tai++; 417 time_tai++;
@@ -418,7 +420,9 @@ int second_overflow(unsigned long secs)
418 } 420 }
419 break; 421 break;
420 case TIME_DEL: 422 case TIME_DEL:
421 if ((secs + 1) % 86400 == 0) { 423 if (!(time_status & STA_DEL))
424 time_state = TIME_OK;
425 else if ((secs + 1) % 86400 == 0) {
422 leap = 1; 426 leap = 1;
423 time_tai--; 427 time_tai--;
424 time_state = TIME_WAIT; 428 time_state = TIME_WAIT;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 60c9c60e9108..41be02250e08 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -276,10 +276,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
276{ 276{
277 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; 277 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
278 ktime_t last_update, expires, ret = { .tv64 = 0 }; 278 ktime_t last_update, expires, ret = { .tv64 = 0 };
279 unsigned long rcu_delta_jiffies;
279 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 280 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
280 u64 time_delta; 281 u64 time_delta;
281 282
282
283 /* Read jiffies and the time when jiffies were updated last */ 283 /* Read jiffies and the time when jiffies were updated last */
284 do { 284 do {
285 seq = read_seqbegin(&xtime_lock); 285 seq = read_seqbegin(&xtime_lock);
@@ -288,7 +288,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
288 time_delta = timekeeping_max_deferment(); 288 time_delta = timekeeping_max_deferment();
289 } while (read_seqretry(&xtime_lock, seq)); 289 } while (read_seqretry(&xtime_lock, seq));
290 290
291 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || 291 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) ||
292 arch_needs_cpu(cpu)) { 292 arch_needs_cpu(cpu)) {
293 next_jiffies = last_jiffies + 1; 293 next_jiffies = last_jiffies + 1;
294 delta_jiffies = 1; 294 delta_jiffies = 1;
@@ -296,6 +296,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
296 /* Get the next timer wheel timer */ 296 /* Get the next timer wheel timer */
297 next_jiffies = get_next_timer_interrupt(last_jiffies); 297 next_jiffies = get_next_timer_interrupt(last_jiffies);
298 delta_jiffies = next_jiffies - last_jiffies; 298 delta_jiffies = next_jiffies - last_jiffies;
299 if (rcu_delta_jiffies < delta_jiffies) {
300 next_jiffies = last_jiffies + rcu_delta_jiffies;
301 delta_jiffies = rcu_delta_jiffies;
302 }
299 } 303 }
300 /* 304 /*
301 * Do not stop the tick, if we are only one off 305 * Do not stop the tick, if we are only one off
@@ -369,6 +373,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
369 */ 373 */
370 if (!ts->tick_stopped) { 374 if (!ts->tick_stopped) {
371 select_nohz_load_balancer(1); 375 select_nohz_load_balancer(1);
376 calc_load_enter_idle();
372 377
373 ts->last_tick = hrtimer_get_expires(&ts->sched_timer); 378 ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
374 ts->tick_stopped = 1; 379 ts->tick_stopped = 1;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6f46a00a1e8a..269b1fe5f2ae 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -70,6 +70,12 @@ struct timekeeper {
70 /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ 70 /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
71 struct timespec raw_time; 71 struct timespec raw_time;
72 72
73 /* Offset clock monotonic -> clock realtime */
74 ktime_t offs_real;
75
76 /* Offset clock monotonic -> clock boottime */
77 ktime_t offs_boot;
78
73 /* Seqlock for all timekeeper values */ 79 /* Seqlock for all timekeeper values */
74 seqlock_t lock; 80 seqlock_t lock;
75}; 81};
@@ -172,6 +178,14 @@ static inline s64 timekeeping_get_ns_raw(void)
172 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 178 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
173} 179}
174 180
181static void update_rt_offset(void)
182{
183 struct timespec tmp, *wtm = &timekeeper.wall_to_monotonic;
184
185 set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec);
186 timekeeper.offs_real = timespec_to_ktime(tmp);
187}
188
175/* must hold write on timekeeper.lock */ 189/* must hold write on timekeeper.lock */
176static void timekeeping_update(bool clearntp) 190static void timekeeping_update(bool clearntp)
177{ 191{
@@ -179,6 +193,7 @@ static void timekeeping_update(bool clearntp)
179 timekeeper.ntp_error = 0; 193 timekeeper.ntp_error = 0;
180 ntp_clear(); 194 ntp_clear();
181 } 195 }
196 update_rt_offset();
182 update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, 197 update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic,
183 timekeeper.clock, timekeeper.mult); 198 timekeeper.clock, timekeeper.mult);
184} 199}
@@ -604,6 +619,7 @@ void __init timekeeping_init(void)
604 } 619 }
605 set_normalized_timespec(&timekeeper.wall_to_monotonic, 620 set_normalized_timespec(&timekeeper.wall_to_monotonic,
606 -boot.tv_sec, -boot.tv_nsec); 621 -boot.tv_sec, -boot.tv_nsec);
622 update_rt_offset();
607 timekeeper.total_sleep_time.tv_sec = 0; 623 timekeeper.total_sleep_time.tv_sec = 0;
608 timekeeper.total_sleep_time.tv_nsec = 0; 624 timekeeper.total_sleep_time.tv_nsec = 0;
609 write_sequnlock_irqrestore(&timekeeper.lock, flags); 625 write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -612,6 +628,12 @@ void __init timekeeping_init(void)
612/* time in seconds when suspend began */ 628/* time in seconds when suspend began */
613static struct timespec timekeeping_suspend_time; 629static struct timespec timekeeping_suspend_time;
614 630
631static void update_sleep_time(struct timespec t)
632{
633 timekeeper.total_sleep_time = t;
634 timekeeper.offs_boot = timespec_to_ktime(t);
635}
636
615/** 637/**
616 * __timekeeping_inject_sleeptime - Internal function to add sleep interval 638 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
617 * @delta: pointer to a timespec delta value 639 * @delta: pointer to a timespec delta value
@@ -630,8 +652,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta)
630 timekeeper.xtime = timespec_add(timekeeper.xtime, *delta); 652 timekeeper.xtime = timespec_add(timekeeper.xtime, *delta);
631 timekeeper.wall_to_monotonic = 653 timekeeper.wall_to_monotonic =
632 timespec_sub(timekeeper.wall_to_monotonic, *delta); 654 timespec_sub(timekeeper.wall_to_monotonic, *delta);
633 timekeeper.total_sleep_time = timespec_add( 655 update_sleep_time(timespec_add(timekeeper.total_sleep_time, *delta));
634 timekeeper.total_sleep_time, *delta);
635} 656}
636 657
637 658
@@ -963,6 +984,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
963 leap = second_overflow(timekeeper.xtime.tv_sec); 984 leap = second_overflow(timekeeper.xtime.tv_sec);
964 timekeeper.xtime.tv_sec += leap; 985 timekeeper.xtime.tv_sec += leap;
965 timekeeper.wall_to_monotonic.tv_sec -= leap; 986 timekeeper.wall_to_monotonic.tv_sec -= leap;
987 if (leap)
988 clock_was_set_delayed();
966 } 989 }
967 990
968 /* Accumulate raw time */ 991 /* Accumulate raw time */
@@ -1079,6 +1102,8 @@ static void update_wall_time(void)
1079 leap = second_overflow(timekeeper.xtime.tv_sec); 1102 leap = second_overflow(timekeeper.xtime.tv_sec);
1080 timekeeper.xtime.tv_sec += leap; 1103 timekeeper.xtime.tv_sec += leap;
1081 timekeeper.wall_to_monotonic.tv_sec -= leap; 1104 timekeeper.wall_to_monotonic.tv_sec -= leap;
1105 if (leap)
1106 clock_was_set_delayed();
1082 } 1107 }
1083 1108
1084 timekeeping_update(false); 1109 timekeeping_update(false);
@@ -1246,6 +1271,40 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1246 } while (read_seqretry(&timekeeper.lock, seq)); 1271 } while (read_seqretry(&timekeeper.lock, seq));
1247} 1272}
1248 1273
1274#ifdef CONFIG_HIGH_RES_TIMERS
1275/**
1276 * ktime_get_update_offsets - hrtimer helper
1277 * @offs_real: pointer to storage for monotonic -> realtime offset
1278 * @offs_boot: pointer to storage for monotonic -> boottime offset
1279 *
1280 * Returns current monotonic time and updates the offsets
1281 * Called from hrtimer_interupt() or retrigger_next_event()
1282 */
1283ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
1284{
1285 ktime_t now;
1286 unsigned int seq;
1287 u64 secs, nsecs;
1288
1289 do {
1290 seq = read_seqbegin(&timekeeper.lock);
1291
1292 secs = timekeeper.xtime.tv_sec;
1293 nsecs = timekeeper.xtime.tv_nsec;
1294 nsecs += timekeeping_get_ns();
1295 /* If arch requires, add in gettimeoffset() */
1296 nsecs += arch_gettimeoffset();
1297
1298 *offs_real = timekeeper.offs_real;
1299 *offs_boot = timekeeper.offs_boot;
1300 } while (read_seqretry(&timekeeper.lock, seq));
1301
1302 now = ktime_add_ns(ktime_set(secs, 0), nsecs);
1303 now = ktime_sub(now, *offs_real);
1304 return now;
1305}
1306#endif
1307
1249/** 1308/**
1250 * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format 1309 * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
1251 */ 1310 */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1d0f6a8a0e5e..f765465bffe4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1075,6 +1075,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
1075 rb_init_page(bpage->page); 1075 rb_init_page(bpage->page);
1076 1076
1077 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 1077 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
1078 INIT_LIST_HEAD(&cpu_buffer->new_pages);
1078 1079
1079 ret = rb_allocate_pages(cpu_buffer, nr_pages); 1080 ret = rb_allocate_pages(cpu_buffer, nr_pages);
1080 if (ret < 0) 1081 if (ret < 0)
@@ -1346,10 +1347,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
1346 * If something was added to this page, it was full 1347 * If something was added to this page, it was full
1347 * since it is not the tail page. So we deduct the 1348 * since it is not the tail page. So we deduct the
1348 * bytes consumed in ring buffer from here. 1349 * bytes consumed in ring buffer from here.
1349 * No need to update overruns, since this page is 1350 * Increment overrun to account for the lost events.
1350 * deleted from ring buffer and its entries are
1351 * already accounted for.
1352 */ 1351 */
1352 local_add(page_entries, &cpu_buffer->overrun);
1353 local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); 1353 local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
1354 } 1354 }
1355 1355
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 68032c6177db..a7fa0702be1c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -371,7 +371,7 @@ EXPORT_SYMBOL_GPL(tracing_on);
371void tracing_off(void) 371void tracing_off(void)
372{ 372{
373 if (global_trace.buffer) 373 if (global_trace.buffer)
374 ring_buffer_record_on(global_trace.buffer); 374 ring_buffer_record_off(global_trace.buffer);
375 /* 375 /*
376 * This flag is only looked at when buffers haven't been 376 * This flag is only looked at when buffers haven't been
377 * allocated yet. We don't really care about the race 377 * allocated yet. We don't really care about the race
@@ -3609,6 +3609,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3609 .pages = pages_def, 3609 .pages = pages_def,
3610 .partial = partial_def, 3610 .partial = partial_def,
3611 .nr_pages = 0, /* This gets updated below. */ 3611 .nr_pages = 0, /* This gets updated below. */
3612 .nr_pages_max = PIPE_DEF_BUFFERS,
3612 .flags = flags, 3613 .flags = flags,
3613 .ops = &tracing_pipe_buf_ops, 3614 .ops = &tracing_pipe_buf_ops,
3614 .spd_release = tracing_spd_release_pipe, 3615 .spd_release = tracing_spd_release_pipe,
@@ -3680,7 +3681,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3680 3681
3681 ret = splice_to_pipe(pipe, &spd); 3682 ret = splice_to_pipe(pipe, &spd);
3682out: 3683out:
3683 splice_shrink_spd(pipe, &spd); 3684 splice_shrink_spd(&spd);
3684 return ret; 3685 return ret;
3685 3686
3686out_err: 3687out_err:
@@ -4231,6 +4232,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4231 struct splice_pipe_desc spd = { 4232 struct splice_pipe_desc spd = {
4232 .pages = pages_def, 4233 .pages = pages_def,
4233 .partial = partial_def, 4234 .partial = partial_def,
4235 .nr_pages_max = PIPE_DEF_BUFFERS,
4234 .flags = flags, 4236 .flags = flags,
4235 .ops = &buffer_pipe_buf_ops, 4237 .ops = &buffer_pipe_buf_ops,
4236 .spd_release = buffer_spd_release, 4238 .spd_release = buffer_spd_release,
@@ -4318,7 +4320,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4318 } 4320 }
4319 4321
4320 ret = splice_to_pipe(pipe, &spd); 4322 ret = splice_to_pipe(pipe, &spd);
4321 splice_shrink_spd(pipe, &spd); 4323 splice_shrink_spd(&spd);
4322out: 4324out:
4323 return ret; 4325 return ret;
4324} 4326}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index e5e1d85b8c7c..4b1dfba70f7c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -372,6 +372,13 @@ static int watchdog(void *unused)
372 372
373 373
374#ifdef CONFIG_HARDLOCKUP_DETECTOR 374#ifdef CONFIG_HARDLOCKUP_DETECTOR
375/*
376 * People like the simple clean cpu node info on boot.
377 * Reduce the watchdog noise by only printing messages
378 * that are different from what cpu0 displayed.
379 */
380static unsigned long cpu0_err;
381
375static int watchdog_nmi_enable(int cpu) 382static int watchdog_nmi_enable(int cpu)
376{ 383{
377 struct perf_event_attr *wd_attr; 384 struct perf_event_attr *wd_attr;
@@ -390,11 +397,21 @@ static int watchdog_nmi_enable(int cpu)
390 397
391 /* Try to register using hardware perf events */ 398 /* Try to register using hardware perf events */
392 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); 399 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
400
401 /* save cpu0 error for future comparision */
402 if (cpu == 0 && IS_ERR(event))
403 cpu0_err = PTR_ERR(event);
404
393 if (!IS_ERR(event)) { 405 if (!IS_ERR(event)) {
394 pr_info("enabled, takes one hw-pmu counter.\n"); 406 /* only print for cpu0 or different than cpu0 */
407 if (cpu == 0 || cpu0_err)
408 pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
395 goto out_save; 409 goto out_save;
396 } 410 }
397 411
412 /* skip displaying the same error again */
413 if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
414 return PTR_ERR(event);
398 415
399 /* vary the KERN level based on the returned errno */ 416 /* vary the KERN level based on the returned errno */
400 if (PTR_ERR(event) == -EOPNOTSUPP) 417 if (PTR_ERR(event) == -EOPNOTSUPP)