diff options
author | Thomas Gleixner <tglx@linutronix.de> | 2012-07-15 04:24:53 -0400 |
---|---|---|
committer | Thomas Gleixner <tglx@linutronix.de> | 2012-07-15 04:24:53 -0400 |
commit | e8b9dd7e2471b1274e3be719fcc385e0a710e46f (patch) | |
tree | 030d7ce20e8f8767d9423f78c102aba089eec372 /kernel | |
parent | 924412f66fd9d21212e560a93792b0b607d46c6e (diff) | |
parent | 6b1859dba01c7d512b72d77e3fd7da8354235189 (diff) |
Merge branch 'timers/urgent' into timers/core
Reason: Update to upstream changes to avoid further conflicts.
Fixup a trivial merge conflict in kernel/time/tick-sched.c
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup.c | 36 | ||||
-rw-r--r-- | kernel/events/core.c | 10 | ||||
-rw-r--r-- | kernel/exit.c | 19 | ||||
-rw-r--r-- | kernel/fork.c | 11 | ||||
-rw-r--r-- | kernel/hrtimer.c | 53 | ||||
-rw-r--r-- | kernel/panic.c | 6 | ||||
-rw-r--r-- | kernel/pid_namespace.c | 20 | ||||
-rw-r--r-- | kernel/printk.c | 670 | ||||
-rw-r--r-- | kernel/rcutree.c | 17 | ||||
-rw-r--r-- | kernel/rcutree.h | 15 | ||||
-rw-r--r-- | kernel/rcutree_plugin.h | 179 | ||||
-rw-r--r-- | kernel/relay.c | 5 | ||||
-rw-r--r-- | kernel/sched/core.c | 276 | ||||
-rw-r--r-- | kernel/sched/idle_task.c | 1 | ||||
-rw-r--r-- | kernel/sched/sched.h | 2 | ||||
-rw-r--r-- | kernel/sys.c | 22 | ||||
-rw-r--r-- | kernel/time/ntp.c | 8 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 9 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 63 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 6 | ||||
-rw-r--r-- | kernel/trace/trace.c | 8 | ||||
-rw-r--r-- | kernel/watchdog.c | 19 |
22 files changed, 1061 insertions, 394 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 72fcd3069a90..b303dfc7dce0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -255,12 +255,17 @@ int cgroup_lock_is_held(void) | |||
255 | 255 | ||
256 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); | 256 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); |
257 | 257 | ||
258 | static int css_unbias_refcnt(int refcnt) | ||
259 | { | ||
260 | return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; | ||
261 | } | ||
262 | |||
258 | /* the current nr of refs, always >= 0 whether @css is deactivated or not */ | 263 | /* the current nr of refs, always >= 0 whether @css is deactivated or not */ |
259 | static int css_refcnt(struct cgroup_subsys_state *css) | 264 | static int css_refcnt(struct cgroup_subsys_state *css) |
260 | { | 265 | { |
261 | int v = atomic_read(&css->refcnt); | 266 | int v = atomic_read(&css->refcnt); |
262 | 267 | ||
263 | return v >= 0 ? v : v - CSS_DEACT_BIAS; | 268 | return css_unbias_refcnt(v); |
264 | } | 269 | } |
265 | 270 | ||
266 | /* convenient tests for these bits */ | 271 | /* convenient tests for these bits */ |
@@ -896,13 +901,10 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
896 | mutex_unlock(&cgroup_mutex); | 901 | mutex_unlock(&cgroup_mutex); |
897 | 902 | ||
898 | /* | 903 | /* |
899 | * We want to drop the active superblock reference from the | 904 | * Drop the active superblock reference that we took when we |
900 | * cgroup creation after all the dentry refs are gone - | 905 | * created the cgroup |
901 | * kill_sb gets mighty unhappy otherwise. Mark | ||
902 | * dentry->d_fsdata with cgroup_diput() to tell | ||
903 | * cgroup_d_release() to call deactivate_super(). | ||
904 | */ | 906 | */ |
905 | dentry->d_fsdata = cgroup_diput; | 907 | deactivate_super(cgrp->root->sb); |
906 | 908 | ||
907 | /* | 909 | /* |
908 | * if we're getting rid of the cgroup, refcount should ensure | 910 | * if we're getting rid of the cgroup, refcount should ensure |
@@ -928,13 +930,6 @@ static int cgroup_delete(const struct dentry *d) | |||
928 | return 1; | 930 | return 1; |
929 | } | 931 | } |
930 | 932 | ||
931 | static void cgroup_d_release(struct dentry *dentry) | ||
932 | { | ||
933 | /* did cgroup_diput() tell me to deactivate super? */ | ||
934 | if (dentry->d_fsdata == cgroup_diput) | ||
935 | deactivate_super(dentry->d_sb); | ||
936 | } | ||
937 | |||
938 | static void remove_dir(struct dentry *d) | 933 | static void remove_dir(struct dentry *d) |
939 | { | 934 | { |
940 | struct dentry *parent = dget(d->d_parent); | 935 | struct dentry *parent = dget(d->d_parent); |
@@ -1542,7 +1537,6 @@ static int cgroup_get_rootdir(struct super_block *sb) | |||
1542 | static const struct dentry_operations cgroup_dops = { | 1537 | static const struct dentry_operations cgroup_dops = { |
1543 | .d_iput = cgroup_diput, | 1538 | .d_iput = cgroup_diput, |
1544 | .d_delete = cgroup_delete, | 1539 | .d_delete = cgroup_delete, |
1545 | .d_release = cgroup_d_release, | ||
1546 | }; | 1540 | }; |
1547 | 1541 | ||
1548 | struct inode *inode = | 1542 | struct inode *inode = |
@@ -3889,8 +3883,12 @@ static void css_dput_fn(struct work_struct *work) | |||
3889 | { | 3883 | { |
3890 | struct cgroup_subsys_state *css = | 3884 | struct cgroup_subsys_state *css = |
3891 | container_of(work, struct cgroup_subsys_state, dput_work); | 3885 | container_of(work, struct cgroup_subsys_state, dput_work); |
3886 | struct dentry *dentry = css->cgroup->dentry; | ||
3887 | struct super_block *sb = dentry->d_sb; | ||
3892 | 3888 | ||
3893 | dput(css->cgroup->dentry); | 3889 | atomic_inc(&sb->s_active); |
3890 | dput(dentry); | ||
3891 | deactivate_super(sb); | ||
3894 | } | 3892 | } |
3895 | 3893 | ||
3896 | static void init_cgroup_css(struct cgroup_subsys_state *css, | 3894 | static void init_cgroup_css(struct cgroup_subsys_state *css, |
@@ -4982,10 +4980,12 @@ EXPORT_SYMBOL_GPL(__css_tryget); | |||
4982 | void __css_put(struct cgroup_subsys_state *css) | 4980 | void __css_put(struct cgroup_subsys_state *css) |
4983 | { | 4981 | { |
4984 | struct cgroup *cgrp = css->cgroup; | 4982 | struct cgroup *cgrp = css->cgroup; |
4983 | int v; | ||
4985 | 4984 | ||
4986 | rcu_read_lock(); | 4985 | rcu_read_lock(); |
4987 | atomic_dec(&css->refcnt); | 4986 | v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); |
4988 | switch (css_refcnt(css)) { | 4987 | |
4988 | switch (v) { | ||
4989 | case 1: | 4989 | case 1: |
4990 | if (notify_on_release(cgrp)) { | 4990 | if (notify_on_release(cgrp)) { |
4991 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 4991 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
diff --git a/kernel/events/core.c b/kernel/events/core.c index f85c0154b333..d7d71d6ec972 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -253,9 +253,9 @@ perf_cgroup_match(struct perf_event *event) | |||
253 | return !event->cgrp || event->cgrp == cpuctx->cgrp; | 253 | return !event->cgrp || event->cgrp == cpuctx->cgrp; |
254 | } | 254 | } |
255 | 255 | ||
256 | static inline void perf_get_cgroup(struct perf_event *event) | 256 | static inline bool perf_tryget_cgroup(struct perf_event *event) |
257 | { | 257 | { |
258 | css_get(&event->cgrp->css); | 258 | return css_tryget(&event->cgrp->css); |
259 | } | 259 | } |
260 | 260 | ||
261 | static inline void perf_put_cgroup(struct perf_event *event) | 261 | static inline void perf_put_cgroup(struct perf_event *event) |
@@ -484,7 +484,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, | |||
484 | event->cgrp = cgrp; | 484 | event->cgrp = cgrp; |
485 | 485 | ||
486 | /* must be done before we fput() the file */ | 486 | /* must be done before we fput() the file */ |
487 | perf_get_cgroup(event); | 487 | if (!perf_tryget_cgroup(event)) { |
488 | event->cgrp = NULL; | ||
489 | ret = -ENOENT; | ||
490 | goto out; | ||
491 | } | ||
488 | 492 | ||
489 | /* | 493 | /* |
490 | * all events in a group must monitor | 494 | * all events in a group must monitor |
diff --git a/kernel/exit.c b/kernel/exit.c index 34867cc5b42a..2f59cc334516 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -72,6 +72,18 @@ static void __unhash_process(struct task_struct *p, bool group_dead) | |||
72 | list_del_rcu(&p->tasks); | 72 | list_del_rcu(&p->tasks); |
73 | list_del_init(&p->sibling); | 73 | list_del_init(&p->sibling); |
74 | __this_cpu_dec(process_counts); | 74 | __this_cpu_dec(process_counts); |
75 | /* | ||
76 | * If we are the last child process in a pid namespace to be | ||
77 | * reaped, notify the reaper sleeping zap_pid_ns_processes(). | ||
78 | */ | ||
79 | if (IS_ENABLED(CONFIG_PID_NS)) { | ||
80 | struct task_struct *parent = p->real_parent; | ||
81 | |||
82 | if ((task_active_pid_ns(parent)->child_reaper == parent) && | ||
83 | list_empty(&parent->children) && | ||
84 | (parent->flags & PF_EXITING)) | ||
85 | wake_up_process(parent); | ||
86 | } | ||
75 | } | 87 | } |
76 | list_del_rcu(&p->thread_group); | 88 | list_del_rcu(&p->thread_group); |
77 | } | 89 | } |
@@ -643,6 +655,7 @@ static void exit_mm(struct task_struct * tsk) | |||
643 | mm_release(tsk, mm); | 655 | mm_release(tsk, mm); |
644 | if (!mm) | 656 | if (!mm) |
645 | return; | 657 | return; |
658 | sync_mm_rss(mm); | ||
646 | /* | 659 | /* |
647 | * Serialize with any possible pending coredump. | 660 | * Serialize with any possible pending coredump. |
648 | * We must hold mmap_sem around checking core_state | 661 | * We must hold mmap_sem around checking core_state |
@@ -719,12 +732,6 @@ static struct task_struct *find_new_reaper(struct task_struct *father) | |||
719 | 732 | ||
720 | zap_pid_ns_processes(pid_ns); | 733 | zap_pid_ns_processes(pid_ns); |
721 | write_lock_irq(&tasklist_lock); | 734 | write_lock_irq(&tasklist_lock); |
722 | /* | ||
723 | * We can not clear ->child_reaper or leave it alone. | ||
724 | * There may by stealth EXIT_DEAD tasks on ->children, | ||
725 | * forget_original_parent() must move them somewhere. | ||
726 | */ | ||
727 | pid_ns->child_reaper = init_pid_ns.child_reaper; | ||
728 | } else if (father->signal->has_child_subreaper) { | 735 | } else if (father->signal->has_child_subreaper) { |
729 | struct task_struct *reaper; | 736 | struct task_struct *reaper; |
730 | 737 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index ab5211b9e622..f00e319d8376 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -304,12 +304,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
304 | } | 304 | } |
305 | 305 | ||
306 | err = arch_dup_task_struct(tsk, orig); | 306 | err = arch_dup_task_struct(tsk, orig); |
307 | if (err) | ||
308 | goto out; | ||
309 | 307 | ||
308 | /* | ||
309 | * We defer looking at err, because we will need this setup | ||
310 | * for the clean up path to work correctly. | ||
311 | */ | ||
310 | tsk->stack = ti; | 312 | tsk->stack = ti; |
311 | |||
312 | setup_thread_stack(tsk, orig); | 313 | setup_thread_stack(tsk, orig); |
314 | |||
315 | if (err) | ||
316 | goto out; | ||
317 | |||
313 | clear_user_return_notifier(tsk); | 318 | clear_user_return_notifier(tsk); |
314 | clear_tsk_need_resched(tsk); | 319 | clear_tsk_need_resched(tsk); |
315 | stackend = end_of_stack(tsk); | 320 | stackend = end_of_stack(tsk); |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ae34bf51682b..6db7a5ed52b5 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -657,6 +657,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
657 | return 0; | 657 | return 0; |
658 | } | 658 | } |
659 | 659 | ||
660 | static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) | ||
661 | { | ||
662 | ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; | ||
663 | ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; | ||
664 | |||
665 | return ktime_get_update_offsets(offs_real, offs_boot); | ||
666 | } | ||
667 | |||
660 | /* | 668 | /* |
661 | * Retrigger next event is called after clock was set | 669 | * Retrigger next event is called after clock was set |
662 | * | 670 | * |
@@ -665,22 +673,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
665 | static void retrigger_next_event(void *arg) | 673 | static void retrigger_next_event(void *arg) |
666 | { | 674 | { |
667 | struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); | 675 | struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); |
668 | struct timespec realtime_offset, xtim, wtm, sleep; | ||
669 | 676 | ||
670 | if (!hrtimer_hres_active()) | 677 | if (!hrtimer_hres_active()) |
671 | return; | 678 | return; |
672 | 679 | ||
673 | /* Optimized out for !HIGH_RES */ | ||
674 | get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep); | ||
675 | set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); | ||
676 | |||
677 | /* Adjust CLOCK_REALTIME offset */ | ||
678 | raw_spin_lock(&base->lock); | 680 | raw_spin_lock(&base->lock); |
679 | base->clock_base[HRTIMER_BASE_REALTIME].offset = | 681 | hrtimer_update_base(base); |
680 | timespec_to_ktime(realtime_offset); | ||
681 | base->clock_base[HRTIMER_BASE_BOOTTIME].offset = | ||
682 | timespec_to_ktime(sleep); | ||
683 | |||
684 | hrtimer_force_reprogram(base, 0); | 682 | hrtimer_force_reprogram(base, 0); |
685 | raw_spin_unlock(&base->lock); | 683 | raw_spin_unlock(&base->lock); |
686 | } | 684 | } |
@@ -710,13 +708,25 @@ static int hrtimer_switch_to_hres(void) | |||
710 | base->clock_base[i].resolution = KTIME_HIGH_RES; | 708 | base->clock_base[i].resolution = KTIME_HIGH_RES; |
711 | 709 | ||
712 | tick_setup_sched_timer(); | 710 | tick_setup_sched_timer(); |
713 | |||
714 | /* "Retrigger" the interrupt to get things going */ | 711 | /* "Retrigger" the interrupt to get things going */ |
715 | retrigger_next_event(NULL); | 712 | retrigger_next_event(NULL); |
716 | local_irq_restore(flags); | 713 | local_irq_restore(flags); |
717 | return 1; | 714 | return 1; |
718 | } | 715 | } |
719 | 716 | ||
717 | /* | ||
718 | * Called from timekeeping code to reprogramm the hrtimer interrupt | ||
719 | * device. If called from the timer interrupt context we defer it to | ||
720 | * softirq context. | ||
721 | */ | ||
722 | void clock_was_set_delayed(void) | ||
723 | { | ||
724 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | ||
725 | |||
726 | cpu_base->clock_was_set = 1; | ||
727 | __raise_softirq_irqoff(HRTIMER_SOFTIRQ); | ||
728 | } | ||
729 | |||
720 | #else | 730 | #else |
721 | 731 | ||
722 | static inline int hrtimer_hres_active(void) { return 0; } | 732 | static inline int hrtimer_hres_active(void) { return 0; } |
@@ -1250,11 +1260,10 @@ void hrtimer_interrupt(struct clock_event_device *dev) | |||
1250 | cpu_base->nr_events++; | 1260 | cpu_base->nr_events++; |
1251 | dev->next_event.tv64 = KTIME_MAX; | 1261 | dev->next_event.tv64 = KTIME_MAX; |
1252 | 1262 | ||
1253 | entry_time = now = ktime_get(); | 1263 | raw_spin_lock(&cpu_base->lock); |
1264 | entry_time = now = hrtimer_update_base(cpu_base); | ||
1254 | retry: | 1265 | retry: |
1255 | expires_next.tv64 = KTIME_MAX; | 1266 | expires_next.tv64 = KTIME_MAX; |
1256 | |||
1257 | raw_spin_lock(&cpu_base->lock); | ||
1258 | /* | 1267 | /* |
1259 | * We set expires_next to KTIME_MAX here with cpu_base->lock | 1268 | * We set expires_next to KTIME_MAX here with cpu_base->lock |
1260 | * held to prevent that a timer is enqueued in our queue via | 1269 | * held to prevent that a timer is enqueued in our queue via |
@@ -1330,8 +1339,12 @@ retry: | |||
1330 | * We need to prevent that we loop forever in the hrtimer | 1339 | * We need to prevent that we loop forever in the hrtimer |
1331 | * interrupt routine. We give it 3 attempts to avoid | 1340 | * interrupt routine. We give it 3 attempts to avoid |
1332 | * overreacting on some spurious event. | 1341 | * overreacting on some spurious event. |
1342 | * | ||
1343 | * Acquire base lock for updating the offsets and retrieving | ||
1344 | * the current time. | ||
1333 | */ | 1345 | */ |
1334 | now = ktime_get(); | 1346 | raw_spin_lock(&cpu_base->lock); |
1347 | now = hrtimer_update_base(cpu_base); | ||
1335 | cpu_base->nr_retries++; | 1348 | cpu_base->nr_retries++; |
1336 | if (++retries < 3) | 1349 | if (++retries < 3) |
1337 | goto retry; | 1350 | goto retry; |
@@ -1343,6 +1356,7 @@ retry: | |||
1343 | */ | 1356 | */ |
1344 | cpu_base->nr_hangs++; | 1357 | cpu_base->nr_hangs++; |
1345 | cpu_base->hang_detected = 1; | 1358 | cpu_base->hang_detected = 1; |
1359 | raw_spin_unlock(&cpu_base->lock); | ||
1346 | delta = ktime_sub(now, entry_time); | 1360 | delta = ktime_sub(now, entry_time); |
1347 | if (delta.tv64 > cpu_base->max_hang_time.tv64) | 1361 | if (delta.tv64 > cpu_base->max_hang_time.tv64) |
1348 | cpu_base->max_hang_time = delta; | 1362 | cpu_base->max_hang_time = delta; |
@@ -1395,6 +1409,13 @@ void hrtimer_peek_ahead_timers(void) | |||
1395 | 1409 | ||
1396 | static void run_hrtimer_softirq(struct softirq_action *h) | 1410 | static void run_hrtimer_softirq(struct softirq_action *h) |
1397 | { | 1411 | { |
1412 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | ||
1413 | |||
1414 | if (cpu_base->clock_was_set) { | ||
1415 | cpu_base->clock_was_set = 0; | ||
1416 | clock_was_set(); | ||
1417 | } | ||
1418 | |||
1398 | hrtimer_peek_ahead_timers(); | 1419 | hrtimer_peek_ahead_timers(); |
1399 | } | 1420 | } |
1400 | 1421 | ||
diff --git a/kernel/panic.c b/kernel/panic.c index 8ed89a175d79..d2a5f4ecc6dd 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -27,7 +27,7 @@ | |||
27 | #define PANIC_TIMER_STEP 100 | 27 | #define PANIC_TIMER_STEP 100 |
28 | #define PANIC_BLINK_SPD 18 | 28 | #define PANIC_BLINK_SPD 18 |
29 | 29 | ||
30 | int panic_on_oops; | 30 | int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; |
31 | static unsigned long tainted_mask; | 31 | static unsigned long tainted_mask; |
32 | static int pause_on_oops; | 32 | static int pause_on_oops; |
33 | static int pause_on_oops_flag; | 33 | static int pause_on_oops_flag; |
@@ -108,8 +108,6 @@ void panic(const char *fmt, ...) | |||
108 | */ | 108 | */ |
109 | crash_kexec(NULL); | 109 | crash_kexec(NULL); |
110 | 110 | ||
111 | kmsg_dump(KMSG_DUMP_PANIC); | ||
112 | |||
113 | /* | 111 | /* |
114 | * Note smp_send_stop is the usual smp shutdown function, which | 112 | * Note smp_send_stop is the usual smp shutdown function, which |
115 | * unfortunately means it may not be hardened to work in a panic | 113 | * unfortunately means it may not be hardened to work in a panic |
@@ -117,6 +115,8 @@ void panic(const char *fmt, ...) | |||
117 | */ | 115 | */ |
118 | smp_send_stop(); | 116 | smp_send_stop(); |
119 | 117 | ||
118 | kmsg_dump(KMSG_DUMP_PANIC); | ||
119 | |||
120 | atomic_notifier_call_chain(&panic_notifier_list, 0, buf); | 120 | atomic_notifier_call_chain(&panic_notifier_list, 0, buf); |
121 | 121 | ||
122 | bust_spinlocks(0); | 122 | bust_spinlocks(0); |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 16b20e38c4a1..b3c7fd554250 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -184,11 +184,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
184 | } | 184 | } |
185 | read_unlock(&tasklist_lock); | 185 | read_unlock(&tasklist_lock); |
186 | 186 | ||
187 | /* Firstly reap the EXIT_ZOMBIE children we may have. */ | ||
187 | do { | 188 | do { |
188 | clear_thread_flag(TIF_SIGPENDING); | 189 | clear_thread_flag(TIF_SIGPENDING); |
189 | rc = sys_wait4(-1, NULL, __WALL, NULL); | 190 | rc = sys_wait4(-1, NULL, __WALL, NULL); |
190 | } while (rc != -ECHILD); | 191 | } while (rc != -ECHILD); |
191 | 192 | ||
193 | /* | ||
194 | * sys_wait4() above can't reap the TASK_DEAD children. | ||
195 | * Make sure they all go away, see __unhash_process(). | ||
196 | */ | ||
197 | for (;;) { | ||
198 | bool need_wait = false; | ||
199 | |||
200 | read_lock(&tasklist_lock); | ||
201 | if (!list_empty(¤t->children)) { | ||
202 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
203 | need_wait = true; | ||
204 | } | ||
205 | read_unlock(&tasklist_lock); | ||
206 | |||
207 | if (!need_wait) | ||
208 | break; | ||
209 | schedule(); | ||
210 | } | ||
211 | |||
192 | if (pid_ns->reboot) | 212 | if (pid_ns->reboot) |
193 | current->signal->group_exit_code = pid_ns->reboot; | 213 | current->signal->group_exit_code = pid_ns->reboot; |
194 | 214 | ||
diff --git a/kernel/printk.c b/kernel/printk.c index 32462d2b364a..177fa49357a5 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -193,12 +193,21 @@ static int console_may_schedule; | |||
193 | * separated by ',', and find the message after the ';' character. | 193 | * separated by ',', and find the message after the ';' character. |
194 | */ | 194 | */ |
195 | 195 | ||
196 | enum log_flags { | ||
197 | LOG_NOCONS = 1, /* already flushed, do not print to console */ | ||
198 | LOG_NEWLINE = 2, /* text ended with a newline */ | ||
199 | LOG_PREFIX = 4, /* text started with a prefix */ | ||
200 | LOG_CONT = 8, /* text is a fragment of a continuation line */ | ||
201 | }; | ||
202 | |||
196 | struct log { | 203 | struct log { |
197 | u64 ts_nsec; /* timestamp in nanoseconds */ | 204 | u64 ts_nsec; /* timestamp in nanoseconds */ |
198 | u16 len; /* length of entire record */ | 205 | u16 len; /* length of entire record */ |
199 | u16 text_len; /* length of text buffer */ | 206 | u16 text_len; /* length of text buffer */ |
200 | u16 dict_len; /* length of dictionary buffer */ | 207 | u16 dict_len; /* length of dictionary buffer */ |
201 | u16 level; /* syslog level + facility */ | 208 | u8 facility; /* syslog facility */ |
209 | u8 flags:5; /* internal record flags */ | ||
210 | u8 level:3; /* syslog level */ | ||
202 | }; | 211 | }; |
203 | 212 | ||
204 | /* | 213 | /* |
@@ -210,6 +219,8 @@ static DEFINE_RAW_SPINLOCK(logbuf_lock); | |||
210 | /* the next printk record to read by syslog(READ) or /proc/kmsg */ | 219 | /* the next printk record to read by syslog(READ) or /proc/kmsg */ |
211 | static u64 syslog_seq; | 220 | static u64 syslog_seq; |
212 | static u32 syslog_idx; | 221 | static u32 syslog_idx; |
222 | static enum log_flags syslog_prev; | ||
223 | static size_t syslog_partial; | ||
213 | 224 | ||
214 | /* index and sequence number of the first record stored in the buffer */ | 225 | /* index and sequence number of the first record stored in the buffer */ |
215 | static u64 log_first_seq; | 226 | static u64 log_first_seq; |
@@ -227,10 +238,10 @@ static u32 clear_idx; | |||
227 | #define LOG_LINE_MAX 1024 | 238 | #define LOG_LINE_MAX 1024 |
228 | 239 | ||
229 | /* record buffer */ | 240 | /* record buffer */ |
230 | #if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) | 241 | #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) |
231 | #define LOG_ALIGN 4 | 242 | #define LOG_ALIGN 4 |
232 | #else | 243 | #else |
233 | #define LOG_ALIGN 8 | 244 | #define LOG_ALIGN __alignof__(struct log) |
234 | #endif | 245 | #endif |
235 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) | 246 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) |
236 | static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); | 247 | static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); |
@@ -286,6 +297,7 @@ static u32 log_next(u32 idx) | |||
286 | 297 | ||
287 | /* insert record into the buffer, discard old ones, update heads */ | 298 | /* insert record into the buffer, discard old ones, update heads */ |
288 | static void log_store(int facility, int level, | 299 | static void log_store(int facility, int level, |
300 | enum log_flags flags, u64 ts_nsec, | ||
289 | const char *dict, u16 dict_len, | 301 | const char *dict, u16 dict_len, |
290 | const char *text, u16 text_len) | 302 | const char *text, u16 text_len) |
291 | { | 303 | { |
@@ -329,8 +341,13 @@ static void log_store(int facility, int level, | |||
329 | msg->text_len = text_len; | 341 | msg->text_len = text_len; |
330 | memcpy(log_dict(msg), dict, dict_len); | 342 | memcpy(log_dict(msg), dict, dict_len); |
331 | msg->dict_len = dict_len; | 343 | msg->dict_len = dict_len; |
332 | msg->level = (facility << 3) | (level & 7); | 344 | msg->facility = facility; |
333 | msg->ts_nsec = local_clock(); | 345 | msg->level = level & 7; |
346 | msg->flags = flags & 0x1f; | ||
347 | if (ts_nsec > 0) | ||
348 | msg->ts_nsec = ts_nsec; | ||
349 | else | ||
350 | msg->ts_nsec = local_clock(); | ||
334 | memset(log_dict(msg) + dict_len, 0, pad_len); | 351 | memset(log_dict(msg) + dict_len, 0, pad_len); |
335 | msg->len = sizeof(struct log) + text_len + dict_len + pad_len; | 352 | msg->len = sizeof(struct log) + text_len + dict_len + pad_len; |
336 | 353 | ||
@@ -414,21 +431,23 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, | |||
414 | if (!user) | 431 | if (!user) |
415 | return -EBADF; | 432 | return -EBADF; |
416 | 433 | ||
417 | mutex_lock(&user->lock); | 434 | ret = mutex_lock_interruptible(&user->lock); |
418 | raw_spin_lock(&logbuf_lock); | 435 | if (ret) |
436 | return ret; | ||
437 | raw_spin_lock_irq(&logbuf_lock); | ||
419 | while (user->seq == log_next_seq) { | 438 | while (user->seq == log_next_seq) { |
420 | if (file->f_flags & O_NONBLOCK) { | 439 | if (file->f_flags & O_NONBLOCK) { |
421 | ret = -EAGAIN; | 440 | ret = -EAGAIN; |
422 | raw_spin_unlock(&logbuf_lock); | 441 | raw_spin_unlock_irq(&logbuf_lock); |
423 | goto out; | 442 | goto out; |
424 | } | 443 | } |
425 | 444 | ||
426 | raw_spin_unlock(&logbuf_lock); | 445 | raw_spin_unlock_irq(&logbuf_lock); |
427 | ret = wait_event_interruptible(log_wait, | 446 | ret = wait_event_interruptible(log_wait, |
428 | user->seq != log_next_seq); | 447 | user->seq != log_next_seq); |
429 | if (ret) | 448 | if (ret) |
430 | goto out; | 449 | goto out; |
431 | raw_spin_lock(&logbuf_lock); | 450 | raw_spin_lock_irq(&logbuf_lock); |
432 | } | 451 | } |
433 | 452 | ||
434 | if (user->seq < log_first_seq) { | 453 | if (user->seq < log_first_seq) { |
@@ -436,7 +455,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, | |||
436 | user->idx = log_first_idx; | 455 | user->idx = log_first_idx; |
437 | user->seq = log_first_seq; | 456 | user->seq = log_first_seq; |
438 | ret = -EPIPE; | 457 | ret = -EPIPE; |
439 | raw_spin_unlock(&logbuf_lock); | 458 | raw_spin_unlock_irq(&logbuf_lock); |
440 | goto out; | 459 | goto out; |
441 | } | 460 | } |
442 | 461 | ||
@@ -444,13 +463,13 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, | |||
444 | ts_usec = msg->ts_nsec; | 463 | ts_usec = msg->ts_nsec; |
445 | do_div(ts_usec, 1000); | 464 | do_div(ts_usec, 1000); |
446 | len = sprintf(user->buf, "%u,%llu,%llu;", | 465 | len = sprintf(user->buf, "%u,%llu,%llu;", |
447 | msg->level, user->seq, ts_usec); | 466 | (msg->facility << 3) | msg->level, user->seq, ts_usec); |
448 | 467 | ||
449 | /* escape non-printable characters */ | 468 | /* escape non-printable characters */ |
450 | for (i = 0; i < msg->text_len; i++) { | 469 | for (i = 0; i < msg->text_len; i++) { |
451 | unsigned char c = log_text(msg)[i]; | 470 | unsigned char c = log_text(msg)[i]; |
452 | 471 | ||
453 | if (c < ' ' || c >= 128) | 472 | if (c < ' ' || c >= 127 || c == '\\') |
454 | len += sprintf(user->buf + len, "\\x%02x", c); | 473 | len += sprintf(user->buf + len, "\\x%02x", c); |
455 | else | 474 | else |
456 | user->buf[len++] = c; | 475 | user->buf[len++] = c; |
@@ -474,7 +493,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, | |||
474 | continue; | 493 | continue; |
475 | } | 494 | } |
476 | 495 | ||
477 | if (c < ' ' || c >= 128) { | 496 | if (c < ' ' || c >= 127 || c == '\\') { |
478 | len += sprintf(user->buf + len, "\\x%02x", c); | 497 | len += sprintf(user->buf + len, "\\x%02x", c); |
479 | continue; | 498 | continue; |
480 | } | 499 | } |
@@ -486,7 +505,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, | |||
486 | 505 | ||
487 | user->idx = log_next(user->idx); | 506 | user->idx = log_next(user->idx); |
488 | user->seq++; | 507 | user->seq++; |
489 | raw_spin_unlock(&logbuf_lock); | 508 | raw_spin_unlock_irq(&logbuf_lock); |
490 | 509 | ||
491 | if (len > count) { | 510 | if (len > count) { |
492 | ret = -EINVAL; | 511 | ret = -EINVAL; |
@@ -513,7 +532,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) | |||
513 | if (offset) | 532 | if (offset) |
514 | return -ESPIPE; | 533 | return -ESPIPE; |
515 | 534 | ||
516 | raw_spin_lock(&logbuf_lock); | 535 | raw_spin_lock_irq(&logbuf_lock); |
517 | switch (whence) { | 536 | switch (whence) { |
518 | case SEEK_SET: | 537 | case SEEK_SET: |
519 | /* the first record */ | 538 | /* the first record */ |
@@ -537,7 +556,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) | |||
537 | default: | 556 | default: |
538 | ret = -EINVAL; | 557 | ret = -EINVAL; |
539 | } | 558 | } |
540 | raw_spin_unlock(&logbuf_lock); | 559 | raw_spin_unlock_irq(&logbuf_lock); |
541 | return ret; | 560 | return ret; |
542 | } | 561 | } |
543 | 562 | ||
@@ -551,14 +570,14 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) | |||
551 | 570 | ||
552 | poll_wait(file, &log_wait, wait); | 571 | poll_wait(file, &log_wait, wait); |
553 | 572 | ||
554 | raw_spin_lock(&logbuf_lock); | 573 | raw_spin_lock_irq(&logbuf_lock); |
555 | if (user->seq < log_next_seq) { | 574 | if (user->seq < log_next_seq) { |
556 | /* return error when data has vanished underneath us */ | 575 | /* return error when data has vanished underneath us */ |
557 | if (user->seq < log_first_seq) | 576 | if (user->seq < log_first_seq) |
558 | ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; | 577 | ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; |
559 | ret = POLLIN|POLLRDNORM; | 578 | ret = POLLIN|POLLRDNORM; |
560 | } | 579 | } |
561 | raw_spin_unlock(&logbuf_lock); | 580 | raw_spin_unlock_irq(&logbuf_lock); |
562 | 581 | ||
563 | return ret; | 582 | return ret; |
564 | } | 583 | } |
@@ -582,10 +601,10 @@ static int devkmsg_open(struct inode *inode, struct file *file) | |||
582 | 601 | ||
583 | mutex_init(&user->lock); | 602 | mutex_init(&user->lock); |
584 | 603 | ||
585 | raw_spin_lock(&logbuf_lock); | 604 | raw_spin_lock_irq(&logbuf_lock); |
586 | user->idx = log_first_idx; | 605 | user->idx = log_first_idx; |
587 | user->seq = log_first_seq; | 606 | user->seq = log_first_seq; |
588 | raw_spin_unlock(&logbuf_lock); | 607 | raw_spin_unlock_irq(&logbuf_lock); |
589 | 608 | ||
590 | file->private_data = user; | 609 | file->private_data = user; |
591 | return 0; | 610 | return 0; |
@@ -785,44 +804,64 @@ static bool printk_time; | |||
785 | #endif | 804 | #endif |
786 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); | 805 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); |
787 | 806 | ||
807 | static size_t print_time(u64 ts, char *buf) | ||
808 | { | ||
809 | unsigned long rem_nsec; | ||
810 | |||
811 | if (!printk_time) | ||
812 | return 0; | ||
813 | |||
814 | if (!buf) | ||
815 | return 15; | ||
816 | |||
817 | rem_nsec = do_div(ts, 1000000000); | ||
818 | return sprintf(buf, "[%5lu.%06lu] ", | ||
819 | (unsigned long)ts, rem_nsec / 1000); | ||
820 | } | ||
821 | |||
788 | static size_t print_prefix(const struct log *msg, bool syslog, char *buf) | 822 | static size_t print_prefix(const struct log *msg, bool syslog, char *buf) |
789 | { | 823 | { |
790 | size_t len = 0; | 824 | size_t len = 0; |
825 | unsigned int prefix = (msg->facility << 3) | msg->level; | ||
791 | 826 | ||
792 | if (syslog) { | 827 | if (syslog) { |
793 | if (buf) { | 828 | if (buf) { |
794 | len += sprintf(buf, "<%u>", msg->level); | 829 | len += sprintf(buf, "<%u>", prefix); |
795 | } else { | 830 | } else { |
796 | len += 3; | 831 | len += 3; |
797 | if (msg->level > 9) | 832 | if (prefix > 999) |
798 | len++; | 833 | len += 3; |
799 | if (msg->level > 99) | 834 | else if (prefix > 99) |
835 | len += 2; | ||
836 | else if (prefix > 9) | ||
800 | len++; | 837 | len++; |
801 | } | 838 | } |
802 | } | 839 | } |
803 | 840 | ||
804 | if (printk_time) { | 841 | len += print_time(msg->ts_nsec, buf ? buf + len : NULL); |
805 | if (buf) { | ||
806 | unsigned long long ts = msg->ts_nsec; | ||
807 | unsigned long rem_nsec = do_div(ts, 1000000000); | ||
808 | |||
809 | len += sprintf(buf + len, "[%5lu.%06lu] ", | ||
810 | (unsigned long) ts, rem_nsec / 1000); | ||
811 | } else { | ||
812 | len += 15; | ||
813 | } | ||
814 | } | ||
815 | |||
816 | return len; | 842 | return len; |
817 | } | 843 | } |
818 | 844 | ||
819 | static size_t msg_print_text(const struct log *msg, bool syslog, | 845 | static size_t msg_print_text(const struct log *msg, enum log_flags prev, |
820 | char *buf, size_t size) | 846 | bool syslog, char *buf, size_t size) |
821 | { | 847 | { |
822 | const char *text = log_text(msg); | 848 | const char *text = log_text(msg); |
823 | size_t text_size = msg->text_len; | 849 | size_t text_size = msg->text_len; |
850 | bool prefix = true; | ||
851 | bool newline = true; | ||
824 | size_t len = 0; | 852 | size_t len = 0; |
825 | 853 | ||
854 | if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)) | ||
855 | prefix = false; | ||
856 | |||
857 | if (msg->flags & LOG_CONT) { | ||
858 | if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE)) | ||
859 | prefix = false; | ||
860 | |||
861 | if (!(msg->flags & LOG_NEWLINE)) | ||
862 | newline = false; | ||
863 | } | ||
864 | |||
826 | do { | 865 | do { |
827 | const char *next = memchr(text, '\n', text_size); | 866 | const char *next = memchr(text, '\n', text_size); |
828 | size_t text_len; | 867 | size_t text_len; |
@@ -840,16 +879,22 @@ static size_t msg_print_text(const struct log *msg, bool syslog, | |||
840 | text_len + 1>= size - len) | 879 | text_len + 1>= size - len) |
841 | break; | 880 | break; |
842 | 881 | ||
843 | len += print_prefix(msg, syslog, buf + len); | 882 | if (prefix) |
883 | len += print_prefix(msg, syslog, buf + len); | ||
844 | memcpy(buf + len, text, text_len); | 884 | memcpy(buf + len, text, text_len); |
845 | len += text_len; | 885 | len += text_len; |
846 | buf[len++] = '\n'; | 886 | if (next || newline) |
887 | buf[len++] = '\n'; | ||
847 | } else { | 888 | } else { |
848 | /* SYSLOG_ACTION_* buffer size only calculation */ | 889 | /* SYSLOG_ACTION_* buffer size only calculation */ |
849 | len += print_prefix(msg, syslog, NULL); | 890 | if (prefix) |
850 | len += text_len + 1; | 891 | len += print_prefix(msg, syslog, NULL); |
892 | len += text_len; | ||
893 | if (next || newline) | ||
894 | len++; | ||
851 | } | 895 | } |
852 | 896 | ||
897 | prefix = true; | ||
853 | text = next; | 898 | text = next; |
854 | } while (text); | 899 | } while (text); |
855 | 900 | ||
@@ -860,26 +905,60 @@ static int syslog_print(char __user *buf, int size) | |||
860 | { | 905 | { |
861 | char *text; | 906 | char *text; |
862 | struct log *msg; | 907 | struct log *msg; |
863 | int len; | 908 | int len = 0; |
864 | 909 | ||
865 | text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); | 910 | text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); |
866 | if (!text) | 911 | if (!text) |
867 | return -ENOMEM; | 912 | return -ENOMEM; |
868 | 913 | ||
869 | raw_spin_lock_irq(&logbuf_lock); | 914 | while (size > 0) { |
870 | if (syslog_seq < log_first_seq) { | 915 | size_t n; |
871 | /* messages are gone, move to first one */ | 916 | size_t skip; |
872 | syslog_seq = log_first_seq; | 917 | |
873 | syslog_idx = log_first_idx; | 918 | raw_spin_lock_irq(&logbuf_lock); |
874 | } | 919 | if (syslog_seq < log_first_seq) { |
875 | msg = log_from_idx(syslog_idx); | 920 | /* messages are gone, move to first one */ |
876 | len = msg_print_text(msg, true, text, LOG_LINE_MAX); | 921 | syslog_seq = log_first_seq; |
877 | syslog_idx = log_next(syslog_idx); | 922 | syslog_idx = log_first_idx; |
878 | syslog_seq++; | 923 | syslog_prev = 0; |
879 | raw_spin_unlock_irq(&logbuf_lock); | 924 | syslog_partial = 0; |
925 | } | ||
926 | if (syslog_seq == log_next_seq) { | ||
927 | raw_spin_unlock_irq(&logbuf_lock); | ||
928 | break; | ||
929 | } | ||
930 | |||
931 | skip = syslog_partial; | ||
932 | msg = log_from_idx(syslog_idx); | ||
933 | n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX); | ||
934 | if (n - syslog_partial <= size) { | ||
935 | /* message fits into buffer, move forward */ | ||
936 | syslog_idx = log_next(syslog_idx); | ||
937 | syslog_seq++; | ||
938 | syslog_prev = msg->flags; | ||
939 | n -= syslog_partial; | ||
940 | syslog_partial = 0; | ||
941 | } else if (!len){ | ||
942 | /* partial read(), remember position */ | ||
943 | n = size; | ||
944 | syslog_partial += n; | ||
945 | } else | ||
946 | n = 0; | ||
947 | raw_spin_unlock_irq(&logbuf_lock); | ||
948 | |||
949 | if (!n) | ||
950 | break; | ||
951 | |||
952 | if (copy_to_user(buf, text + skip, n)) { | ||
953 | if (!len) | ||
954 | len = -EFAULT; | ||
955 | break; | ||
956 | } | ||
880 | 957 | ||
881 | if (len > 0 && copy_to_user(buf, text, len)) | 958 | len += n; |
882 | len = -EFAULT; | 959 | size -= n; |
960 | buf += n; | ||
961 | } | ||
883 | 962 | ||
884 | kfree(text); | 963 | kfree(text); |
885 | return len; | 964 | return len; |
@@ -899,6 +978,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
899 | u64 next_seq; | 978 | u64 next_seq; |
900 | u64 seq; | 979 | u64 seq; |
901 | u32 idx; | 980 | u32 idx; |
981 | enum log_flags prev; | ||
902 | 982 | ||
903 | if (clear_seq < log_first_seq) { | 983 | if (clear_seq < log_first_seq) { |
904 | /* messages are gone, move to first available one */ | 984 | /* messages are gone, move to first available one */ |
@@ -909,41 +989,47 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
909 | /* | 989 | /* |
910 | * Find first record that fits, including all following records, | 990 | * Find first record that fits, including all following records, |
911 | * into the user-provided buffer for this dump. | 991 | * into the user-provided buffer for this dump. |
912 | */ | 992 | */ |
913 | seq = clear_seq; | 993 | seq = clear_seq; |
914 | idx = clear_idx; | 994 | idx = clear_idx; |
995 | prev = 0; | ||
915 | while (seq < log_next_seq) { | 996 | while (seq < log_next_seq) { |
916 | struct log *msg = log_from_idx(idx); | 997 | struct log *msg = log_from_idx(idx); |
917 | 998 | ||
918 | len += msg_print_text(msg, true, NULL, 0); | 999 | len += msg_print_text(msg, prev, true, NULL, 0); |
919 | idx = log_next(idx); | 1000 | idx = log_next(idx); |
920 | seq++; | 1001 | seq++; |
921 | } | 1002 | } |
1003 | |||
1004 | /* move first record forward until length fits into the buffer */ | ||
922 | seq = clear_seq; | 1005 | seq = clear_seq; |
923 | idx = clear_idx; | 1006 | idx = clear_idx; |
1007 | prev = 0; | ||
924 | while (len > size && seq < log_next_seq) { | 1008 | while (len > size && seq < log_next_seq) { |
925 | struct log *msg = log_from_idx(idx); | 1009 | struct log *msg = log_from_idx(idx); |
926 | 1010 | ||
927 | len -= msg_print_text(msg, true, NULL, 0); | 1011 | len -= msg_print_text(msg, prev, true, NULL, 0); |
928 | idx = log_next(idx); | 1012 | idx = log_next(idx); |
929 | seq++; | 1013 | seq++; |
930 | } | 1014 | } |
931 | 1015 | ||
932 | /* last message in this dump */ | 1016 | /* last message fitting into this dump */ |
933 | next_seq = log_next_seq; | 1017 | next_seq = log_next_seq; |
934 | 1018 | ||
935 | len = 0; | 1019 | len = 0; |
1020 | prev = 0; | ||
936 | while (len >= 0 && seq < next_seq) { | 1021 | while (len >= 0 && seq < next_seq) { |
937 | struct log *msg = log_from_idx(idx); | 1022 | struct log *msg = log_from_idx(idx); |
938 | int textlen; | 1023 | int textlen; |
939 | 1024 | ||
940 | textlen = msg_print_text(msg, true, text, LOG_LINE_MAX); | 1025 | textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX); |
941 | if (textlen < 0) { | 1026 | if (textlen < 0) { |
942 | len = textlen; | 1027 | len = textlen; |
943 | break; | 1028 | break; |
944 | } | 1029 | } |
945 | idx = log_next(idx); | 1030 | idx = log_next(idx); |
946 | seq++; | 1031 | seq++; |
1032 | prev = msg->flags; | ||
947 | 1033 | ||
948 | raw_spin_unlock_irq(&logbuf_lock); | 1034 | raw_spin_unlock_irq(&logbuf_lock); |
949 | if (copy_to_user(buf + len, text, textlen)) | 1035 | if (copy_to_user(buf + len, text, textlen)) |
@@ -956,6 +1042,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
956 | /* messages are gone, move to next one */ | 1042 | /* messages are gone, move to next one */ |
957 | seq = log_first_seq; | 1043 | seq = log_first_seq; |
958 | idx = log_first_idx; | 1044 | idx = log_first_idx; |
1045 | prev = 0; | ||
959 | } | 1046 | } |
960 | } | 1047 | } |
961 | } | 1048 | } |
@@ -1027,6 +1114,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
1027 | /* Clear ring buffer */ | 1114 | /* Clear ring buffer */ |
1028 | case SYSLOG_ACTION_CLEAR: | 1115 | case SYSLOG_ACTION_CLEAR: |
1029 | syslog_print_all(NULL, 0, true); | 1116 | syslog_print_all(NULL, 0, true); |
1117 | break; | ||
1030 | /* Disable logging to console */ | 1118 | /* Disable logging to console */ |
1031 | case SYSLOG_ACTION_CONSOLE_OFF: | 1119 | case SYSLOG_ACTION_CONSOLE_OFF: |
1032 | if (saved_console_loglevel == -1) | 1120 | if (saved_console_loglevel == -1) |
@@ -1059,6 +1147,8 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
1059 | /* messages are gone, move to first one */ | 1147 | /* messages are gone, move to first one */ |
1060 | syslog_seq = log_first_seq; | 1148 | syslog_seq = log_first_seq; |
1061 | syslog_idx = log_first_idx; | 1149 | syslog_idx = log_first_idx; |
1150 | syslog_prev = 0; | ||
1151 | syslog_partial = 0; | ||
1062 | } | 1152 | } |
1063 | if (from_file) { | 1153 | if (from_file) { |
1064 | /* | 1154 | /* |
@@ -1068,19 +1158,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
1068 | */ | 1158 | */ |
1069 | error = log_next_idx - syslog_idx; | 1159 | error = log_next_idx - syslog_idx; |
1070 | } else { | 1160 | } else { |
1071 | u64 seq; | 1161 | u64 seq = syslog_seq; |
1072 | u32 idx; | 1162 | u32 idx = syslog_idx; |
1163 | enum log_flags prev = syslog_prev; | ||
1073 | 1164 | ||
1074 | error = 0; | 1165 | error = 0; |
1075 | seq = syslog_seq; | ||
1076 | idx = syslog_idx; | ||
1077 | while (seq < log_next_seq) { | 1166 | while (seq < log_next_seq) { |
1078 | struct log *msg = log_from_idx(idx); | 1167 | struct log *msg = log_from_idx(idx); |
1079 | 1168 | ||
1080 | error += msg_print_text(msg, true, NULL, 0); | 1169 | error += msg_print_text(msg, prev, true, NULL, 0); |
1081 | idx = log_next(idx); | 1170 | idx = log_next(idx); |
1082 | seq++; | 1171 | seq++; |
1172 | prev = msg->flags; | ||
1083 | } | 1173 | } |
1174 | error -= syslog_partial; | ||
1084 | } | 1175 | } |
1085 | raw_spin_unlock_irq(&logbuf_lock); | 1176 | raw_spin_unlock_irq(&logbuf_lock); |
1086 | break; | 1177 | break; |
@@ -1259,22 +1350,98 @@ static inline void printk_delay(void) | |||
1259 | } | 1350 | } |
1260 | } | 1351 | } |
1261 | 1352 | ||
1353 | /* | ||
1354 | * Continuation lines are buffered, and not committed to the record buffer | ||
1355 | * until the line is complete, or a race forces it. The line fragments | ||
1356 | * though, are printed immediately to the consoles to ensure everything has | ||
1357 | * reached the console in case of a kernel crash. | ||
1358 | */ | ||
1359 | static struct cont { | ||
1360 | char buf[LOG_LINE_MAX]; | ||
1361 | size_t len; /* length == 0 means unused buffer */ | ||
1362 | size_t cons; /* bytes written to console */ | ||
1363 | struct task_struct *owner; /* task of first print*/ | ||
1364 | u64 ts_nsec; /* time of first print */ | ||
1365 | u8 level; /* log level of first message */ | ||
1366 | u8 facility; /* log level of first message */ | ||
1367 | bool flushed:1; /* buffer sealed and committed */ | ||
1368 | } cont; | ||
1369 | |||
1370 | static void cont_flush(void) | ||
1371 | { | ||
1372 | if (cont.flushed) | ||
1373 | return; | ||
1374 | if (cont.len == 0) | ||
1375 | return; | ||
1376 | |||
1377 | log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec, | ||
1378 | NULL, 0, cont.buf, cont.len); | ||
1379 | |||
1380 | cont.flushed = true; | ||
1381 | } | ||
1382 | |||
1383 | static bool cont_add(int facility, int level, const char *text, size_t len) | ||
1384 | { | ||
1385 | if (cont.len && cont.flushed) | ||
1386 | return false; | ||
1387 | |||
1388 | if (cont.len + len > sizeof(cont.buf)) { | ||
1389 | cont_flush(); | ||
1390 | return false; | ||
1391 | } | ||
1392 | |||
1393 | if (!cont.len) { | ||
1394 | cont.facility = facility; | ||
1395 | cont.level = level; | ||
1396 | cont.owner = current; | ||
1397 | cont.ts_nsec = local_clock(); | ||
1398 | cont.cons = 0; | ||
1399 | cont.flushed = false; | ||
1400 | } | ||
1401 | |||
1402 | memcpy(cont.buf + cont.len, text, len); | ||
1403 | cont.len += len; | ||
1404 | return true; | ||
1405 | } | ||
1406 | |||
1407 | static size_t cont_print_text(char *text, size_t size) | ||
1408 | { | ||
1409 | size_t textlen = 0; | ||
1410 | size_t len; | ||
1411 | |||
1412 | if (cont.cons == 0) { | ||
1413 | textlen += print_time(cont.ts_nsec, text); | ||
1414 | size -= textlen; | ||
1415 | } | ||
1416 | |||
1417 | len = cont.len - cont.cons; | ||
1418 | if (len > 0) { | ||
1419 | if (len+1 > size) | ||
1420 | len = size-1; | ||
1421 | memcpy(text + textlen, cont.buf + cont.cons, len); | ||
1422 | textlen += len; | ||
1423 | cont.cons = cont.len; | ||
1424 | } | ||
1425 | |||
1426 | if (cont.flushed) { | ||
1427 | text[textlen++] = '\n'; | ||
1428 | /* got everything, release buffer */ | ||
1429 | cont.len = 0; | ||
1430 | } | ||
1431 | return textlen; | ||
1432 | } | ||
1433 | |||
1262 | asmlinkage int vprintk_emit(int facility, int level, | 1434 | asmlinkage int vprintk_emit(int facility, int level, |
1263 | const char *dict, size_t dictlen, | 1435 | const char *dict, size_t dictlen, |
1264 | const char *fmt, va_list args) | 1436 | const char *fmt, va_list args) |
1265 | { | 1437 | { |
1266 | static int recursion_bug; | 1438 | static int recursion_bug; |
1267 | static char cont_buf[LOG_LINE_MAX]; | ||
1268 | static size_t cont_len; | ||
1269 | static int cont_level; | ||
1270 | static struct task_struct *cont_task; | ||
1271 | static char textbuf[LOG_LINE_MAX]; | 1439 | static char textbuf[LOG_LINE_MAX]; |
1272 | char *text = textbuf; | 1440 | char *text = textbuf; |
1273 | size_t text_len; | 1441 | size_t text_len; |
1442 | enum log_flags lflags = 0; | ||
1274 | unsigned long flags; | 1443 | unsigned long flags; |
1275 | int this_cpu; | 1444 | int this_cpu; |
1276 | bool newline = false; | ||
1277 | bool prefix = false; | ||
1278 | int printed_len = 0; | 1445 | int printed_len = 0; |
1279 | 1446 | ||
1280 | boot_delay_msec(); | 1447 | boot_delay_msec(); |
@@ -1313,7 +1480,8 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1313 | recursion_bug = 0; | 1480 | recursion_bug = 0; |
1314 | printed_len += strlen(recursion_msg); | 1481 | printed_len += strlen(recursion_msg); |
1315 | /* emit KERN_CRIT message */ | 1482 | /* emit KERN_CRIT message */ |
1316 | log_store(0, 2, NULL, 0, recursion_msg, printed_len); | 1483 | log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, |
1484 | NULL, 0, recursion_msg, printed_len); | ||
1317 | } | 1485 | } |
1318 | 1486 | ||
1319 | /* | 1487 | /* |
@@ -1325,7 +1493,7 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1325 | /* mark and strip a trailing newline */ | 1493 | /* mark and strip a trailing newline */ |
1326 | if (text_len && text[text_len-1] == '\n') { | 1494 | if (text_len && text[text_len-1] == '\n') { |
1327 | text_len--; | 1495 | text_len--; |
1328 | newline = true; | 1496 | lflags |= LOG_NEWLINE; |
1329 | } | 1497 | } |
1330 | 1498 | ||
1331 | /* strip syslog prefix and extract log level or control flags */ | 1499 | /* strip syslog prefix and extract log level or control flags */ |
@@ -1335,7 +1503,7 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1335 | if (level == -1) | 1503 | if (level == -1) |
1336 | level = text[1] - '0'; | 1504 | level = text[1] - '0'; |
1337 | case 'd': /* KERN_DEFAULT */ | 1505 | case 'd': /* KERN_DEFAULT */ |
1338 | prefix = true; | 1506 | lflags |= LOG_PREFIX; |
1339 | case 'c': /* KERN_CONT */ | 1507 | case 'c': /* KERN_CONT */ |
1340 | text += 3; | 1508 | text += 3; |
1341 | text_len -= 3; | 1509 | text_len -= 3; |
@@ -1345,61 +1513,41 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1345 | if (level == -1) | 1513 | if (level == -1) |
1346 | level = default_message_loglevel; | 1514 | level = default_message_loglevel; |
1347 | 1515 | ||
1348 | if (dict) { | 1516 | if (dict) |
1349 | prefix = true; | 1517 | lflags |= LOG_PREFIX|LOG_NEWLINE; |
1350 | newline = true; | ||
1351 | } | ||
1352 | 1518 | ||
1353 | if (!newline) { | 1519 | if (!(lflags & LOG_NEWLINE)) { |
1354 | if (cont_len && (prefix || cont_task != current)) { | 1520 | /* |
1355 | /* | 1521 | * Flush the conflicting buffer. An earlier newline was missing, |
1356 | * Flush earlier buffer, which is either from a | 1522 | * or another task also prints continuation lines. |
1357 | * different thread, or when we got a new prefix. | 1523 | */ |
1358 | */ | 1524 | if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) |
1359 | log_store(facility, cont_level, NULL, 0, cont_buf, cont_len); | 1525 | cont_flush(); |
1360 | cont_len = 0; | ||
1361 | } | ||
1362 | |||
1363 | if (!cont_len) { | ||
1364 | cont_level = level; | ||
1365 | cont_task = current; | ||
1366 | } | ||
1367 | 1526 | ||
1368 | /* buffer or append to earlier buffer from the same thread */ | 1527 | /* buffer line if possible, otherwise store it right away */ |
1369 | if (cont_len + text_len > sizeof(cont_buf)) | 1528 | if (!cont_add(facility, level, text, text_len)) |
1370 | text_len = sizeof(cont_buf) - cont_len; | 1529 | log_store(facility, level, lflags | LOG_CONT, 0, |
1371 | memcpy(cont_buf + cont_len, text, text_len); | 1530 | dict, dictlen, text, text_len); |
1372 | cont_len += text_len; | ||
1373 | } else { | 1531 | } else { |
1374 | if (cont_len && cont_task == current) { | 1532 | bool stored = false; |
1375 | if (prefix) { | ||
1376 | /* | ||
1377 | * New prefix from the same thread; flush. We | ||
1378 | * either got no earlier newline, or we race | ||
1379 | * with an interrupt. | ||
1380 | */ | ||
1381 | log_store(facility, cont_level, | ||
1382 | NULL, 0, cont_buf, cont_len); | ||
1383 | cont_len = 0; | ||
1384 | } | ||
1385 | 1533 | ||
1386 | /* append to the earlier buffer and flush */ | 1534 | /* |
1387 | if (cont_len + text_len > sizeof(cont_buf)) | 1535 | * If an earlier newline was missing and it was the same task, |
1388 | text_len = sizeof(cont_buf) - cont_len; | 1536 | * either merge it with the current buffer and flush, or if |
1389 | memcpy(cont_buf + cont_len, text, text_len); | 1537 | * there was a race with interrupts (prefix == true) then just |
1390 | cont_len += text_len; | 1538 | * flush it out and store this line separately. |
1391 | log_store(facility, cont_level, | 1539 | */ |
1392 | NULL, 0, cont_buf, cont_len); | 1540 | if (cont.len && cont.owner == current) { |
1393 | cont_len = 0; | 1541 | if (!(lflags & LOG_PREFIX)) |
1394 | cont_task = NULL; | 1542 | stored = cont_add(facility, level, text, text_len); |
1395 | printed_len = cont_len; | 1543 | cont_flush(); |
1396 | } else { | ||
1397 | /* ordinary single and terminated line */ | ||
1398 | log_store(facility, level, | ||
1399 | dict, dictlen, text, text_len); | ||
1400 | printed_len = text_len; | ||
1401 | } | 1544 | } |
1545 | |||
1546 | if (!stored) | ||
1547 | log_store(facility, level, lflags, 0, | ||
1548 | dict, dictlen, text, text_len); | ||
1402 | } | 1549 | } |
1550 | printed_len += text_len; | ||
1403 | 1551 | ||
1404 | /* | 1552 | /* |
1405 | * Try to acquire and then immediately release the console semaphore. | 1553 | * Try to acquire and then immediately release the console semaphore. |
@@ -1486,11 +1634,18 @@ EXPORT_SYMBOL(printk); | |||
1486 | #else | 1634 | #else |
1487 | 1635 | ||
1488 | #define LOG_LINE_MAX 0 | 1636 | #define LOG_LINE_MAX 0 |
1637 | static struct cont { | ||
1638 | size_t len; | ||
1639 | size_t cons; | ||
1640 | u8 level; | ||
1641 | bool flushed:1; | ||
1642 | } cont; | ||
1489 | static struct log *log_from_idx(u32 idx) { return NULL; } | 1643 | static struct log *log_from_idx(u32 idx) { return NULL; } |
1490 | static u32 log_next(u32 idx) { return 0; } | 1644 | static u32 log_next(u32 idx) { return 0; } |
1491 | static void call_console_drivers(int level, const char *text, size_t len) {} | 1645 | static void call_console_drivers(int level, const char *text, size_t len) {} |
1492 | static size_t msg_print_text(const struct log *msg, bool syslog, | 1646 | static size_t msg_print_text(const struct log *msg, enum log_flags prev, |
1493 | char *buf, size_t size) { return 0; } | 1647 | bool syslog, char *buf, size_t size) { return 0; } |
1648 | static size_t cont_print_text(char *text, size_t size) { return 0; } | ||
1494 | 1649 | ||
1495 | #endif /* CONFIG_PRINTK */ | 1650 | #endif /* CONFIG_PRINTK */ |
1496 | 1651 | ||
@@ -1765,6 +1920,7 @@ void wake_up_klogd(void) | |||
1765 | /* the next printk record to write to the console */ | 1920 | /* the next printk record to write to the console */ |
1766 | static u64 console_seq; | 1921 | static u64 console_seq; |
1767 | static u32 console_idx; | 1922 | static u32 console_idx; |
1923 | static enum log_flags console_prev; | ||
1768 | 1924 | ||
1769 | /** | 1925 | /** |
1770 | * console_unlock - unlock the console system | 1926 | * console_unlock - unlock the console system |
@@ -1782,6 +1938,7 @@ static u32 console_idx; | |||
1782 | */ | 1938 | */ |
1783 | void console_unlock(void) | 1939 | void console_unlock(void) |
1784 | { | 1940 | { |
1941 | static char text[LOG_LINE_MAX]; | ||
1785 | static u64 seen_seq; | 1942 | static u64 seen_seq; |
1786 | unsigned long flags; | 1943 | unsigned long flags; |
1787 | bool wake_klogd = false; | 1944 | bool wake_klogd = false; |
@@ -1794,10 +1951,23 @@ void console_unlock(void) | |||
1794 | 1951 | ||
1795 | console_may_schedule = 0; | 1952 | console_may_schedule = 0; |
1796 | 1953 | ||
1954 | /* flush buffered message fragment immediately to console */ | ||
1955 | raw_spin_lock_irqsave(&logbuf_lock, flags); | ||
1956 | if (cont.len && (cont.cons < cont.len || cont.flushed)) { | ||
1957 | size_t len; | ||
1958 | |||
1959 | len = cont_print_text(text, sizeof(text)); | ||
1960 | raw_spin_unlock(&logbuf_lock); | ||
1961 | stop_critical_timings(); | ||
1962 | call_console_drivers(cont.level, text, len); | ||
1963 | start_critical_timings(); | ||
1964 | local_irq_restore(flags); | ||
1965 | } else | ||
1966 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
1967 | |||
1797 | again: | 1968 | again: |
1798 | for (;;) { | 1969 | for (;;) { |
1799 | struct log *msg; | 1970 | struct log *msg; |
1800 | static char text[LOG_LINE_MAX]; | ||
1801 | size_t len; | 1971 | size_t len; |
1802 | int level; | 1972 | int level; |
1803 | 1973 | ||
@@ -1811,18 +1981,35 @@ again: | |||
1811 | /* messages are gone, move to first one */ | 1981 | /* messages are gone, move to first one */ |
1812 | console_seq = log_first_seq; | 1982 | console_seq = log_first_seq; |
1813 | console_idx = log_first_idx; | 1983 | console_idx = log_first_idx; |
1984 | console_prev = 0; | ||
1814 | } | 1985 | } |
1815 | 1986 | skip: | |
1816 | if (console_seq == log_next_seq) | 1987 | if (console_seq == log_next_seq) |
1817 | break; | 1988 | break; |
1818 | 1989 | ||
1819 | msg = log_from_idx(console_idx); | 1990 | msg = log_from_idx(console_idx); |
1820 | level = msg->level & 7; | 1991 | if (msg->flags & LOG_NOCONS) { |
1821 | 1992 | /* | |
1822 | len = msg_print_text(msg, false, text, sizeof(text)); | 1993 | * Skip record we have buffered and already printed |
1994 | * directly to the console when we received it. | ||
1995 | */ | ||
1996 | console_idx = log_next(console_idx); | ||
1997 | console_seq++; | ||
1998 | /* | ||
1999 | * We will get here again when we register a new | ||
2000 | * CON_PRINTBUFFER console. Clear the flag so we | ||
2001 | * will properly dump everything later. | ||
2002 | */ | ||
2003 | msg->flags &= ~LOG_NOCONS; | ||
2004 | goto skip; | ||
2005 | } | ||
1823 | 2006 | ||
2007 | level = msg->level; | ||
2008 | len = msg_print_text(msg, console_prev, false, | ||
2009 | text, sizeof(text)); | ||
1824 | console_idx = log_next(console_idx); | 2010 | console_idx = log_next(console_idx); |
1825 | console_seq++; | 2011 | console_seq++; |
2012 | console_prev = msg->flags; | ||
1826 | raw_spin_unlock(&logbuf_lock); | 2013 | raw_spin_unlock(&logbuf_lock); |
1827 | 2014 | ||
1828 | stop_critical_timings(); /* don't trace print latency */ | 2015 | stop_critical_timings(); /* don't trace print latency */ |
@@ -2085,6 +2272,7 @@ void register_console(struct console *newcon) | |||
2085 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 2272 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
2086 | console_seq = syslog_seq; | 2273 | console_seq = syslog_seq; |
2087 | console_idx = syslog_idx; | 2274 | console_idx = syslog_idx; |
2275 | console_prev = syslog_prev; | ||
2088 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 2276 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
2089 | /* | 2277 | /* |
2090 | * We're about to replay the log buffer. Only do this to the | 2278 | * We're about to replay the log buffer. Only do this to the |
@@ -2300,48 +2488,214 @@ module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); | |||
2300 | * kmsg_dump - dump kernel log to kernel message dumpers. | 2488 | * kmsg_dump - dump kernel log to kernel message dumpers. |
2301 | * @reason: the reason (oops, panic etc) for dumping | 2489 | * @reason: the reason (oops, panic etc) for dumping |
2302 | * | 2490 | * |
2303 | * Iterate through each of the dump devices and call the oops/panic | 2491 | * Call each of the registered dumper's dump() callback, which can |
2304 | * callbacks with the log buffer. | 2492 | * retrieve the kmsg records with kmsg_dump_get_line() or |
2493 | * kmsg_dump_get_buffer(). | ||
2305 | */ | 2494 | */ |
2306 | void kmsg_dump(enum kmsg_dump_reason reason) | 2495 | void kmsg_dump(enum kmsg_dump_reason reason) |
2307 | { | 2496 | { |
2308 | u64 idx; | ||
2309 | struct kmsg_dumper *dumper; | 2497 | struct kmsg_dumper *dumper; |
2310 | const char *s1, *s2; | ||
2311 | unsigned long l1, l2; | ||
2312 | unsigned long flags; | 2498 | unsigned long flags; |
2313 | 2499 | ||
2314 | if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) | 2500 | if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) |
2315 | return; | 2501 | return; |
2316 | 2502 | ||
2317 | /* Theoretically, the log could move on after we do this, but | 2503 | rcu_read_lock(); |
2318 | there's not a lot we can do about that. The new messages | 2504 | list_for_each_entry_rcu(dumper, &dump_list, list) { |
2319 | will overwrite the start of what we dump. */ | 2505 | if (dumper->max_reason && reason > dumper->max_reason) |
2506 | continue; | ||
2507 | |||
2508 | /* initialize iterator with data about the stored records */ | ||
2509 | dumper->active = true; | ||
2510 | |||
2511 | raw_spin_lock_irqsave(&logbuf_lock, flags); | ||
2512 | dumper->cur_seq = clear_seq; | ||
2513 | dumper->cur_idx = clear_idx; | ||
2514 | dumper->next_seq = log_next_seq; | ||
2515 | dumper->next_idx = log_next_idx; | ||
2516 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
2517 | |||
2518 | /* invoke dumper which will iterate over records */ | ||
2519 | dumper->dump(dumper, reason); | ||
2520 | |||
2521 | /* reset iterator */ | ||
2522 | dumper->active = false; | ||
2523 | } | ||
2524 | rcu_read_unlock(); | ||
2525 | } | ||
2526 | |||
2527 | /** | ||
2528 | * kmsg_dump_get_line - retrieve one kmsg log line | ||
2529 | * @dumper: registered kmsg dumper | ||
2530 | * @syslog: include the "<4>" prefixes | ||
2531 | * @line: buffer to copy the line to | ||
2532 | * @size: maximum size of the buffer | ||
2533 | * @len: length of line placed into buffer | ||
2534 | * | ||
2535 | * Start at the beginning of the kmsg buffer, with the oldest kmsg | ||
2536 | * record, and copy one record into the provided buffer. | ||
2537 | * | ||
2538 | * Consecutive calls will return the next available record moving | ||
2539 | * towards the end of the buffer with the youngest messages. | ||
2540 | * | ||
2541 | * A return value of FALSE indicates that there are no more records to | ||
2542 | * read. | ||
2543 | */ | ||
2544 | bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, | ||
2545 | char *line, size_t size, size_t *len) | ||
2546 | { | ||
2547 | unsigned long flags; | ||
2548 | struct log *msg; | ||
2549 | size_t l = 0; | ||
2550 | bool ret = false; | ||
2551 | |||
2552 | if (!dumper->active) | ||
2553 | goto out; | ||
2320 | 2554 | ||
2321 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 2555 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
2322 | if (syslog_seq < log_first_seq) | 2556 | if (dumper->cur_seq < log_first_seq) { |
2323 | idx = syslog_idx; | 2557 | /* messages are gone, move to first available one */ |
2324 | else | 2558 | dumper->cur_seq = log_first_seq; |
2325 | idx = log_first_idx; | 2559 | dumper->cur_idx = log_first_idx; |
2560 | } | ||
2326 | 2561 | ||
2327 | if (idx > log_next_idx) { | 2562 | /* last entry */ |
2328 | s1 = log_buf; | 2563 | if (dumper->cur_seq >= log_next_seq) { |
2329 | l1 = log_next_idx; | 2564 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
2565 | goto out; | ||
2566 | } | ||
2330 | 2567 | ||
2331 | s2 = log_buf + idx; | 2568 | msg = log_from_idx(dumper->cur_idx); |
2332 | l2 = log_buf_len - idx; | 2569 | l = msg_print_text(msg, 0, syslog, line, size); |
2333 | } else { | 2570 | |
2334 | s1 = ""; | 2571 | dumper->cur_idx = log_next(dumper->cur_idx); |
2335 | l1 = 0; | 2572 | dumper->cur_seq++; |
2573 | ret = true; | ||
2574 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
2575 | out: | ||
2576 | if (len) | ||
2577 | *len = l; | ||
2578 | return ret; | ||
2579 | } | ||
2580 | EXPORT_SYMBOL_GPL(kmsg_dump_get_line); | ||
2581 | |||
2582 | /** | ||
2583 | * kmsg_dump_get_buffer - copy kmsg log lines | ||
2584 | * @dumper: registered kmsg dumper | ||
2585 | * @syslog: include the "<4>" prefixes | ||
2586 | * @buf: buffer to copy the line to | ||
2587 | * @size: maximum size of the buffer | ||
2588 | * @len: length of line placed into buffer | ||
2589 | * | ||
2590 | * Start at the end of the kmsg buffer and fill the provided buffer | ||
2591 | * with as many of the the *youngest* kmsg records that fit into it. | ||
2592 | * If the buffer is large enough, all available kmsg records will be | ||
2593 | * copied with a single call. | ||
2594 | * | ||
2595 | * Consecutive calls will fill the buffer with the next block of | ||
2596 | * available older records, not including the earlier retrieved ones. | ||
2597 | * | ||
2598 | * A return value of FALSE indicates that there are no more records to | ||
2599 | * read. | ||
2600 | */ | ||
2601 | bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, | ||
2602 | char *buf, size_t size, size_t *len) | ||
2603 | { | ||
2604 | unsigned long flags; | ||
2605 | u64 seq; | ||
2606 | u32 idx; | ||
2607 | u64 next_seq; | ||
2608 | u32 next_idx; | ||
2609 | enum log_flags prev; | ||
2610 | size_t l = 0; | ||
2611 | bool ret = false; | ||
2612 | |||
2613 | if (!dumper->active) | ||
2614 | goto out; | ||
2615 | |||
2616 | raw_spin_lock_irqsave(&logbuf_lock, flags); | ||
2617 | if (dumper->cur_seq < log_first_seq) { | ||
2618 | /* messages are gone, move to first available one */ | ||
2619 | dumper->cur_seq = log_first_seq; | ||
2620 | dumper->cur_idx = log_first_idx; | ||
2621 | } | ||
2622 | |||
2623 | /* last entry */ | ||
2624 | if (dumper->cur_seq >= dumper->next_seq) { | ||
2625 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
2626 | goto out; | ||
2627 | } | ||
2628 | |||
2629 | /* calculate length of entire buffer */ | ||
2630 | seq = dumper->cur_seq; | ||
2631 | idx = dumper->cur_idx; | ||
2632 | prev = 0; | ||
2633 | while (seq < dumper->next_seq) { | ||
2634 | struct log *msg = log_from_idx(idx); | ||
2635 | |||
2636 | l += msg_print_text(msg, prev, true, NULL, 0); | ||
2637 | idx = log_next(idx); | ||
2638 | seq++; | ||
2639 | prev = msg->flags; | ||
2640 | } | ||
2336 | 2641 | ||
2337 | s2 = log_buf + idx; | 2642 | /* move first record forward until length fits into the buffer */ |
2338 | l2 = log_next_idx - idx; | 2643 | seq = dumper->cur_seq; |
2644 | idx = dumper->cur_idx; | ||
2645 | prev = 0; | ||
2646 | while (l > size && seq < dumper->next_seq) { | ||
2647 | struct log *msg = log_from_idx(idx); | ||
2648 | |||
2649 | l -= msg_print_text(msg, prev, true, NULL, 0); | ||
2650 | idx = log_next(idx); | ||
2651 | seq++; | ||
2652 | prev = msg->flags; | ||
2339 | } | 2653 | } |
2654 | |||
2655 | /* last message in next interation */ | ||
2656 | next_seq = seq; | ||
2657 | next_idx = idx; | ||
2658 | |||
2659 | l = 0; | ||
2660 | prev = 0; | ||
2661 | while (seq < dumper->next_seq) { | ||
2662 | struct log *msg = log_from_idx(idx); | ||
2663 | |||
2664 | l += msg_print_text(msg, prev, syslog, buf + l, size - l); | ||
2665 | idx = log_next(idx); | ||
2666 | seq++; | ||
2667 | prev = msg->flags; | ||
2668 | } | ||
2669 | |||
2670 | dumper->next_seq = next_seq; | ||
2671 | dumper->next_idx = next_idx; | ||
2672 | ret = true; | ||
2340 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 2673 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
2674 | out: | ||
2675 | if (len) | ||
2676 | *len = l; | ||
2677 | return ret; | ||
2678 | } | ||
2679 | EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); | ||
2341 | 2680 | ||
2342 | rcu_read_lock(); | 2681 | /** |
2343 | list_for_each_entry_rcu(dumper, &dump_list, list) | 2682 | * kmsg_dump_rewind - reset the interator |
2344 | dumper->dump(dumper, reason, s1, l1, s2, l2); | 2683 | * @dumper: registered kmsg dumper |
2345 | rcu_read_unlock(); | 2684 | * |
2685 | * Reset the dumper's iterator so that kmsg_dump_get_line() and | ||
2686 | * kmsg_dump_get_buffer() can be called again and used multiple | ||
2687 | * times within the same dumper.dump() callback. | ||
2688 | */ | ||
2689 | void kmsg_dump_rewind(struct kmsg_dumper *dumper) | ||
2690 | { | ||
2691 | unsigned long flags; | ||
2692 | |||
2693 | raw_spin_lock_irqsave(&logbuf_lock, flags); | ||
2694 | dumper->cur_seq = clear_seq; | ||
2695 | dumper->cur_idx = clear_idx; | ||
2696 | dumper->next_seq = log_next_seq; | ||
2697 | dumper->next_idx = log_next_idx; | ||
2698 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
2346 | } | 2699 | } |
2700 | EXPORT_SYMBOL_GPL(kmsg_dump_rewind); | ||
2347 | #endif | 2701 | #endif |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0da7b88d92d0..4b97bba7396e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -201,6 +201,7 @@ void rcu_note_context_switch(int cpu) | |||
201 | { | 201 | { |
202 | trace_rcu_utilization("Start context switch"); | 202 | trace_rcu_utilization("Start context switch"); |
203 | rcu_sched_qs(cpu); | 203 | rcu_sched_qs(cpu); |
204 | rcu_preempt_note_context_switch(cpu); | ||
204 | trace_rcu_utilization("End context switch"); | 205 | trace_rcu_utilization("End context switch"); |
205 | } | 206 | } |
206 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 207 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
@@ -1397,6 +1398,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | |||
1397 | rdp->qlen_lazy += rsp->qlen_lazy; | 1398 | rdp->qlen_lazy += rsp->qlen_lazy; |
1398 | rdp->qlen += rsp->qlen; | 1399 | rdp->qlen += rsp->qlen; |
1399 | rdp->n_cbs_adopted += rsp->qlen; | 1400 | rdp->n_cbs_adopted += rsp->qlen; |
1401 | if (rsp->qlen_lazy != rsp->qlen) | ||
1402 | rcu_idle_count_callbacks_posted(); | ||
1400 | rsp->qlen_lazy = 0; | 1403 | rsp->qlen_lazy = 0; |
1401 | rsp->qlen = 0; | 1404 | rsp->qlen = 0; |
1402 | 1405 | ||
@@ -1528,7 +1531,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1528 | { | 1531 | { |
1529 | unsigned long flags; | 1532 | unsigned long flags; |
1530 | struct rcu_head *next, *list, **tail; | 1533 | struct rcu_head *next, *list, **tail; |
1531 | int bl, count, count_lazy; | 1534 | int bl, count, count_lazy, i; |
1532 | 1535 | ||
1533 | /* If no callbacks are ready, just return.*/ | 1536 | /* If no callbacks are ready, just return.*/ |
1534 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { | 1537 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { |
@@ -1551,9 +1554,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1551 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; | 1554 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; |
1552 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; | 1555 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; |
1553 | tail = rdp->nxttail[RCU_DONE_TAIL]; | 1556 | tail = rdp->nxttail[RCU_DONE_TAIL]; |
1554 | for (count = RCU_NEXT_SIZE - 1; count >= 0; count--) | 1557 | for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) |
1555 | if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL]) | 1558 | if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) |
1556 | rdp->nxttail[count] = &rdp->nxtlist; | 1559 | rdp->nxttail[i] = &rdp->nxtlist; |
1557 | local_irq_restore(flags); | 1560 | local_irq_restore(flags); |
1558 | 1561 | ||
1559 | /* Invoke callbacks. */ | 1562 | /* Invoke callbacks. */ |
@@ -1581,9 +1584,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1581 | if (list != NULL) { | 1584 | if (list != NULL) { |
1582 | *tail = rdp->nxtlist; | 1585 | *tail = rdp->nxtlist; |
1583 | rdp->nxtlist = list; | 1586 | rdp->nxtlist = list; |
1584 | for (count = 0; count < RCU_NEXT_SIZE; count++) | 1587 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
1585 | if (&rdp->nxtlist == rdp->nxttail[count]) | 1588 | if (&rdp->nxtlist == rdp->nxttail[i]) |
1586 | rdp->nxttail[count] = tail; | 1589 | rdp->nxttail[i] = tail; |
1587 | else | 1590 | else |
1588 | break; | 1591 | break; |
1589 | } | 1592 | } |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 7f5d138dedf5..19b61ac1079f 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -84,6 +84,20 @@ struct rcu_dynticks { | |||
84 | /* Process level is worth LLONG_MAX/2. */ | 84 | /* Process level is worth LLONG_MAX/2. */ |
85 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ | 85 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ |
86 | atomic_t dynticks; /* Even value for idle, else odd. */ | 86 | atomic_t dynticks; /* Even value for idle, else odd. */ |
87 | #ifdef CONFIG_RCU_FAST_NO_HZ | ||
88 | int dyntick_drain; /* Prepare-for-idle state variable. */ | ||
89 | unsigned long dyntick_holdoff; | ||
90 | /* No retries for the jiffy of failure. */ | ||
91 | struct timer_list idle_gp_timer; | ||
92 | /* Wake up CPU sleeping with callbacks. */ | ||
93 | unsigned long idle_gp_timer_expires; | ||
94 | /* When to wake up CPU (for repost). */ | ||
95 | bool idle_first_pass; /* First pass of attempt to go idle? */ | ||
96 | unsigned long nonlazy_posted; | ||
97 | /* # times non-lazy CBs posted to CPU. */ | ||
98 | unsigned long nonlazy_posted_snap; | ||
99 | /* idle-period nonlazy_posted snapshot. */ | ||
100 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | ||
87 | }; | 101 | }; |
88 | 102 | ||
89 | /* RCU's kthread states for tracing. */ | 103 | /* RCU's kthread states for tracing. */ |
@@ -430,6 +444,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); | |||
430 | /* Forward declarations for rcutree_plugin.h */ | 444 | /* Forward declarations for rcutree_plugin.h */ |
431 | static void rcu_bootup_announce(void); | 445 | static void rcu_bootup_announce(void); |
432 | long rcu_batches_completed(void); | 446 | long rcu_batches_completed(void); |
447 | static void rcu_preempt_note_context_switch(int cpu); | ||
433 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); | 448 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); |
434 | #ifdef CONFIG_HOTPLUG_CPU | 449 | #ifdef CONFIG_HOTPLUG_CPU |
435 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | 450 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 2411000d9869..3e4899459f3d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu) | |||
153 | * | 153 | * |
154 | * Caller must disable preemption. | 154 | * Caller must disable preemption. |
155 | */ | 155 | */ |
156 | void rcu_preempt_note_context_switch(void) | 156 | static void rcu_preempt_note_context_switch(int cpu) |
157 | { | 157 | { |
158 | struct task_struct *t = current; | 158 | struct task_struct *t = current; |
159 | unsigned long flags; | 159 | unsigned long flags; |
@@ -164,7 +164,7 @@ void rcu_preempt_note_context_switch(void) | |||
164 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { | 164 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { |
165 | 165 | ||
166 | /* Possibly blocking in an RCU read-side critical section. */ | 166 | /* Possibly blocking in an RCU read-side critical section. */ |
167 | rdp = __this_cpu_ptr(rcu_preempt_state.rda); | 167 | rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); |
168 | rnp = rdp->mynode; | 168 | rnp = rdp->mynode; |
169 | raw_spin_lock_irqsave(&rnp->lock, flags); | 169 | raw_spin_lock_irqsave(&rnp->lock, flags); |
170 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | 170 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; |
@@ -228,7 +228,7 @@ void rcu_preempt_note_context_switch(void) | |||
228 | * means that we continue to block the current grace period. | 228 | * means that we continue to block the current grace period. |
229 | */ | 229 | */ |
230 | local_irq_save(flags); | 230 | local_irq_save(flags); |
231 | rcu_preempt_qs(smp_processor_id()); | 231 | rcu_preempt_qs(cpu); |
232 | local_irq_restore(flags); | 232 | local_irq_restore(flags); |
233 | } | 233 | } |
234 | 234 | ||
@@ -1002,6 +1002,14 @@ void rcu_force_quiescent_state(void) | |||
1002 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | 1002 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); |
1003 | 1003 | ||
1004 | /* | 1004 | /* |
1005 | * Because preemptible RCU does not exist, we never have to check for | ||
1006 | * CPUs being in quiescent states. | ||
1007 | */ | ||
1008 | static void rcu_preempt_note_context_switch(int cpu) | ||
1009 | { | ||
1010 | } | ||
1011 | |||
1012 | /* | ||
1005 | * Because preemptible RCU does not exist, there are never any preempted | 1013 | * Because preemptible RCU does not exist, there are never any preempted |
1006 | * RCU readers. | 1014 | * RCU readers. |
1007 | */ | 1015 | */ |
@@ -1886,8 +1894,9 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) | |||
1886 | * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs | 1894 | * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs |
1887 | * any flavor of RCU. | 1895 | * any flavor of RCU. |
1888 | */ | 1896 | */ |
1889 | int rcu_needs_cpu(int cpu) | 1897 | int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) |
1890 | { | 1898 | { |
1899 | *delta_jiffies = ULONG_MAX; | ||
1891 | return rcu_cpu_has_callbacks(cpu); | 1900 | return rcu_cpu_has_callbacks(cpu); |
1892 | } | 1901 | } |
1893 | 1902 | ||
@@ -1962,41 +1971,6 @@ static void rcu_idle_count_callbacks_posted(void) | |||
1962 | #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ | 1971 | #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ |
1963 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ | 1972 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ |
1964 | 1973 | ||
1965 | /* Loop counter for rcu_prepare_for_idle(). */ | ||
1966 | static DEFINE_PER_CPU(int, rcu_dyntick_drain); | ||
1967 | /* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */ | ||
1968 | static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); | ||
1969 | /* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */ | ||
1970 | static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); | ||
1971 | /* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */ | ||
1972 | static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires); | ||
1973 | /* Enable special processing on first attempt to enter dyntick-idle mode. */ | ||
1974 | static DEFINE_PER_CPU(bool, rcu_idle_first_pass); | ||
1975 | /* Running count of non-lazy callbacks posted, never decremented. */ | ||
1976 | static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted); | ||
1977 | /* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */ | ||
1978 | static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap); | ||
1979 | |||
1980 | /* | ||
1981 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no | ||
1982 | * callbacks on this CPU, (2) this CPU has not yet attempted to enter | ||
1983 | * dyntick-idle mode, or (3) this CPU is in the process of attempting to | ||
1984 | * enter dyntick-idle mode. Otherwise, if we have recently tried and failed | ||
1985 | * to enter dyntick-idle mode, we refuse to try to enter it. After all, | ||
1986 | * it is better to incur scheduling-clock interrupts than to spin | ||
1987 | * continuously for the same time duration! | ||
1988 | */ | ||
1989 | int rcu_needs_cpu(int cpu) | ||
1990 | { | ||
1991 | /* Flag a new idle sojourn to the idle-entry state machine. */ | ||
1992 | per_cpu(rcu_idle_first_pass, cpu) = 1; | ||
1993 | /* If no callbacks, RCU doesn't need the CPU. */ | ||
1994 | if (!rcu_cpu_has_callbacks(cpu)) | ||
1995 | return 0; | ||
1996 | /* Otherwise, RCU needs the CPU only if it recently tried and failed. */ | ||
1997 | return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies; | ||
1998 | } | ||
1999 | |||
2000 | /* | 1974 | /* |
2001 | * Does the specified flavor of RCU have non-lazy callbacks pending on | 1975 | * Does the specified flavor of RCU have non-lazy callbacks pending on |
2002 | * the specified CPU? Both RCU flavor and CPU are specified by the | 1976 | * the specified CPU? Both RCU flavor and CPU are specified by the |
@@ -2040,6 +2014,47 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu) | |||
2040 | } | 2014 | } |
2041 | 2015 | ||
2042 | /* | 2016 | /* |
2017 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no | ||
2018 | * callbacks on this CPU, (2) this CPU has not yet attempted to enter | ||
2019 | * dyntick-idle mode, or (3) this CPU is in the process of attempting to | ||
2020 | * enter dyntick-idle mode. Otherwise, if we have recently tried and failed | ||
2021 | * to enter dyntick-idle mode, we refuse to try to enter it. After all, | ||
2022 | * it is better to incur scheduling-clock interrupts than to spin | ||
2023 | * continuously for the same time duration! | ||
2024 | * | ||
2025 | * The delta_jiffies argument is used to store the time when RCU is | ||
2026 | * going to need the CPU again if it still has callbacks. The reason | ||
2027 | * for this is that rcu_prepare_for_idle() might need to post a timer, | ||
2028 | * but if so, it will do so after tick_nohz_stop_sched_tick() has set | ||
2029 | * the wakeup time for this CPU. This means that RCU's timer can be | ||
2030 | * delayed until the wakeup time, which defeats the purpose of posting | ||
2031 | * a timer. | ||
2032 | */ | ||
2033 | int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) | ||
2034 | { | ||
2035 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
2036 | |||
2037 | /* Flag a new idle sojourn to the idle-entry state machine. */ | ||
2038 | rdtp->idle_first_pass = 1; | ||
2039 | /* If no callbacks, RCU doesn't need the CPU. */ | ||
2040 | if (!rcu_cpu_has_callbacks(cpu)) { | ||
2041 | *delta_jiffies = ULONG_MAX; | ||
2042 | return 0; | ||
2043 | } | ||
2044 | if (rdtp->dyntick_holdoff == jiffies) { | ||
2045 | /* RCU recently tried and failed, so don't try again. */ | ||
2046 | *delta_jiffies = 1; | ||
2047 | return 1; | ||
2048 | } | ||
2049 | /* Set up for the possibility that RCU will post a timer. */ | ||
2050 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) | ||
2051 | *delta_jiffies = RCU_IDLE_GP_DELAY; | ||
2052 | else | ||
2053 | *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY; | ||
2054 | return 0; | ||
2055 | } | ||
2056 | |||
2057 | /* | ||
2043 | * Handler for smp_call_function_single(). The only point of this | 2058 | * Handler for smp_call_function_single(). The only point of this |
2044 | * handler is to wake the CPU up, so the handler does only tracing. | 2059 | * handler is to wake the CPU up, so the handler does only tracing. |
2045 | */ | 2060 | */ |
@@ -2075,21 +2090,24 @@ static void rcu_idle_gp_timer_func(unsigned long cpu_in) | |||
2075 | */ | 2090 | */ |
2076 | static void rcu_prepare_for_idle_init(int cpu) | 2091 | static void rcu_prepare_for_idle_init(int cpu) |
2077 | { | 2092 | { |
2078 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; | 2093 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
2079 | setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), | 2094 | |
2080 | rcu_idle_gp_timer_func, cpu); | 2095 | rdtp->dyntick_holdoff = jiffies - 1; |
2081 | per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1; | 2096 | setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu); |
2082 | per_cpu(rcu_idle_first_pass, cpu) = 1; | 2097 | rdtp->idle_gp_timer_expires = jiffies - 1; |
2098 | rdtp->idle_first_pass = 1; | ||
2083 | } | 2099 | } |
2084 | 2100 | ||
2085 | /* | 2101 | /* |
2086 | * Clean up for exit from idle. Because we are exiting from idle, there | 2102 | * Clean up for exit from idle. Because we are exiting from idle, there |
2087 | * is no longer any point to rcu_idle_gp_timer, so cancel it. This will | 2103 | * is no longer any point to ->idle_gp_timer, so cancel it. This will |
2088 | * do nothing if this timer is not active, so just cancel it unconditionally. | 2104 | * do nothing if this timer is not active, so just cancel it unconditionally. |
2089 | */ | 2105 | */ |
2090 | static void rcu_cleanup_after_idle(int cpu) | 2106 | static void rcu_cleanup_after_idle(int cpu) |
2091 | { | 2107 | { |
2092 | del_timer(&per_cpu(rcu_idle_gp_timer, cpu)); | 2108 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
2109 | |||
2110 | del_timer(&rdtp->idle_gp_timer); | ||
2093 | trace_rcu_prep_idle("Cleanup after idle"); | 2111 | trace_rcu_prep_idle("Cleanup after idle"); |
2094 | } | 2112 | } |
2095 | 2113 | ||
@@ -2108,42 +2126,41 @@ static void rcu_cleanup_after_idle(int cpu) | |||
2108 | * Because it is not legal to invoke rcu_process_callbacks() with irqs | 2126 | * Because it is not legal to invoke rcu_process_callbacks() with irqs |
2109 | * disabled, we do one pass of force_quiescent_state(), then do a | 2127 | * disabled, we do one pass of force_quiescent_state(), then do a |
2110 | * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked | 2128 | * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked |
2111 | * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. | 2129 | * later. The ->dyntick_drain field controls the sequencing. |
2112 | * | 2130 | * |
2113 | * The caller must have disabled interrupts. | 2131 | * The caller must have disabled interrupts. |
2114 | */ | 2132 | */ |
2115 | static void rcu_prepare_for_idle(int cpu) | 2133 | static void rcu_prepare_for_idle(int cpu) |
2116 | { | 2134 | { |
2117 | struct timer_list *tp; | 2135 | struct timer_list *tp; |
2136 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
2118 | 2137 | ||
2119 | /* | 2138 | /* |
2120 | * If this is an idle re-entry, for example, due to use of | 2139 | * If this is an idle re-entry, for example, due to use of |
2121 | * RCU_NONIDLE() or the new idle-loop tracing API within the idle | 2140 | * RCU_NONIDLE() or the new idle-loop tracing API within the idle |
2122 | * loop, then don't take any state-machine actions, unless the | 2141 | * loop, then don't take any state-machine actions, unless the |
2123 | * momentary exit from idle queued additional non-lazy callbacks. | 2142 | * momentary exit from idle queued additional non-lazy callbacks. |
2124 | * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks | 2143 | * Instead, repost the ->idle_gp_timer if this CPU has callbacks |
2125 | * pending. | 2144 | * pending. |
2126 | */ | 2145 | */ |
2127 | if (!per_cpu(rcu_idle_first_pass, cpu) && | 2146 | if (!rdtp->idle_first_pass && |
2128 | (per_cpu(rcu_nonlazy_posted, cpu) == | 2147 | (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) { |
2129 | per_cpu(rcu_nonlazy_posted_snap, cpu))) { | ||
2130 | if (rcu_cpu_has_callbacks(cpu)) { | 2148 | if (rcu_cpu_has_callbacks(cpu)) { |
2131 | tp = &per_cpu(rcu_idle_gp_timer, cpu); | 2149 | tp = &rdtp->idle_gp_timer; |
2132 | mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); | 2150 | mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); |
2133 | } | 2151 | } |
2134 | return; | 2152 | return; |
2135 | } | 2153 | } |
2136 | per_cpu(rcu_idle_first_pass, cpu) = 0; | 2154 | rdtp->idle_first_pass = 0; |
2137 | per_cpu(rcu_nonlazy_posted_snap, cpu) = | 2155 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1; |
2138 | per_cpu(rcu_nonlazy_posted, cpu) - 1; | ||
2139 | 2156 | ||
2140 | /* | 2157 | /* |
2141 | * If there are no callbacks on this CPU, enter dyntick-idle mode. | 2158 | * If there are no callbacks on this CPU, enter dyntick-idle mode. |
2142 | * Also reset state to avoid prejudicing later attempts. | 2159 | * Also reset state to avoid prejudicing later attempts. |
2143 | */ | 2160 | */ |
2144 | if (!rcu_cpu_has_callbacks(cpu)) { | 2161 | if (!rcu_cpu_has_callbacks(cpu)) { |
2145 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; | 2162 | rdtp->dyntick_holdoff = jiffies - 1; |
2146 | per_cpu(rcu_dyntick_drain, cpu) = 0; | 2163 | rdtp->dyntick_drain = 0; |
2147 | trace_rcu_prep_idle("No callbacks"); | 2164 | trace_rcu_prep_idle("No callbacks"); |
2148 | return; | 2165 | return; |
2149 | } | 2166 | } |
@@ -2152,36 +2169,37 @@ static void rcu_prepare_for_idle(int cpu) | |||
2152 | * If in holdoff mode, just return. We will presumably have | 2169 | * If in holdoff mode, just return. We will presumably have |
2153 | * refrained from disabling the scheduling-clock tick. | 2170 | * refrained from disabling the scheduling-clock tick. |
2154 | */ | 2171 | */ |
2155 | if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { | 2172 | if (rdtp->dyntick_holdoff == jiffies) { |
2156 | trace_rcu_prep_idle("In holdoff"); | 2173 | trace_rcu_prep_idle("In holdoff"); |
2157 | return; | 2174 | return; |
2158 | } | 2175 | } |
2159 | 2176 | ||
2160 | /* Check and update the rcu_dyntick_drain sequencing. */ | 2177 | /* Check and update the ->dyntick_drain sequencing. */ |
2161 | if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { | 2178 | if (rdtp->dyntick_drain <= 0) { |
2162 | /* First time through, initialize the counter. */ | 2179 | /* First time through, initialize the counter. */ |
2163 | per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; | 2180 | rdtp->dyntick_drain = RCU_IDLE_FLUSHES; |
2164 | } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && | 2181 | } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES && |
2165 | !rcu_pending(cpu) && | 2182 | !rcu_pending(cpu) && |
2166 | !local_softirq_pending()) { | 2183 | !local_softirq_pending()) { |
2167 | /* Can we go dyntick-idle despite still having callbacks? */ | 2184 | /* Can we go dyntick-idle despite still having callbacks? */ |
2168 | trace_rcu_prep_idle("Dyntick with callbacks"); | 2185 | rdtp->dyntick_drain = 0; |
2169 | per_cpu(rcu_dyntick_drain, cpu) = 0; | 2186 | rdtp->dyntick_holdoff = jiffies; |
2170 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; | 2187 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) { |
2171 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) | 2188 | trace_rcu_prep_idle("Dyntick with callbacks"); |
2172 | per_cpu(rcu_idle_gp_timer_expires, cpu) = | 2189 | rdtp->idle_gp_timer_expires = |
2173 | jiffies + RCU_IDLE_GP_DELAY; | 2190 | jiffies + RCU_IDLE_GP_DELAY; |
2174 | else | 2191 | } else { |
2175 | per_cpu(rcu_idle_gp_timer_expires, cpu) = | 2192 | rdtp->idle_gp_timer_expires = |
2176 | jiffies + RCU_IDLE_LAZY_GP_DELAY; | 2193 | jiffies + RCU_IDLE_LAZY_GP_DELAY; |
2177 | tp = &per_cpu(rcu_idle_gp_timer, cpu); | 2194 | trace_rcu_prep_idle("Dyntick with lazy callbacks"); |
2178 | mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); | 2195 | } |
2179 | per_cpu(rcu_nonlazy_posted_snap, cpu) = | 2196 | tp = &rdtp->idle_gp_timer; |
2180 | per_cpu(rcu_nonlazy_posted, cpu); | 2197 | mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); |
2198 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; | ||
2181 | return; /* Nothing more to do immediately. */ | 2199 | return; /* Nothing more to do immediately. */ |
2182 | } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { | 2200 | } else if (--(rdtp->dyntick_drain) <= 0) { |
2183 | /* We have hit the limit, so time to give up. */ | 2201 | /* We have hit the limit, so time to give up. */ |
2184 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; | 2202 | rdtp->dyntick_holdoff = jiffies; |
2185 | trace_rcu_prep_idle("Begin holdoff"); | 2203 | trace_rcu_prep_idle("Begin holdoff"); |
2186 | invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ | 2204 | invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ |
2187 | return; | 2205 | return; |
@@ -2227,7 +2245,7 @@ static void rcu_prepare_for_idle(int cpu) | |||
2227 | */ | 2245 | */ |
2228 | static void rcu_idle_count_callbacks_posted(void) | 2246 | static void rcu_idle_count_callbacks_posted(void) |
2229 | { | 2247 | { |
2230 | __this_cpu_add(rcu_nonlazy_posted, 1); | 2248 | __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); |
2231 | } | 2249 | } |
2232 | 2250 | ||
2233 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 2251 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
@@ -2238,11 +2256,12 @@ static void rcu_idle_count_callbacks_posted(void) | |||
2238 | 2256 | ||
2239 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | 2257 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) |
2240 | { | 2258 | { |
2241 | struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu); | 2259 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
2260 | struct timer_list *tltp = &rdtp->idle_gp_timer; | ||
2242 | 2261 | ||
2243 | sprintf(cp, "drain=%d %c timer=%lu", | 2262 | sprintf(cp, "drain=%d %c timer=%lu", |
2244 | per_cpu(rcu_dyntick_drain, cpu), | 2263 | rdtp->dyntick_drain, |
2245 | per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', | 2264 | rdtp->dyntick_holdoff == jiffies ? 'H' : '.', |
2246 | timer_pending(tltp) ? tltp->expires - jiffies : -1); | 2265 | timer_pending(tltp) ? tltp->expires - jiffies : -1); |
2247 | } | 2266 | } |
2248 | 2267 | ||
diff --git a/kernel/relay.c b/kernel/relay.c index ab56a1764d4d..e8cd2027abbd 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
@@ -1235,6 +1235,7 @@ static ssize_t subbuf_splice_actor(struct file *in, | |||
1235 | struct splice_pipe_desc spd = { | 1235 | struct splice_pipe_desc spd = { |
1236 | .pages = pages, | 1236 | .pages = pages, |
1237 | .nr_pages = 0, | 1237 | .nr_pages = 0, |
1238 | .nr_pages_max = PIPE_DEF_BUFFERS, | ||
1238 | .partial = partial, | 1239 | .partial = partial, |
1239 | .flags = flags, | 1240 | .flags = flags, |
1240 | .ops = &relay_pipe_buf_ops, | 1241 | .ops = &relay_pipe_buf_ops, |
@@ -1302,8 +1303,8 @@ static ssize_t subbuf_splice_actor(struct file *in, | |||
1302 | ret += padding; | 1303 | ret += padding; |
1303 | 1304 | ||
1304 | out: | 1305 | out: |
1305 | splice_shrink_spd(pipe, &spd); | 1306 | splice_shrink_spd(&spd); |
1306 | return ret; | 1307 | return ret; |
1307 | } | 1308 | } |
1308 | 1309 | ||
1309 | static ssize_t relay_file_splice_read(struct file *in, | 1310 | static ssize_t relay_file_splice_read(struct file *in, |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d5594a4268d4..468bdd44c1ba 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -2081,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2081 | #endif | 2081 | #endif |
2082 | 2082 | ||
2083 | /* Here we just switch the register state and the stack. */ | 2083 | /* Here we just switch the register state and the stack. */ |
2084 | rcu_switch_from(prev); | ||
2085 | switch_to(prev, next, prev); | 2084 | switch_to(prev, next, prev); |
2086 | 2085 | ||
2087 | barrier(); | 2086 | barrier(); |
@@ -2161,11 +2160,73 @@ unsigned long this_cpu_load(void) | |||
2161 | } | 2160 | } |
2162 | 2161 | ||
2163 | 2162 | ||
2163 | /* | ||
2164 | * Global load-average calculations | ||
2165 | * | ||
2166 | * We take a distributed and async approach to calculating the global load-avg | ||
2167 | * in order to minimize overhead. | ||
2168 | * | ||
2169 | * The global load average is an exponentially decaying average of nr_running + | ||
2170 | * nr_uninterruptible. | ||
2171 | * | ||
2172 | * Once every LOAD_FREQ: | ||
2173 | * | ||
2174 | * nr_active = 0; | ||
2175 | * for_each_possible_cpu(cpu) | ||
2176 | * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; | ||
2177 | * | ||
2178 | * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) | ||
2179 | * | ||
2180 | * Due to a number of reasons the above turns in the mess below: | ||
2181 | * | ||
2182 | * - for_each_possible_cpu() is prohibitively expensive on machines with | ||
2183 | * serious number of cpus, therefore we need to take a distributed approach | ||
2184 | * to calculating nr_active. | ||
2185 | * | ||
2186 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 | ||
2187 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } | ||
2188 | * | ||
2189 | * So assuming nr_active := 0 when we start out -- true per definition, we | ||
2190 | * can simply take per-cpu deltas and fold those into a global accumulate | ||
2191 | * to obtain the same result. See calc_load_fold_active(). | ||
2192 | * | ||
2193 | * Furthermore, in order to avoid synchronizing all per-cpu delta folding | ||
2194 | * across the machine, we assume 10 ticks is sufficient time for every | ||
2195 | * cpu to have completed this task. | ||
2196 | * | ||
2197 | * This places an upper-bound on the IRQ-off latency of the machine. Then | ||
2198 | * again, being late doesn't loose the delta, just wrecks the sample. | ||
2199 | * | ||
2200 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because | ||
2201 | * this would add another cross-cpu cacheline miss and atomic operation | ||
2202 | * to the wakeup path. Instead we increment on whatever cpu the task ran | ||
2203 | * when it went into uninterruptible state and decrement on whatever cpu | ||
2204 | * did the wakeup. This means that only the sum of nr_uninterruptible over | ||
2205 | * all cpus yields the correct result. | ||
2206 | * | ||
2207 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. | ||
2208 | */ | ||
2209 | |||
2164 | /* Variables and functions for calc_load */ | 2210 | /* Variables and functions for calc_load */ |
2165 | static atomic_long_t calc_load_tasks; | 2211 | static atomic_long_t calc_load_tasks; |
2166 | static unsigned long calc_load_update; | 2212 | static unsigned long calc_load_update; |
2167 | unsigned long avenrun[3]; | 2213 | unsigned long avenrun[3]; |
2168 | EXPORT_SYMBOL(avenrun); | 2214 | EXPORT_SYMBOL(avenrun); /* should be removed */ |
2215 | |||
2216 | /** | ||
2217 | * get_avenrun - get the load average array | ||
2218 | * @loads: pointer to dest load array | ||
2219 | * @offset: offset to add | ||
2220 | * @shift: shift count to shift the result left | ||
2221 | * | ||
2222 | * These values are estimates at best, so no need for locking. | ||
2223 | */ | ||
2224 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
2225 | { | ||
2226 | loads[0] = (avenrun[0] + offset) << shift; | ||
2227 | loads[1] = (avenrun[1] + offset) << shift; | ||
2228 | loads[2] = (avenrun[2] + offset) << shift; | ||
2229 | } | ||
2169 | 2230 | ||
2170 | static long calc_load_fold_active(struct rq *this_rq) | 2231 | static long calc_load_fold_active(struct rq *this_rq) |
2171 | { | 2232 | { |
@@ -2182,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq) | |||
2182 | return delta; | 2243 | return delta; |
2183 | } | 2244 | } |
2184 | 2245 | ||
2246 | /* | ||
2247 | * a1 = a0 * e + a * (1 - e) | ||
2248 | */ | ||
2185 | static unsigned long | 2249 | static unsigned long |
2186 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | 2250 | calc_load(unsigned long load, unsigned long exp, unsigned long active) |
2187 | { | 2251 | { |
@@ -2193,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |||
2193 | 2257 | ||
2194 | #ifdef CONFIG_NO_HZ | 2258 | #ifdef CONFIG_NO_HZ |
2195 | /* | 2259 | /* |
2196 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. | 2260 | * Handle NO_HZ for the global load-average. |
2261 | * | ||
2262 | * Since the above described distributed algorithm to compute the global | ||
2263 | * load-average relies on per-cpu sampling from the tick, it is affected by | ||
2264 | * NO_HZ. | ||
2265 | * | ||
2266 | * The basic idea is to fold the nr_active delta into a global idle-delta upon | ||
2267 | * entering NO_HZ state such that we can include this as an 'extra' cpu delta | ||
2268 | * when we read the global state. | ||
2269 | * | ||
2270 | * Obviously reality has to ruin such a delightfully simple scheme: | ||
2271 | * | ||
2272 | * - When we go NO_HZ idle during the window, we can negate our sample | ||
2273 | * contribution, causing under-accounting. | ||
2274 | * | ||
2275 | * We avoid this by keeping two idle-delta counters and flipping them | ||
2276 | * when the window starts, thus separating old and new NO_HZ load. | ||
2277 | * | ||
2278 | * The only trick is the slight shift in index flip for read vs write. | ||
2279 | * | ||
2280 | * 0s 5s 10s 15s | ||
2281 | * +10 +10 +10 +10 | ||
2282 | * |-|-----------|-|-----------|-|-----------|-| | ||
2283 | * r:0 0 1 1 0 0 1 1 0 | ||
2284 | * w:0 1 1 0 0 1 1 0 0 | ||
2285 | * | ||
2286 | * This ensures we'll fold the old idle contribution in this window while | ||
2287 | * accumlating the new one. | ||
2288 | * | ||
2289 | * - When we wake up from NO_HZ idle during the window, we push up our | ||
2290 | * contribution, since we effectively move our sample point to a known | ||
2291 | * busy state. | ||
2292 | * | ||
2293 | * This is solved by pushing the window forward, and thus skipping the | ||
2294 | * sample, for this cpu (effectively using the idle-delta for this cpu which | ||
2295 | * was in effect at the time the window opened). This also solves the issue | ||
2296 | * of having to deal with a cpu having been in NOHZ idle for multiple | ||
2297 | * LOAD_FREQ intervals. | ||
2197 | * | 2298 | * |
2198 | * When making the ILB scale, we should try to pull this in as well. | 2299 | * When making the ILB scale, we should try to pull this in as well. |
2199 | */ | 2300 | */ |
2200 | static atomic_long_t calc_load_tasks_idle; | 2301 | static atomic_long_t calc_load_idle[2]; |
2302 | static int calc_load_idx; | ||
2201 | 2303 | ||
2202 | void calc_load_account_idle(struct rq *this_rq) | 2304 | static inline int calc_load_write_idx(void) |
2203 | { | 2305 | { |
2306 | int idx = calc_load_idx; | ||
2307 | |||
2308 | /* | ||
2309 | * See calc_global_nohz(), if we observe the new index, we also | ||
2310 | * need to observe the new update time. | ||
2311 | */ | ||
2312 | smp_rmb(); | ||
2313 | |||
2314 | /* | ||
2315 | * If the folding window started, make sure we start writing in the | ||
2316 | * next idle-delta. | ||
2317 | */ | ||
2318 | if (!time_before(jiffies, calc_load_update)) | ||
2319 | idx++; | ||
2320 | |||
2321 | return idx & 1; | ||
2322 | } | ||
2323 | |||
2324 | static inline int calc_load_read_idx(void) | ||
2325 | { | ||
2326 | return calc_load_idx & 1; | ||
2327 | } | ||
2328 | |||
2329 | void calc_load_enter_idle(void) | ||
2330 | { | ||
2331 | struct rq *this_rq = this_rq(); | ||
2204 | long delta; | 2332 | long delta; |
2205 | 2333 | ||
2334 | /* | ||
2335 | * We're going into NOHZ mode, if there's any pending delta, fold it | ||
2336 | * into the pending idle delta. | ||
2337 | */ | ||
2206 | delta = calc_load_fold_active(this_rq); | 2338 | delta = calc_load_fold_active(this_rq); |
2207 | if (delta) | 2339 | if (delta) { |
2208 | atomic_long_add(delta, &calc_load_tasks_idle); | 2340 | int idx = calc_load_write_idx(); |
2341 | atomic_long_add(delta, &calc_load_idle[idx]); | ||
2342 | } | ||
2209 | } | 2343 | } |
2210 | 2344 | ||
2211 | static long calc_load_fold_idle(void) | 2345 | void calc_load_exit_idle(void) |
2212 | { | 2346 | { |
2213 | long delta = 0; | 2347 | struct rq *this_rq = this_rq(); |
2348 | |||
2349 | /* | ||
2350 | * If we're still before the sample window, we're done. | ||
2351 | */ | ||
2352 | if (time_before(jiffies, this_rq->calc_load_update)) | ||
2353 | return; | ||
2214 | 2354 | ||
2215 | /* | 2355 | /* |
2216 | * Its got a race, we don't care... | 2356 | * We woke inside or after the sample window, this means we're already |
2357 | * accounted through the nohz accounting, so skip the entire deal and | ||
2358 | * sync up for the next window. | ||
2217 | */ | 2359 | */ |
2218 | if (atomic_long_read(&calc_load_tasks_idle)) | 2360 | this_rq->calc_load_update = calc_load_update; |
2219 | delta = atomic_long_xchg(&calc_load_tasks_idle, 0); | 2361 | if (time_before(jiffies, this_rq->calc_load_update + 10)) |
2362 | this_rq->calc_load_update += LOAD_FREQ; | ||
2363 | } | ||
2364 | |||
2365 | static long calc_load_fold_idle(void) | ||
2366 | { | ||
2367 | int idx = calc_load_read_idx(); | ||
2368 | long delta = 0; | ||
2369 | |||
2370 | if (atomic_long_read(&calc_load_idle[idx])) | ||
2371 | delta = atomic_long_xchg(&calc_load_idle[idx], 0); | ||
2220 | 2372 | ||
2221 | return delta; | 2373 | return delta; |
2222 | } | 2374 | } |
@@ -2302,66 +2454,39 @@ static void calc_global_nohz(void) | |||
2302 | { | 2454 | { |
2303 | long delta, active, n; | 2455 | long delta, active, n; |
2304 | 2456 | ||
2305 | /* | 2457 | if (!time_before(jiffies, calc_load_update + 10)) { |
2306 | * If we crossed a calc_load_update boundary, make sure to fold | 2458 | /* |
2307 | * any pending idle changes, the respective CPUs might have | 2459 | * Catch-up, fold however many we are behind still |
2308 | * missed the tick driven calc_load_account_active() update | 2460 | */ |
2309 | * due to NO_HZ. | 2461 | delta = jiffies - calc_load_update - 10; |
2310 | */ | 2462 | n = 1 + (delta / LOAD_FREQ); |
2311 | delta = calc_load_fold_idle(); | ||
2312 | if (delta) | ||
2313 | atomic_long_add(delta, &calc_load_tasks); | ||
2314 | |||
2315 | /* | ||
2316 | * It could be the one fold was all it took, we done! | ||
2317 | */ | ||
2318 | if (time_before(jiffies, calc_load_update + 10)) | ||
2319 | return; | ||
2320 | |||
2321 | /* | ||
2322 | * Catch-up, fold however many we are behind still | ||
2323 | */ | ||
2324 | delta = jiffies - calc_load_update - 10; | ||
2325 | n = 1 + (delta / LOAD_FREQ); | ||
2326 | 2463 | ||
2327 | active = atomic_long_read(&calc_load_tasks); | 2464 | active = atomic_long_read(&calc_load_tasks); |
2328 | active = active > 0 ? active * FIXED_1 : 0; | 2465 | active = active > 0 ? active * FIXED_1 : 0; |
2329 | 2466 | ||
2330 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | 2467 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); |
2331 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | 2468 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); |
2332 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | 2469 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); |
2333 | 2470 | ||
2334 | calc_load_update += n * LOAD_FREQ; | 2471 | calc_load_update += n * LOAD_FREQ; |
2335 | } | 2472 | } |
2336 | #else | ||
2337 | void calc_load_account_idle(struct rq *this_rq) | ||
2338 | { | ||
2339 | } | ||
2340 | 2473 | ||
2341 | static inline long calc_load_fold_idle(void) | 2474 | /* |
2342 | { | 2475 | * Flip the idle index... |
2343 | return 0; | 2476 | * |
2477 | * Make sure we first write the new time then flip the index, so that | ||
2478 | * calc_load_write_idx() will see the new time when it reads the new | ||
2479 | * index, this avoids a double flip messing things up. | ||
2480 | */ | ||
2481 | smp_wmb(); | ||
2482 | calc_load_idx++; | ||
2344 | } | 2483 | } |
2484 | #else /* !CONFIG_NO_HZ */ | ||
2345 | 2485 | ||
2346 | static void calc_global_nohz(void) | 2486 | static inline long calc_load_fold_idle(void) { return 0; } |
2347 | { | 2487 | static inline void calc_global_nohz(void) { } |
2348 | } | ||
2349 | #endif | ||
2350 | 2488 | ||
2351 | /** | 2489 | #endif /* CONFIG_NO_HZ */ |
2352 | * get_avenrun - get the load average array | ||
2353 | * @loads: pointer to dest load array | ||
2354 | * @offset: offset to add | ||
2355 | * @shift: shift count to shift the result left | ||
2356 | * | ||
2357 | * These values are estimates at best, so no need for locking. | ||
2358 | */ | ||
2359 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
2360 | { | ||
2361 | loads[0] = (avenrun[0] + offset) << shift; | ||
2362 | loads[1] = (avenrun[1] + offset) << shift; | ||
2363 | loads[2] = (avenrun[2] + offset) << shift; | ||
2364 | } | ||
2365 | 2490 | ||
2366 | /* | 2491 | /* |
2367 | * calc_load - update the avenrun load estimates 10 ticks after the | 2492 | * calc_load - update the avenrun load estimates 10 ticks after the |
@@ -2369,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | |||
2369 | */ | 2494 | */ |
2370 | void calc_global_load(unsigned long ticks) | 2495 | void calc_global_load(unsigned long ticks) |
2371 | { | 2496 | { |
2372 | long active; | 2497 | long active, delta; |
2373 | 2498 | ||
2374 | if (time_before(jiffies, calc_load_update + 10)) | 2499 | if (time_before(jiffies, calc_load_update + 10)) |
2375 | return; | 2500 | return; |
2376 | 2501 | ||
2502 | /* | ||
2503 | * Fold the 'old' idle-delta to include all NO_HZ cpus. | ||
2504 | */ | ||
2505 | delta = calc_load_fold_idle(); | ||
2506 | if (delta) | ||
2507 | atomic_long_add(delta, &calc_load_tasks); | ||
2508 | |||
2377 | active = atomic_long_read(&calc_load_tasks); | 2509 | active = atomic_long_read(&calc_load_tasks); |
2378 | active = active > 0 ? active * FIXED_1 : 0; | 2510 | active = active > 0 ? active * FIXED_1 : 0; |
2379 | 2511 | ||
@@ -2384,12 +2516,7 @@ void calc_global_load(unsigned long ticks) | |||
2384 | calc_load_update += LOAD_FREQ; | 2516 | calc_load_update += LOAD_FREQ; |
2385 | 2517 | ||
2386 | /* | 2518 | /* |
2387 | * Account one period with whatever state we found before | 2519 | * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. |
2388 | * folding in the nohz state and ageing the entire idle period. | ||
2389 | * | ||
2390 | * This avoids loosing a sample when we go idle between | ||
2391 | * calc_load_account_active() (10 ticks ago) and now and thus | ||
2392 | * under-accounting. | ||
2393 | */ | 2520 | */ |
2394 | calc_global_nohz(); | 2521 | calc_global_nohz(); |
2395 | } | 2522 | } |
@@ -2406,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq) | |||
2406 | return; | 2533 | return; |
2407 | 2534 | ||
2408 | delta = calc_load_fold_active(this_rq); | 2535 | delta = calc_load_fold_active(this_rq); |
2409 | delta += calc_load_fold_idle(); | ||
2410 | if (delta) | 2536 | if (delta) |
2411 | atomic_long_add(delta, &calc_load_tasks); | 2537 | atomic_long_add(delta, &calc_load_tasks); |
2412 | 2538 | ||
@@ -2414,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq) | |||
2414 | } | 2540 | } |
2415 | 2541 | ||
2416 | /* | 2542 | /* |
2543 | * End of global load-average stuff | ||
2544 | */ | ||
2545 | |||
2546 | /* | ||
2417 | * The exact cpuload at various idx values, calculated at every tick would be | 2547 | * The exact cpuload at various idx values, calculated at every tick would be |
2418 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load | 2548 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load |
2419 | * | 2549 | * |
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b44d604b35d1..b6baf370cae9 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
@@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl | |||
25 | static struct task_struct *pick_next_task_idle(struct rq *rq) | 25 | static struct task_struct *pick_next_task_idle(struct rq *rq) |
26 | { | 26 | { |
27 | schedstat_inc(rq, sched_goidle); | 27 | schedstat_inc(rq, sched_goidle); |
28 | calc_load_account_idle(rq); | ||
29 | return rq->idle; | 28 | return rq->idle; |
30 | } | 29 | } |
31 | 30 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6d52cea7f33d..55844f24435a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -942,8 +942,6 @@ static inline u64 sched_avg_period(void) | |||
942 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | 942 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; |
943 | } | 943 | } |
944 | 944 | ||
945 | void calc_load_account_idle(struct rq *this_rq); | ||
946 | |||
947 | #ifdef CONFIG_SCHED_HRTICK | 945 | #ifdef CONFIG_SCHED_HRTICK |
948 | 946 | ||
949 | /* | 947 | /* |
diff --git a/kernel/sys.c b/kernel/sys.c index f0ec44dcd415..2d39a84cd857 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -1788,7 +1788,6 @@ SYSCALL_DEFINE1(umask, int, mask) | |||
1788 | #ifdef CONFIG_CHECKPOINT_RESTORE | 1788 | #ifdef CONFIG_CHECKPOINT_RESTORE |
1789 | static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | 1789 | static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) |
1790 | { | 1790 | { |
1791 | struct vm_area_struct *vma; | ||
1792 | struct file *exe_file; | 1791 | struct file *exe_file; |
1793 | struct dentry *dentry; | 1792 | struct dentry *dentry; |
1794 | int err; | 1793 | int err; |
@@ -1816,13 +1815,17 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | |||
1816 | down_write(&mm->mmap_sem); | 1815 | down_write(&mm->mmap_sem); |
1817 | 1816 | ||
1818 | /* | 1817 | /* |
1819 | * Forbid mm->exe_file change if there are mapped other files. | 1818 | * Forbid mm->exe_file change if old file still mapped. |
1820 | */ | 1819 | */ |
1821 | err = -EBUSY; | 1820 | err = -EBUSY; |
1822 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 1821 | if (mm->exe_file) { |
1823 | if (vma->vm_file && !path_equal(&vma->vm_file->f_path, | 1822 | struct vm_area_struct *vma; |
1824 | &exe_file->f_path)) | 1823 | |
1825 | goto exit_unlock; | 1824 | for (vma = mm->mmap; vma; vma = vma->vm_next) |
1825 | if (vma->vm_file && | ||
1826 | path_equal(&vma->vm_file->f_path, | ||
1827 | &mm->exe_file->f_path)) | ||
1828 | goto exit_unlock; | ||
1826 | } | 1829 | } |
1827 | 1830 | ||
1828 | /* | 1831 | /* |
@@ -1835,6 +1838,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | |||
1835 | if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) | 1838 | if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) |
1836 | goto exit_unlock; | 1839 | goto exit_unlock; |
1837 | 1840 | ||
1841 | err = 0; | ||
1838 | set_mm_exe_file(mm, exe_file); | 1842 | set_mm_exe_file(mm, exe_file); |
1839 | exit_unlock: | 1843 | exit_unlock: |
1840 | up_write(&mm->mmap_sem); | 1844 | up_write(&mm->mmap_sem); |
@@ -2127,9 +2131,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
2127 | else | 2131 | else |
2128 | return -EINVAL; | 2132 | return -EINVAL; |
2129 | break; | 2133 | break; |
2130 | case PR_GET_TID_ADDRESS: | ||
2131 | error = prctl_get_tid_address(me, (int __user **)arg2); | ||
2132 | break; | ||
2133 | default: | 2134 | default: |
2134 | return -EINVAL; | 2135 | return -EINVAL; |
2135 | } | 2136 | } |
@@ -2147,6 +2148,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
2147 | case PR_SET_MM: | 2148 | case PR_SET_MM: |
2148 | error = prctl_set_mm(arg2, arg3, arg4, arg5); | 2149 | error = prctl_set_mm(arg2, arg3, arg4, arg5); |
2149 | break; | 2150 | break; |
2151 | case PR_GET_TID_ADDRESS: | ||
2152 | error = prctl_get_tid_address(me, (int __user **)arg2); | ||
2153 | break; | ||
2150 | case PR_SET_CHILD_SUBREAPER: | 2154 | case PR_SET_CHILD_SUBREAPER: |
2151 | me->signal->is_child_subreaper = !!arg2; | 2155 | me->signal->is_child_subreaper = !!arg2; |
2152 | error = 0; | 2156 | error = 0; |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 70b33abcc7bb..b7fbadc5c973 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -409,7 +409,9 @@ int second_overflow(unsigned long secs) | |||
409 | time_state = TIME_DEL; | 409 | time_state = TIME_DEL; |
410 | break; | 410 | break; |
411 | case TIME_INS: | 411 | case TIME_INS: |
412 | if (secs % 86400 == 0) { | 412 | if (!(time_status & STA_INS)) |
413 | time_state = TIME_OK; | ||
414 | else if (secs % 86400 == 0) { | ||
413 | leap = -1; | 415 | leap = -1; |
414 | time_state = TIME_OOP; | 416 | time_state = TIME_OOP; |
415 | time_tai++; | 417 | time_tai++; |
@@ -418,7 +420,9 @@ int second_overflow(unsigned long secs) | |||
418 | } | 420 | } |
419 | break; | 421 | break; |
420 | case TIME_DEL: | 422 | case TIME_DEL: |
421 | if ((secs + 1) % 86400 == 0) { | 423 | if (!(time_status & STA_DEL)) |
424 | time_state = TIME_OK; | ||
425 | else if ((secs + 1) % 86400 == 0) { | ||
422 | leap = 1; | 426 | leap = 1; |
423 | time_tai--; | 427 | time_tai--; |
424 | time_state = TIME_WAIT; | 428 | time_state = TIME_WAIT; |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 60c9c60e9108..41be02250e08 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -276,10 +276,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
276 | { | 276 | { |
277 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; | 277 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; |
278 | ktime_t last_update, expires, ret = { .tv64 = 0 }; | 278 | ktime_t last_update, expires, ret = { .tv64 = 0 }; |
279 | unsigned long rcu_delta_jiffies; | ||
279 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 280 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; |
280 | u64 time_delta; | 281 | u64 time_delta; |
281 | 282 | ||
282 | |||
283 | /* Read jiffies and the time when jiffies were updated last */ | 283 | /* Read jiffies and the time when jiffies were updated last */ |
284 | do { | 284 | do { |
285 | seq = read_seqbegin(&xtime_lock); | 285 | seq = read_seqbegin(&xtime_lock); |
@@ -288,7 +288,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
288 | time_delta = timekeeping_max_deferment(); | 288 | time_delta = timekeeping_max_deferment(); |
289 | } while (read_seqretry(&xtime_lock, seq)); | 289 | } while (read_seqretry(&xtime_lock, seq)); |
290 | 290 | ||
291 | if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || | 291 | if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || |
292 | arch_needs_cpu(cpu)) { | 292 | arch_needs_cpu(cpu)) { |
293 | next_jiffies = last_jiffies + 1; | 293 | next_jiffies = last_jiffies + 1; |
294 | delta_jiffies = 1; | 294 | delta_jiffies = 1; |
@@ -296,6 +296,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
296 | /* Get the next timer wheel timer */ | 296 | /* Get the next timer wheel timer */ |
297 | next_jiffies = get_next_timer_interrupt(last_jiffies); | 297 | next_jiffies = get_next_timer_interrupt(last_jiffies); |
298 | delta_jiffies = next_jiffies - last_jiffies; | 298 | delta_jiffies = next_jiffies - last_jiffies; |
299 | if (rcu_delta_jiffies < delta_jiffies) { | ||
300 | next_jiffies = last_jiffies + rcu_delta_jiffies; | ||
301 | delta_jiffies = rcu_delta_jiffies; | ||
302 | } | ||
299 | } | 303 | } |
300 | /* | 304 | /* |
301 | * Do not stop the tick, if we are only one off | 305 | * Do not stop the tick, if we are only one off |
@@ -369,6 +373,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
369 | */ | 373 | */ |
370 | if (!ts->tick_stopped) { | 374 | if (!ts->tick_stopped) { |
371 | select_nohz_load_balancer(1); | 375 | select_nohz_load_balancer(1); |
376 | calc_load_enter_idle(); | ||
372 | 377 | ||
373 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); | 378 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); |
374 | ts->tick_stopped = 1; | 379 | ts->tick_stopped = 1; |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6f46a00a1e8a..269b1fe5f2ae 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -70,6 +70,12 @@ struct timekeeper { | |||
70 | /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ | 70 | /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ |
71 | struct timespec raw_time; | 71 | struct timespec raw_time; |
72 | 72 | ||
73 | /* Offset clock monotonic -> clock realtime */ | ||
74 | ktime_t offs_real; | ||
75 | |||
76 | /* Offset clock monotonic -> clock boottime */ | ||
77 | ktime_t offs_boot; | ||
78 | |||
73 | /* Seqlock for all timekeeper values */ | 79 | /* Seqlock for all timekeeper values */ |
74 | seqlock_t lock; | 80 | seqlock_t lock; |
75 | }; | 81 | }; |
@@ -172,6 +178,14 @@ static inline s64 timekeeping_get_ns_raw(void) | |||
172 | return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); | 178 | return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); |
173 | } | 179 | } |
174 | 180 | ||
181 | static void update_rt_offset(void) | ||
182 | { | ||
183 | struct timespec tmp, *wtm = &timekeeper.wall_to_monotonic; | ||
184 | |||
185 | set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec); | ||
186 | timekeeper.offs_real = timespec_to_ktime(tmp); | ||
187 | } | ||
188 | |||
175 | /* must hold write on timekeeper.lock */ | 189 | /* must hold write on timekeeper.lock */ |
176 | static void timekeeping_update(bool clearntp) | 190 | static void timekeeping_update(bool clearntp) |
177 | { | 191 | { |
@@ -179,6 +193,7 @@ static void timekeeping_update(bool clearntp) | |||
179 | timekeeper.ntp_error = 0; | 193 | timekeeper.ntp_error = 0; |
180 | ntp_clear(); | 194 | ntp_clear(); |
181 | } | 195 | } |
196 | update_rt_offset(); | ||
182 | update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, | 197 | update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, |
183 | timekeeper.clock, timekeeper.mult); | 198 | timekeeper.clock, timekeeper.mult); |
184 | } | 199 | } |
@@ -604,6 +619,7 @@ void __init timekeeping_init(void) | |||
604 | } | 619 | } |
605 | set_normalized_timespec(&timekeeper.wall_to_monotonic, | 620 | set_normalized_timespec(&timekeeper.wall_to_monotonic, |
606 | -boot.tv_sec, -boot.tv_nsec); | 621 | -boot.tv_sec, -boot.tv_nsec); |
622 | update_rt_offset(); | ||
607 | timekeeper.total_sleep_time.tv_sec = 0; | 623 | timekeeper.total_sleep_time.tv_sec = 0; |
608 | timekeeper.total_sleep_time.tv_nsec = 0; | 624 | timekeeper.total_sleep_time.tv_nsec = 0; |
609 | write_sequnlock_irqrestore(&timekeeper.lock, flags); | 625 | write_sequnlock_irqrestore(&timekeeper.lock, flags); |
@@ -612,6 +628,12 @@ void __init timekeeping_init(void) | |||
612 | /* time in seconds when suspend began */ | 628 | /* time in seconds when suspend began */ |
613 | static struct timespec timekeeping_suspend_time; | 629 | static struct timespec timekeeping_suspend_time; |
614 | 630 | ||
631 | static void update_sleep_time(struct timespec t) | ||
632 | { | ||
633 | timekeeper.total_sleep_time = t; | ||
634 | timekeeper.offs_boot = timespec_to_ktime(t); | ||
635 | } | ||
636 | |||
615 | /** | 637 | /** |
616 | * __timekeeping_inject_sleeptime - Internal function to add sleep interval | 638 | * __timekeeping_inject_sleeptime - Internal function to add sleep interval |
617 | * @delta: pointer to a timespec delta value | 639 | * @delta: pointer to a timespec delta value |
@@ -630,8 +652,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta) | |||
630 | timekeeper.xtime = timespec_add(timekeeper.xtime, *delta); | 652 | timekeeper.xtime = timespec_add(timekeeper.xtime, *delta); |
631 | timekeeper.wall_to_monotonic = | 653 | timekeeper.wall_to_monotonic = |
632 | timespec_sub(timekeeper.wall_to_monotonic, *delta); | 654 | timespec_sub(timekeeper.wall_to_monotonic, *delta); |
633 | timekeeper.total_sleep_time = timespec_add( | 655 | update_sleep_time(timespec_add(timekeeper.total_sleep_time, *delta)); |
634 | timekeeper.total_sleep_time, *delta); | ||
635 | } | 656 | } |
636 | 657 | ||
637 | 658 | ||
@@ -963,6 +984,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) | |||
963 | leap = second_overflow(timekeeper.xtime.tv_sec); | 984 | leap = second_overflow(timekeeper.xtime.tv_sec); |
964 | timekeeper.xtime.tv_sec += leap; | 985 | timekeeper.xtime.tv_sec += leap; |
965 | timekeeper.wall_to_monotonic.tv_sec -= leap; | 986 | timekeeper.wall_to_monotonic.tv_sec -= leap; |
987 | if (leap) | ||
988 | clock_was_set_delayed(); | ||
966 | } | 989 | } |
967 | 990 | ||
968 | /* Accumulate raw time */ | 991 | /* Accumulate raw time */ |
@@ -1079,6 +1102,8 @@ static void update_wall_time(void) | |||
1079 | leap = second_overflow(timekeeper.xtime.tv_sec); | 1102 | leap = second_overflow(timekeeper.xtime.tv_sec); |
1080 | timekeeper.xtime.tv_sec += leap; | 1103 | timekeeper.xtime.tv_sec += leap; |
1081 | timekeeper.wall_to_monotonic.tv_sec -= leap; | 1104 | timekeeper.wall_to_monotonic.tv_sec -= leap; |
1105 | if (leap) | ||
1106 | clock_was_set_delayed(); | ||
1082 | } | 1107 | } |
1083 | 1108 | ||
1084 | timekeeping_update(false); | 1109 | timekeeping_update(false); |
@@ -1246,6 +1271,40 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, | |||
1246 | } while (read_seqretry(&timekeeper.lock, seq)); | 1271 | } while (read_seqretry(&timekeeper.lock, seq)); |
1247 | } | 1272 | } |
1248 | 1273 | ||
1274 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
1275 | /** | ||
1276 | * ktime_get_update_offsets - hrtimer helper | ||
1277 | * @offs_real: pointer to storage for monotonic -> realtime offset | ||
1278 | * @offs_boot: pointer to storage for monotonic -> boottime offset | ||
1279 | * | ||
1280 | * Returns current monotonic time and updates the offsets | ||
1281 | * Called from hrtimer_interupt() or retrigger_next_event() | ||
1282 | */ | ||
1283 | ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) | ||
1284 | { | ||
1285 | ktime_t now; | ||
1286 | unsigned int seq; | ||
1287 | u64 secs, nsecs; | ||
1288 | |||
1289 | do { | ||
1290 | seq = read_seqbegin(&timekeeper.lock); | ||
1291 | |||
1292 | secs = timekeeper.xtime.tv_sec; | ||
1293 | nsecs = timekeeper.xtime.tv_nsec; | ||
1294 | nsecs += timekeeping_get_ns(); | ||
1295 | /* If arch requires, add in gettimeoffset() */ | ||
1296 | nsecs += arch_gettimeoffset(); | ||
1297 | |||
1298 | *offs_real = timekeeper.offs_real; | ||
1299 | *offs_boot = timekeeper.offs_boot; | ||
1300 | } while (read_seqretry(&timekeeper.lock, seq)); | ||
1301 | |||
1302 | now = ktime_add_ns(ktime_set(secs, 0), nsecs); | ||
1303 | now = ktime_sub(now, *offs_real); | ||
1304 | return now; | ||
1305 | } | ||
1306 | #endif | ||
1307 | |||
1249 | /** | 1308 | /** |
1250 | * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format | 1309 | * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format |
1251 | */ | 1310 | */ |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d0f6a8a0e5e..f765465bffe4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -1075,6 +1075,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) | |||
1075 | rb_init_page(bpage->page); | 1075 | rb_init_page(bpage->page); |
1076 | 1076 | ||
1077 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); | 1077 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); |
1078 | INIT_LIST_HEAD(&cpu_buffer->new_pages); | ||
1078 | 1079 | ||
1079 | ret = rb_allocate_pages(cpu_buffer, nr_pages); | 1080 | ret = rb_allocate_pages(cpu_buffer, nr_pages); |
1080 | if (ret < 0) | 1081 | if (ret < 0) |
@@ -1346,10 +1347,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) | |||
1346 | * If something was added to this page, it was full | 1347 | * If something was added to this page, it was full |
1347 | * since it is not the tail page. So we deduct the | 1348 | * since it is not the tail page. So we deduct the |
1348 | * bytes consumed in ring buffer from here. | 1349 | * bytes consumed in ring buffer from here. |
1349 | * No need to update overruns, since this page is | 1350 | * Increment overrun to account for the lost events. |
1350 | * deleted from ring buffer and its entries are | ||
1351 | * already accounted for. | ||
1352 | */ | 1351 | */ |
1352 | local_add(page_entries, &cpu_buffer->overrun); | ||
1353 | local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); | 1353 | local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); |
1354 | } | 1354 | } |
1355 | 1355 | ||
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 68032c6177db..a7fa0702be1c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -371,7 +371,7 @@ EXPORT_SYMBOL_GPL(tracing_on); | |||
371 | void tracing_off(void) | 371 | void tracing_off(void) |
372 | { | 372 | { |
373 | if (global_trace.buffer) | 373 | if (global_trace.buffer) |
374 | ring_buffer_record_on(global_trace.buffer); | 374 | ring_buffer_record_off(global_trace.buffer); |
375 | /* | 375 | /* |
376 | * This flag is only looked at when buffers haven't been | 376 | * This flag is only looked at when buffers haven't been |
377 | * allocated yet. We don't really care about the race | 377 | * allocated yet. We don't really care about the race |
@@ -3609,6 +3609,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
3609 | .pages = pages_def, | 3609 | .pages = pages_def, |
3610 | .partial = partial_def, | 3610 | .partial = partial_def, |
3611 | .nr_pages = 0, /* This gets updated below. */ | 3611 | .nr_pages = 0, /* This gets updated below. */ |
3612 | .nr_pages_max = PIPE_DEF_BUFFERS, | ||
3612 | .flags = flags, | 3613 | .flags = flags, |
3613 | .ops = &tracing_pipe_buf_ops, | 3614 | .ops = &tracing_pipe_buf_ops, |
3614 | .spd_release = tracing_spd_release_pipe, | 3615 | .spd_release = tracing_spd_release_pipe, |
@@ -3680,7 +3681,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
3680 | 3681 | ||
3681 | ret = splice_to_pipe(pipe, &spd); | 3682 | ret = splice_to_pipe(pipe, &spd); |
3682 | out: | 3683 | out: |
3683 | splice_shrink_spd(pipe, &spd); | 3684 | splice_shrink_spd(&spd); |
3684 | return ret; | 3685 | return ret; |
3685 | 3686 | ||
3686 | out_err: | 3687 | out_err: |
@@ -4231,6 +4232,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
4231 | struct splice_pipe_desc spd = { | 4232 | struct splice_pipe_desc spd = { |
4232 | .pages = pages_def, | 4233 | .pages = pages_def, |
4233 | .partial = partial_def, | 4234 | .partial = partial_def, |
4235 | .nr_pages_max = PIPE_DEF_BUFFERS, | ||
4234 | .flags = flags, | 4236 | .flags = flags, |
4235 | .ops = &buffer_pipe_buf_ops, | 4237 | .ops = &buffer_pipe_buf_ops, |
4236 | .spd_release = buffer_spd_release, | 4238 | .spd_release = buffer_spd_release, |
@@ -4318,7 +4320,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
4318 | } | 4320 | } |
4319 | 4321 | ||
4320 | ret = splice_to_pipe(pipe, &spd); | 4322 | ret = splice_to_pipe(pipe, &spd); |
4321 | splice_shrink_spd(pipe, &spd); | 4323 | splice_shrink_spd(&spd); |
4322 | out: | 4324 | out: |
4323 | return ret; | 4325 | return ret; |
4324 | } | 4326 | } |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index e5e1d85b8c7c..4b1dfba70f7c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -372,6 +372,13 @@ static int watchdog(void *unused) | |||
372 | 372 | ||
373 | 373 | ||
374 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 374 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
375 | /* | ||
376 | * People like the simple clean cpu node info on boot. | ||
377 | * Reduce the watchdog noise by only printing messages | ||
378 | * that are different from what cpu0 displayed. | ||
379 | */ | ||
380 | static unsigned long cpu0_err; | ||
381 | |||
375 | static int watchdog_nmi_enable(int cpu) | 382 | static int watchdog_nmi_enable(int cpu) |
376 | { | 383 | { |
377 | struct perf_event_attr *wd_attr; | 384 | struct perf_event_attr *wd_attr; |
@@ -390,11 +397,21 @@ static int watchdog_nmi_enable(int cpu) | |||
390 | 397 | ||
391 | /* Try to register using hardware perf events */ | 398 | /* Try to register using hardware perf events */ |
392 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); | 399 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); |
400 | |||
401 | /* save cpu0 error for future comparision */ | ||
402 | if (cpu == 0 && IS_ERR(event)) | ||
403 | cpu0_err = PTR_ERR(event); | ||
404 | |||
393 | if (!IS_ERR(event)) { | 405 | if (!IS_ERR(event)) { |
394 | pr_info("enabled, takes one hw-pmu counter.\n"); | 406 | /* only print for cpu0 or different than cpu0 */ |
407 | if (cpu == 0 || cpu0_err) | ||
408 | pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); | ||
395 | goto out_save; | 409 | goto out_save; |
396 | } | 410 | } |
397 | 411 | ||
412 | /* skip displaying the same error again */ | ||
413 | if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) | ||
414 | return PTR_ERR(event); | ||
398 | 415 | ||
399 | /* vary the KERN level based on the returned errno */ | 416 | /* vary the KERN level based on the returned errno */ |
400 | if (PTR_ERR(event) == -EOPNOTSUPP) | 417 | if (PTR_ERR(event) == -EOPNOTSUPP) |