diff options
Diffstat (limited to 'kernel')
33 files changed, 1494 insertions, 615 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0f3527d6184a..b303dfc7dce0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -255,12 +255,17 @@ int cgroup_lock_is_held(void) | |||
| 255 | 255 | ||
| 256 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); | 256 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); |
| 257 | 257 | ||
| 258 | static int css_unbias_refcnt(int refcnt) | ||
| 259 | { | ||
| 260 | return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; | ||
| 261 | } | ||
| 262 | |||
| 258 | /* the current nr of refs, always >= 0 whether @css is deactivated or not */ | 263 | /* the current nr of refs, always >= 0 whether @css is deactivated or not */ |
| 259 | static int css_refcnt(struct cgroup_subsys_state *css) | 264 | static int css_refcnt(struct cgroup_subsys_state *css) |
| 260 | { | 265 | { |
| 261 | int v = atomic_read(&css->refcnt); | 266 | int v = atomic_read(&css->refcnt); |
| 262 | 267 | ||
| 263 | return v >= 0 ? v : v - CSS_DEACT_BIAS; | 268 | return css_unbias_refcnt(v); |
| 264 | } | 269 | } |
| 265 | 270 | ||
| 266 | /* convenient tests for these bits */ | 271 | /* convenient tests for these bits */ |
| @@ -3878,8 +3883,12 @@ static void css_dput_fn(struct work_struct *work) | |||
| 3878 | { | 3883 | { |
| 3879 | struct cgroup_subsys_state *css = | 3884 | struct cgroup_subsys_state *css = |
| 3880 | container_of(work, struct cgroup_subsys_state, dput_work); | 3885 | container_of(work, struct cgroup_subsys_state, dput_work); |
| 3886 | struct dentry *dentry = css->cgroup->dentry; | ||
| 3887 | struct super_block *sb = dentry->d_sb; | ||
| 3881 | 3888 | ||
| 3882 | dput(css->cgroup->dentry); | 3889 | atomic_inc(&sb->s_active); |
| 3890 | dput(dentry); | ||
| 3891 | deactivate_super(sb); | ||
| 3883 | } | 3892 | } |
| 3884 | 3893 | ||
| 3885 | static void init_cgroup_css(struct cgroup_subsys_state *css, | 3894 | static void init_cgroup_css(struct cgroup_subsys_state *css, |
| @@ -4971,10 +4980,12 @@ EXPORT_SYMBOL_GPL(__css_tryget); | |||
| 4971 | void __css_put(struct cgroup_subsys_state *css) | 4980 | void __css_put(struct cgroup_subsys_state *css) |
| 4972 | { | 4981 | { |
| 4973 | struct cgroup *cgrp = css->cgroup; | 4982 | struct cgroup *cgrp = css->cgroup; |
| 4983 | int v; | ||
| 4974 | 4984 | ||
| 4975 | rcu_read_lock(); | 4985 | rcu_read_lock(); |
| 4976 | atomic_dec(&css->refcnt); | 4986 | v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); |
| 4977 | switch (css_refcnt(css)) { | 4987 | |
| 4988 | switch (v) { | ||
| 4978 | case 1: | 4989 | case 1: |
| 4979 | if (notify_on_release(cgrp)) { | 4990 | if (notify_on_release(cgrp)) { |
| 4980 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 4991 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 67b847dfa2bb..1f91413edb87 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | #include <linux/ctype.h> | 14 | #include <linux/ctype.h> |
| 15 | #include <linux/string.h> | 15 | #include <linux/string.h> |
| 16 | #include <linux/kernel.h> | 16 | #include <linux/kernel.h> |
| 17 | #include <linux/kmsg_dump.h> | ||
| 17 | #include <linux/reboot.h> | 18 | #include <linux/reboot.h> |
| 18 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
| 19 | #include <linux/sysrq.h> | 20 | #include <linux/sysrq.h> |
| @@ -2040,8 +2041,15 @@ static int kdb_env(int argc, const char **argv) | |||
| 2040 | */ | 2041 | */ |
| 2041 | static int kdb_dmesg(int argc, const char **argv) | 2042 | static int kdb_dmesg(int argc, const char **argv) |
| 2042 | { | 2043 | { |
| 2043 | char *syslog_data[4], *start, *end, c = '\0', *p; | 2044 | int diag; |
| 2044 | int diag, logging, logsize, lines = 0, adjust = 0, n; | 2045 | int logging; |
| 2046 | int lines = 0; | ||
| 2047 | int adjust = 0; | ||
| 2048 | int n = 0; | ||
| 2049 | int skip = 0; | ||
| 2050 | struct kmsg_dumper dumper = { .active = 1 }; | ||
| 2051 | size_t len; | ||
| 2052 | char buf[201]; | ||
| 2045 | 2053 | ||
| 2046 | if (argc > 2) | 2054 | if (argc > 2) |
| 2047 | return KDB_ARGCOUNT; | 2055 | return KDB_ARGCOUNT; |
| @@ -2064,22 +2072,10 @@ static int kdb_dmesg(int argc, const char **argv) | |||
| 2064 | kdb_set(2, setargs); | 2072 | kdb_set(2, setargs); |
| 2065 | } | 2073 | } |
| 2066 | 2074 | ||
| 2067 | /* syslog_data[0,1] physical start, end+1. syslog_data[2,3] | 2075 | kmsg_dump_rewind_nolock(&dumper); |
| 2068 | * logical start, end+1. */ | 2076 | while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL)) |
| 2069 | kdb_syslog_data(syslog_data); | 2077 | n++; |
| 2070 | if (syslog_data[2] == syslog_data[3]) | 2078 | |
| 2071 | return 0; | ||
| 2072 | logsize = syslog_data[1] - syslog_data[0]; | ||
| 2073 | start = syslog_data[2]; | ||
| 2074 | end = syslog_data[3]; | ||
| 2075 | #define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0]) | ||
| 2076 | for (n = 0, p = start; p < end; ++p) { | ||
| 2077 | c = *KDB_WRAP(p); | ||
| 2078 | if (c == '\n') | ||
| 2079 | ++n; | ||
| 2080 | } | ||
| 2081 | if (c != '\n') | ||
| 2082 | ++n; | ||
| 2083 | if (lines < 0) { | 2079 | if (lines < 0) { |
| 2084 | if (adjust >= n) | 2080 | if (adjust >= n) |
| 2085 | kdb_printf("buffer only contains %d lines, nothing " | 2081 | kdb_printf("buffer only contains %d lines, nothing " |
| @@ -2087,21 +2083,11 @@ static int kdb_dmesg(int argc, const char **argv) | |||
| 2087 | else if (adjust - lines >= n) | 2083 | else if (adjust - lines >= n) |
| 2088 | kdb_printf("buffer only contains %d lines, last %d " | 2084 | kdb_printf("buffer only contains %d lines, last %d " |
| 2089 | "lines printed\n", n, n - adjust); | 2085 | "lines printed\n", n, n - adjust); |
| 2090 | if (adjust) { | 2086 | skip = adjust; |
| 2091 | for (; start < end && adjust; ++start) { | 2087 | lines = abs(lines); |
| 2092 | if (*KDB_WRAP(start) == '\n') | ||
| 2093 | --adjust; | ||
| 2094 | } | ||
| 2095 | if (start < end) | ||
| 2096 | ++start; | ||
| 2097 | } | ||
| 2098 | for (p = start; p < end && lines; ++p) { | ||
| 2099 | if (*KDB_WRAP(p) == '\n') | ||
| 2100 | ++lines; | ||
| 2101 | } | ||
| 2102 | end = p; | ||
| 2103 | } else if (lines > 0) { | 2088 | } else if (lines > 0) { |
| 2104 | int skip = n - (adjust + lines); | 2089 | skip = n - lines - adjust; |
| 2090 | lines = abs(lines); | ||
| 2105 | if (adjust >= n) { | 2091 | if (adjust >= n) { |
| 2106 | kdb_printf("buffer only contains %d lines, " | 2092 | kdb_printf("buffer only contains %d lines, " |
| 2107 | "nothing printed\n", n); | 2093 | "nothing printed\n", n); |
| @@ -2112,35 +2098,24 @@ static int kdb_dmesg(int argc, const char **argv) | |||
| 2112 | kdb_printf("buffer only contains %d lines, first " | 2098 | kdb_printf("buffer only contains %d lines, first " |
| 2113 | "%d lines printed\n", n, lines); | 2099 | "%d lines printed\n", n, lines); |
| 2114 | } | 2100 | } |
| 2115 | for (; start < end && skip; ++start) { | 2101 | } else { |
| 2116 | if (*KDB_WRAP(start) == '\n') | 2102 | lines = n; |
| 2117 | --skip; | ||
| 2118 | } | ||
| 2119 | for (p = start; p < end && lines; ++p) { | ||
| 2120 | if (*KDB_WRAP(p) == '\n') | ||
| 2121 | --lines; | ||
| 2122 | } | ||
| 2123 | end = p; | ||
| 2124 | } | 2103 | } |
| 2125 | /* Do a line at a time (max 200 chars) to reduce protocol overhead */ | 2104 | |
| 2126 | c = '\n'; | 2105 | if (skip >= n || skip < 0) |
| 2127 | while (start != end) { | 2106 | return 0; |
| 2128 | char buf[201]; | 2107 | |
| 2129 | p = buf; | 2108 | kmsg_dump_rewind_nolock(&dumper); |
| 2130 | if (KDB_FLAG(CMD_INTERRUPT)) | 2109 | while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) { |
| 2131 | return 0; | 2110 | if (skip) { |
| 2132 | while (start < end && (c = *KDB_WRAP(start)) && | 2111 | skip--; |
| 2133 | (p - buf) < sizeof(buf)-1) { | 2112 | continue; |
| 2134 | ++start; | ||
| 2135 | *p++ = c; | ||
| 2136 | if (c == '\n') | ||
| 2137 | break; | ||
| 2138 | } | 2113 | } |
| 2139 | *p = '\0'; | 2114 | if (!lines--) |
| 2140 | kdb_printf("%s", buf); | 2115 | break; |
| 2116 | |||
| 2117 | kdb_printf("%.*s\n", (int)len - 1, buf); | ||
| 2141 | } | 2118 | } |
| 2142 | if (c != '\n') | ||
| 2143 | kdb_printf("\n"); | ||
| 2144 | 2119 | ||
| 2145 | return 0; | 2120 | return 0; |
| 2146 | } | 2121 | } |
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 47c4e56e513b..392ec6a25844 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h | |||
| @@ -205,7 +205,6 @@ extern char kdb_grep_string[]; | |||
| 205 | extern int kdb_grep_leading; | 205 | extern int kdb_grep_leading; |
| 206 | extern int kdb_grep_trailing; | 206 | extern int kdb_grep_trailing; |
| 207 | extern char *kdb_cmds[]; | 207 | extern char *kdb_cmds[]; |
| 208 | extern void kdb_syslog_data(char *syslog_data[]); | ||
| 209 | extern unsigned long kdb_task_state_string(const char *); | 208 | extern unsigned long kdb_task_state_string(const char *); |
| 210 | extern char kdb_task_state_char (const struct task_struct *); | 209 | extern char kdb_task_state_char (const struct task_struct *); |
| 211 | extern unsigned long kdb_task_state(const struct task_struct *p, | 210 | extern unsigned long kdb_task_state(const struct task_struct *p, |
diff --git a/kernel/events/core.c b/kernel/events/core.c index 5b06cbbf6931..d7d71d6ec972 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
| @@ -253,9 +253,9 @@ perf_cgroup_match(struct perf_event *event) | |||
| 253 | return !event->cgrp || event->cgrp == cpuctx->cgrp; | 253 | return !event->cgrp || event->cgrp == cpuctx->cgrp; |
| 254 | } | 254 | } |
| 255 | 255 | ||
| 256 | static inline void perf_get_cgroup(struct perf_event *event) | 256 | static inline bool perf_tryget_cgroup(struct perf_event *event) |
| 257 | { | 257 | { |
| 258 | css_get(&event->cgrp->css); | 258 | return css_tryget(&event->cgrp->css); |
| 259 | } | 259 | } |
| 260 | 260 | ||
| 261 | static inline void perf_put_cgroup(struct perf_event *event) | 261 | static inline void perf_put_cgroup(struct perf_event *event) |
| @@ -484,7 +484,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, | |||
| 484 | event->cgrp = cgrp; | 484 | event->cgrp = cgrp; |
| 485 | 485 | ||
| 486 | /* must be done before we fput() the file */ | 486 | /* must be done before we fput() the file */ |
| 487 | perf_get_cgroup(event); | 487 | if (!perf_tryget_cgroup(event)) { |
| 488 | event->cgrp = NULL; | ||
| 489 | ret = -ENOENT; | ||
| 490 | goto out; | ||
| 491 | } | ||
| 488 | 492 | ||
| 489 | /* | 493 | /* |
| 490 | * all events in a group must monitor | 494 | * all events in a group must monitor |
| @@ -3181,7 +3185,6 @@ static void perf_event_for_each(struct perf_event *event, | |||
| 3181 | event = event->group_leader; | 3185 | event = event->group_leader; |
| 3182 | 3186 | ||
| 3183 | perf_event_for_each_child(event, func); | 3187 | perf_event_for_each_child(event, func); |
| 3184 | func(event); | ||
| 3185 | list_for_each_entry(sibling, &event->sibling_list, group_entry) | 3188 | list_for_each_entry(sibling, &event->sibling_list, group_entry) |
| 3186 | perf_event_for_each_child(sibling, func); | 3189 | perf_event_for_each_child(sibling, func); |
| 3187 | mutex_unlock(&ctx->mutex); | 3190 | mutex_unlock(&ctx->mutex); |
diff --git a/kernel/exit.c b/kernel/exit.c index 34867cc5b42a..2f59cc334516 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -72,6 +72,18 @@ static void __unhash_process(struct task_struct *p, bool group_dead) | |||
| 72 | list_del_rcu(&p->tasks); | 72 | list_del_rcu(&p->tasks); |
| 73 | list_del_init(&p->sibling); | 73 | list_del_init(&p->sibling); |
| 74 | __this_cpu_dec(process_counts); | 74 | __this_cpu_dec(process_counts); |
| 75 | /* | ||
| 76 | * If we are the last child process in a pid namespace to be | ||
| 77 | * reaped, notify the reaper sleeping zap_pid_ns_processes(). | ||
| 78 | */ | ||
| 79 | if (IS_ENABLED(CONFIG_PID_NS)) { | ||
| 80 | struct task_struct *parent = p->real_parent; | ||
| 81 | |||
| 82 | if ((task_active_pid_ns(parent)->child_reaper == parent) && | ||
| 83 | list_empty(&parent->children) && | ||
| 84 | (parent->flags & PF_EXITING)) | ||
| 85 | wake_up_process(parent); | ||
| 86 | } | ||
| 75 | } | 87 | } |
| 76 | list_del_rcu(&p->thread_group); | 88 | list_del_rcu(&p->thread_group); |
| 77 | } | 89 | } |
| @@ -643,6 +655,7 @@ static void exit_mm(struct task_struct * tsk) | |||
| 643 | mm_release(tsk, mm); | 655 | mm_release(tsk, mm); |
| 644 | if (!mm) | 656 | if (!mm) |
| 645 | return; | 657 | return; |
| 658 | sync_mm_rss(mm); | ||
| 646 | /* | 659 | /* |
| 647 | * Serialize with any possible pending coredump. | 660 | * Serialize with any possible pending coredump. |
| 648 | * We must hold mmap_sem around checking core_state | 661 | * We must hold mmap_sem around checking core_state |
| @@ -719,12 +732,6 @@ static struct task_struct *find_new_reaper(struct task_struct *father) | |||
| 719 | 732 | ||
| 720 | zap_pid_ns_processes(pid_ns); | 733 | zap_pid_ns_processes(pid_ns); |
| 721 | write_lock_irq(&tasklist_lock); | 734 | write_lock_irq(&tasklist_lock); |
| 722 | /* | ||
| 723 | * We can not clear ->child_reaper or leave it alone. | ||
| 724 | * There may by stealth EXIT_DEAD tasks on ->children, | ||
| 725 | * forget_original_parent() must move them somewhere. | ||
| 726 | */ | ||
| 727 | pid_ns->child_reaper = init_pid_ns.child_reaper; | ||
| 728 | } else if (father->signal->has_child_subreaper) { | 735 | } else if (father->signal->has_child_subreaper) { |
| 729 | struct task_struct *reaper; | 736 | struct task_struct *reaper; |
| 730 | 737 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index ab5211b9e622..f00e319d8376 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -304,12 +304,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
| 304 | } | 304 | } |
| 305 | 305 | ||
| 306 | err = arch_dup_task_struct(tsk, orig); | 306 | err = arch_dup_task_struct(tsk, orig); |
| 307 | if (err) | ||
| 308 | goto out; | ||
| 309 | 307 | ||
| 308 | /* | ||
| 309 | * We defer looking at err, because we will need this setup | ||
| 310 | * for the clean up path to work correctly. | ||
| 311 | */ | ||
| 310 | tsk->stack = ti; | 312 | tsk->stack = ti; |
| 311 | |||
| 312 | setup_thread_stack(tsk, orig); | 313 | setup_thread_stack(tsk, orig); |
| 314 | |||
| 315 | if (err) | ||
| 316 | goto out; | ||
| 317 | |||
| 313 | clear_user_return_notifier(tsk); | 318 | clear_user_return_notifier(tsk); |
| 314 | clear_tsk_need_resched(tsk); | 319 | clear_tsk_need_resched(tsk); |
| 315 | stackend = end_of_stack(tsk); | 320 | stackend = end_of_stack(tsk); |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ae34bf51682b..6db7a5ed52b5 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
| @@ -657,6 +657,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
| 657 | return 0; | 657 | return 0; |
| 658 | } | 658 | } |
| 659 | 659 | ||
| 660 | static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) | ||
| 661 | { | ||
| 662 | ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; | ||
| 663 | ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; | ||
| 664 | |||
| 665 | return ktime_get_update_offsets(offs_real, offs_boot); | ||
| 666 | } | ||
| 667 | |||
| 660 | /* | 668 | /* |
| 661 | * Retrigger next event is called after clock was set | 669 | * Retrigger next event is called after clock was set |
| 662 | * | 670 | * |
| @@ -665,22 +673,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
| 665 | static void retrigger_next_event(void *arg) | 673 | static void retrigger_next_event(void *arg) |
| 666 | { | 674 | { |
| 667 | struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); | 675 | struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); |
| 668 | struct timespec realtime_offset, xtim, wtm, sleep; | ||
| 669 | 676 | ||
| 670 | if (!hrtimer_hres_active()) | 677 | if (!hrtimer_hres_active()) |
| 671 | return; | 678 | return; |
| 672 | 679 | ||
| 673 | /* Optimized out for !HIGH_RES */ | ||
| 674 | get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep); | ||
| 675 | set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); | ||
| 676 | |||
| 677 | /* Adjust CLOCK_REALTIME offset */ | ||
| 678 | raw_spin_lock(&base->lock); | 680 | raw_spin_lock(&base->lock); |
| 679 | base->clock_base[HRTIMER_BASE_REALTIME].offset = | 681 | hrtimer_update_base(base); |
| 680 | timespec_to_ktime(realtime_offset); | ||
| 681 | base->clock_base[HRTIMER_BASE_BOOTTIME].offset = | ||
| 682 | timespec_to_ktime(sleep); | ||
| 683 | |||
| 684 | hrtimer_force_reprogram(base, 0); | 682 | hrtimer_force_reprogram(base, 0); |
| 685 | raw_spin_unlock(&base->lock); | 683 | raw_spin_unlock(&base->lock); |
| 686 | } | 684 | } |
| @@ -710,13 +708,25 @@ static int hrtimer_switch_to_hres(void) | |||
| 710 | base->clock_base[i].resolution = KTIME_HIGH_RES; | 708 | base->clock_base[i].resolution = KTIME_HIGH_RES; |
| 711 | 709 | ||
| 712 | tick_setup_sched_timer(); | 710 | tick_setup_sched_timer(); |
| 713 | |||
| 714 | /* "Retrigger" the interrupt to get things going */ | 711 | /* "Retrigger" the interrupt to get things going */ |
| 715 | retrigger_next_event(NULL); | 712 | retrigger_next_event(NULL); |
| 716 | local_irq_restore(flags); | 713 | local_irq_restore(flags); |
| 717 | return 1; | 714 | return 1; |
| 718 | } | 715 | } |
| 719 | 716 | ||
| 717 | /* | ||
| 718 | * Called from timekeeping code to reprogramm the hrtimer interrupt | ||
| 719 | * device. If called from the timer interrupt context we defer it to | ||
| 720 | * softirq context. | ||
| 721 | */ | ||
| 722 | void clock_was_set_delayed(void) | ||
| 723 | { | ||
| 724 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | ||
| 725 | |||
| 726 | cpu_base->clock_was_set = 1; | ||
| 727 | __raise_softirq_irqoff(HRTIMER_SOFTIRQ); | ||
| 728 | } | ||
| 729 | |||
| 720 | #else | 730 | #else |
| 721 | 731 | ||
| 722 | static inline int hrtimer_hres_active(void) { return 0; } | 732 | static inline int hrtimer_hres_active(void) { return 0; } |
| @@ -1250,11 +1260,10 @@ void hrtimer_interrupt(struct clock_event_device *dev) | |||
| 1250 | cpu_base->nr_events++; | 1260 | cpu_base->nr_events++; |
| 1251 | dev->next_event.tv64 = KTIME_MAX; | 1261 | dev->next_event.tv64 = KTIME_MAX; |
| 1252 | 1262 | ||
| 1253 | entry_time = now = ktime_get(); | 1263 | raw_spin_lock(&cpu_base->lock); |
| 1264 | entry_time = now = hrtimer_update_base(cpu_base); | ||
| 1254 | retry: | 1265 | retry: |
| 1255 | expires_next.tv64 = KTIME_MAX; | 1266 | expires_next.tv64 = KTIME_MAX; |
| 1256 | |||
| 1257 | raw_spin_lock(&cpu_base->lock); | ||
| 1258 | /* | 1267 | /* |
| 1259 | * We set expires_next to KTIME_MAX here with cpu_base->lock | 1268 | * We set expires_next to KTIME_MAX here with cpu_base->lock |
| 1260 | * held to prevent that a timer is enqueued in our queue via | 1269 | * held to prevent that a timer is enqueued in our queue via |
| @@ -1330,8 +1339,12 @@ retry: | |||
| 1330 | * We need to prevent that we loop forever in the hrtimer | 1339 | * We need to prevent that we loop forever in the hrtimer |
| 1331 | * interrupt routine. We give it 3 attempts to avoid | 1340 | * interrupt routine. We give it 3 attempts to avoid |
| 1332 | * overreacting on some spurious event. | 1341 | * overreacting on some spurious event. |
| 1342 | * | ||
| 1343 | * Acquire base lock for updating the offsets and retrieving | ||
| 1344 | * the current time. | ||
| 1333 | */ | 1345 | */ |
| 1334 | now = ktime_get(); | 1346 | raw_spin_lock(&cpu_base->lock); |
| 1347 | now = hrtimer_update_base(cpu_base); | ||
| 1335 | cpu_base->nr_retries++; | 1348 | cpu_base->nr_retries++; |
| 1336 | if (++retries < 3) | 1349 | if (++retries < 3) |
| 1337 | goto retry; | 1350 | goto retry; |
| @@ -1343,6 +1356,7 @@ retry: | |||
| 1343 | */ | 1356 | */ |
| 1344 | cpu_base->nr_hangs++; | 1357 | cpu_base->nr_hangs++; |
| 1345 | cpu_base->hang_detected = 1; | 1358 | cpu_base->hang_detected = 1; |
| 1359 | raw_spin_unlock(&cpu_base->lock); | ||
| 1346 | delta = ktime_sub(now, entry_time); | 1360 | delta = ktime_sub(now, entry_time); |
| 1347 | if (delta.tv64 > cpu_base->max_hang_time.tv64) | 1361 | if (delta.tv64 > cpu_base->max_hang_time.tv64) |
| 1348 | cpu_base->max_hang_time = delta; | 1362 | cpu_base->max_hang_time = delta; |
| @@ -1395,6 +1409,13 @@ void hrtimer_peek_ahead_timers(void) | |||
| 1395 | 1409 | ||
| 1396 | static void run_hrtimer_softirq(struct softirq_action *h) | 1410 | static void run_hrtimer_softirq(struct softirq_action *h) |
| 1397 | { | 1411 | { |
| 1412 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | ||
| 1413 | |||
| 1414 | if (cpu_base->clock_was_set) { | ||
| 1415 | cpu_base->clock_was_set = 0; | ||
| 1416 | clock_was_set(); | ||
| 1417 | } | ||
| 1418 | |||
| 1398 | hrtimer_peek_ahead_timers(); | 1419 | hrtimer_peek_ahead_timers(); |
| 1399 | } | 1420 | } |
| 1400 | 1421 | ||
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index fc275e4f629b..eebd6d5cfb44 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
| @@ -275,8 +275,10 @@ void handle_nested_irq(unsigned int irq) | |||
| 275 | kstat_incr_irqs_this_cpu(irq, desc); | 275 | kstat_incr_irqs_this_cpu(irq, desc); |
| 276 | 276 | ||
| 277 | action = desc->action; | 277 | action = desc->action; |
| 278 | if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) | 278 | if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { |
| 279 | desc->istate |= IRQS_PENDING; | ||
| 279 | goto out_unlock; | 280 | goto out_unlock; |
| 281 | } | ||
| 280 | 282 | ||
| 281 | irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); | 283 | irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); |
| 282 | raw_spin_unlock_irq(&desc->lock); | 284 | raw_spin_unlock_irq(&desc->lock); |
| @@ -324,8 +326,10 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) | |||
| 324 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | 326 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); |
| 325 | kstat_incr_irqs_this_cpu(irq, desc); | 327 | kstat_incr_irqs_this_cpu(irq, desc); |
| 326 | 328 | ||
| 327 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) | 329 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { |
| 330 | desc->istate |= IRQS_PENDING; | ||
| 328 | goto out_unlock; | 331 | goto out_unlock; |
| 332 | } | ||
| 329 | 333 | ||
| 330 | handle_irq_event(desc); | 334 | handle_irq_event(desc); |
| 331 | 335 | ||
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 8e5c56b3b7d9..001fa5bab490 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
| @@ -101,6 +101,9 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); | |||
| 101 | 101 | ||
| 102 | extern void irq_set_thread_affinity(struct irq_desc *desc); | 102 | extern void irq_set_thread_affinity(struct irq_desc *desc); |
| 103 | 103 | ||
| 104 | extern int irq_do_set_affinity(struct irq_data *data, | ||
| 105 | const struct cpumask *dest, bool force); | ||
| 106 | |||
| 104 | /* Inline functions for support of irq chips on slow busses */ | 107 | /* Inline functions for support of irq chips on slow busses */ |
| 105 | static inline void chip_bus_lock(struct irq_desc *desc) | 108 | static inline void chip_bus_lock(struct irq_desc *desc) |
| 106 | { | 109 | { |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ea0c6c2ae6f7..8c548232ba39 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
| @@ -142,6 +142,25 @@ static inline void | |||
| 142 | irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } | 142 | irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } |
| 143 | #endif | 143 | #endif |
| 144 | 144 | ||
| 145 | int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, | ||
| 146 | bool force) | ||
| 147 | { | ||
| 148 | struct irq_desc *desc = irq_data_to_desc(data); | ||
| 149 | struct irq_chip *chip = irq_data_get_irq_chip(data); | ||
| 150 | int ret; | ||
| 151 | |||
| 152 | ret = chip->irq_set_affinity(data, mask, false); | ||
| 153 | switch (ret) { | ||
| 154 | case IRQ_SET_MASK_OK: | ||
| 155 | cpumask_copy(data->affinity, mask); | ||
| 156 | case IRQ_SET_MASK_OK_NOCOPY: | ||
| 157 | irq_set_thread_affinity(desc); | ||
| 158 | ret = 0; | ||
| 159 | } | ||
| 160 | |||
| 161 | return ret; | ||
| 162 | } | ||
| 163 | |||
| 145 | int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) | 164 | int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) |
| 146 | { | 165 | { |
| 147 | struct irq_chip *chip = irq_data_get_irq_chip(data); | 166 | struct irq_chip *chip = irq_data_get_irq_chip(data); |
| @@ -152,14 +171,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) | |||
| 152 | return -EINVAL; | 171 | return -EINVAL; |
| 153 | 172 | ||
| 154 | if (irq_can_move_pcntxt(data)) { | 173 | if (irq_can_move_pcntxt(data)) { |
| 155 | ret = chip->irq_set_affinity(data, mask, false); | 174 | ret = irq_do_set_affinity(data, mask, false); |
| 156 | switch (ret) { | ||
| 157 | case IRQ_SET_MASK_OK: | ||
| 158 | cpumask_copy(data->affinity, mask); | ||
| 159 | case IRQ_SET_MASK_OK_NOCOPY: | ||
| 160 | irq_set_thread_affinity(desc); | ||
| 161 | ret = 0; | ||
| 162 | } | ||
| 163 | } else { | 175 | } else { |
| 164 | irqd_set_move_pending(data); | 176 | irqd_set_move_pending(data); |
| 165 | irq_copy_pending(desc, mask); | 177 | irq_copy_pending(desc, mask); |
| @@ -283,9 +295,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); | |||
| 283 | static int | 295 | static int |
| 284 | setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) | 296 | setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) |
| 285 | { | 297 | { |
| 286 | struct irq_chip *chip = irq_desc_get_chip(desc); | ||
| 287 | struct cpumask *set = irq_default_affinity; | 298 | struct cpumask *set = irq_default_affinity; |
| 288 | int ret, node = desc->irq_data.node; | 299 | int node = desc->irq_data.node; |
| 289 | 300 | ||
| 290 | /* Excludes PER_CPU and NO_BALANCE interrupts */ | 301 | /* Excludes PER_CPU and NO_BALANCE interrupts */ |
| 291 | if (!irq_can_set_affinity(irq)) | 302 | if (!irq_can_set_affinity(irq)) |
| @@ -311,13 +322,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) | |||
| 311 | if (cpumask_intersects(mask, nodemask)) | 322 | if (cpumask_intersects(mask, nodemask)) |
| 312 | cpumask_and(mask, mask, nodemask); | 323 | cpumask_and(mask, mask, nodemask); |
| 313 | } | 324 | } |
| 314 | ret = chip->irq_set_affinity(&desc->irq_data, mask, false); | 325 | irq_do_set_affinity(&desc->irq_data, mask, false); |
| 315 | switch (ret) { | ||
| 316 | case IRQ_SET_MASK_OK: | ||
| 317 | cpumask_copy(desc->irq_data.affinity, mask); | ||
| 318 | case IRQ_SET_MASK_OK_NOCOPY: | ||
| 319 | irq_set_thread_affinity(desc); | ||
| 320 | } | ||
| 321 | return 0; | 326 | return 0; |
| 322 | } | 327 | } |
| 323 | #else | 328 | #else |
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index c3c89751b327..ca3f4aaff707 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c | |||
| @@ -42,17 +42,8 @@ void irq_move_masked_irq(struct irq_data *idata) | |||
| 42 | * For correct operation this depends on the caller | 42 | * For correct operation this depends on the caller |
| 43 | * masking the irqs. | 43 | * masking the irqs. |
| 44 | */ | 44 | */ |
| 45 | if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) | 45 | if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) |
| 46 | < nr_cpu_ids)) { | 46 | irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false); |
| 47 | int ret = chip->irq_set_affinity(&desc->irq_data, | ||
| 48 | desc->pending_mask, false); | ||
| 49 | switch (ret) { | ||
| 50 | case IRQ_SET_MASK_OK: | ||
| 51 | cpumask_copy(desc->irq_data.affinity, desc->pending_mask); | ||
| 52 | case IRQ_SET_MASK_OK_NOCOPY: | ||
| 53 | irq_set_thread_affinity(desc); | ||
| 54 | } | ||
| 55 | } | ||
| 56 | 47 | ||
| 57 | cpumask_clear(desc->pending_mask); | 48 | cpumask_clear(desc->pending_mask); |
| 58 | } | 49 | } |
diff --git a/kernel/panic.c b/kernel/panic.c index 8ed89a175d79..d2a5f4ecc6dd 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
| @@ -27,7 +27,7 @@ | |||
| 27 | #define PANIC_TIMER_STEP 100 | 27 | #define PANIC_TIMER_STEP 100 |
| 28 | #define PANIC_BLINK_SPD 18 | 28 | #define PANIC_BLINK_SPD 18 |
| 29 | 29 | ||
| 30 | int panic_on_oops; | 30 | int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; |
| 31 | static unsigned long tainted_mask; | 31 | static unsigned long tainted_mask; |
| 32 | static int pause_on_oops; | 32 | static int pause_on_oops; |
| 33 | static int pause_on_oops_flag; | 33 | static int pause_on_oops_flag; |
| @@ -108,8 +108,6 @@ void panic(const char *fmt, ...) | |||
| 108 | */ | 108 | */ |
| 109 | crash_kexec(NULL); | 109 | crash_kexec(NULL); |
| 110 | 110 | ||
| 111 | kmsg_dump(KMSG_DUMP_PANIC); | ||
| 112 | |||
| 113 | /* | 111 | /* |
| 114 | * Note smp_send_stop is the usual smp shutdown function, which | 112 | * Note smp_send_stop is the usual smp shutdown function, which |
| 115 | * unfortunately means it may not be hardened to work in a panic | 113 | * unfortunately means it may not be hardened to work in a panic |
| @@ -117,6 +115,8 @@ void panic(const char *fmt, ...) | |||
| 117 | */ | 115 | */ |
| 118 | smp_send_stop(); | 116 | smp_send_stop(); |
| 119 | 117 | ||
| 118 | kmsg_dump(KMSG_DUMP_PANIC); | ||
| 119 | |||
| 120 | atomic_notifier_call_chain(&panic_notifier_list, 0, buf); | 120 | atomic_notifier_call_chain(&panic_notifier_list, 0, buf); |
| 121 | 121 | ||
| 122 | bust_spinlocks(0); | 122 | bust_spinlocks(0); |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 16b20e38c4a1..b3c7fd554250 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
| @@ -184,11 +184,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
| 184 | } | 184 | } |
| 185 | read_unlock(&tasklist_lock); | 185 | read_unlock(&tasklist_lock); |
| 186 | 186 | ||
| 187 | /* Firstly reap the EXIT_ZOMBIE children we may have. */ | ||
| 187 | do { | 188 | do { |
| 188 | clear_thread_flag(TIF_SIGPENDING); | 189 | clear_thread_flag(TIF_SIGPENDING); |
| 189 | rc = sys_wait4(-1, NULL, __WALL, NULL); | 190 | rc = sys_wait4(-1, NULL, __WALL, NULL); |
| 190 | } while (rc != -ECHILD); | 191 | } while (rc != -ECHILD); |
| 191 | 192 | ||
| 193 | /* | ||
| 194 | * sys_wait4() above can't reap the TASK_DEAD children. | ||
| 195 | * Make sure they all go away, see __unhash_process(). | ||
| 196 | */ | ||
| 197 | for (;;) { | ||
| 198 | bool need_wait = false; | ||
| 199 | |||
| 200 | read_lock(&tasklist_lock); | ||
| 201 | if (!list_empty(¤t->children)) { | ||
| 202 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
| 203 | need_wait = true; | ||
| 204 | } | ||
| 205 | read_unlock(&tasklist_lock); | ||
| 206 | |||
| 207 | if (!need_wait) | ||
| 208 | break; | ||
| 209 | schedule(); | ||
| 210 | } | ||
| 211 | |||
| 192 | if (pid_ns->reboot) | 212 | if (pid_ns->reboot) |
| 193 | current->signal->group_exit_code = pid_ns->reboot; | 213 | current->signal->group_exit_code = pid_ns->reboot; |
| 194 | 214 | ||
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 8b53db38a279..238025f5472e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
| @@ -27,7 +27,6 @@ | |||
| 27 | #include <linux/syscore_ops.h> | 27 | #include <linux/syscore_ops.h> |
| 28 | #include <linux/ctype.h> | 28 | #include <linux/ctype.h> |
| 29 | #include <linux/genhd.h> | 29 | #include <linux/genhd.h> |
| 30 | #include <scsi/scsi_scan.h> | ||
| 31 | 30 | ||
| 32 | #include "power.h" | 31 | #include "power.h" |
| 33 | 32 | ||
| @@ -748,13 +747,6 @@ static int software_resume(void) | |||
| 748 | async_synchronize_full(); | 747 | async_synchronize_full(); |
| 749 | } | 748 | } |
| 750 | 749 | ||
| 751 | /* | ||
| 752 | * We can't depend on SCSI devices being available after loading | ||
| 753 | * one of their modules until scsi_complete_async_scans() is | ||
| 754 | * called and the resume device usually is a SCSI one. | ||
| 755 | */ | ||
| 756 | scsi_complete_async_scans(); | ||
| 757 | |||
| 758 | swsusp_resume_device = name_to_dev_t(resume_file); | 750 | swsusp_resume_device = name_to_dev_t(resume_file); |
| 759 | if (!swsusp_resume_device) { | 751 | if (!swsusp_resume_device) { |
| 760 | error = -ENODEV; | 752 | error = -ENODEV; |
diff --git a/kernel/power/user.c b/kernel/power/user.c index 91b0fd021a95..4ed81e74f86f 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
| @@ -24,7 +24,6 @@ | |||
| 24 | #include <linux/console.h> | 24 | #include <linux/console.h> |
| 25 | #include <linux/cpu.h> | 25 | #include <linux/cpu.h> |
| 26 | #include <linux/freezer.h> | 26 | #include <linux/freezer.h> |
| 27 | #include <scsi/scsi_scan.h> | ||
| 28 | 27 | ||
| 29 | #include <asm/uaccess.h> | 28 | #include <asm/uaccess.h> |
| 30 | 29 | ||
| @@ -84,7 +83,6 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
| 84 | * appear. | 83 | * appear. |
| 85 | */ | 84 | */ |
| 86 | wait_for_device_probe(); | 85 | wait_for_device_probe(); |
| 87 | scsi_complete_async_scans(); | ||
| 88 | 86 | ||
| 89 | data->swap = -1; | 87 | data->swap = -1; |
| 90 | data->mode = O_WRONLY; | 88 | data->mode = O_WRONLY; |
diff --git a/kernel/printk.c b/kernel/printk.c index 32462d2b364a..ac4bc9e79465 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
| @@ -193,12 +193,21 @@ static int console_may_schedule; | |||
| 193 | * separated by ',', and find the message after the ';' character. | 193 | * separated by ',', and find the message after the ';' character. |
| 194 | */ | 194 | */ |
| 195 | 195 | ||
| 196 | enum log_flags { | ||
| 197 | LOG_NOCONS = 1, /* already flushed, do not print to console */ | ||
| 198 | LOG_NEWLINE = 2, /* text ended with a newline */ | ||
| 199 | LOG_PREFIX = 4, /* text started with a prefix */ | ||
| 200 | LOG_CONT = 8, /* text is a fragment of a continuation line */ | ||
| 201 | }; | ||
| 202 | |||
| 196 | struct log { | 203 | struct log { |
| 197 | u64 ts_nsec; /* timestamp in nanoseconds */ | 204 | u64 ts_nsec; /* timestamp in nanoseconds */ |
| 198 | u16 len; /* length of entire record */ | 205 | u16 len; /* length of entire record */ |
| 199 | u16 text_len; /* length of text buffer */ | 206 | u16 text_len; /* length of text buffer */ |
| 200 | u16 dict_len; /* length of dictionary buffer */ | 207 | u16 dict_len; /* length of dictionary buffer */ |
| 201 | u16 level; /* syslog level + facility */ | 208 | u8 facility; /* syslog facility */ |
| 209 | u8 flags:5; /* internal record flags */ | ||
| 210 | u8 level:3; /* syslog level */ | ||
| 202 | }; | 211 | }; |
| 203 | 212 | ||
| 204 | /* | 213 | /* |
| @@ -210,6 +219,8 @@ static DEFINE_RAW_SPINLOCK(logbuf_lock); | |||
| 210 | /* the next printk record to read by syslog(READ) or /proc/kmsg */ | 219 | /* the next printk record to read by syslog(READ) or /proc/kmsg */ |
| 211 | static u64 syslog_seq; | 220 | static u64 syslog_seq; |
| 212 | static u32 syslog_idx; | 221 | static u32 syslog_idx; |
| 222 | static enum log_flags syslog_prev; | ||
| 223 | static size_t syslog_partial; | ||
| 213 | 224 | ||
| 214 | /* index and sequence number of the first record stored in the buffer */ | 225 | /* index and sequence number of the first record stored in the buffer */ |
| 215 | static u64 log_first_seq; | 226 | static u64 log_first_seq; |
| @@ -227,10 +238,10 @@ static u32 clear_idx; | |||
| 227 | #define LOG_LINE_MAX 1024 | 238 | #define LOG_LINE_MAX 1024 |
| 228 | 239 | ||
| 229 | /* record buffer */ | 240 | /* record buffer */ |
| 230 | #if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) | 241 | #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) |
| 231 | #define LOG_ALIGN 4 | 242 | #define LOG_ALIGN 4 |
| 232 | #else | 243 | #else |
| 233 | #define LOG_ALIGN 8 | 244 | #define LOG_ALIGN __alignof__(struct log) |
| 234 | #endif | 245 | #endif |
| 235 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) | 246 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) |
| 236 | static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); | 247 | static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); |
| @@ -286,6 +297,7 @@ static u32 log_next(u32 idx) | |||
| 286 | 297 | ||
| 287 | /* insert record into the buffer, discard old ones, update heads */ | 298 | /* insert record into the buffer, discard old ones, update heads */ |
| 288 | static void log_store(int facility, int level, | 299 | static void log_store(int facility, int level, |
| 300 | enum log_flags flags, u64 ts_nsec, | ||
| 289 | const char *dict, u16 dict_len, | 301 | const char *dict, u16 dict_len, |
| 290 | const char *text, u16 text_len) | 302 | const char *text, u16 text_len) |
| 291 | { | 303 | { |
| @@ -329,8 +341,13 @@ static void log_store(int facility, int level, | |||
| 329 | msg->text_len = text_len; | 341 | msg->text_len = text_len; |
| 330 | memcpy(log_dict(msg), dict, dict_len); | 342 | memcpy(log_dict(msg), dict, dict_len); |
| 331 | msg->dict_len = dict_len; | 343 | msg->dict_len = dict_len; |
| 332 | msg->level = (facility << 3) | (level & 7); | 344 | msg->facility = facility; |
| 333 | msg->ts_nsec = local_clock(); | 345 | msg->level = level & 7; |
| 346 | msg->flags = flags & 0x1f; | ||
| 347 | if (ts_nsec > 0) | ||
| 348 | msg->ts_nsec = ts_nsec; | ||
| 349 | else | ||
| 350 | msg->ts_nsec = local_clock(); | ||
| 334 | memset(log_dict(msg) + dict_len, 0, pad_len); | 351 | memset(log_dict(msg) + dict_len, 0, pad_len); |
| 335 | msg->len = sizeof(struct log) + text_len + dict_len + pad_len; | 352 | msg->len = sizeof(struct log) + text_len + dict_len + pad_len; |
| 336 | 353 | ||
| @@ -414,21 +431,23 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, | |||
| 414 | if (!user) | 431 | if (!user) |
| 415 | return -EBADF; | 432 | return -EBADF; |
| 416 | 433 | ||
| 417 | mutex_lock(&user->lock); | 434 | ret = mutex_lock_interruptible(&user->lock); |
| 418 | raw_spin_lock(&logbuf_lock); | 435 | if (ret) |
| 436 | return ret; | ||
| 437 | raw_spin_lock_irq(&logbuf_lock); | ||
| 419 | while (user->seq == log_next_seq) { | 438 | while (user->seq == log_next_seq) { |
| 420 | if (file->f_flags & O_NONBLOCK) { | 439 | if (file->f_flags & O_NONBLOCK) { |
| 421 | ret = -EAGAIN; | 440 | ret = -EAGAIN; |
| 422 | raw_spin_unlock(&logbuf_lock); | 441 | raw_spin_unlock_irq(&logbuf_lock); |
| 423 | goto out; | 442 | goto out; |
| 424 | } | 443 | } |
| 425 | 444 | ||
| 426 | raw_spin_unlock(&logbuf_lock); | 445 | raw_spin_unlock_irq(&logbuf_lock); |
| 427 | ret = wait_event_interruptible(log_wait, | 446 | ret = wait_event_interruptible(log_wait, |
| 428 | user->seq != log_next_seq); | 447 | user->seq != log_next_seq); |
| 429 | if (ret) | 448 | if (ret) |
| 430 | goto out; | 449 | goto out; |
| 431 | raw_spin_lock(&logbuf_lock); | 450 | raw_spin_lock_irq(&logbuf_lock); |
| 432 | } | 451 | } |
| 433 | 452 | ||
| 434 | if (user->seq < log_first_seq) { | 453 | if (user->seq < log_first_seq) { |
| @@ -436,7 +455,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, | |||
| 436 | user->idx = log_first_idx; | 455 | user->idx = log_first_idx; |
| 437 | user->seq = log_first_seq; | 456 | user->seq = log_first_seq; |
| 438 | ret = -EPIPE; | 457 | ret = -EPIPE; |
| 439 | raw_spin_unlock(&logbuf_lock); | 458 | raw_spin_unlock_irq(&logbuf_lock); |
| 440 | goto out; | 459 | goto out; |
| 441 | } | 460 | } |
| 442 | 461 | ||
| @@ -444,13 +463,13 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, | |||
| 444 | ts_usec = msg->ts_nsec; | 463 | ts_usec = msg->ts_nsec; |
| 445 | do_div(ts_usec, 1000); | 464 | do_div(ts_usec, 1000); |
| 446 | len = sprintf(user->buf, "%u,%llu,%llu;", | 465 | len = sprintf(user->buf, "%u,%llu,%llu;", |
| 447 | msg->level, user->seq, ts_usec); | 466 | (msg->facility << 3) | msg->level, user->seq, ts_usec); |
| 448 | 467 | ||
| 449 | /* escape non-printable characters */ | 468 | /* escape non-printable characters */ |
| 450 | for (i = 0; i < msg->text_len; i++) { | 469 | for (i = 0; i < msg->text_len; i++) { |
| 451 | unsigned char c = log_text(msg)[i]; | 470 | unsigned char c = log_text(msg)[i]; |
| 452 | 471 | ||
| 453 | if (c < ' ' || c >= 128) | 472 | if (c < ' ' || c >= 127 || c == '\\') |
| 454 | len += sprintf(user->buf + len, "\\x%02x", c); | 473 | len += sprintf(user->buf + len, "\\x%02x", c); |
| 455 | else | 474 | else |
| 456 | user->buf[len++] = c; | 475 | user->buf[len++] = c; |
| @@ -474,7 +493,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, | |||
| 474 | continue; | 493 | continue; |
| 475 | } | 494 | } |
| 476 | 495 | ||
| 477 | if (c < ' ' || c >= 128) { | 496 | if (c < ' ' || c >= 127 || c == '\\') { |
| 478 | len += sprintf(user->buf + len, "\\x%02x", c); | 497 | len += sprintf(user->buf + len, "\\x%02x", c); |
| 479 | continue; | 498 | continue; |
| 480 | } | 499 | } |
| @@ -486,7 +505,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, | |||
| 486 | 505 | ||
| 487 | user->idx = log_next(user->idx); | 506 | user->idx = log_next(user->idx); |
| 488 | user->seq++; | 507 | user->seq++; |
| 489 | raw_spin_unlock(&logbuf_lock); | 508 | raw_spin_unlock_irq(&logbuf_lock); |
| 490 | 509 | ||
| 491 | if (len > count) { | 510 | if (len > count) { |
| 492 | ret = -EINVAL; | 511 | ret = -EINVAL; |
| @@ -513,7 +532,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) | |||
| 513 | if (offset) | 532 | if (offset) |
| 514 | return -ESPIPE; | 533 | return -ESPIPE; |
| 515 | 534 | ||
| 516 | raw_spin_lock(&logbuf_lock); | 535 | raw_spin_lock_irq(&logbuf_lock); |
| 517 | switch (whence) { | 536 | switch (whence) { |
| 518 | case SEEK_SET: | 537 | case SEEK_SET: |
| 519 | /* the first record */ | 538 | /* the first record */ |
| @@ -537,7 +556,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) | |||
| 537 | default: | 556 | default: |
| 538 | ret = -EINVAL; | 557 | ret = -EINVAL; |
| 539 | } | 558 | } |
| 540 | raw_spin_unlock(&logbuf_lock); | 559 | raw_spin_unlock_irq(&logbuf_lock); |
| 541 | return ret; | 560 | return ret; |
| 542 | } | 561 | } |
| 543 | 562 | ||
| @@ -551,14 +570,14 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) | |||
| 551 | 570 | ||
| 552 | poll_wait(file, &log_wait, wait); | 571 | poll_wait(file, &log_wait, wait); |
| 553 | 572 | ||
| 554 | raw_spin_lock(&logbuf_lock); | 573 | raw_spin_lock_irq(&logbuf_lock); |
| 555 | if (user->seq < log_next_seq) { | 574 | if (user->seq < log_next_seq) { |
| 556 | /* return error when data has vanished underneath us */ | 575 | /* return error when data has vanished underneath us */ |
| 557 | if (user->seq < log_first_seq) | 576 | if (user->seq < log_first_seq) |
| 558 | ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; | 577 | ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; |
| 559 | ret = POLLIN|POLLRDNORM; | 578 | ret = POLLIN|POLLRDNORM; |
| 560 | } | 579 | } |
| 561 | raw_spin_unlock(&logbuf_lock); | 580 | raw_spin_unlock_irq(&logbuf_lock); |
| 562 | 581 | ||
| 563 | return ret; | 582 | return ret; |
| 564 | } | 583 | } |
| @@ -582,10 +601,10 @@ static int devkmsg_open(struct inode *inode, struct file *file) | |||
| 582 | 601 | ||
| 583 | mutex_init(&user->lock); | 602 | mutex_init(&user->lock); |
| 584 | 603 | ||
| 585 | raw_spin_lock(&logbuf_lock); | 604 | raw_spin_lock_irq(&logbuf_lock); |
| 586 | user->idx = log_first_idx; | 605 | user->idx = log_first_idx; |
| 587 | user->seq = log_first_seq; | 606 | user->seq = log_first_seq; |
| 588 | raw_spin_unlock(&logbuf_lock); | 607 | raw_spin_unlock_irq(&logbuf_lock); |
| 589 | 608 | ||
| 590 | file->private_data = user; | 609 | file->private_data = user; |
| 591 | return 0; | 610 | return 0; |
| @@ -785,44 +804,64 @@ static bool printk_time; | |||
| 785 | #endif | 804 | #endif |
| 786 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); | 805 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); |
| 787 | 806 | ||
| 807 | static size_t print_time(u64 ts, char *buf) | ||
| 808 | { | ||
| 809 | unsigned long rem_nsec; | ||
| 810 | |||
| 811 | if (!printk_time) | ||
| 812 | return 0; | ||
| 813 | |||
| 814 | if (!buf) | ||
| 815 | return 15; | ||
| 816 | |||
| 817 | rem_nsec = do_div(ts, 1000000000); | ||
| 818 | return sprintf(buf, "[%5lu.%06lu] ", | ||
| 819 | (unsigned long)ts, rem_nsec / 1000); | ||
| 820 | } | ||
| 821 | |||
| 788 | static size_t print_prefix(const struct log *msg, bool syslog, char *buf) | 822 | static size_t print_prefix(const struct log *msg, bool syslog, char *buf) |
| 789 | { | 823 | { |
| 790 | size_t len = 0; | 824 | size_t len = 0; |
| 825 | unsigned int prefix = (msg->facility << 3) | msg->level; | ||
| 791 | 826 | ||
| 792 | if (syslog) { | 827 | if (syslog) { |
| 793 | if (buf) { | 828 | if (buf) { |
| 794 | len += sprintf(buf, "<%u>", msg->level); | 829 | len += sprintf(buf, "<%u>", prefix); |
| 795 | } else { | 830 | } else { |
| 796 | len += 3; | 831 | len += 3; |
| 797 | if (msg->level > 9) | 832 | if (prefix > 999) |
| 798 | len++; | 833 | len += 3; |
| 799 | if (msg->level > 99) | 834 | else if (prefix > 99) |
| 835 | len += 2; | ||
| 836 | else if (prefix > 9) | ||
| 800 | len++; | 837 | len++; |
| 801 | } | 838 | } |
| 802 | } | 839 | } |
| 803 | 840 | ||
| 804 | if (printk_time) { | 841 | len += print_time(msg->ts_nsec, buf ? buf + len : NULL); |
| 805 | if (buf) { | ||
| 806 | unsigned long long ts = msg->ts_nsec; | ||
| 807 | unsigned long rem_nsec = do_div(ts, 1000000000); | ||
| 808 | |||
| 809 | len += sprintf(buf + len, "[%5lu.%06lu] ", | ||
| 810 | (unsigned long) ts, rem_nsec / 1000); | ||
| 811 | } else { | ||
| 812 | len += 15; | ||
| 813 | } | ||
| 814 | } | ||
| 815 | |||
| 816 | return len; | 842 | return len; |
| 817 | } | 843 | } |
| 818 | 844 | ||
| 819 | static size_t msg_print_text(const struct log *msg, bool syslog, | 845 | static size_t msg_print_text(const struct log *msg, enum log_flags prev, |
| 820 | char *buf, size_t size) | 846 | bool syslog, char *buf, size_t size) |
| 821 | { | 847 | { |
| 822 | const char *text = log_text(msg); | 848 | const char *text = log_text(msg); |
| 823 | size_t text_size = msg->text_len; | 849 | size_t text_size = msg->text_len; |
| 850 | bool prefix = true; | ||
| 851 | bool newline = true; | ||
| 824 | size_t len = 0; | 852 | size_t len = 0; |
| 825 | 853 | ||
| 854 | if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)) | ||
| 855 | prefix = false; | ||
| 856 | |||
| 857 | if (msg->flags & LOG_CONT) { | ||
| 858 | if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE)) | ||
| 859 | prefix = false; | ||
| 860 | |||
| 861 | if (!(msg->flags & LOG_NEWLINE)) | ||
| 862 | newline = false; | ||
| 863 | } | ||
| 864 | |||
| 826 | do { | 865 | do { |
| 827 | const char *next = memchr(text, '\n', text_size); | 866 | const char *next = memchr(text, '\n', text_size); |
| 828 | size_t text_len; | 867 | size_t text_len; |
| @@ -840,16 +879,22 @@ static size_t msg_print_text(const struct log *msg, bool syslog, | |||
| 840 | text_len + 1>= size - len) | 879 | text_len + 1>= size - len) |
| 841 | break; | 880 | break; |
| 842 | 881 | ||
| 843 | len += print_prefix(msg, syslog, buf + len); | 882 | if (prefix) |
| 883 | len += print_prefix(msg, syslog, buf + len); | ||
| 844 | memcpy(buf + len, text, text_len); | 884 | memcpy(buf + len, text, text_len); |
| 845 | len += text_len; | 885 | len += text_len; |
| 846 | buf[len++] = '\n'; | 886 | if (next || newline) |
| 887 | buf[len++] = '\n'; | ||
| 847 | } else { | 888 | } else { |
| 848 | /* SYSLOG_ACTION_* buffer size only calculation */ | 889 | /* SYSLOG_ACTION_* buffer size only calculation */ |
| 849 | len += print_prefix(msg, syslog, NULL); | 890 | if (prefix) |
| 850 | len += text_len + 1; | 891 | len += print_prefix(msg, syslog, NULL); |
| 892 | len += text_len; | ||
| 893 | if (next || newline) | ||
| 894 | len++; | ||
| 851 | } | 895 | } |
| 852 | 896 | ||
| 897 | prefix = true; | ||
| 853 | text = next; | 898 | text = next; |
| 854 | } while (text); | 899 | } while (text); |
| 855 | 900 | ||
| @@ -860,26 +905,60 @@ static int syslog_print(char __user *buf, int size) | |||
| 860 | { | 905 | { |
| 861 | char *text; | 906 | char *text; |
| 862 | struct log *msg; | 907 | struct log *msg; |
| 863 | int len; | 908 | int len = 0; |
| 864 | 909 | ||
| 865 | text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); | 910 | text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); |
| 866 | if (!text) | 911 | if (!text) |
| 867 | return -ENOMEM; | 912 | return -ENOMEM; |
| 868 | 913 | ||
| 869 | raw_spin_lock_irq(&logbuf_lock); | 914 | while (size > 0) { |
| 870 | if (syslog_seq < log_first_seq) { | 915 | size_t n; |
| 871 | /* messages are gone, move to first one */ | 916 | size_t skip; |
| 872 | syslog_seq = log_first_seq; | ||
| 873 | syslog_idx = log_first_idx; | ||
| 874 | } | ||
| 875 | msg = log_from_idx(syslog_idx); | ||
| 876 | len = msg_print_text(msg, true, text, LOG_LINE_MAX); | ||
| 877 | syslog_idx = log_next(syslog_idx); | ||
| 878 | syslog_seq++; | ||
| 879 | raw_spin_unlock_irq(&logbuf_lock); | ||
| 880 | 917 | ||
| 881 | if (len > 0 && copy_to_user(buf, text, len)) | 918 | raw_spin_lock_irq(&logbuf_lock); |
| 882 | len = -EFAULT; | 919 | if (syslog_seq < log_first_seq) { |
| 920 | /* messages are gone, move to first one */ | ||
| 921 | syslog_seq = log_first_seq; | ||
| 922 | syslog_idx = log_first_idx; | ||
| 923 | syslog_prev = 0; | ||
| 924 | syslog_partial = 0; | ||
| 925 | } | ||
| 926 | if (syslog_seq == log_next_seq) { | ||
| 927 | raw_spin_unlock_irq(&logbuf_lock); | ||
| 928 | break; | ||
| 929 | } | ||
| 930 | |||
| 931 | skip = syslog_partial; | ||
| 932 | msg = log_from_idx(syslog_idx); | ||
| 933 | n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX); | ||
| 934 | if (n - syslog_partial <= size) { | ||
| 935 | /* message fits into buffer, move forward */ | ||
| 936 | syslog_idx = log_next(syslog_idx); | ||
| 937 | syslog_seq++; | ||
| 938 | syslog_prev = msg->flags; | ||
| 939 | n -= syslog_partial; | ||
| 940 | syslog_partial = 0; | ||
| 941 | } else if (!len){ | ||
| 942 | /* partial read(), remember position */ | ||
| 943 | n = size; | ||
| 944 | syslog_partial += n; | ||
| 945 | } else | ||
| 946 | n = 0; | ||
| 947 | raw_spin_unlock_irq(&logbuf_lock); | ||
| 948 | |||
| 949 | if (!n) | ||
| 950 | break; | ||
| 951 | |||
| 952 | if (copy_to_user(buf, text + skip, n)) { | ||
| 953 | if (!len) | ||
| 954 | len = -EFAULT; | ||
| 955 | break; | ||
| 956 | } | ||
| 957 | |||
| 958 | len += n; | ||
| 959 | size -= n; | ||
| 960 | buf += n; | ||
| 961 | } | ||
| 883 | 962 | ||
| 884 | kfree(text); | 963 | kfree(text); |
| 885 | return len; | 964 | return len; |
| @@ -899,6 +978,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
| 899 | u64 next_seq; | 978 | u64 next_seq; |
| 900 | u64 seq; | 979 | u64 seq; |
| 901 | u32 idx; | 980 | u32 idx; |
| 981 | enum log_flags prev; | ||
| 902 | 982 | ||
| 903 | if (clear_seq < log_first_seq) { | 983 | if (clear_seq < log_first_seq) { |
| 904 | /* messages are gone, move to first available one */ | 984 | /* messages are gone, move to first available one */ |
| @@ -909,41 +989,47 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
| 909 | /* | 989 | /* |
| 910 | * Find first record that fits, including all following records, | 990 | * Find first record that fits, including all following records, |
| 911 | * into the user-provided buffer for this dump. | 991 | * into the user-provided buffer for this dump. |
| 912 | */ | 992 | */ |
| 913 | seq = clear_seq; | 993 | seq = clear_seq; |
| 914 | idx = clear_idx; | 994 | idx = clear_idx; |
| 995 | prev = 0; | ||
| 915 | while (seq < log_next_seq) { | 996 | while (seq < log_next_seq) { |
| 916 | struct log *msg = log_from_idx(idx); | 997 | struct log *msg = log_from_idx(idx); |
| 917 | 998 | ||
| 918 | len += msg_print_text(msg, true, NULL, 0); | 999 | len += msg_print_text(msg, prev, true, NULL, 0); |
| 919 | idx = log_next(idx); | 1000 | idx = log_next(idx); |
| 920 | seq++; | 1001 | seq++; |
| 921 | } | 1002 | } |
| 1003 | |||
| 1004 | /* move first record forward until length fits into the buffer */ | ||
| 922 | seq = clear_seq; | 1005 | seq = clear_seq; |
| 923 | idx = clear_idx; | 1006 | idx = clear_idx; |
| 1007 | prev = 0; | ||
| 924 | while (len > size && seq < log_next_seq) { | 1008 | while (len > size && seq < log_next_seq) { |
| 925 | struct log *msg = log_from_idx(idx); | 1009 | struct log *msg = log_from_idx(idx); |
| 926 | 1010 | ||
| 927 | len -= msg_print_text(msg, true, NULL, 0); | 1011 | len -= msg_print_text(msg, prev, true, NULL, 0); |
| 928 | idx = log_next(idx); | 1012 | idx = log_next(idx); |
| 929 | seq++; | 1013 | seq++; |
| 930 | } | 1014 | } |
| 931 | 1015 | ||
| 932 | /* last message in this dump */ | 1016 | /* last message fitting into this dump */ |
| 933 | next_seq = log_next_seq; | 1017 | next_seq = log_next_seq; |
| 934 | 1018 | ||
| 935 | len = 0; | 1019 | len = 0; |
| 1020 | prev = 0; | ||
| 936 | while (len >= 0 && seq < next_seq) { | 1021 | while (len >= 0 && seq < next_seq) { |
| 937 | struct log *msg = log_from_idx(idx); | 1022 | struct log *msg = log_from_idx(idx); |
| 938 | int textlen; | 1023 | int textlen; |
| 939 | 1024 | ||
| 940 | textlen = msg_print_text(msg, true, text, LOG_LINE_MAX); | 1025 | textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX); |
| 941 | if (textlen < 0) { | 1026 | if (textlen < 0) { |
| 942 | len = textlen; | 1027 | len = textlen; |
| 943 | break; | 1028 | break; |
| 944 | } | 1029 | } |
| 945 | idx = log_next(idx); | 1030 | idx = log_next(idx); |
| 946 | seq++; | 1031 | seq++; |
| 1032 | prev = msg->flags; | ||
| 947 | 1033 | ||
| 948 | raw_spin_unlock_irq(&logbuf_lock); | 1034 | raw_spin_unlock_irq(&logbuf_lock); |
| 949 | if (copy_to_user(buf + len, text, textlen)) | 1035 | if (copy_to_user(buf + len, text, textlen)) |
| @@ -956,6 +1042,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
| 956 | /* messages are gone, move to next one */ | 1042 | /* messages are gone, move to next one */ |
| 957 | seq = log_first_seq; | 1043 | seq = log_first_seq; |
| 958 | idx = log_first_idx; | 1044 | idx = log_first_idx; |
| 1045 | prev = 0; | ||
| 959 | } | 1046 | } |
| 960 | } | 1047 | } |
| 961 | } | 1048 | } |
| @@ -1027,6 +1114,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
| 1027 | /* Clear ring buffer */ | 1114 | /* Clear ring buffer */ |
| 1028 | case SYSLOG_ACTION_CLEAR: | 1115 | case SYSLOG_ACTION_CLEAR: |
| 1029 | syslog_print_all(NULL, 0, true); | 1116 | syslog_print_all(NULL, 0, true); |
| 1117 | break; | ||
| 1030 | /* Disable logging to console */ | 1118 | /* Disable logging to console */ |
| 1031 | case SYSLOG_ACTION_CONSOLE_OFF: | 1119 | case SYSLOG_ACTION_CONSOLE_OFF: |
| 1032 | if (saved_console_loglevel == -1) | 1120 | if (saved_console_loglevel == -1) |
| @@ -1059,6 +1147,8 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
| 1059 | /* messages are gone, move to first one */ | 1147 | /* messages are gone, move to first one */ |
| 1060 | syslog_seq = log_first_seq; | 1148 | syslog_seq = log_first_seq; |
| 1061 | syslog_idx = log_first_idx; | 1149 | syslog_idx = log_first_idx; |
| 1150 | syslog_prev = 0; | ||
| 1151 | syslog_partial = 0; | ||
| 1062 | } | 1152 | } |
| 1063 | if (from_file) { | 1153 | if (from_file) { |
| 1064 | /* | 1154 | /* |
| @@ -1068,19 +1158,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
| 1068 | */ | 1158 | */ |
| 1069 | error = log_next_idx - syslog_idx; | 1159 | error = log_next_idx - syslog_idx; |
| 1070 | } else { | 1160 | } else { |
| 1071 | u64 seq; | 1161 | u64 seq = syslog_seq; |
| 1072 | u32 idx; | 1162 | u32 idx = syslog_idx; |
| 1163 | enum log_flags prev = syslog_prev; | ||
| 1073 | 1164 | ||
| 1074 | error = 0; | 1165 | error = 0; |
| 1075 | seq = syslog_seq; | ||
| 1076 | idx = syslog_idx; | ||
| 1077 | while (seq < log_next_seq) { | 1166 | while (seq < log_next_seq) { |
| 1078 | struct log *msg = log_from_idx(idx); | 1167 | struct log *msg = log_from_idx(idx); |
| 1079 | 1168 | ||
| 1080 | error += msg_print_text(msg, true, NULL, 0); | 1169 | error += msg_print_text(msg, prev, true, NULL, 0); |
| 1081 | idx = log_next(idx); | 1170 | idx = log_next(idx); |
| 1082 | seq++; | 1171 | seq++; |
| 1172 | prev = msg->flags; | ||
| 1083 | } | 1173 | } |
| 1174 | error -= syslog_partial; | ||
| 1084 | } | 1175 | } |
| 1085 | raw_spin_unlock_irq(&logbuf_lock); | 1176 | raw_spin_unlock_irq(&logbuf_lock); |
| 1086 | break; | 1177 | break; |
| @@ -1101,21 +1192,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) | |||
| 1101 | return do_syslog(type, buf, len, SYSLOG_FROM_CALL); | 1192 | return do_syslog(type, buf, len, SYSLOG_FROM_CALL); |
| 1102 | } | 1193 | } |
| 1103 | 1194 | ||
| 1104 | #ifdef CONFIG_KGDB_KDB | ||
| 1105 | /* kdb dmesg command needs access to the syslog buffer. do_syslog() | ||
| 1106 | * uses locks so it cannot be used during debugging. Just tell kdb | ||
| 1107 | * where the start and end of the physical and logical logs are. This | ||
| 1108 | * is equivalent to do_syslog(3). | ||
| 1109 | */ | ||
| 1110 | void kdb_syslog_data(char *syslog_data[4]) | ||
| 1111 | { | ||
| 1112 | syslog_data[0] = log_buf; | ||
| 1113 | syslog_data[1] = log_buf + log_buf_len; | ||
| 1114 | syslog_data[2] = log_buf + log_first_idx; | ||
| 1115 | syslog_data[3] = log_buf + log_next_idx; | ||
| 1116 | } | ||
| 1117 | #endif /* CONFIG_KGDB_KDB */ | ||
| 1118 | |||
| 1119 | static bool __read_mostly ignore_loglevel; | 1195 | static bool __read_mostly ignore_loglevel; |
| 1120 | 1196 | ||
| 1121 | static int __init ignore_loglevel_setup(char *str) | 1197 | static int __init ignore_loglevel_setup(char *str) |
| @@ -1259,22 +1335,98 @@ static inline void printk_delay(void) | |||
| 1259 | } | 1335 | } |
| 1260 | } | 1336 | } |
| 1261 | 1337 | ||
| 1338 | /* | ||
| 1339 | * Continuation lines are buffered, and not committed to the record buffer | ||
| 1340 | * until the line is complete, or a race forces it. The line fragments | ||
| 1341 | * though, are printed immediately to the consoles to ensure everything has | ||
| 1342 | * reached the console in case of a kernel crash. | ||
| 1343 | */ | ||
| 1344 | static struct cont { | ||
| 1345 | char buf[LOG_LINE_MAX]; | ||
| 1346 | size_t len; /* length == 0 means unused buffer */ | ||
| 1347 | size_t cons; /* bytes written to console */ | ||
| 1348 | struct task_struct *owner; /* task of first print*/ | ||
| 1349 | u64 ts_nsec; /* time of first print */ | ||
| 1350 | u8 level; /* log level of first message */ | ||
| 1351 | u8 facility; /* log level of first message */ | ||
| 1352 | bool flushed:1; /* buffer sealed and committed */ | ||
| 1353 | } cont; | ||
| 1354 | |||
| 1355 | static void cont_flush(void) | ||
| 1356 | { | ||
| 1357 | if (cont.flushed) | ||
| 1358 | return; | ||
| 1359 | if (cont.len == 0) | ||
| 1360 | return; | ||
| 1361 | |||
| 1362 | log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec, | ||
| 1363 | NULL, 0, cont.buf, cont.len); | ||
| 1364 | |||
| 1365 | cont.flushed = true; | ||
| 1366 | } | ||
| 1367 | |||
| 1368 | static bool cont_add(int facility, int level, const char *text, size_t len) | ||
| 1369 | { | ||
| 1370 | if (cont.len && cont.flushed) | ||
| 1371 | return false; | ||
| 1372 | |||
| 1373 | if (cont.len + len > sizeof(cont.buf)) { | ||
| 1374 | cont_flush(); | ||
| 1375 | return false; | ||
| 1376 | } | ||
| 1377 | |||
| 1378 | if (!cont.len) { | ||
| 1379 | cont.facility = facility; | ||
| 1380 | cont.level = level; | ||
| 1381 | cont.owner = current; | ||
| 1382 | cont.ts_nsec = local_clock(); | ||
| 1383 | cont.cons = 0; | ||
| 1384 | cont.flushed = false; | ||
| 1385 | } | ||
| 1386 | |||
| 1387 | memcpy(cont.buf + cont.len, text, len); | ||
| 1388 | cont.len += len; | ||
| 1389 | return true; | ||
| 1390 | } | ||
| 1391 | |||
| 1392 | static size_t cont_print_text(char *text, size_t size) | ||
| 1393 | { | ||
| 1394 | size_t textlen = 0; | ||
| 1395 | size_t len; | ||
| 1396 | |||
| 1397 | if (cont.cons == 0) { | ||
| 1398 | textlen += print_time(cont.ts_nsec, text); | ||
| 1399 | size -= textlen; | ||
| 1400 | } | ||
| 1401 | |||
| 1402 | len = cont.len - cont.cons; | ||
| 1403 | if (len > 0) { | ||
| 1404 | if (len+1 > size) | ||
| 1405 | len = size-1; | ||
| 1406 | memcpy(text + textlen, cont.buf + cont.cons, len); | ||
| 1407 | textlen += len; | ||
| 1408 | cont.cons = cont.len; | ||
| 1409 | } | ||
| 1410 | |||
| 1411 | if (cont.flushed) { | ||
| 1412 | text[textlen++] = '\n'; | ||
| 1413 | /* got everything, release buffer */ | ||
| 1414 | cont.len = 0; | ||
| 1415 | } | ||
| 1416 | return textlen; | ||
| 1417 | } | ||
| 1418 | |||
| 1262 | asmlinkage int vprintk_emit(int facility, int level, | 1419 | asmlinkage int vprintk_emit(int facility, int level, |
| 1263 | const char *dict, size_t dictlen, | 1420 | const char *dict, size_t dictlen, |
| 1264 | const char *fmt, va_list args) | 1421 | const char *fmt, va_list args) |
| 1265 | { | 1422 | { |
| 1266 | static int recursion_bug; | 1423 | static int recursion_bug; |
| 1267 | static char cont_buf[LOG_LINE_MAX]; | ||
| 1268 | static size_t cont_len; | ||
| 1269 | static int cont_level; | ||
| 1270 | static struct task_struct *cont_task; | ||
| 1271 | static char textbuf[LOG_LINE_MAX]; | 1424 | static char textbuf[LOG_LINE_MAX]; |
| 1272 | char *text = textbuf; | 1425 | char *text = textbuf; |
| 1273 | size_t text_len; | 1426 | size_t text_len; |
| 1427 | enum log_flags lflags = 0; | ||
| 1274 | unsigned long flags; | 1428 | unsigned long flags; |
| 1275 | int this_cpu; | 1429 | int this_cpu; |
| 1276 | bool newline = false; | ||
| 1277 | bool prefix = false; | ||
| 1278 | int printed_len = 0; | 1430 | int printed_len = 0; |
| 1279 | 1431 | ||
| 1280 | boot_delay_msec(); | 1432 | boot_delay_msec(); |
| @@ -1313,7 +1465,8 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1313 | recursion_bug = 0; | 1465 | recursion_bug = 0; |
| 1314 | printed_len += strlen(recursion_msg); | 1466 | printed_len += strlen(recursion_msg); |
| 1315 | /* emit KERN_CRIT message */ | 1467 | /* emit KERN_CRIT message */ |
| 1316 | log_store(0, 2, NULL, 0, recursion_msg, printed_len); | 1468 | log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, |
| 1469 | NULL, 0, recursion_msg, printed_len); | ||
| 1317 | } | 1470 | } |
| 1318 | 1471 | ||
| 1319 | /* | 1472 | /* |
| @@ -1325,7 +1478,7 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1325 | /* mark and strip a trailing newline */ | 1478 | /* mark and strip a trailing newline */ |
| 1326 | if (text_len && text[text_len-1] == '\n') { | 1479 | if (text_len && text[text_len-1] == '\n') { |
| 1327 | text_len--; | 1480 | text_len--; |
| 1328 | newline = true; | 1481 | lflags |= LOG_NEWLINE; |
| 1329 | } | 1482 | } |
| 1330 | 1483 | ||
| 1331 | /* strip syslog prefix and extract log level or control flags */ | 1484 | /* strip syslog prefix and extract log level or control flags */ |
| @@ -1335,7 +1488,7 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1335 | if (level == -1) | 1488 | if (level == -1) |
| 1336 | level = text[1] - '0'; | 1489 | level = text[1] - '0'; |
| 1337 | case 'd': /* KERN_DEFAULT */ | 1490 | case 'd': /* KERN_DEFAULT */ |
| 1338 | prefix = true; | 1491 | lflags |= LOG_PREFIX; |
| 1339 | case 'c': /* KERN_CONT */ | 1492 | case 'c': /* KERN_CONT */ |
| 1340 | text += 3; | 1493 | text += 3; |
| 1341 | text_len -= 3; | 1494 | text_len -= 3; |
| @@ -1345,61 +1498,41 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1345 | if (level == -1) | 1498 | if (level == -1) |
| 1346 | level = default_message_loglevel; | 1499 | level = default_message_loglevel; |
| 1347 | 1500 | ||
| 1348 | if (dict) { | 1501 | if (dict) |
| 1349 | prefix = true; | 1502 | lflags |= LOG_PREFIX|LOG_NEWLINE; |
| 1350 | newline = true; | ||
| 1351 | } | ||
| 1352 | |||
| 1353 | if (!newline) { | ||
| 1354 | if (cont_len && (prefix || cont_task != current)) { | ||
| 1355 | /* | ||
| 1356 | * Flush earlier buffer, which is either from a | ||
| 1357 | * different thread, or when we got a new prefix. | ||
| 1358 | */ | ||
| 1359 | log_store(facility, cont_level, NULL, 0, cont_buf, cont_len); | ||
| 1360 | cont_len = 0; | ||
| 1361 | } | ||
| 1362 | 1503 | ||
| 1363 | if (!cont_len) { | 1504 | if (!(lflags & LOG_NEWLINE)) { |
| 1364 | cont_level = level; | 1505 | /* |
| 1365 | cont_task = current; | 1506 | * Flush the conflicting buffer. An earlier newline was missing, |
| 1366 | } | 1507 | * or another task also prints continuation lines. |
| 1508 | */ | ||
| 1509 | if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) | ||
| 1510 | cont_flush(); | ||
| 1367 | 1511 | ||
| 1368 | /* buffer or append to earlier buffer from the same thread */ | 1512 | /* buffer line if possible, otherwise store it right away */ |
| 1369 | if (cont_len + text_len > sizeof(cont_buf)) | 1513 | if (!cont_add(facility, level, text, text_len)) |
| 1370 | text_len = sizeof(cont_buf) - cont_len; | 1514 | log_store(facility, level, lflags | LOG_CONT, 0, |
| 1371 | memcpy(cont_buf + cont_len, text, text_len); | 1515 | dict, dictlen, text, text_len); |
| 1372 | cont_len += text_len; | ||
| 1373 | } else { | 1516 | } else { |
| 1374 | if (cont_len && cont_task == current) { | 1517 | bool stored = false; |
| 1375 | if (prefix) { | ||
| 1376 | /* | ||
| 1377 | * New prefix from the same thread; flush. We | ||
| 1378 | * either got no earlier newline, or we race | ||
| 1379 | * with an interrupt. | ||
| 1380 | */ | ||
| 1381 | log_store(facility, cont_level, | ||
| 1382 | NULL, 0, cont_buf, cont_len); | ||
| 1383 | cont_len = 0; | ||
| 1384 | } | ||
| 1385 | 1518 | ||
| 1386 | /* append to the earlier buffer and flush */ | 1519 | /* |
| 1387 | if (cont_len + text_len > sizeof(cont_buf)) | 1520 | * If an earlier newline was missing and it was the same task, |
| 1388 | text_len = sizeof(cont_buf) - cont_len; | 1521 | * either merge it with the current buffer and flush, or if |
| 1389 | memcpy(cont_buf + cont_len, text, text_len); | 1522 | * there was a race with interrupts (prefix == true) then just |
| 1390 | cont_len += text_len; | 1523 | * flush it out and store this line separately. |
| 1391 | log_store(facility, cont_level, | 1524 | */ |
| 1392 | NULL, 0, cont_buf, cont_len); | 1525 | if (cont.len && cont.owner == current) { |
| 1393 | cont_len = 0; | 1526 | if (!(lflags & LOG_PREFIX)) |
| 1394 | cont_task = NULL; | 1527 | stored = cont_add(facility, level, text, text_len); |
| 1395 | printed_len = cont_len; | 1528 | cont_flush(); |
| 1396 | } else { | ||
| 1397 | /* ordinary single and terminated line */ | ||
| 1398 | log_store(facility, level, | ||
| 1399 | dict, dictlen, text, text_len); | ||
| 1400 | printed_len = text_len; | ||
| 1401 | } | 1529 | } |
| 1530 | |||
| 1531 | if (!stored) | ||
| 1532 | log_store(facility, level, lflags, 0, | ||
| 1533 | dict, dictlen, text, text_len); | ||
| 1402 | } | 1534 | } |
| 1535 | printed_len += text_len; | ||
| 1403 | 1536 | ||
| 1404 | /* | 1537 | /* |
| 1405 | * Try to acquire and then immediately release the console semaphore. | 1538 | * Try to acquire and then immediately release the console semaphore. |
| @@ -1486,11 +1619,18 @@ EXPORT_SYMBOL(printk); | |||
| 1486 | #else | 1619 | #else |
| 1487 | 1620 | ||
| 1488 | #define LOG_LINE_MAX 0 | 1621 | #define LOG_LINE_MAX 0 |
| 1622 | static struct cont { | ||
| 1623 | size_t len; | ||
| 1624 | size_t cons; | ||
| 1625 | u8 level; | ||
| 1626 | bool flushed:1; | ||
| 1627 | } cont; | ||
| 1489 | static struct log *log_from_idx(u32 idx) { return NULL; } | 1628 | static struct log *log_from_idx(u32 idx) { return NULL; } |
| 1490 | static u32 log_next(u32 idx) { return 0; } | 1629 | static u32 log_next(u32 idx) { return 0; } |
| 1491 | static void call_console_drivers(int level, const char *text, size_t len) {} | 1630 | static void call_console_drivers(int level, const char *text, size_t len) {} |
| 1492 | static size_t msg_print_text(const struct log *msg, bool syslog, | 1631 | static size_t msg_print_text(const struct log *msg, enum log_flags prev, |
| 1493 | char *buf, size_t size) { return 0; } | 1632 | bool syslog, char *buf, size_t size) { return 0; } |
| 1633 | static size_t cont_print_text(char *text, size_t size) { return 0; } | ||
| 1494 | 1634 | ||
| 1495 | #endif /* CONFIG_PRINTK */ | 1635 | #endif /* CONFIG_PRINTK */ |
| 1496 | 1636 | ||
| @@ -1765,6 +1905,7 @@ void wake_up_klogd(void) | |||
| 1765 | /* the next printk record to write to the console */ | 1905 | /* the next printk record to write to the console */ |
| 1766 | static u64 console_seq; | 1906 | static u64 console_seq; |
| 1767 | static u32 console_idx; | 1907 | static u32 console_idx; |
| 1908 | static enum log_flags console_prev; | ||
| 1768 | 1909 | ||
| 1769 | /** | 1910 | /** |
| 1770 | * console_unlock - unlock the console system | 1911 | * console_unlock - unlock the console system |
| @@ -1782,6 +1923,7 @@ static u32 console_idx; | |||
| 1782 | */ | 1923 | */ |
| 1783 | void console_unlock(void) | 1924 | void console_unlock(void) |
| 1784 | { | 1925 | { |
| 1926 | static char text[LOG_LINE_MAX]; | ||
| 1785 | static u64 seen_seq; | 1927 | static u64 seen_seq; |
| 1786 | unsigned long flags; | 1928 | unsigned long flags; |
| 1787 | bool wake_klogd = false; | 1929 | bool wake_klogd = false; |
| @@ -1794,10 +1936,23 @@ void console_unlock(void) | |||
| 1794 | 1936 | ||
| 1795 | console_may_schedule = 0; | 1937 | console_may_schedule = 0; |
| 1796 | 1938 | ||
| 1939 | /* flush buffered message fragment immediately to console */ | ||
| 1940 | raw_spin_lock_irqsave(&logbuf_lock, flags); | ||
| 1941 | if (cont.len && (cont.cons < cont.len || cont.flushed)) { | ||
| 1942 | size_t len; | ||
| 1943 | |||
| 1944 | len = cont_print_text(text, sizeof(text)); | ||
| 1945 | raw_spin_unlock(&logbuf_lock); | ||
| 1946 | stop_critical_timings(); | ||
| 1947 | call_console_drivers(cont.level, text, len); | ||
| 1948 | start_critical_timings(); | ||
| 1949 | local_irq_restore(flags); | ||
| 1950 | } else | ||
| 1951 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
| 1952 | |||
| 1797 | again: | 1953 | again: |
| 1798 | for (;;) { | 1954 | for (;;) { |
| 1799 | struct log *msg; | 1955 | struct log *msg; |
| 1800 | static char text[LOG_LINE_MAX]; | ||
| 1801 | size_t len; | 1956 | size_t len; |
| 1802 | int level; | 1957 | int level; |
| 1803 | 1958 | ||
| @@ -1811,18 +1966,35 @@ again: | |||
| 1811 | /* messages are gone, move to first one */ | 1966 | /* messages are gone, move to first one */ |
| 1812 | console_seq = log_first_seq; | 1967 | console_seq = log_first_seq; |
| 1813 | console_idx = log_first_idx; | 1968 | console_idx = log_first_idx; |
| 1969 | console_prev = 0; | ||
| 1814 | } | 1970 | } |
| 1815 | 1971 | skip: | |
| 1816 | if (console_seq == log_next_seq) | 1972 | if (console_seq == log_next_seq) |
| 1817 | break; | 1973 | break; |
| 1818 | 1974 | ||
| 1819 | msg = log_from_idx(console_idx); | 1975 | msg = log_from_idx(console_idx); |
| 1820 | level = msg->level & 7; | 1976 | if (msg->flags & LOG_NOCONS) { |
| 1821 | 1977 | /* | |
| 1822 | len = msg_print_text(msg, false, text, sizeof(text)); | 1978 | * Skip record we have buffered and already printed |
| 1979 | * directly to the console when we received it. | ||
| 1980 | */ | ||
| 1981 | console_idx = log_next(console_idx); | ||
| 1982 | console_seq++; | ||
| 1983 | /* | ||
| 1984 | * We will get here again when we register a new | ||
| 1985 | * CON_PRINTBUFFER console. Clear the flag so we | ||
| 1986 | * will properly dump everything later. | ||
| 1987 | */ | ||
| 1988 | msg->flags &= ~LOG_NOCONS; | ||
| 1989 | goto skip; | ||
| 1990 | } | ||
| 1823 | 1991 | ||
| 1992 | level = msg->level; | ||
| 1993 | len = msg_print_text(msg, console_prev, false, | ||
| 1994 | text, sizeof(text)); | ||
| 1824 | console_idx = log_next(console_idx); | 1995 | console_idx = log_next(console_idx); |
| 1825 | console_seq++; | 1996 | console_seq++; |
| 1997 | console_prev = msg->flags; | ||
| 1826 | raw_spin_unlock(&logbuf_lock); | 1998 | raw_spin_unlock(&logbuf_lock); |
| 1827 | 1999 | ||
| 1828 | stop_critical_timings(); /* don't trace print latency */ | 2000 | stop_critical_timings(); /* don't trace print latency */ |
| @@ -2085,6 +2257,7 @@ void register_console(struct console *newcon) | |||
| 2085 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 2257 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
| 2086 | console_seq = syslog_seq; | 2258 | console_seq = syslog_seq; |
| 2087 | console_idx = syslog_idx; | 2259 | console_idx = syslog_idx; |
| 2260 | console_prev = syslog_prev; | ||
| 2088 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 2261 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
| 2089 | /* | 2262 | /* |
| 2090 | * We're about to replay the log buffer. Only do this to the | 2263 | * We're about to replay the log buffer. Only do this to the |
| @@ -2300,48 +2473,256 @@ module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); | |||
| 2300 | * kmsg_dump - dump kernel log to kernel message dumpers. | 2473 | * kmsg_dump - dump kernel log to kernel message dumpers. |
| 2301 | * @reason: the reason (oops, panic etc) for dumping | 2474 | * @reason: the reason (oops, panic etc) for dumping |
| 2302 | * | 2475 | * |
| 2303 | * Iterate through each of the dump devices and call the oops/panic | 2476 | * Call each of the registered dumper's dump() callback, which can |
| 2304 | * callbacks with the log buffer. | 2477 | * retrieve the kmsg records with kmsg_dump_get_line() or |
| 2478 | * kmsg_dump_get_buffer(). | ||
| 2305 | */ | 2479 | */ |
| 2306 | void kmsg_dump(enum kmsg_dump_reason reason) | 2480 | void kmsg_dump(enum kmsg_dump_reason reason) |
| 2307 | { | 2481 | { |
| 2308 | u64 idx; | ||
| 2309 | struct kmsg_dumper *dumper; | 2482 | struct kmsg_dumper *dumper; |
| 2310 | const char *s1, *s2; | ||
| 2311 | unsigned long l1, l2; | ||
| 2312 | unsigned long flags; | 2483 | unsigned long flags; |
| 2313 | 2484 | ||
| 2314 | if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) | 2485 | if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) |
| 2315 | return; | 2486 | return; |
| 2316 | 2487 | ||
| 2317 | /* Theoretically, the log could move on after we do this, but | 2488 | rcu_read_lock(); |
| 2318 | there's not a lot we can do about that. The new messages | 2489 | list_for_each_entry_rcu(dumper, &dump_list, list) { |
| 2319 | will overwrite the start of what we dump. */ | 2490 | if (dumper->max_reason && reason > dumper->max_reason) |
| 2491 | continue; | ||
| 2492 | |||
| 2493 | /* initialize iterator with data about the stored records */ | ||
| 2494 | dumper->active = true; | ||
| 2495 | |||
| 2496 | raw_spin_lock_irqsave(&logbuf_lock, flags); | ||
| 2497 | dumper->cur_seq = clear_seq; | ||
| 2498 | dumper->cur_idx = clear_idx; | ||
| 2499 | dumper->next_seq = log_next_seq; | ||
| 2500 | dumper->next_idx = log_next_idx; | ||
| 2501 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
| 2502 | |||
| 2503 | /* invoke dumper which will iterate over records */ | ||
| 2504 | dumper->dump(dumper, reason); | ||
| 2505 | |||
| 2506 | /* reset iterator */ | ||
| 2507 | dumper->active = false; | ||
| 2508 | } | ||
| 2509 | rcu_read_unlock(); | ||
| 2510 | } | ||
| 2511 | |||
| 2512 | /** | ||
| 2513 | * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version) | ||
| 2514 | * @dumper: registered kmsg dumper | ||
| 2515 | * @syslog: include the "<4>" prefixes | ||
| 2516 | * @line: buffer to copy the line to | ||
| 2517 | * @size: maximum size of the buffer | ||
| 2518 | * @len: length of line placed into buffer | ||
| 2519 | * | ||
| 2520 | * Start at the beginning of the kmsg buffer, with the oldest kmsg | ||
| 2521 | * record, and copy one record into the provided buffer. | ||
| 2522 | * | ||
| 2523 | * Consecutive calls will return the next available record moving | ||
| 2524 | * towards the end of the buffer with the youngest messages. | ||
| 2525 | * | ||
| 2526 | * A return value of FALSE indicates that there are no more records to | ||
| 2527 | * read. | ||
| 2528 | * | ||
| 2529 | * The function is similar to kmsg_dump_get_line(), but grabs no locks. | ||
| 2530 | */ | ||
| 2531 | bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, | ||
| 2532 | char *line, size_t size, size_t *len) | ||
| 2533 | { | ||
| 2534 | struct log *msg; | ||
| 2535 | size_t l = 0; | ||
| 2536 | bool ret = false; | ||
| 2537 | |||
| 2538 | if (!dumper->active) | ||
| 2539 | goto out; | ||
| 2540 | |||
| 2541 | if (dumper->cur_seq < log_first_seq) { | ||
| 2542 | /* messages are gone, move to first available one */ | ||
| 2543 | dumper->cur_seq = log_first_seq; | ||
| 2544 | dumper->cur_idx = log_first_idx; | ||
| 2545 | } | ||
| 2546 | |||
| 2547 | /* last entry */ | ||
| 2548 | if (dumper->cur_seq >= log_next_seq) | ||
| 2549 | goto out; | ||
| 2550 | |||
| 2551 | msg = log_from_idx(dumper->cur_idx); | ||
| 2552 | l = msg_print_text(msg, 0, syslog, line, size); | ||
| 2553 | |||
| 2554 | dumper->cur_idx = log_next(dumper->cur_idx); | ||
| 2555 | dumper->cur_seq++; | ||
| 2556 | ret = true; | ||
| 2557 | out: | ||
| 2558 | if (len) | ||
| 2559 | *len = l; | ||
| 2560 | return ret; | ||
| 2561 | } | ||
| 2562 | |||
| 2563 | /** | ||
| 2564 | * kmsg_dump_get_line - retrieve one kmsg log line | ||
| 2565 | * @dumper: registered kmsg dumper | ||
| 2566 | * @syslog: include the "<4>" prefixes | ||
| 2567 | * @line: buffer to copy the line to | ||
| 2568 | * @size: maximum size of the buffer | ||
| 2569 | * @len: length of line placed into buffer | ||
| 2570 | * | ||
| 2571 | * Start at the beginning of the kmsg buffer, with the oldest kmsg | ||
| 2572 | * record, and copy one record into the provided buffer. | ||
| 2573 | * | ||
| 2574 | * Consecutive calls will return the next available record moving | ||
| 2575 | * towards the end of the buffer with the youngest messages. | ||
| 2576 | * | ||
| 2577 | * A return value of FALSE indicates that there are no more records to | ||
| 2578 | * read. | ||
| 2579 | */ | ||
| 2580 | bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, | ||
| 2581 | char *line, size_t size, size_t *len) | ||
| 2582 | { | ||
| 2583 | unsigned long flags; | ||
| 2584 | bool ret; | ||
| 2585 | |||
| 2586 | raw_spin_lock_irqsave(&logbuf_lock, flags); | ||
| 2587 | ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); | ||
| 2588 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
| 2589 | |||
| 2590 | return ret; | ||
| 2591 | } | ||
| 2592 | EXPORT_SYMBOL_GPL(kmsg_dump_get_line); | ||
| 2593 | |||
| 2594 | /** | ||
| 2595 | * kmsg_dump_get_buffer - copy kmsg log lines | ||
| 2596 | * @dumper: registered kmsg dumper | ||
| 2597 | * @syslog: include the "<4>" prefixes | ||
| 2598 | * @buf: buffer to copy the line to | ||
| 2599 | * @size: maximum size of the buffer | ||
| 2600 | * @len: length of line placed into buffer | ||
| 2601 | * | ||
| 2602 | * Start at the end of the kmsg buffer and fill the provided buffer | ||
| 2603 | * with as many of the the *youngest* kmsg records that fit into it. | ||
| 2604 | * If the buffer is large enough, all available kmsg records will be | ||
| 2605 | * copied with a single call. | ||
| 2606 | * | ||
| 2607 | * Consecutive calls will fill the buffer with the next block of | ||
| 2608 | * available older records, not including the earlier retrieved ones. | ||
| 2609 | * | ||
| 2610 | * A return value of FALSE indicates that there are no more records to | ||
| 2611 | * read. | ||
| 2612 | */ | ||
| 2613 | bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, | ||
| 2614 | char *buf, size_t size, size_t *len) | ||
| 2615 | { | ||
| 2616 | unsigned long flags; | ||
| 2617 | u64 seq; | ||
| 2618 | u32 idx; | ||
| 2619 | u64 next_seq; | ||
| 2620 | u32 next_idx; | ||
| 2621 | enum log_flags prev; | ||
| 2622 | size_t l = 0; | ||
| 2623 | bool ret = false; | ||
| 2624 | |||
| 2625 | if (!dumper->active) | ||
| 2626 | goto out; | ||
| 2320 | 2627 | ||
| 2321 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 2628 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
| 2322 | if (syslog_seq < log_first_seq) | 2629 | if (dumper->cur_seq < log_first_seq) { |
| 2323 | idx = syslog_idx; | 2630 | /* messages are gone, move to first available one */ |
| 2324 | else | 2631 | dumper->cur_seq = log_first_seq; |
| 2325 | idx = log_first_idx; | 2632 | dumper->cur_idx = log_first_idx; |
| 2633 | } | ||
| 2634 | |||
| 2635 | /* last entry */ | ||
| 2636 | if (dumper->cur_seq >= dumper->next_seq) { | ||
| 2637 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
| 2638 | goto out; | ||
| 2639 | } | ||
| 2326 | 2640 | ||
| 2327 | if (idx > log_next_idx) { | 2641 | /* calculate length of entire buffer */ |
| 2328 | s1 = log_buf; | 2642 | seq = dumper->cur_seq; |
| 2329 | l1 = log_next_idx; | 2643 | idx = dumper->cur_idx; |
| 2644 | prev = 0; | ||
| 2645 | while (seq < dumper->next_seq) { | ||
| 2646 | struct log *msg = log_from_idx(idx); | ||
| 2647 | |||
| 2648 | l += msg_print_text(msg, prev, true, NULL, 0); | ||
| 2649 | idx = log_next(idx); | ||
| 2650 | seq++; | ||
| 2651 | prev = msg->flags; | ||
| 2652 | } | ||
| 2330 | 2653 | ||
| 2331 | s2 = log_buf + idx; | 2654 | /* move first record forward until length fits into the buffer */ |
| 2332 | l2 = log_buf_len - idx; | 2655 | seq = dumper->cur_seq; |
| 2333 | } else { | 2656 | idx = dumper->cur_idx; |
| 2334 | s1 = ""; | 2657 | prev = 0; |
| 2335 | l1 = 0; | 2658 | while (l > size && seq < dumper->next_seq) { |
| 2659 | struct log *msg = log_from_idx(idx); | ||
| 2660 | |||
| 2661 | l -= msg_print_text(msg, prev, true, NULL, 0); | ||
| 2662 | idx = log_next(idx); | ||
| 2663 | seq++; | ||
| 2664 | prev = msg->flags; | ||
| 2665 | } | ||
| 2666 | |||
| 2667 | /* last message in next interation */ | ||
| 2668 | next_seq = seq; | ||
| 2669 | next_idx = idx; | ||
| 2670 | |||
| 2671 | l = 0; | ||
| 2672 | prev = 0; | ||
| 2673 | while (seq < dumper->next_seq) { | ||
| 2674 | struct log *msg = log_from_idx(idx); | ||
| 2336 | 2675 | ||
| 2337 | s2 = log_buf + idx; | 2676 | l += msg_print_text(msg, prev, syslog, buf + l, size - l); |
| 2338 | l2 = log_next_idx - idx; | 2677 | idx = log_next(idx); |
| 2678 | seq++; | ||
| 2679 | prev = msg->flags; | ||
| 2339 | } | 2680 | } |
| 2681 | |||
| 2682 | dumper->next_seq = next_seq; | ||
| 2683 | dumper->next_idx = next_idx; | ||
| 2684 | ret = true; | ||
| 2340 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 2685 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
| 2686 | out: | ||
| 2687 | if (len) | ||
| 2688 | *len = l; | ||
| 2689 | return ret; | ||
| 2690 | } | ||
| 2691 | EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); | ||
| 2341 | 2692 | ||
| 2342 | rcu_read_lock(); | 2693 | /** |
| 2343 | list_for_each_entry_rcu(dumper, &dump_list, list) | 2694 | * kmsg_dump_rewind_nolock - reset the interator (unlocked version) |
| 2344 | dumper->dump(dumper, reason, s1, l1, s2, l2); | 2695 | * @dumper: registered kmsg dumper |
| 2345 | rcu_read_unlock(); | 2696 | * |
| 2697 | * Reset the dumper's iterator so that kmsg_dump_get_line() and | ||
| 2698 | * kmsg_dump_get_buffer() can be called again and used multiple | ||
| 2699 | * times within the same dumper.dump() callback. | ||
| 2700 | * | ||
| 2701 | * The function is similar to kmsg_dump_rewind(), but grabs no locks. | ||
| 2702 | */ | ||
| 2703 | void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) | ||
| 2704 | { | ||
| 2705 | dumper->cur_seq = clear_seq; | ||
| 2706 | dumper->cur_idx = clear_idx; | ||
| 2707 | dumper->next_seq = log_next_seq; | ||
| 2708 | dumper->next_idx = log_next_idx; | ||
| 2709 | } | ||
| 2710 | |||
| 2711 | /** | ||
| 2712 | * kmsg_dump_rewind - reset the interator | ||
| 2713 | * @dumper: registered kmsg dumper | ||
| 2714 | * | ||
| 2715 | * Reset the dumper's iterator so that kmsg_dump_get_line() and | ||
| 2716 | * kmsg_dump_get_buffer() can be called again and used multiple | ||
| 2717 | * times within the same dumper.dump() callback. | ||
| 2718 | */ | ||
| 2719 | void kmsg_dump_rewind(struct kmsg_dumper *dumper) | ||
| 2720 | { | ||
| 2721 | unsigned long flags; | ||
| 2722 | |||
| 2723 | raw_spin_lock_irqsave(&logbuf_lock, flags); | ||
| 2724 | kmsg_dump_rewind_nolock(dumper); | ||
| 2725 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
| 2346 | } | 2726 | } |
| 2727 | EXPORT_SYMBOL_GPL(kmsg_dump_rewind); | ||
| 2347 | #endif | 2728 | #endif |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0da7b88d92d0..4b97bba7396e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
| @@ -201,6 +201,7 @@ void rcu_note_context_switch(int cpu) | |||
| 201 | { | 201 | { |
| 202 | trace_rcu_utilization("Start context switch"); | 202 | trace_rcu_utilization("Start context switch"); |
| 203 | rcu_sched_qs(cpu); | 203 | rcu_sched_qs(cpu); |
| 204 | rcu_preempt_note_context_switch(cpu); | ||
| 204 | trace_rcu_utilization("End context switch"); | 205 | trace_rcu_utilization("End context switch"); |
| 205 | } | 206 | } |
| 206 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 207 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
| @@ -1397,6 +1398,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | |||
| 1397 | rdp->qlen_lazy += rsp->qlen_lazy; | 1398 | rdp->qlen_lazy += rsp->qlen_lazy; |
| 1398 | rdp->qlen += rsp->qlen; | 1399 | rdp->qlen += rsp->qlen; |
| 1399 | rdp->n_cbs_adopted += rsp->qlen; | 1400 | rdp->n_cbs_adopted += rsp->qlen; |
| 1401 | if (rsp->qlen_lazy != rsp->qlen) | ||
| 1402 | rcu_idle_count_callbacks_posted(); | ||
| 1400 | rsp->qlen_lazy = 0; | 1403 | rsp->qlen_lazy = 0; |
| 1401 | rsp->qlen = 0; | 1404 | rsp->qlen = 0; |
| 1402 | 1405 | ||
| @@ -1528,7 +1531,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1528 | { | 1531 | { |
| 1529 | unsigned long flags; | 1532 | unsigned long flags; |
| 1530 | struct rcu_head *next, *list, **tail; | 1533 | struct rcu_head *next, *list, **tail; |
| 1531 | int bl, count, count_lazy; | 1534 | int bl, count, count_lazy, i; |
| 1532 | 1535 | ||
| 1533 | /* If no callbacks are ready, just return.*/ | 1536 | /* If no callbacks are ready, just return.*/ |
| 1534 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { | 1537 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { |
| @@ -1551,9 +1554,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1551 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; | 1554 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; |
| 1552 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; | 1555 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; |
| 1553 | tail = rdp->nxttail[RCU_DONE_TAIL]; | 1556 | tail = rdp->nxttail[RCU_DONE_TAIL]; |
| 1554 | for (count = RCU_NEXT_SIZE - 1; count >= 0; count--) | 1557 | for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) |
| 1555 | if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL]) | 1558 | if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) |
| 1556 | rdp->nxttail[count] = &rdp->nxtlist; | 1559 | rdp->nxttail[i] = &rdp->nxtlist; |
| 1557 | local_irq_restore(flags); | 1560 | local_irq_restore(flags); |
| 1558 | 1561 | ||
| 1559 | /* Invoke callbacks. */ | 1562 | /* Invoke callbacks. */ |
| @@ -1581,9 +1584,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1581 | if (list != NULL) { | 1584 | if (list != NULL) { |
| 1582 | *tail = rdp->nxtlist; | 1585 | *tail = rdp->nxtlist; |
| 1583 | rdp->nxtlist = list; | 1586 | rdp->nxtlist = list; |
| 1584 | for (count = 0; count < RCU_NEXT_SIZE; count++) | 1587 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
| 1585 | if (&rdp->nxtlist == rdp->nxttail[count]) | 1588 | if (&rdp->nxtlist == rdp->nxttail[i]) |
| 1586 | rdp->nxttail[count] = tail; | 1589 | rdp->nxttail[i] = tail; |
| 1587 | else | 1590 | else |
| 1588 | break; | 1591 | break; |
| 1589 | } | 1592 | } |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 7f5d138dedf5..19b61ac1079f 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
| @@ -84,6 +84,20 @@ struct rcu_dynticks { | |||
| 84 | /* Process level is worth LLONG_MAX/2. */ | 84 | /* Process level is worth LLONG_MAX/2. */ |
| 85 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ | 85 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ |
| 86 | atomic_t dynticks; /* Even value for idle, else odd. */ | 86 | atomic_t dynticks; /* Even value for idle, else odd. */ |
| 87 | #ifdef CONFIG_RCU_FAST_NO_HZ | ||
| 88 | int dyntick_drain; /* Prepare-for-idle state variable. */ | ||
| 89 | unsigned long dyntick_holdoff; | ||
| 90 | /* No retries for the jiffy of failure. */ | ||
| 91 | struct timer_list idle_gp_timer; | ||
| 92 | /* Wake up CPU sleeping with callbacks. */ | ||
| 93 | unsigned long idle_gp_timer_expires; | ||
| 94 | /* When to wake up CPU (for repost). */ | ||
| 95 | bool idle_first_pass; /* First pass of attempt to go idle? */ | ||
| 96 | unsigned long nonlazy_posted; | ||
| 97 | /* # times non-lazy CBs posted to CPU. */ | ||
| 98 | unsigned long nonlazy_posted_snap; | ||
| 99 | /* idle-period nonlazy_posted snapshot. */ | ||
| 100 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | ||
| 87 | }; | 101 | }; |
| 88 | 102 | ||
| 89 | /* RCU's kthread states for tracing. */ | 103 | /* RCU's kthread states for tracing. */ |
| @@ -430,6 +444,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); | |||
| 430 | /* Forward declarations for rcutree_plugin.h */ | 444 | /* Forward declarations for rcutree_plugin.h */ |
| 431 | static void rcu_bootup_announce(void); | 445 | static void rcu_bootup_announce(void); |
| 432 | long rcu_batches_completed(void); | 446 | long rcu_batches_completed(void); |
| 447 | static void rcu_preempt_note_context_switch(int cpu); | ||
| 433 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); | 448 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); |
| 434 | #ifdef CONFIG_HOTPLUG_CPU | 449 | #ifdef CONFIG_HOTPLUG_CPU |
| 435 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | 450 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 2411000d9869..3e4899459f3d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
| @@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu) | |||
| 153 | * | 153 | * |
| 154 | * Caller must disable preemption. | 154 | * Caller must disable preemption. |
| 155 | */ | 155 | */ |
| 156 | void rcu_preempt_note_context_switch(void) | 156 | static void rcu_preempt_note_context_switch(int cpu) |
| 157 | { | 157 | { |
| 158 | struct task_struct *t = current; | 158 | struct task_struct *t = current; |
| 159 | unsigned long flags; | 159 | unsigned long flags; |
| @@ -164,7 +164,7 @@ void rcu_preempt_note_context_switch(void) | |||
| 164 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { | 164 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { |
| 165 | 165 | ||
| 166 | /* Possibly blocking in an RCU read-side critical section. */ | 166 | /* Possibly blocking in an RCU read-side critical section. */ |
| 167 | rdp = __this_cpu_ptr(rcu_preempt_state.rda); | 167 | rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); |
| 168 | rnp = rdp->mynode; | 168 | rnp = rdp->mynode; |
| 169 | raw_spin_lock_irqsave(&rnp->lock, flags); | 169 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 170 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | 170 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; |
| @@ -228,7 +228,7 @@ void rcu_preempt_note_context_switch(void) | |||
| 228 | * means that we continue to block the current grace period. | 228 | * means that we continue to block the current grace period. |
| 229 | */ | 229 | */ |
| 230 | local_irq_save(flags); | 230 | local_irq_save(flags); |
| 231 | rcu_preempt_qs(smp_processor_id()); | 231 | rcu_preempt_qs(cpu); |
| 232 | local_irq_restore(flags); | 232 | local_irq_restore(flags); |
| 233 | } | 233 | } |
| 234 | 234 | ||
| @@ -1002,6 +1002,14 @@ void rcu_force_quiescent_state(void) | |||
| 1002 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | 1002 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); |
| 1003 | 1003 | ||
| 1004 | /* | 1004 | /* |
| 1005 | * Because preemptible RCU does not exist, we never have to check for | ||
| 1006 | * CPUs being in quiescent states. | ||
| 1007 | */ | ||
| 1008 | static void rcu_preempt_note_context_switch(int cpu) | ||
| 1009 | { | ||
| 1010 | } | ||
| 1011 | |||
| 1012 | /* | ||
| 1005 | * Because preemptible RCU does not exist, there are never any preempted | 1013 | * Because preemptible RCU does not exist, there are never any preempted |
| 1006 | * RCU readers. | 1014 | * RCU readers. |
| 1007 | */ | 1015 | */ |
| @@ -1886,8 +1894,9 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) | |||
| 1886 | * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs | 1894 | * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs |
| 1887 | * any flavor of RCU. | 1895 | * any flavor of RCU. |
| 1888 | */ | 1896 | */ |
| 1889 | int rcu_needs_cpu(int cpu) | 1897 | int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) |
| 1890 | { | 1898 | { |
| 1899 | *delta_jiffies = ULONG_MAX; | ||
| 1891 | return rcu_cpu_has_callbacks(cpu); | 1900 | return rcu_cpu_has_callbacks(cpu); |
| 1892 | } | 1901 | } |
| 1893 | 1902 | ||
| @@ -1962,41 +1971,6 @@ static void rcu_idle_count_callbacks_posted(void) | |||
| 1962 | #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ | 1971 | #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ |
| 1963 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ | 1972 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ |
| 1964 | 1973 | ||
| 1965 | /* Loop counter for rcu_prepare_for_idle(). */ | ||
| 1966 | static DEFINE_PER_CPU(int, rcu_dyntick_drain); | ||
| 1967 | /* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */ | ||
| 1968 | static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); | ||
| 1969 | /* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */ | ||
| 1970 | static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); | ||
| 1971 | /* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */ | ||
| 1972 | static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires); | ||
| 1973 | /* Enable special processing on first attempt to enter dyntick-idle mode. */ | ||
| 1974 | static DEFINE_PER_CPU(bool, rcu_idle_first_pass); | ||
| 1975 | /* Running count of non-lazy callbacks posted, never decremented. */ | ||
| 1976 | static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted); | ||
| 1977 | /* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */ | ||
| 1978 | static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap); | ||
| 1979 | |||
| 1980 | /* | ||
| 1981 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no | ||
| 1982 | * callbacks on this CPU, (2) this CPU has not yet attempted to enter | ||
| 1983 | * dyntick-idle mode, or (3) this CPU is in the process of attempting to | ||
| 1984 | * enter dyntick-idle mode. Otherwise, if we have recently tried and failed | ||
| 1985 | * to enter dyntick-idle mode, we refuse to try to enter it. After all, | ||
| 1986 | * it is better to incur scheduling-clock interrupts than to spin | ||
| 1987 | * continuously for the same time duration! | ||
| 1988 | */ | ||
| 1989 | int rcu_needs_cpu(int cpu) | ||
| 1990 | { | ||
| 1991 | /* Flag a new idle sojourn to the idle-entry state machine. */ | ||
| 1992 | per_cpu(rcu_idle_first_pass, cpu) = 1; | ||
| 1993 | /* If no callbacks, RCU doesn't need the CPU. */ | ||
| 1994 | if (!rcu_cpu_has_callbacks(cpu)) | ||
| 1995 | return 0; | ||
| 1996 | /* Otherwise, RCU needs the CPU only if it recently tried and failed. */ | ||
| 1997 | return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies; | ||
| 1998 | } | ||
| 1999 | |||
| 2000 | /* | 1974 | /* |
| 2001 | * Does the specified flavor of RCU have non-lazy callbacks pending on | 1975 | * Does the specified flavor of RCU have non-lazy callbacks pending on |
| 2002 | * the specified CPU? Both RCU flavor and CPU are specified by the | 1976 | * the specified CPU? Both RCU flavor and CPU are specified by the |
| @@ -2040,6 +2014,47 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu) | |||
| 2040 | } | 2014 | } |
| 2041 | 2015 | ||
| 2042 | /* | 2016 | /* |
| 2017 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no | ||
| 2018 | * callbacks on this CPU, (2) this CPU has not yet attempted to enter | ||
| 2019 | * dyntick-idle mode, or (3) this CPU is in the process of attempting to | ||
| 2020 | * enter dyntick-idle mode. Otherwise, if we have recently tried and failed | ||
| 2021 | * to enter dyntick-idle mode, we refuse to try to enter it. After all, | ||
| 2022 | * it is better to incur scheduling-clock interrupts than to spin | ||
| 2023 | * continuously for the same time duration! | ||
| 2024 | * | ||
| 2025 | * The delta_jiffies argument is used to store the time when RCU is | ||
| 2026 | * going to need the CPU again if it still has callbacks. The reason | ||
| 2027 | * for this is that rcu_prepare_for_idle() might need to post a timer, | ||
| 2028 | * but if so, it will do so after tick_nohz_stop_sched_tick() has set | ||
| 2029 | * the wakeup time for this CPU. This means that RCU's timer can be | ||
| 2030 | * delayed until the wakeup time, which defeats the purpose of posting | ||
| 2031 | * a timer. | ||
| 2032 | */ | ||
| 2033 | int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) | ||
| 2034 | { | ||
| 2035 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
| 2036 | |||
| 2037 | /* Flag a new idle sojourn to the idle-entry state machine. */ | ||
| 2038 | rdtp->idle_first_pass = 1; | ||
| 2039 | /* If no callbacks, RCU doesn't need the CPU. */ | ||
| 2040 | if (!rcu_cpu_has_callbacks(cpu)) { | ||
| 2041 | *delta_jiffies = ULONG_MAX; | ||
| 2042 | return 0; | ||
| 2043 | } | ||
| 2044 | if (rdtp->dyntick_holdoff == jiffies) { | ||
| 2045 | /* RCU recently tried and failed, so don't try again. */ | ||
| 2046 | *delta_jiffies = 1; | ||
| 2047 | return 1; | ||
| 2048 | } | ||
| 2049 | /* Set up for the possibility that RCU will post a timer. */ | ||
| 2050 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) | ||
| 2051 | *delta_jiffies = RCU_IDLE_GP_DELAY; | ||
| 2052 | else | ||
| 2053 | *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY; | ||
| 2054 | return 0; | ||
| 2055 | } | ||
| 2056 | |||
| 2057 | /* | ||
| 2043 | * Handler for smp_call_function_single(). The only point of this | 2058 | * Handler for smp_call_function_single(). The only point of this |
| 2044 | * handler is to wake the CPU up, so the handler does only tracing. | 2059 | * handler is to wake the CPU up, so the handler does only tracing. |
| 2045 | */ | 2060 | */ |
| @@ -2075,21 +2090,24 @@ static void rcu_idle_gp_timer_func(unsigned long cpu_in) | |||
| 2075 | */ | 2090 | */ |
| 2076 | static void rcu_prepare_for_idle_init(int cpu) | 2091 | static void rcu_prepare_for_idle_init(int cpu) |
| 2077 | { | 2092 | { |
| 2078 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; | 2093 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
| 2079 | setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), | 2094 | |
| 2080 | rcu_idle_gp_timer_func, cpu); | 2095 | rdtp->dyntick_holdoff = jiffies - 1; |
| 2081 | per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1; | 2096 | setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu); |
| 2082 | per_cpu(rcu_idle_first_pass, cpu) = 1; | 2097 | rdtp->idle_gp_timer_expires = jiffies - 1; |
| 2098 | rdtp->idle_first_pass = 1; | ||
| 2083 | } | 2099 | } |
| 2084 | 2100 | ||
| 2085 | /* | 2101 | /* |
| 2086 | * Clean up for exit from idle. Because we are exiting from idle, there | 2102 | * Clean up for exit from idle. Because we are exiting from idle, there |
| 2087 | * is no longer any point to rcu_idle_gp_timer, so cancel it. This will | 2103 | * is no longer any point to ->idle_gp_timer, so cancel it. This will |
| 2088 | * do nothing if this timer is not active, so just cancel it unconditionally. | 2104 | * do nothing if this timer is not active, so just cancel it unconditionally. |
| 2089 | */ | 2105 | */ |
| 2090 | static void rcu_cleanup_after_idle(int cpu) | 2106 | static void rcu_cleanup_after_idle(int cpu) |
| 2091 | { | 2107 | { |
| 2092 | del_timer(&per_cpu(rcu_idle_gp_timer, cpu)); | 2108 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
| 2109 | |||
| 2110 | del_timer(&rdtp->idle_gp_timer); | ||
| 2093 | trace_rcu_prep_idle("Cleanup after idle"); | 2111 | trace_rcu_prep_idle("Cleanup after idle"); |
| 2094 | } | 2112 | } |
| 2095 | 2113 | ||
| @@ -2108,42 +2126,41 @@ static void rcu_cleanup_after_idle(int cpu) | |||
| 2108 | * Because it is not legal to invoke rcu_process_callbacks() with irqs | 2126 | * Because it is not legal to invoke rcu_process_callbacks() with irqs |
| 2109 | * disabled, we do one pass of force_quiescent_state(), then do a | 2127 | * disabled, we do one pass of force_quiescent_state(), then do a |
| 2110 | * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked | 2128 | * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked |
| 2111 | * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. | 2129 | * later. The ->dyntick_drain field controls the sequencing. |
| 2112 | * | 2130 | * |
| 2113 | * The caller must have disabled interrupts. | 2131 | * The caller must have disabled interrupts. |
| 2114 | */ | 2132 | */ |
| 2115 | static void rcu_prepare_for_idle(int cpu) | 2133 | static void rcu_prepare_for_idle(int cpu) |
| 2116 | { | 2134 | { |
| 2117 | struct timer_list *tp; | 2135 | struct timer_list *tp; |
| 2136 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
| 2118 | 2137 | ||
| 2119 | /* | 2138 | /* |
| 2120 | * If this is an idle re-entry, for example, due to use of | 2139 | * If this is an idle re-entry, for example, due to use of |
| 2121 | * RCU_NONIDLE() or the new idle-loop tracing API within the idle | 2140 | * RCU_NONIDLE() or the new idle-loop tracing API within the idle |
| 2122 | * loop, then don't take any state-machine actions, unless the | 2141 | * loop, then don't take any state-machine actions, unless the |
| 2123 | * momentary exit from idle queued additional non-lazy callbacks. | 2142 | * momentary exit from idle queued additional non-lazy callbacks. |
| 2124 | * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks | 2143 | * Instead, repost the ->idle_gp_timer if this CPU has callbacks |
| 2125 | * pending. | 2144 | * pending. |
| 2126 | */ | 2145 | */ |
| 2127 | if (!per_cpu(rcu_idle_first_pass, cpu) && | 2146 | if (!rdtp->idle_first_pass && |
| 2128 | (per_cpu(rcu_nonlazy_posted, cpu) == | 2147 | (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) { |
| 2129 | per_cpu(rcu_nonlazy_posted_snap, cpu))) { | ||
| 2130 | if (rcu_cpu_has_callbacks(cpu)) { | 2148 | if (rcu_cpu_has_callbacks(cpu)) { |
| 2131 | tp = &per_cpu(rcu_idle_gp_timer, cpu); | 2149 | tp = &rdtp->idle_gp_timer; |
| 2132 | mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); | 2150 | mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); |
| 2133 | } | 2151 | } |
| 2134 | return; | 2152 | return; |
| 2135 | } | 2153 | } |
| 2136 | per_cpu(rcu_idle_first_pass, cpu) = 0; | 2154 | rdtp->idle_first_pass = 0; |
| 2137 | per_cpu(rcu_nonlazy_posted_snap, cpu) = | 2155 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1; |
| 2138 | per_cpu(rcu_nonlazy_posted, cpu) - 1; | ||
| 2139 | 2156 | ||
| 2140 | /* | 2157 | /* |
| 2141 | * If there are no callbacks on this CPU, enter dyntick-idle mode. | 2158 | * If there are no callbacks on this CPU, enter dyntick-idle mode. |
| 2142 | * Also reset state to avoid prejudicing later attempts. | 2159 | * Also reset state to avoid prejudicing later attempts. |
| 2143 | */ | 2160 | */ |
| 2144 | if (!rcu_cpu_has_callbacks(cpu)) { | 2161 | if (!rcu_cpu_has_callbacks(cpu)) { |
| 2145 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; | 2162 | rdtp->dyntick_holdoff = jiffies - 1; |
| 2146 | per_cpu(rcu_dyntick_drain, cpu) = 0; | 2163 | rdtp->dyntick_drain = 0; |
| 2147 | trace_rcu_prep_idle("No callbacks"); | 2164 | trace_rcu_prep_idle("No callbacks"); |
| 2148 | return; | 2165 | return; |
| 2149 | } | 2166 | } |
| @@ -2152,36 +2169,37 @@ static void rcu_prepare_for_idle(int cpu) | |||
| 2152 | * If in holdoff mode, just return. We will presumably have | 2169 | * If in holdoff mode, just return. We will presumably have |
| 2153 | * refrained from disabling the scheduling-clock tick. | 2170 | * refrained from disabling the scheduling-clock tick. |
| 2154 | */ | 2171 | */ |
| 2155 | if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { | 2172 | if (rdtp->dyntick_holdoff == jiffies) { |
| 2156 | trace_rcu_prep_idle("In holdoff"); | 2173 | trace_rcu_prep_idle("In holdoff"); |
| 2157 | return; | 2174 | return; |
| 2158 | } | 2175 | } |
| 2159 | 2176 | ||
| 2160 | /* Check and update the rcu_dyntick_drain sequencing. */ | 2177 | /* Check and update the ->dyntick_drain sequencing. */ |
| 2161 | if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { | 2178 | if (rdtp->dyntick_drain <= 0) { |
| 2162 | /* First time through, initialize the counter. */ | 2179 | /* First time through, initialize the counter. */ |
| 2163 | per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; | 2180 | rdtp->dyntick_drain = RCU_IDLE_FLUSHES; |
| 2164 | } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && | 2181 | } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES && |
| 2165 | !rcu_pending(cpu) && | 2182 | !rcu_pending(cpu) && |
| 2166 | !local_softirq_pending()) { | 2183 | !local_softirq_pending()) { |
| 2167 | /* Can we go dyntick-idle despite still having callbacks? */ | 2184 | /* Can we go dyntick-idle despite still having callbacks? */ |
| 2168 | trace_rcu_prep_idle("Dyntick with callbacks"); | 2185 | rdtp->dyntick_drain = 0; |
| 2169 | per_cpu(rcu_dyntick_drain, cpu) = 0; | 2186 | rdtp->dyntick_holdoff = jiffies; |
| 2170 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; | 2187 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) { |
| 2171 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) | 2188 | trace_rcu_prep_idle("Dyntick with callbacks"); |
| 2172 | per_cpu(rcu_idle_gp_timer_expires, cpu) = | 2189 | rdtp->idle_gp_timer_expires = |
| 2173 | jiffies + RCU_IDLE_GP_DELAY; | 2190 | jiffies + RCU_IDLE_GP_DELAY; |
| 2174 | else | 2191 | } else { |
| 2175 | per_cpu(rcu_idle_gp_timer_expires, cpu) = | 2192 | rdtp->idle_gp_timer_expires = |
| 2176 | jiffies + RCU_IDLE_LAZY_GP_DELAY; | 2193 | jiffies + RCU_IDLE_LAZY_GP_DELAY; |
| 2177 | tp = &per_cpu(rcu_idle_gp_timer, cpu); | 2194 | trace_rcu_prep_idle("Dyntick with lazy callbacks"); |
| 2178 | mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); | 2195 | } |
| 2179 | per_cpu(rcu_nonlazy_posted_snap, cpu) = | 2196 | tp = &rdtp->idle_gp_timer; |
| 2180 | per_cpu(rcu_nonlazy_posted, cpu); | 2197 | mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); |
| 2198 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; | ||
| 2181 | return; /* Nothing more to do immediately. */ | 2199 | return; /* Nothing more to do immediately. */ |
| 2182 | } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { | 2200 | } else if (--(rdtp->dyntick_drain) <= 0) { |
| 2183 | /* We have hit the limit, so time to give up. */ | 2201 | /* We have hit the limit, so time to give up. */ |
| 2184 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; | 2202 | rdtp->dyntick_holdoff = jiffies; |
| 2185 | trace_rcu_prep_idle("Begin holdoff"); | 2203 | trace_rcu_prep_idle("Begin holdoff"); |
| 2186 | invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ | 2204 | invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ |
| 2187 | return; | 2205 | return; |
| @@ -2227,7 +2245,7 @@ static void rcu_prepare_for_idle(int cpu) | |||
| 2227 | */ | 2245 | */ |
| 2228 | static void rcu_idle_count_callbacks_posted(void) | 2246 | static void rcu_idle_count_callbacks_posted(void) |
| 2229 | { | 2247 | { |
| 2230 | __this_cpu_add(rcu_nonlazy_posted, 1); | 2248 | __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); |
| 2231 | } | 2249 | } |
| 2232 | 2250 | ||
| 2233 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 2251 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
| @@ -2238,11 +2256,12 @@ static void rcu_idle_count_callbacks_posted(void) | |||
| 2238 | 2256 | ||
| 2239 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | 2257 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) |
| 2240 | { | 2258 | { |
| 2241 | struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu); | 2259 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
| 2260 | struct timer_list *tltp = &rdtp->idle_gp_timer; | ||
| 2242 | 2261 | ||
| 2243 | sprintf(cp, "drain=%d %c timer=%lu", | 2262 | sprintf(cp, "drain=%d %c timer=%lu", |
| 2244 | per_cpu(rcu_dyntick_drain, cpu), | 2263 | rdtp->dyntick_drain, |
| 2245 | per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', | 2264 | rdtp->dyntick_holdoff == jiffies ? 'H' : '.', |
| 2246 | timer_pending(tltp) ? tltp->expires - jiffies : -1); | 2265 | timer_pending(tltp) ? tltp->expires - jiffies : -1); |
| 2247 | } | 2266 | } |
| 2248 | 2267 | ||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 39eb6011bc38..468bdd44c1ba 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -142,9 +142,8 @@ const_debug unsigned int sysctl_sched_features = | |||
| 142 | #define SCHED_FEAT(name, enabled) \ | 142 | #define SCHED_FEAT(name, enabled) \ |
| 143 | #name , | 143 | #name , |
| 144 | 144 | ||
| 145 | static __read_mostly char *sched_feat_names[] = { | 145 | static const char * const sched_feat_names[] = { |
| 146 | #include "features.h" | 146 | #include "features.h" |
| 147 | NULL | ||
| 148 | }; | 147 | }; |
| 149 | 148 | ||
| 150 | #undef SCHED_FEAT | 149 | #undef SCHED_FEAT |
| @@ -2082,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
| 2082 | #endif | 2081 | #endif |
| 2083 | 2082 | ||
| 2084 | /* Here we just switch the register state and the stack. */ | 2083 | /* Here we just switch the register state and the stack. */ |
| 2085 | rcu_switch_from(prev); | ||
| 2086 | switch_to(prev, next, prev); | 2084 | switch_to(prev, next, prev); |
| 2087 | 2085 | ||
| 2088 | barrier(); | 2086 | barrier(); |
| @@ -2162,11 +2160,73 @@ unsigned long this_cpu_load(void) | |||
| 2162 | } | 2160 | } |
| 2163 | 2161 | ||
| 2164 | 2162 | ||
| 2163 | /* | ||
| 2164 | * Global load-average calculations | ||
| 2165 | * | ||
| 2166 | * We take a distributed and async approach to calculating the global load-avg | ||
| 2167 | * in order to minimize overhead. | ||
| 2168 | * | ||
| 2169 | * The global load average is an exponentially decaying average of nr_running + | ||
| 2170 | * nr_uninterruptible. | ||
| 2171 | * | ||
| 2172 | * Once every LOAD_FREQ: | ||
| 2173 | * | ||
| 2174 | * nr_active = 0; | ||
| 2175 | * for_each_possible_cpu(cpu) | ||
| 2176 | * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; | ||
| 2177 | * | ||
| 2178 | * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) | ||
| 2179 | * | ||
| 2180 | * Due to a number of reasons the above turns in the mess below: | ||
| 2181 | * | ||
| 2182 | * - for_each_possible_cpu() is prohibitively expensive on machines with | ||
| 2183 | * serious number of cpus, therefore we need to take a distributed approach | ||
| 2184 | * to calculating nr_active. | ||
| 2185 | * | ||
| 2186 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 | ||
| 2187 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } | ||
| 2188 | * | ||
| 2189 | * So assuming nr_active := 0 when we start out -- true per definition, we | ||
| 2190 | * can simply take per-cpu deltas and fold those into a global accumulate | ||
| 2191 | * to obtain the same result. See calc_load_fold_active(). | ||
| 2192 | * | ||
| 2193 | * Furthermore, in order to avoid synchronizing all per-cpu delta folding | ||
| 2194 | * across the machine, we assume 10 ticks is sufficient time for every | ||
| 2195 | * cpu to have completed this task. | ||
| 2196 | * | ||
| 2197 | * This places an upper-bound on the IRQ-off latency of the machine. Then | ||
| 2198 | * again, being late doesn't loose the delta, just wrecks the sample. | ||
| 2199 | * | ||
| 2200 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because | ||
| 2201 | * this would add another cross-cpu cacheline miss and atomic operation | ||
| 2202 | * to the wakeup path. Instead we increment on whatever cpu the task ran | ||
| 2203 | * when it went into uninterruptible state and decrement on whatever cpu | ||
| 2204 | * did the wakeup. This means that only the sum of nr_uninterruptible over | ||
| 2205 | * all cpus yields the correct result. | ||
| 2206 | * | ||
| 2207 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. | ||
| 2208 | */ | ||
| 2209 | |||
| 2165 | /* Variables and functions for calc_load */ | 2210 | /* Variables and functions for calc_load */ |
| 2166 | static atomic_long_t calc_load_tasks; | 2211 | static atomic_long_t calc_load_tasks; |
| 2167 | static unsigned long calc_load_update; | 2212 | static unsigned long calc_load_update; |
| 2168 | unsigned long avenrun[3]; | 2213 | unsigned long avenrun[3]; |
| 2169 | EXPORT_SYMBOL(avenrun); | 2214 | EXPORT_SYMBOL(avenrun); /* should be removed */ |
| 2215 | |||
| 2216 | /** | ||
| 2217 | * get_avenrun - get the load average array | ||
| 2218 | * @loads: pointer to dest load array | ||
| 2219 | * @offset: offset to add | ||
| 2220 | * @shift: shift count to shift the result left | ||
| 2221 | * | ||
| 2222 | * These values are estimates at best, so no need for locking. | ||
| 2223 | */ | ||
| 2224 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
| 2225 | { | ||
| 2226 | loads[0] = (avenrun[0] + offset) << shift; | ||
| 2227 | loads[1] = (avenrun[1] + offset) << shift; | ||
| 2228 | loads[2] = (avenrun[2] + offset) << shift; | ||
| 2229 | } | ||
| 2170 | 2230 | ||
| 2171 | static long calc_load_fold_active(struct rq *this_rq) | 2231 | static long calc_load_fold_active(struct rq *this_rq) |
| 2172 | { | 2232 | { |
| @@ -2183,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq) | |||
| 2183 | return delta; | 2243 | return delta; |
| 2184 | } | 2244 | } |
| 2185 | 2245 | ||
| 2246 | /* | ||
| 2247 | * a1 = a0 * e + a * (1 - e) | ||
| 2248 | */ | ||
| 2186 | static unsigned long | 2249 | static unsigned long |
| 2187 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | 2250 | calc_load(unsigned long load, unsigned long exp, unsigned long active) |
| 2188 | { | 2251 | { |
| @@ -2194,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |||
| 2194 | 2257 | ||
| 2195 | #ifdef CONFIG_NO_HZ | 2258 | #ifdef CONFIG_NO_HZ |
| 2196 | /* | 2259 | /* |
| 2197 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. | 2260 | * Handle NO_HZ for the global load-average. |
| 2261 | * | ||
| 2262 | * Since the above described distributed algorithm to compute the global | ||
| 2263 | * load-average relies on per-cpu sampling from the tick, it is affected by | ||
| 2264 | * NO_HZ. | ||
| 2265 | * | ||
| 2266 | * The basic idea is to fold the nr_active delta into a global idle-delta upon | ||
| 2267 | * entering NO_HZ state such that we can include this as an 'extra' cpu delta | ||
| 2268 | * when we read the global state. | ||
| 2269 | * | ||
| 2270 | * Obviously reality has to ruin such a delightfully simple scheme: | ||
| 2271 | * | ||
| 2272 | * - When we go NO_HZ idle during the window, we can negate our sample | ||
| 2273 | * contribution, causing under-accounting. | ||
| 2274 | * | ||
| 2275 | * We avoid this by keeping two idle-delta counters and flipping them | ||
| 2276 | * when the window starts, thus separating old and new NO_HZ load. | ||
| 2277 | * | ||
| 2278 | * The only trick is the slight shift in index flip for read vs write. | ||
| 2279 | * | ||
| 2280 | * 0s 5s 10s 15s | ||
| 2281 | * +10 +10 +10 +10 | ||
| 2282 | * |-|-----------|-|-----------|-|-----------|-| | ||
| 2283 | * r:0 0 1 1 0 0 1 1 0 | ||
| 2284 | * w:0 1 1 0 0 1 1 0 0 | ||
| 2285 | * | ||
| 2286 | * This ensures we'll fold the old idle contribution in this window while | ||
| 2287 | * accumlating the new one. | ||
| 2288 | * | ||
| 2289 | * - When we wake up from NO_HZ idle during the window, we push up our | ||
| 2290 | * contribution, since we effectively move our sample point to a known | ||
| 2291 | * busy state. | ||
| 2292 | * | ||
| 2293 | * This is solved by pushing the window forward, and thus skipping the | ||
| 2294 | * sample, for this cpu (effectively using the idle-delta for this cpu which | ||
| 2295 | * was in effect at the time the window opened). This also solves the issue | ||
| 2296 | * of having to deal with a cpu having been in NOHZ idle for multiple | ||
| 2297 | * LOAD_FREQ intervals. | ||
| 2198 | * | 2298 | * |
| 2199 | * When making the ILB scale, we should try to pull this in as well. | 2299 | * When making the ILB scale, we should try to pull this in as well. |
| 2200 | */ | 2300 | */ |
| 2201 | static atomic_long_t calc_load_tasks_idle; | 2301 | static atomic_long_t calc_load_idle[2]; |
| 2302 | static int calc_load_idx; | ||
| 2303 | |||
| 2304 | static inline int calc_load_write_idx(void) | ||
| 2305 | { | ||
| 2306 | int idx = calc_load_idx; | ||
| 2307 | |||
| 2308 | /* | ||
| 2309 | * See calc_global_nohz(), if we observe the new index, we also | ||
| 2310 | * need to observe the new update time. | ||
| 2311 | */ | ||
| 2312 | smp_rmb(); | ||
| 2313 | |||
| 2314 | /* | ||
| 2315 | * If the folding window started, make sure we start writing in the | ||
| 2316 | * next idle-delta. | ||
| 2317 | */ | ||
| 2318 | if (!time_before(jiffies, calc_load_update)) | ||
| 2319 | idx++; | ||
| 2202 | 2320 | ||
| 2203 | void calc_load_account_idle(struct rq *this_rq) | 2321 | return idx & 1; |
| 2322 | } | ||
| 2323 | |||
| 2324 | static inline int calc_load_read_idx(void) | ||
| 2204 | { | 2325 | { |
| 2326 | return calc_load_idx & 1; | ||
| 2327 | } | ||
| 2328 | |||
| 2329 | void calc_load_enter_idle(void) | ||
| 2330 | { | ||
| 2331 | struct rq *this_rq = this_rq(); | ||
| 2205 | long delta; | 2332 | long delta; |
| 2206 | 2333 | ||
| 2334 | /* | ||
| 2335 | * We're going into NOHZ mode, if there's any pending delta, fold it | ||
| 2336 | * into the pending idle delta. | ||
| 2337 | */ | ||
| 2207 | delta = calc_load_fold_active(this_rq); | 2338 | delta = calc_load_fold_active(this_rq); |
| 2208 | if (delta) | 2339 | if (delta) { |
| 2209 | atomic_long_add(delta, &calc_load_tasks_idle); | 2340 | int idx = calc_load_write_idx(); |
| 2341 | atomic_long_add(delta, &calc_load_idle[idx]); | ||
| 2342 | } | ||
| 2210 | } | 2343 | } |
| 2211 | 2344 | ||
| 2212 | static long calc_load_fold_idle(void) | 2345 | void calc_load_exit_idle(void) |
| 2213 | { | 2346 | { |
| 2214 | long delta = 0; | 2347 | struct rq *this_rq = this_rq(); |
| 2215 | 2348 | ||
| 2216 | /* | 2349 | /* |
| 2217 | * Its got a race, we don't care... | 2350 | * If we're still before the sample window, we're done. |
| 2218 | */ | 2351 | */ |
| 2219 | if (atomic_long_read(&calc_load_tasks_idle)) | 2352 | if (time_before(jiffies, this_rq->calc_load_update)) |
| 2220 | delta = atomic_long_xchg(&calc_load_tasks_idle, 0); | 2353 | return; |
| 2354 | |||
| 2355 | /* | ||
| 2356 | * We woke inside or after the sample window, this means we're already | ||
| 2357 | * accounted through the nohz accounting, so skip the entire deal and | ||
| 2358 | * sync up for the next window. | ||
| 2359 | */ | ||
| 2360 | this_rq->calc_load_update = calc_load_update; | ||
| 2361 | if (time_before(jiffies, this_rq->calc_load_update + 10)) | ||
| 2362 | this_rq->calc_load_update += LOAD_FREQ; | ||
| 2363 | } | ||
| 2364 | |||
| 2365 | static long calc_load_fold_idle(void) | ||
| 2366 | { | ||
| 2367 | int idx = calc_load_read_idx(); | ||
| 2368 | long delta = 0; | ||
| 2369 | |||
| 2370 | if (atomic_long_read(&calc_load_idle[idx])) | ||
| 2371 | delta = atomic_long_xchg(&calc_load_idle[idx], 0); | ||
| 2221 | 2372 | ||
| 2222 | return delta; | 2373 | return delta; |
| 2223 | } | 2374 | } |
| @@ -2303,66 +2454,39 @@ static void calc_global_nohz(void) | |||
| 2303 | { | 2454 | { |
| 2304 | long delta, active, n; | 2455 | long delta, active, n; |
| 2305 | 2456 | ||
| 2306 | /* | 2457 | if (!time_before(jiffies, calc_load_update + 10)) { |
| 2307 | * If we crossed a calc_load_update boundary, make sure to fold | 2458 | /* |
| 2308 | * any pending idle changes, the respective CPUs might have | 2459 | * Catch-up, fold however many we are behind still |
| 2309 | * missed the tick driven calc_load_account_active() update | 2460 | */ |
| 2310 | * due to NO_HZ. | 2461 | delta = jiffies - calc_load_update - 10; |
| 2311 | */ | 2462 | n = 1 + (delta / LOAD_FREQ); |
| 2312 | delta = calc_load_fold_idle(); | ||
| 2313 | if (delta) | ||
| 2314 | atomic_long_add(delta, &calc_load_tasks); | ||
| 2315 | |||
| 2316 | /* | ||
| 2317 | * It could be the one fold was all it took, we done! | ||
| 2318 | */ | ||
| 2319 | if (time_before(jiffies, calc_load_update + 10)) | ||
| 2320 | return; | ||
| 2321 | |||
| 2322 | /* | ||
| 2323 | * Catch-up, fold however many we are behind still | ||
| 2324 | */ | ||
| 2325 | delta = jiffies - calc_load_update - 10; | ||
| 2326 | n = 1 + (delta / LOAD_FREQ); | ||
| 2327 | 2463 | ||
| 2328 | active = atomic_long_read(&calc_load_tasks); | 2464 | active = atomic_long_read(&calc_load_tasks); |
| 2329 | active = active > 0 ? active * FIXED_1 : 0; | 2465 | active = active > 0 ? active * FIXED_1 : 0; |
| 2330 | 2466 | ||
| 2331 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | 2467 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); |
| 2332 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | 2468 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); |
| 2333 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | 2469 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); |
| 2334 | 2470 | ||
| 2335 | calc_load_update += n * LOAD_FREQ; | 2471 | calc_load_update += n * LOAD_FREQ; |
| 2336 | } | 2472 | } |
| 2337 | #else | ||
| 2338 | void calc_load_account_idle(struct rq *this_rq) | ||
| 2339 | { | ||
| 2340 | } | ||
| 2341 | 2473 | ||
| 2342 | static inline long calc_load_fold_idle(void) | 2474 | /* |
| 2343 | { | 2475 | * Flip the idle index... |
| 2344 | return 0; | 2476 | * |
| 2477 | * Make sure we first write the new time then flip the index, so that | ||
| 2478 | * calc_load_write_idx() will see the new time when it reads the new | ||
| 2479 | * index, this avoids a double flip messing things up. | ||
| 2480 | */ | ||
| 2481 | smp_wmb(); | ||
| 2482 | calc_load_idx++; | ||
| 2345 | } | 2483 | } |
| 2484 | #else /* !CONFIG_NO_HZ */ | ||
| 2346 | 2485 | ||
| 2347 | static void calc_global_nohz(void) | 2486 | static inline long calc_load_fold_idle(void) { return 0; } |
| 2348 | { | 2487 | static inline void calc_global_nohz(void) { } |
| 2349 | } | ||
| 2350 | #endif | ||
| 2351 | 2488 | ||
| 2352 | /** | 2489 | #endif /* CONFIG_NO_HZ */ |
| 2353 | * get_avenrun - get the load average array | ||
| 2354 | * @loads: pointer to dest load array | ||
| 2355 | * @offset: offset to add | ||
| 2356 | * @shift: shift count to shift the result left | ||
| 2357 | * | ||
| 2358 | * These values are estimates at best, so no need for locking. | ||
| 2359 | */ | ||
| 2360 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
| 2361 | { | ||
| 2362 | loads[0] = (avenrun[0] + offset) << shift; | ||
| 2363 | loads[1] = (avenrun[1] + offset) << shift; | ||
| 2364 | loads[2] = (avenrun[2] + offset) << shift; | ||
| 2365 | } | ||
| 2366 | 2490 | ||
| 2367 | /* | 2491 | /* |
| 2368 | * calc_load - update the avenrun load estimates 10 ticks after the | 2492 | * calc_load - update the avenrun load estimates 10 ticks after the |
| @@ -2370,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | |||
| 2370 | */ | 2494 | */ |
| 2371 | void calc_global_load(unsigned long ticks) | 2495 | void calc_global_load(unsigned long ticks) |
| 2372 | { | 2496 | { |
| 2373 | long active; | 2497 | long active, delta; |
| 2374 | 2498 | ||
| 2375 | if (time_before(jiffies, calc_load_update + 10)) | 2499 | if (time_before(jiffies, calc_load_update + 10)) |
| 2376 | return; | 2500 | return; |
| 2377 | 2501 | ||
| 2502 | /* | ||
| 2503 | * Fold the 'old' idle-delta to include all NO_HZ cpus. | ||
| 2504 | */ | ||
| 2505 | delta = calc_load_fold_idle(); | ||
| 2506 | if (delta) | ||
| 2507 | atomic_long_add(delta, &calc_load_tasks); | ||
| 2508 | |||
| 2378 | active = atomic_long_read(&calc_load_tasks); | 2509 | active = atomic_long_read(&calc_load_tasks); |
| 2379 | active = active > 0 ? active * FIXED_1 : 0; | 2510 | active = active > 0 ? active * FIXED_1 : 0; |
| 2380 | 2511 | ||
| @@ -2385,12 +2516,7 @@ void calc_global_load(unsigned long ticks) | |||
| 2385 | calc_load_update += LOAD_FREQ; | 2516 | calc_load_update += LOAD_FREQ; |
| 2386 | 2517 | ||
| 2387 | /* | 2518 | /* |
| 2388 | * Account one period with whatever state we found before | 2519 | * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. |
| 2389 | * folding in the nohz state and ageing the entire idle period. | ||
| 2390 | * | ||
| 2391 | * This avoids loosing a sample when we go idle between | ||
| 2392 | * calc_load_account_active() (10 ticks ago) and now and thus | ||
| 2393 | * under-accounting. | ||
| 2394 | */ | 2520 | */ |
| 2395 | calc_global_nohz(); | 2521 | calc_global_nohz(); |
| 2396 | } | 2522 | } |
| @@ -2407,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq) | |||
| 2407 | return; | 2533 | return; |
| 2408 | 2534 | ||
| 2409 | delta = calc_load_fold_active(this_rq); | 2535 | delta = calc_load_fold_active(this_rq); |
| 2410 | delta += calc_load_fold_idle(); | ||
| 2411 | if (delta) | 2536 | if (delta) |
| 2412 | atomic_long_add(delta, &calc_load_tasks); | 2537 | atomic_long_add(delta, &calc_load_tasks); |
| 2413 | 2538 | ||
| @@ -2415,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq) | |||
| 2415 | } | 2540 | } |
| 2416 | 2541 | ||
| 2417 | /* | 2542 | /* |
| 2543 | * End of global load-average stuff | ||
| 2544 | */ | ||
| 2545 | |||
| 2546 | /* | ||
| 2418 | * The exact cpuload at various idx values, calculated at every tick would be | 2547 | * The exact cpuload at various idx values, calculated at every tick would be |
| 2419 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load | 2548 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load |
| 2420 | * | 2549 | * |
| @@ -2517,25 +2646,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, | |||
| 2517 | sched_avg_update(this_rq); | 2646 | sched_avg_update(this_rq); |
| 2518 | } | 2647 | } |
| 2519 | 2648 | ||
| 2649 | #ifdef CONFIG_NO_HZ | ||
| 2650 | /* | ||
| 2651 | * There is no sane way to deal with nohz on smp when using jiffies because the | ||
| 2652 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | ||
| 2653 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. | ||
| 2654 | * | ||
| 2655 | * Therefore we cannot use the delta approach from the regular tick since that | ||
| 2656 | * would seriously skew the load calculation. However we'll make do for those | ||
| 2657 | * updates happening while idle (nohz_idle_balance) or coming out of idle | ||
| 2658 | * (tick_nohz_idle_exit). | ||
| 2659 | * | ||
| 2660 | * This means we might still be one tick off for nohz periods. | ||
| 2661 | */ | ||
| 2662 | |||
| 2520 | /* | 2663 | /* |
| 2521 | * Called from nohz_idle_balance() to update the load ratings before doing the | 2664 | * Called from nohz_idle_balance() to update the load ratings before doing the |
| 2522 | * idle balance. | 2665 | * idle balance. |
| 2523 | */ | 2666 | */ |
| 2524 | void update_idle_cpu_load(struct rq *this_rq) | 2667 | void update_idle_cpu_load(struct rq *this_rq) |
| 2525 | { | 2668 | { |
| 2526 | unsigned long curr_jiffies = jiffies; | 2669 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); |
| 2527 | unsigned long load = this_rq->load.weight; | 2670 | unsigned long load = this_rq->load.weight; |
| 2528 | unsigned long pending_updates; | 2671 | unsigned long pending_updates; |
| 2529 | 2672 | ||
| 2530 | /* | 2673 | /* |
| 2531 | * Bloody broken means of dealing with nohz, but better than nothing.. | 2674 | * bail if there's load or we're actually up-to-date. |
| 2532 | * jiffies is updated by one cpu, another cpu can drift wrt the jiffy | ||
| 2533 | * update and see 0 difference the one time and 2 the next, even though | ||
| 2534 | * we ticked at roughtly the same rate. | ||
| 2535 | * | ||
| 2536 | * Hence we only use this from nohz_idle_balance() and skip this | ||
| 2537 | * nonsense when called from the scheduler_tick() since that's | ||
| 2538 | * guaranteed a stable rate. | ||
| 2539 | */ | 2675 | */ |
| 2540 | if (load || curr_jiffies == this_rq->last_load_update_tick) | 2676 | if (load || curr_jiffies == this_rq->last_load_update_tick) |
| 2541 | return; | 2677 | return; |
| @@ -2547,12 +2683,38 @@ void update_idle_cpu_load(struct rq *this_rq) | |||
| 2547 | } | 2683 | } |
| 2548 | 2684 | ||
| 2549 | /* | 2685 | /* |
| 2686 | * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. | ||
| 2687 | */ | ||
| 2688 | void update_cpu_load_nohz(void) | ||
| 2689 | { | ||
| 2690 | struct rq *this_rq = this_rq(); | ||
| 2691 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
| 2692 | unsigned long pending_updates; | ||
| 2693 | |||
| 2694 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
| 2695 | return; | ||
| 2696 | |||
| 2697 | raw_spin_lock(&this_rq->lock); | ||
| 2698 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
| 2699 | if (pending_updates) { | ||
| 2700 | this_rq->last_load_update_tick = curr_jiffies; | ||
| 2701 | /* | ||
| 2702 | * We were idle, this means load 0, the current load might be | ||
| 2703 | * !0 due to remote wakeups and the sort. | ||
| 2704 | */ | ||
| 2705 | __update_cpu_load(this_rq, 0, pending_updates); | ||
| 2706 | } | ||
| 2707 | raw_spin_unlock(&this_rq->lock); | ||
| 2708 | } | ||
| 2709 | #endif /* CONFIG_NO_HZ */ | ||
| 2710 | |||
| 2711 | /* | ||
| 2550 | * Called from scheduler_tick() | 2712 | * Called from scheduler_tick() |
| 2551 | */ | 2713 | */ |
| 2552 | static void update_cpu_load_active(struct rq *this_rq) | 2714 | static void update_cpu_load_active(struct rq *this_rq) |
| 2553 | { | 2715 | { |
| 2554 | /* | 2716 | /* |
| 2555 | * See the mess in update_idle_cpu_load(). | 2717 | * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). |
| 2556 | */ | 2718 | */ |
| 2557 | this_rq->last_load_update_tick = jiffies; | 2719 | this_rq->last_load_update_tick = jiffies; |
| 2558 | __update_cpu_load(this_rq, this_rq->load.weight, 1); | 2720 | __update_cpu_load(this_rq, this_rq->load.weight, 1); |
| @@ -4982,7 +5144,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
| 4982 | p->sched_class->set_cpus_allowed(p, new_mask); | 5144 | p->sched_class->set_cpus_allowed(p, new_mask); |
| 4983 | 5145 | ||
| 4984 | cpumask_copy(&p->cpus_allowed, new_mask); | 5146 | cpumask_copy(&p->cpus_allowed, new_mask); |
| 4985 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); | 5147 | p->nr_cpus_allowed = cpumask_weight(new_mask); |
| 4986 | } | 5148 | } |
| 4987 | 5149 | ||
| 4988 | /* | 5150 | /* |
| @@ -5524,15 +5686,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ | |||
| 5524 | 5686 | ||
| 5525 | #ifdef CONFIG_SCHED_DEBUG | 5687 | #ifdef CONFIG_SCHED_DEBUG |
| 5526 | 5688 | ||
| 5527 | static __read_mostly int sched_domain_debug_enabled; | 5689 | static __read_mostly int sched_debug_enabled; |
| 5528 | 5690 | ||
| 5529 | static int __init sched_domain_debug_setup(char *str) | 5691 | static int __init sched_debug_setup(char *str) |
| 5530 | { | 5692 | { |
| 5531 | sched_domain_debug_enabled = 1; | 5693 | sched_debug_enabled = 1; |
| 5532 | 5694 | ||
| 5533 | return 0; | 5695 | return 0; |
| 5534 | } | 5696 | } |
| 5535 | early_param("sched_debug", sched_domain_debug_setup); | 5697 | early_param("sched_debug", sched_debug_setup); |
| 5698 | |||
| 5699 | static inline bool sched_debug(void) | ||
| 5700 | { | ||
| 5701 | return sched_debug_enabled; | ||
| 5702 | } | ||
| 5536 | 5703 | ||
| 5537 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | 5704 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, |
| 5538 | struct cpumask *groupmask) | 5705 | struct cpumask *groupmask) |
| @@ -5572,7 +5739,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
| 5572 | break; | 5739 | break; |
| 5573 | } | 5740 | } |
| 5574 | 5741 | ||
| 5575 | if (!group->sgp->power) { | 5742 | /* |
| 5743 | * Even though we initialize ->power to something semi-sane, | ||
| 5744 | * we leave power_orig unset. This allows us to detect if | ||
| 5745 | * domain iteration is still funny without causing /0 traps. | ||
| 5746 | */ | ||
| 5747 | if (!group->sgp->power_orig) { | ||
| 5576 | printk(KERN_CONT "\n"); | 5748 | printk(KERN_CONT "\n"); |
| 5577 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 5749 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
| 5578 | "set\n"); | 5750 | "set\n"); |
| @@ -5620,7 +5792,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
| 5620 | { | 5792 | { |
| 5621 | int level = 0; | 5793 | int level = 0; |
| 5622 | 5794 | ||
| 5623 | if (!sched_domain_debug_enabled) | 5795 | if (!sched_debug_enabled) |
| 5624 | return; | 5796 | return; |
| 5625 | 5797 | ||
| 5626 | if (!sd) { | 5798 | if (!sd) { |
| @@ -5641,6 +5813,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
| 5641 | } | 5813 | } |
| 5642 | #else /* !CONFIG_SCHED_DEBUG */ | 5814 | #else /* !CONFIG_SCHED_DEBUG */ |
| 5643 | # define sched_domain_debug(sd, cpu) do { } while (0) | 5815 | # define sched_domain_debug(sd, cpu) do { } while (0) |
| 5816 | static inline bool sched_debug(void) | ||
| 5817 | { | ||
| 5818 | return false; | ||
| 5819 | } | ||
| 5644 | #endif /* CONFIG_SCHED_DEBUG */ | 5820 | #endif /* CONFIG_SCHED_DEBUG */ |
| 5645 | 5821 | ||
| 5646 | static int sd_degenerate(struct sched_domain *sd) | 5822 | static int sd_degenerate(struct sched_domain *sd) |
| @@ -5962,6 +6138,44 @@ struct sched_domain_topology_level { | |||
| 5962 | struct sd_data data; | 6138 | struct sd_data data; |
| 5963 | }; | 6139 | }; |
| 5964 | 6140 | ||
| 6141 | /* | ||
| 6142 | * Build an iteration mask that can exclude certain CPUs from the upwards | ||
| 6143 | * domain traversal. | ||
| 6144 | * | ||
| 6145 | * Asymmetric node setups can result in situations where the domain tree is of | ||
| 6146 | * unequal depth, make sure to skip domains that already cover the entire | ||
| 6147 | * range. | ||
| 6148 | * | ||
| 6149 | * In that case build_sched_domains() will have terminated the iteration early | ||
| 6150 | * and our sibling sd spans will be empty. Domains should always include the | ||
| 6151 | * cpu they're built on, so check that. | ||
| 6152 | * | ||
| 6153 | */ | ||
| 6154 | static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) | ||
| 6155 | { | ||
| 6156 | const struct cpumask *span = sched_domain_span(sd); | ||
| 6157 | struct sd_data *sdd = sd->private; | ||
| 6158 | struct sched_domain *sibling; | ||
| 6159 | int i; | ||
| 6160 | |||
| 6161 | for_each_cpu(i, span) { | ||
| 6162 | sibling = *per_cpu_ptr(sdd->sd, i); | ||
| 6163 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) | ||
| 6164 | continue; | ||
| 6165 | |||
| 6166 | cpumask_set_cpu(i, sched_group_mask(sg)); | ||
| 6167 | } | ||
| 6168 | } | ||
| 6169 | |||
| 6170 | /* | ||
| 6171 | * Return the canonical balance cpu for this group, this is the first cpu | ||
| 6172 | * of this group that's also in the iteration mask. | ||
| 6173 | */ | ||
| 6174 | int group_balance_cpu(struct sched_group *sg) | ||
| 6175 | { | ||
| 6176 | return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); | ||
| 6177 | } | ||
| 6178 | |||
| 5965 | static int | 6179 | static int |
| 5966 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) | 6180 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) |
| 5967 | { | 6181 | { |
| @@ -5980,6 +6194,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
| 5980 | if (cpumask_test_cpu(i, covered)) | 6194 | if (cpumask_test_cpu(i, covered)) |
| 5981 | continue; | 6195 | continue; |
| 5982 | 6196 | ||
| 6197 | child = *per_cpu_ptr(sdd->sd, i); | ||
| 6198 | |||
| 6199 | /* See the comment near build_group_mask(). */ | ||
| 6200 | if (!cpumask_test_cpu(i, sched_domain_span(child))) | ||
| 6201 | continue; | ||
| 6202 | |||
| 5983 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | 6203 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), |
| 5984 | GFP_KERNEL, cpu_to_node(cpu)); | 6204 | GFP_KERNEL, cpu_to_node(cpu)); |
| 5985 | 6205 | ||
| @@ -5987,8 +6207,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
| 5987 | goto fail; | 6207 | goto fail; |
| 5988 | 6208 | ||
| 5989 | sg_span = sched_group_cpus(sg); | 6209 | sg_span = sched_group_cpus(sg); |
| 5990 | |||
| 5991 | child = *per_cpu_ptr(sdd->sd, i); | ||
| 5992 | if (child->child) { | 6210 | if (child->child) { |
| 5993 | child = child->child; | 6211 | child = child->child; |
| 5994 | cpumask_copy(sg_span, sched_domain_span(child)); | 6212 | cpumask_copy(sg_span, sched_domain_span(child)); |
| @@ -5997,10 +6215,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
| 5997 | 6215 | ||
| 5998 | cpumask_or(covered, covered, sg_span); | 6216 | cpumask_or(covered, covered, sg_span); |
| 5999 | 6217 | ||
| 6000 | sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); | 6218 | sg->sgp = *per_cpu_ptr(sdd->sgp, i); |
| 6001 | atomic_inc(&sg->sgp->ref); | 6219 | if (atomic_inc_return(&sg->sgp->ref) == 1) |
| 6220 | build_group_mask(sd, sg); | ||
| 6002 | 6221 | ||
| 6003 | if (cpumask_test_cpu(cpu, sg_span)) | 6222 | /* |
| 6223 | * Initialize sgp->power such that even if we mess up the | ||
| 6224 | * domains and no possible iteration will get us here, we won't | ||
| 6225 | * die on a /0 trap. | ||
| 6226 | */ | ||
| 6227 | sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); | ||
| 6228 | |||
| 6229 | /* | ||
| 6230 | * Make sure the first group of this domain contains the | ||
| 6231 | * canonical balance cpu. Otherwise the sched_domain iteration | ||
| 6232 | * breaks. See update_sg_lb_stats(). | ||
| 6233 | */ | ||
| 6234 | if ((!groups && cpumask_test_cpu(cpu, sg_span)) || | ||
| 6235 | group_balance_cpu(sg) == cpu) | ||
| 6004 | groups = sg; | 6236 | groups = sg; |
| 6005 | 6237 | ||
| 6006 | if (!first) | 6238 | if (!first) |
| @@ -6074,6 +6306,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) | |||
| 6074 | 6306 | ||
| 6075 | cpumask_clear(sched_group_cpus(sg)); | 6307 | cpumask_clear(sched_group_cpus(sg)); |
| 6076 | sg->sgp->power = 0; | 6308 | sg->sgp->power = 0; |
| 6309 | cpumask_setall(sched_group_mask(sg)); | ||
| 6077 | 6310 | ||
| 6078 | for_each_cpu(j, span) { | 6311 | for_each_cpu(j, span) { |
| 6079 | if (get_group(j, sdd, NULL) != group) | 6312 | if (get_group(j, sdd, NULL) != group) |
| @@ -6115,7 +6348,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
| 6115 | sg = sg->next; | 6348 | sg = sg->next; |
| 6116 | } while (sg != sd->groups); | 6349 | } while (sg != sd->groups); |
| 6117 | 6350 | ||
| 6118 | if (cpu != group_first_cpu(sg)) | 6351 | if (cpu != group_balance_cpu(sg)) |
| 6119 | return; | 6352 | return; |
| 6120 | 6353 | ||
| 6121 | update_group_power(sd, cpu); | 6354 | update_group_power(sd, cpu); |
| @@ -6165,11 +6398,8 @@ int sched_domain_level_max; | |||
| 6165 | 6398 | ||
| 6166 | static int __init setup_relax_domain_level(char *str) | 6399 | static int __init setup_relax_domain_level(char *str) |
| 6167 | { | 6400 | { |
| 6168 | unsigned long val; | 6401 | if (kstrtoint(str, 0, &default_relax_domain_level)) |
| 6169 | 6402 | pr_warn("Unable to set relax_domain_level\n"); | |
| 6170 | val = simple_strtoul(str, NULL, 0); | ||
| 6171 | if (val < sched_domain_level_max) | ||
| 6172 | default_relax_domain_level = val; | ||
| 6173 | 6403 | ||
| 6174 | return 1; | 6404 | return 1; |
| 6175 | } | 6405 | } |
| @@ -6279,14 +6509,13 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol | |||
| 6279 | #ifdef CONFIG_NUMA | 6509 | #ifdef CONFIG_NUMA |
| 6280 | 6510 | ||
| 6281 | static int sched_domains_numa_levels; | 6511 | static int sched_domains_numa_levels; |
| 6282 | static int sched_domains_numa_scale; | ||
| 6283 | static int *sched_domains_numa_distance; | 6512 | static int *sched_domains_numa_distance; |
| 6284 | static struct cpumask ***sched_domains_numa_masks; | 6513 | static struct cpumask ***sched_domains_numa_masks; |
| 6285 | static int sched_domains_curr_level; | 6514 | static int sched_domains_curr_level; |
| 6286 | 6515 | ||
| 6287 | static inline int sd_local_flags(int level) | 6516 | static inline int sd_local_flags(int level) |
| 6288 | { | 6517 | { |
| 6289 | if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) | 6518 | if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) |
| 6290 | return 0; | 6519 | return 0; |
| 6291 | 6520 | ||
| 6292 | return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; | 6521 | return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; |
| @@ -6344,6 +6573,42 @@ static const struct cpumask *sd_numa_mask(int cpu) | |||
| 6344 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; | 6573 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; |
| 6345 | } | 6574 | } |
| 6346 | 6575 | ||
| 6576 | static void sched_numa_warn(const char *str) | ||
| 6577 | { | ||
| 6578 | static int done = false; | ||
| 6579 | int i,j; | ||
| 6580 | |||
| 6581 | if (done) | ||
| 6582 | return; | ||
| 6583 | |||
| 6584 | done = true; | ||
| 6585 | |||
| 6586 | printk(KERN_WARNING "ERROR: %s\n\n", str); | ||
| 6587 | |||
| 6588 | for (i = 0; i < nr_node_ids; i++) { | ||
| 6589 | printk(KERN_WARNING " "); | ||
| 6590 | for (j = 0; j < nr_node_ids; j++) | ||
| 6591 | printk(KERN_CONT "%02d ", node_distance(i,j)); | ||
| 6592 | printk(KERN_CONT "\n"); | ||
| 6593 | } | ||
| 6594 | printk(KERN_WARNING "\n"); | ||
| 6595 | } | ||
| 6596 | |||
| 6597 | static bool find_numa_distance(int distance) | ||
| 6598 | { | ||
| 6599 | int i; | ||
| 6600 | |||
| 6601 | if (distance == node_distance(0, 0)) | ||
| 6602 | return true; | ||
| 6603 | |||
| 6604 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
| 6605 | if (sched_domains_numa_distance[i] == distance) | ||
| 6606 | return true; | ||
| 6607 | } | ||
| 6608 | |||
| 6609 | return false; | ||
| 6610 | } | ||
| 6611 | |||
| 6347 | static void sched_init_numa(void) | 6612 | static void sched_init_numa(void) |
| 6348 | { | 6613 | { |
| 6349 | int next_distance, curr_distance = node_distance(0, 0); | 6614 | int next_distance, curr_distance = node_distance(0, 0); |
| @@ -6351,7 +6616,6 @@ static void sched_init_numa(void) | |||
| 6351 | int level = 0; | 6616 | int level = 0; |
| 6352 | int i, j, k; | 6617 | int i, j, k; |
| 6353 | 6618 | ||
| 6354 | sched_domains_numa_scale = curr_distance; | ||
| 6355 | sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); | 6619 | sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); |
| 6356 | if (!sched_domains_numa_distance) | 6620 | if (!sched_domains_numa_distance) |
| 6357 | return; | 6621 | return; |
| @@ -6362,23 +6626,41 @@ static void sched_init_numa(void) | |||
| 6362 | * | 6626 | * |
| 6363 | * Assumes node_distance(0,j) includes all distances in | 6627 | * Assumes node_distance(0,j) includes all distances in |
| 6364 | * node_distance(i,j) in order to avoid cubic time. | 6628 | * node_distance(i,j) in order to avoid cubic time. |
| 6365 | * | ||
| 6366 | * XXX: could be optimized to O(n log n) by using sort() | ||
| 6367 | */ | 6629 | */ |
| 6368 | next_distance = curr_distance; | 6630 | next_distance = curr_distance; |
| 6369 | for (i = 0; i < nr_node_ids; i++) { | 6631 | for (i = 0; i < nr_node_ids; i++) { |
| 6370 | for (j = 0; j < nr_node_ids; j++) { | 6632 | for (j = 0; j < nr_node_ids; j++) { |
| 6371 | int distance = node_distance(0, j); | 6633 | for (k = 0; k < nr_node_ids; k++) { |
| 6372 | if (distance > curr_distance && | 6634 | int distance = node_distance(i, k); |
| 6373 | (distance < next_distance || | 6635 | |
| 6374 | next_distance == curr_distance)) | 6636 | if (distance > curr_distance && |
| 6375 | next_distance = distance; | 6637 | (distance < next_distance || |
| 6638 | next_distance == curr_distance)) | ||
| 6639 | next_distance = distance; | ||
| 6640 | |||
| 6641 | /* | ||
| 6642 | * While not a strong assumption it would be nice to know | ||
| 6643 | * about cases where if node A is connected to B, B is not | ||
| 6644 | * equally connected to A. | ||
| 6645 | */ | ||
| 6646 | if (sched_debug() && node_distance(k, i) != distance) | ||
| 6647 | sched_numa_warn("Node-distance not symmetric"); | ||
| 6648 | |||
| 6649 | if (sched_debug() && i && !find_numa_distance(distance)) | ||
| 6650 | sched_numa_warn("Node-0 not representative"); | ||
| 6651 | } | ||
| 6652 | if (next_distance != curr_distance) { | ||
| 6653 | sched_domains_numa_distance[level++] = next_distance; | ||
| 6654 | sched_domains_numa_levels = level; | ||
| 6655 | curr_distance = next_distance; | ||
| 6656 | } else break; | ||
| 6376 | } | 6657 | } |
| 6377 | if (next_distance != curr_distance) { | 6658 | |
| 6378 | sched_domains_numa_distance[level++] = next_distance; | 6659 | /* |
| 6379 | sched_domains_numa_levels = level; | 6660 | * In case of sched_debug() we verify the above assumption. |
| 6380 | curr_distance = next_distance; | 6661 | */ |
| 6381 | } else break; | 6662 | if (!sched_debug()) |
| 6663 | break; | ||
| 6382 | } | 6664 | } |
| 6383 | /* | 6665 | /* |
| 6384 | * 'level' contains the number of unique distances, excluding the | 6666 | * 'level' contains the number of unique distances, excluding the |
| @@ -6403,7 +6685,7 @@ static void sched_init_numa(void) | |||
| 6403 | return; | 6685 | return; |
| 6404 | 6686 | ||
| 6405 | for (j = 0; j < nr_node_ids; j++) { | 6687 | for (j = 0; j < nr_node_ids; j++) { |
| 6406 | struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j); | 6688 | struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); |
| 6407 | if (!mask) | 6689 | if (!mask) |
| 6408 | return; | 6690 | return; |
| 6409 | 6691 | ||
| @@ -6490,7 +6772,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
| 6490 | 6772 | ||
| 6491 | *per_cpu_ptr(sdd->sg, j) = sg; | 6773 | *per_cpu_ptr(sdd->sg, j) = sg; |
| 6492 | 6774 | ||
| 6493 | sgp = kzalloc_node(sizeof(struct sched_group_power), | 6775 | sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), |
| 6494 | GFP_KERNEL, cpu_to_node(j)); | 6776 | GFP_KERNEL, cpu_to_node(j)); |
| 6495 | if (!sgp) | 6777 | if (!sgp) |
| 6496 | return -ENOMEM; | 6778 | return -ENOMEM; |
| @@ -6543,7 +6825,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
| 6543 | if (!sd) | 6825 | if (!sd) |
| 6544 | return child; | 6826 | return child; |
| 6545 | 6827 | ||
| 6546 | set_domain_attribute(sd, attr); | ||
| 6547 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | 6828 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); |
| 6548 | if (child) { | 6829 | if (child) { |
| 6549 | sd->level = child->level + 1; | 6830 | sd->level = child->level + 1; |
| @@ -6551,6 +6832,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
| 6551 | child->parent = sd; | 6832 | child->parent = sd; |
| 6552 | } | 6833 | } |
| 6553 | sd->child = child; | 6834 | sd->child = child; |
| 6835 | set_domain_attribute(sd, attr); | ||
| 6554 | 6836 | ||
| 6555 | return sd; | 6837 | return sd; |
| 6556 | } | 6838 | } |
| @@ -6691,7 +6973,6 @@ static int init_sched_domains(const struct cpumask *cpu_map) | |||
| 6691 | if (!doms_cur) | 6973 | if (!doms_cur) |
| 6692 | doms_cur = &fallback_doms; | 6974 | doms_cur = &fallback_doms; |
| 6693 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | 6975 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); |
| 6694 | dattr_cur = NULL; | ||
| 6695 | err = build_sched_domains(doms_cur[0], NULL); | 6976 | err = build_sched_domains(doms_cur[0], NULL); |
| 6696 | register_sched_domain_sysctl(); | 6977 | register_sched_domain_sysctl(); |
| 6697 | 6978 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 940e6d17cf96..c099cc6eebe3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
| 2703 | int want_sd = 1; | 2703 | int want_sd = 1; |
| 2704 | int sync = wake_flags & WF_SYNC; | 2704 | int sync = wake_flags & WF_SYNC; |
| 2705 | 2705 | ||
| 2706 | if (p->rt.nr_cpus_allowed == 1) | 2706 | if (p->nr_cpus_allowed == 1) |
| 2707 | return prev_cpu; | 2707 | return prev_cpu; |
| 2708 | 2708 | ||
| 2709 | if (sd_flag & SD_BALANCE_WAKE) { | 2709 | if (sd_flag & SD_BALANCE_WAKE) { |
| @@ -3503,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | |||
| 3503 | unsigned long scale_rt_power(int cpu) | 3503 | unsigned long scale_rt_power(int cpu) |
| 3504 | { | 3504 | { |
| 3505 | struct rq *rq = cpu_rq(cpu); | 3505 | struct rq *rq = cpu_rq(cpu); |
| 3506 | u64 total, available; | 3506 | u64 total, available, age_stamp, avg; |
| 3507 | 3507 | ||
| 3508 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | 3508 | /* |
| 3509 | * Since we're reading these variables without serialization make sure | ||
| 3510 | * we read them once before doing sanity checks on them. | ||
| 3511 | */ | ||
| 3512 | age_stamp = ACCESS_ONCE(rq->age_stamp); | ||
| 3513 | avg = ACCESS_ONCE(rq->rt_avg); | ||
| 3514 | |||
| 3515 | total = sched_avg_period() + (rq->clock - age_stamp); | ||
| 3509 | 3516 | ||
| 3510 | if (unlikely(total < rq->rt_avg)) { | 3517 | if (unlikely(total < avg)) { |
| 3511 | /* Ensures that power won't end up being negative */ | 3518 | /* Ensures that power won't end up being negative */ |
| 3512 | available = 0; | 3519 | available = 0; |
| 3513 | } else { | 3520 | } else { |
| 3514 | available = total - rq->rt_avg; | 3521 | available = total - avg; |
| 3515 | } | 3522 | } |
| 3516 | 3523 | ||
| 3517 | if (unlikely((s64)total < SCHED_POWER_SCALE)) | 3524 | if (unlikely((s64)total < SCHED_POWER_SCALE)) |
| @@ -3574,13 +3581,28 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
| 3574 | 3581 | ||
| 3575 | power = 0; | 3582 | power = 0; |
| 3576 | 3583 | ||
| 3577 | group = child->groups; | 3584 | if (child->flags & SD_OVERLAP) { |
| 3578 | do { | 3585 | /* |
| 3579 | power += group->sgp->power; | 3586 | * SD_OVERLAP domains cannot assume that child groups |
| 3580 | group = group->next; | 3587 | * span the current group. |
| 3581 | } while (group != child->groups); | 3588 | */ |
| 3582 | 3589 | ||
| 3583 | sdg->sgp->power = power; | 3590 | for_each_cpu(cpu, sched_group_cpus(sdg)) |
| 3591 | power += power_of(cpu); | ||
| 3592 | } else { | ||
| 3593 | /* | ||
| 3594 | * !SD_OVERLAP domains can assume that child groups | ||
| 3595 | * span the current group. | ||
| 3596 | */ | ||
| 3597 | |||
| 3598 | group = child->groups; | ||
| 3599 | do { | ||
| 3600 | power += group->sgp->power; | ||
| 3601 | group = group->next; | ||
| 3602 | } while (group != child->groups); | ||
| 3603 | } | ||
| 3604 | |||
| 3605 | sdg->sgp->power_orig = sdg->sgp->power = power; | ||
| 3584 | } | 3606 | } |
| 3585 | 3607 | ||
| 3586 | /* | 3608 | /* |
| @@ -3610,7 +3632,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
| 3610 | 3632 | ||
| 3611 | /** | 3633 | /** |
| 3612 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 3634 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
| 3613 | * @sd: The sched_domain whose statistics are to be updated. | 3635 | * @env: The load balancing environment. |
| 3614 | * @group: sched_group whose statistics are to be updated. | 3636 | * @group: sched_group whose statistics are to be updated. |
| 3615 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 3637 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
| 3616 | * @local_group: Does group contain this_cpu. | 3638 | * @local_group: Does group contain this_cpu. |
| @@ -3630,7 +3652,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 3630 | int i; | 3652 | int i; |
| 3631 | 3653 | ||
| 3632 | if (local_group) | 3654 | if (local_group) |
| 3633 | balance_cpu = group_first_cpu(group); | 3655 | balance_cpu = group_balance_cpu(group); |
| 3634 | 3656 | ||
| 3635 | /* Tally up the load of all CPUs in the group */ | 3657 | /* Tally up the load of all CPUs in the group */ |
| 3636 | max_cpu_load = 0; | 3658 | max_cpu_load = 0; |
| @@ -3645,7 +3667,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 3645 | 3667 | ||
| 3646 | /* Bias balancing toward cpus of our domain */ | 3668 | /* Bias balancing toward cpus of our domain */ |
| 3647 | if (local_group) { | 3669 | if (local_group) { |
| 3648 | if (idle_cpu(i) && !first_idle_cpu) { | 3670 | if (idle_cpu(i) && !first_idle_cpu && |
| 3671 | cpumask_test_cpu(i, sched_group_mask(group))) { | ||
| 3649 | first_idle_cpu = 1; | 3672 | first_idle_cpu = 1; |
| 3650 | balance_cpu = i; | 3673 | balance_cpu = i; |
| 3651 | } | 3674 | } |
| @@ -3719,11 +3742,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 3719 | 3742 | ||
| 3720 | /** | 3743 | /** |
| 3721 | * update_sd_pick_busiest - return 1 on busiest group | 3744 | * update_sd_pick_busiest - return 1 on busiest group |
| 3722 | * @sd: sched_domain whose statistics are to be checked | 3745 | * @env: The load balancing environment. |
| 3723 | * @sds: sched_domain statistics | 3746 | * @sds: sched_domain statistics |
| 3724 | * @sg: sched_group candidate to be checked for being the busiest | 3747 | * @sg: sched_group candidate to be checked for being the busiest |
| 3725 | * @sgs: sched_group statistics | 3748 | * @sgs: sched_group statistics |
| 3726 | * @this_cpu: the current cpu | ||
| 3727 | * | 3749 | * |
| 3728 | * Determine if @sg is a busier group than the previously selected | 3750 | * Determine if @sg is a busier group than the previously selected |
| 3729 | * busiest group. | 3751 | * busiest group. |
| @@ -3761,9 +3783,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
| 3761 | 3783 | ||
| 3762 | /** | 3784 | /** |
| 3763 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. | 3785 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. |
| 3764 | * @sd: sched_domain whose statistics are to be updated. | 3786 | * @env: The load balancing environment. |
| 3765 | * @this_cpu: Cpu for which load balance is currently performed. | ||
| 3766 | * @idle: Idle status of this_cpu | ||
| 3767 | * @cpus: Set of cpus considered for load balancing. | 3787 | * @cpus: Set of cpus considered for load balancing. |
| 3768 | * @balance: Should we balance. | 3788 | * @balance: Should we balance. |
| 3769 | * @sds: variable to hold the statistics for this sched_domain. | 3789 | * @sds: variable to hold the statistics for this sched_domain. |
| @@ -3852,10 +3872,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
| 3852 | * Returns 1 when packing is required and a task should be moved to | 3872 | * Returns 1 when packing is required and a task should be moved to |
| 3853 | * this CPU. The amount of the imbalance is returned in *imbalance. | 3873 | * this CPU. The amount of the imbalance is returned in *imbalance. |
| 3854 | * | 3874 | * |
| 3855 | * @sd: The sched_domain whose packing is to be checked. | 3875 | * @env: The load balancing environment. |
| 3856 | * @sds: Statistics of the sched_domain which is to be packed | 3876 | * @sds: Statistics of the sched_domain which is to be packed |
| 3857 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
| 3858 | * @imbalance: returns amount of imbalanced due to packing. | ||
| 3859 | */ | 3877 | */ |
| 3860 | static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) | 3878 | static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) |
| 3861 | { | 3879 | { |
| @@ -3881,9 +3899,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) | |||
| 3881 | * fix_small_imbalance - Calculate the minor imbalance that exists | 3899 | * fix_small_imbalance - Calculate the minor imbalance that exists |
| 3882 | * amongst the groups of a sched_domain, during | 3900 | * amongst the groups of a sched_domain, during |
| 3883 | * load balancing. | 3901 | * load balancing. |
| 3902 | * @env: The load balancing environment. | ||
| 3884 | * @sds: Statistics of the sched_domain whose imbalance is to be calculated. | 3903 | * @sds: Statistics of the sched_domain whose imbalance is to be calculated. |
| 3885 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
| 3886 | * @imbalance: Variable to store the imbalance. | ||
| 3887 | */ | 3904 | */ |
| 3888 | static inline | 3905 | static inline |
| 3889 | void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | 3906 | void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) |
| @@ -4026,11 +4043,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 4026 | * Also calculates the amount of weighted load which should be moved | 4043 | * Also calculates the amount of weighted load which should be moved |
| 4027 | * to restore balance. | 4044 | * to restore balance. |
| 4028 | * | 4045 | * |
| 4029 | * @sd: The sched_domain whose busiest group is to be returned. | 4046 | * @env: The load balancing environment. |
| 4030 | * @this_cpu: The cpu for which load balancing is currently being performed. | ||
| 4031 | * @imbalance: Variable which stores amount of weighted load which should | ||
| 4032 | * be moved to restore balance/put a group to idle. | ||
| 4033 | * @idle: The idle status of this_cpu. | ||
| 4034 | * @cpus: The set of CPUs under consideration for load-balancing. | 4047 | * @cpus: The set of CPUs under consideration for load-balancing. |
| 4035 | * @balance: Pointer to a variable indicating if this_cpu | 4048 | * @balance: Pointer to a variable indicating if this_cpu |
| 4036 | * is the appropriate cpu to perform load balancing at this_level. | 4049 | * is the appropriate cpu to perform load balancing at this_level. |
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b44d604b35d1..b6baf370cae9 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
| @@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl | |||
| 25 | static struct task_struct *pick_next_task_idle(struct rq *rq) | 25 | static struct task_struct *pick_next_task_idle(struct rq *rq) |
| 26 | { | 26 | { |
| 27 | schedstat_inc(rq, sched_goidle); | 27 | schedstat_inc(rq, sched_goidle); |
| 28 | calc_load_account_idle(rq); | ||
| 29 | return rq->idle; | 28 | return rq->idle; |
| 30 | } | 29 | } |
| 31 | 30 | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index c5565c3c515f..573e1ca01102 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq) | |||
| 274 | 274 | ||
| 275 | static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | 275 | static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) |
| 276 | { | 276 | { |
| 277 | struct task_struct *p; | ||
| 278 | |||
| 277 | if (!rt_entity_is_task(rt_se)) | 279 | if (!rt_entity_is_task(rt_se)) |
| 278 | return; | 280 | return; |
| 279 | 281 | ||
| 282 | p = rt_task_of(rt_se); | ||
| 280 | rt_rq = &rq_of_rt_rq(rt_rq)->rt; | 283 | rt_rq = &rq_of_rt_rq(rt_rq)->rt; |
| 281 | 284 | ||
| 282 | rt_rq->rt_nr_total++; | 285 | rt_rq->rt_nr_total++; |
| 283 | if (rt_se->nr_cpus_allowed > 1) | 286 | if (p->nr_cpus_allowed > 1) |
| 284 | rt_rq->rt_nr_migratory++; | 287 | rt_rq->rt_nr_migratory++; |
| 285 | 288 | ||
| 286 | update_rt_migration(rt_rq); | 289 | update_rt_migration(rt_rq); |
| @@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
| 288 | 291 | ||
| 289 | static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | 292 | static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) |
| 290 | { | 293 | { |
| 294 | struct task_struct *p; | ||
| 295 | |||
| 291 | if (!rt_entity_is_task(rt_se)) | 296 | if (!rt_entity_is_task(rt_se)) |
| 292 | return; | 297 | return; |
| 293 | 298 | ||
| 299 | p = rt_task_of(rt_se); | ||
| 294 | rt_rq = &rq_of_rt_rq(rt_rq)->rt; | 300 | rt_rq = &rq_of_rt_rq(rt_rq)->rt; |
| 295 | 301 | ||
| 296 | rt_rq->rt_nr_total--; | 302 | rt_rq->rt_nr_total--; |
| 297 | if (rt_se->nr_cpus_allowed > 1) | 303 | if (p->nr_cpus_allowed > 1) |
| 298 | rt_rq->rt_nr_migratory--; | 304 | rt_rq->rt_nr_migratory--; |
| 299 | 305 | ||
| 300 | update_rt_migration(rt_rq); | 306 | update_rt_migration(rt_rq); |
| @@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
| 1161 | 1167 | ||
| 1162 | enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); | 1168 | enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); |
| 1163 | 1169 | ||
| 1164 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) | 1170 | if (!task_current(rq, p) && p->nr_cpus_allowed > 1) |
| 1165 | enqueue_pushable_task(rq, p); | 1171 | enqueue_pushable_task(rq, p); |
| 1166 | 1172 | ||
| 1167 | inc_nr_running(rq); | 1173 | inc_nr_running(rq); |
| @@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
| 1225 | 1231 | ||
| 1226 | cpu = task_cpu(p); | 1232 | cpu = task_cpu(p); |
| 1227 | 1233 | ||
| 1228 | if (p->rt.nr_cpus_allowed == 1) | 1234 | if (p->nr_cpus_allowed == 1) |
| 1229 | goto out; | 1235 | goto out; |
| 1230 | 1236 | ||
| 1231 | /* For anything but wake ups, just return the task_cpu */ | 1237 | /* For anything but wake ups, just return the task_cpu */ |
| @@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
| 1260 | * will have to sort it out. | 1266 | * will have to sort it out. |
| 1261 | */ | 1267 | */ |
| 1262 | if (curr && unlikely(rt_task(curr)) && | 1268 | if (curr && unlikely(rt_task(curr)) && |
| 1263 | (curr->rt.nr_cpus_allowed < 2 || | 1269 | (curr->nr_cpus_allowed < 2 || |
| 1264 | curr->prio <= p->prio) && | 1270 | curr->prio <= p->prio) && |
| 1265 | (p->rt.nr_cpus_allowed > 1)) { | 1271 | (p->nr_cpus_allowed > 1)) { |
| 1266 | int target = find_lowest_rq(p); | 1272 | int target = find_lowest_rq(p); |
| 1267 | 1273 | ||
| 1268 | if (target != -1) | 1274 | if (target != -1) |
| @@ -1276,10 +1282,10 @@ out: | |||
| 1276 | 1282 | ||
| 1277 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) | 1283 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) |
| 1278 | { | 1284 | { |
| 1279 | if (rq->curr->rt.nr_cpus_allowed == 1) | 1285 | if (rq->curr->nr_cpus_allowed == 1) |
| 1280 | return; | 1286 | return; |
| 1281 | 1287 | ||
| 1282 | if (p->rt.nr_cpus_allowed != 1 | 1288 | if (p->nr_cpus_allowed != 1 |
| 1283 | && cpupri_find(&rq->rd->cpupri, p, NULL)) | 1289 | && cpupri_find(&rq->rd->cpupri, p, NULL)) |
| 1284 | return; | 1290 | return; |
| 1285 | 1291 | ||
| @@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
| 1395 | * The previous task needs to be made eligible for pushing | 1401 | * The previous task needs to be made eligible for pushing |
| 1396 | * if it is still active | 1402 | * if it is still active |
| 1397 | */ | 1403 | */ |
| 1398 | if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) | 1404 | if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) |
| 1399 | enqueue_pushable_task(rq, p); | 1405 | enqueue_pushable_task(rq, p); |
| 1400 | } | 1406 | } |
| 1401 | 1407 | ||
| @@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | |||
| 1408 | { | 1414 | { |
| 1409 | if (!task_running(rq, p) && | 1415 | if (!task_running(rq, p) && |
| 1410 | (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && | 1416 | (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && |
| 1411 | (p->rt.nr_cpus_allowed > 1)) | 1417 | (p->nr_cpus_allowed > 1)) |
| 1412 | return 1; | 1418 | return 1; |
| 1413 | return 0; | 1419 | return 0; |
| 1414 | } | 1420 | } |
| @@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task) | |||
| 1464 | if (unlikely(!lowest_mask)) | 1470 | if (unlikely(!lowest_mask)) |
| 1465 | return -1; | 1471 | return -1; |
| 1466 | 1472 | ||
| 1467 | if (task->rt.nr_cpus_allowed == 1) | 1473 | if (task->nr_cpus_allowed == 1) |
| 1468 | return -1; /* No other targets possible */ | 1474 | return -1; /* No other targets possible */ |
| 1469 | 1475 | ||
| 1470 | if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) | 1476 | if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) |
| @@ -1556,7 +1562,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | |||
| 1556 | task_running(rq, task) || | 1562 | task_running(rq, task) || |
| 1557 | !task->on_rq)) { | 1563 | !task->on_rq)) { |
| 1558 | 1564 | ||
| 1559 | raw_spin_unlock(&lowest_rq->lock); | 1565 | double_unlock_balance(rq, lowest_rq); |
| 1560 | lowest_rq = NULL; | 1566 | lowest_rq = NULL; |
| 1561 | break; | 1567 | break; |
| 1562 | } | 1568 | } |
| @@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) | |||
| 1586 | 1592 | ||
| 1587 | BUG_ON(rq->cpu != task_cpu(p)); | 1593 | BUG_ON(rq->cpu != task_cpu(p)); |
| 1588 | BUG_ON(task_current(rq, p)); | 1594 | BUG_ON(task_current(rq, p)); |
| 1589 | BUG_ON(p->rt.nr_cpus_allowed <= 1); | 1595 | BUG_ON(p->nr_cpus_allowed <= 1); |
| 1590 | 1596 | ||
| 1591 | BUG_ON(!p->on_rq); | 1597 | BUG_ON(!p->on_rq); |
| 1592 | BUG_ON(!rt_task(p)); | 1598 | BUG_ON(!rt_task(p)); |
| @@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
| 1793 | if (!task_running(rq, p) && | 1799 | if (!task_running(rq, p) && |
| 1794 | !test_tsk_need_resched(rq->curr) && | 1800 | !test_tsk_need_resched(rq->curr) && |
| 1795 | has_pushable_tasks(rq) && | 1801 | has_pushable_tasks(rq) && |
| 1796 | p->rt.nr_cpus_allowed > 1 && | 1802 | p->nr_cpus_allowed > 1 && |
| 1797 | rt_task(rq->curr) && | 1803 | rt_task(rq->curr) && |
| 1798 | (rq->curr->rt.nr_cpus_allowed < 2 || | 1804 | (rq->curr->nr_cpus_allowed < 2 || |
| 1799 | rq->curr->prio <= p->prio)) | 1805 | rq->curr->prio <= p->prio)) |
| 1800 | push_rt_tasks(rq); | 1806 | push_rt_tasks(rq); |
| 1801 | } | 1807 | } |
| @@ -1817,7 +1823,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, | |||
| 1817 | * Only update if the process changes its state from whether it | 1823 | * Only update if the process changes its state from whether it |
| 1818 | * can migrate or not. | 1824 | * can migrate or not. |
| 1819 | */ | 1825 | */ |
| 1820 | if ((p->rt.nr_cpus_allowed > 1) == (weight > 1)) | 1826 | if ((p->nr_cpus_allowed > 1) == (weight > 1)) |
| 1821 | return; | 1827 | return; |
| 1822 | 1828 | ||
| 1823 | rq = task_rq(p); | 1829 | rq = task_rq(p); |
| @@ -1979,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p) | |||
| 1979 | 1985 | ||
| 1980 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | 1986 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) |
| 1981 | { | 1987 | { |
| 1988 | struct sched_rt_entity *rt_se = &p->rt; | ||
| 1989 | |||
| 1982 | update_curr_rt(rq); | 1990 | update_curr_rt(rq); |
| 1983 | 1991 | ||
| 1984 | watchdog(rq, p); | 1992 | watchdog(rq, p); |
| @@ -1996,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | |||
| 1996 | p->rt.time_slice = RR_TIMESLICE; | 2004 | p->rt.time_slice = RR_TIMESLICE; |
| 1997 | 2005 | ||
| 1998 | /* | 2006 | /* |
| 1999 | * Requeue to the end of queue if we are not the only element | 2007 | * Requeue to the end of queue if we (and all of our ancestors) are the |
| 2000 | * on the queue: | 2008 | * only element on the queue |
| 2001 | */ | 2009 | */ |
| 2002 | if (p->rt.run_list.prev != p->rt.run_list.next) { | 2010 | for_each_sched_rt_entity(rt_se) { |
| 2003 | requeue_task_rt(rq, p, 0); | 2011 | if (rt_se->run_list.prev != rt_se->run_list.next) { |
| 2004 | set_tsk_need_resched(p); | 2012 | requeue_task_rt(rq, p, 0); |
| 2013 | set_tsk_need_resched(p); | ||
| 2014 | return; | ||
| 2015 | } | ||
| 2005 | } | 2016 | } |
| 2006 | } | 2017 | } |
| 2007 | 2018 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ba9dccfd24ce..55844f24435a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) | |||
| 526 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); | 526 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); |
| 527 | DECLARE_PER_CPU(int, sd_llc_id); | 527 | DECLARE_PER_CPU(int, sd_llc_id); |
| 528 | 528 | ||
| 529 | extern int group_balance_cpu(struct sched_group *sg); | ||
| 530 | |||
| 529 | #endif /* CONFIG_SMP */ | 531 | #endif /* CONFIG_SMP */ |
| 530 | 532 | ||
| 531 | #include "stats.h" | 533 | #include "stats.h" |
| @@ -940,8 +942,6 @@ static inline u64 sched_avg_period(void) | |||
| 940 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | 942 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; |
| 941 | } | 943 | } |
| 942 | 944 | ||
| 943 | void calc_load_account_idle(struct rq *this_rq); | ||
| 944 | |||
| 945 | #ifdef CONFIG_SCHED_HRTICK | 945 | #ifdef CONFIG_SCHED_HRTICK |
| 946 | 946 | ||
| 947 | /* | 947 | /* |
diff --git a/kernel/smpboot.c b/kernel/smpboot.c index e1a797e028a3..98f60c5caa1b 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c | |||
| @@ -31,6 +31,12 @@ void __init idle_thread_set_boot_cpu(void) | |||
| 31 | per_cpu(idle_threads, smp_processor_id()) = current; | 31 | per_cpu(idle_threads, smp_processor_id()) = current; |
| 32 | } | 32 | } |
| 33 | 33 | ||
| 34 | /** | ||
| 35 | * idle_init - Initialize the idle thread for a cpu | ||
| 36 | * @cpu: The cpu for which the idle thread should be initialized | ||
| 37 | * | ||
| 38 | * Creates the thread if it does not exist. | ||
| 39 | */ | ||
| 34 | static inline void idle_init(unsigned int cpu) | 40 | static inline void idle_init(unsigned int cpu) |
| 35 | { | 41 | { |
| 36 | struct task_struct *tsk = per_cpu(idle_threads, cpu); | 42 | struct task_struct *tsk = per_cpu(idle_threads, cpu); |
| @@ -45,17 +51,16 @@ static inline void idle_init(unsigned int cpu) | |||
| 45 | } | 51 | } |
| 46 | 52 | ||
| 47 | /** | 53 | /** |
| 48 | * idle_thread_init - Initialize the idle thread for a cpu | 54 | * idle_threads_init - Initialize idle threads for all cpus |
| 49 | * @cpu: The cpu for which the idle thread should be initialized | ||
| 50 | * | ||
| 51 | * Creates the thread if it does not exist. | ||
| 52 | */ | 55 | */ |
| 53 | void __init idle_threads_init(void) | 56 | void __init idle_threads_init(void) |
| 54 | { | 57 | { |
| 55 | unsigned int cpu; | 58 | unsigned int cpu, boot_cpu; |
| 59 | |||
| 60 | boot_cpu = smp_processor_id(); | ||
| 56 | 61 | ||
| 57 | for_each_possible_cpu(cpu) { | 62 | for_each_possible_cpu(cpu) { |
| 58 | if (cpu != smp_processor_id()) | 63 | if (cpu != boot_cpu) |
| 59 | idle_init(cpu); | 64 | idle_init(cpu); |
| 60 | } | 65 | } |
| 61 | } | 66 | } |
diff --git a/kernel/sys.c b/kernel/sys.c index 9ff89cb9657a..2d39a84cd857 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -1786,27 +1786,12 @@ SYSCALL_DEFINE1(umask, int, mask) | |||
| 1786 | } | 1786 | } |
| 1787 | 1787 | ||
| 1788 | #ifdef CONFIG_CHECKPOINT_RESTORE | 1788 | #ifdef CONFIG_CHECKPOINT_RESTORE |
| 1789 | static bool vma_flags_mismatch(struct vm_area_struct *vma, | ||
| 1790 | unsigned long required, | ||
| 1791 | unsigned long banned) | ||
| 1792 | { | ||
| 1793 | return (vma->vm_flags & required) != required || | ||
| 1794 | (vma->vm_flags & banned); | ||
| 1795 | } | ||
| 1796 | |||
| 1797 | static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | 1789 | static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) |
| 1798 | { | 1790 | { |
| 1799 | struct file *exe_file; | 1791 | struct file *exe_file; |
| 1800 | struct dentry *dentry; | 1792 | struct dentry *dentry; |
| 1801 | int err; | 1793 | int err; |
| 1802 | 1794 | ||
| 1803 | /* | ||
| 1804 | * Setting new mm::exe_file is only allowed when no VM_EXECUTABLE vma's | ||
| 1805 | * remain. So perform a quick test first. | ||
| 1806 | */ | ||
| 1807 | if (mm->num_exe_file_vmas) | ||
| 1808 | return -EBUSY; | ||
| 1809 | |||
| 1810 | exe_file = fget(fd); | 1795 | exe_file = fget(fd); |
| 1811 | if (!exe_file) | 1796 | if (!exe_file) |
| 1812 | return -EBADF; | 1797 | return -EBADF; |
| @@ -1827,17 +1812,35 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | |||
| 1827 | if (err) | 1812 | if (err) |
| 1828 | goto exit; | 1813 | goto exit; |
| 1829 | 1814 | ||
| 1815 | down_write(&mm->mmap_sem); | ||
| 1816 | |||
| 1817 | /* | ||
| 1818 | * Forbid mm->exe_file change if old file still mapped. | ||
| 1819 | */ | ||
| 1820 | err = -EBUSY; | ||
| 1821 | if (mm->exe_file) { | ||
| 1822 | struct vm_area_struct *vma; | ||
| 1823 | |||
| 1824 | for (vma = mm->mmap; vma; vma = vma->vm_next) | ||
| 1825 | if (vma->vm_file && | ||
| 1826 | path_equal(&vma->vm_file->f_path, | ||
| 1827 | &mm->exe_file->f_path)) | ||
| 1828 | goto exit_unlock; | ||
| 1829 | } | ||
| 1830 | |||
| 1830 | /* | 1831 | /* |
| 1831 | * The symlink can be changed only once, just to disallow arbitrary | 1832 | * The symlink can be changed only once, just to disallow arbitrary |
| 1832 | * transitions malicious software might bring in. This means one | 1833 | * transitions malicious software might bring in. This means one |
| 1833 | * could make a snapshot over all processes running and monitor | 1834 | * could make a snapshot over all processes running and monitor |
| 1834 | * /proc/pid/exe changes to notice unusual activity if needed. | 1835 | * /proc/pid/exe changes to notice unusual activity if needed. |
| 1835 | */ | 1836 | */ |
| 1836 | down_write(&mm->mmap_sem); | 1837 | err = -EPERM; |
| 1837 | if (likely(!mm->exe_file)) | 1838 | if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) |
| 1838 | set_mm_exe_file(mm, exe_file); | 1839 | goto exit_unlock; |
| 1839 | else | 1840 | |
| 1840 | err = -EBUSY; | 1841 | err = 0; |
| 1842 | set_mm_exe_file(mm, exe_file); | ||
| 1843 | exit_unlock: | ||
| 1841 | up_write(&mm->mmap_sem); | 1844 | up_write(&mm->mmap_sem); |
| 1842 | 1845 | ||
| 1843 | exit: | 1846 | exit: |
| @@ -1862,7 +1865,7 @@ static int prctl_set_mm(int opt, unsigned long addr, | |||
| 1862 | if (opt == PR_SET_MM_EXE_FILE) | 1865 | if (opt == PR_SET_MM_EXE_FILE) |
| 1863 | return prctl_set_mm_exe_file(mm, (unsigned int)addr); | 1866 | return prctl_set_mm_exe_file(mm, (unsigned int)addr); |
| 1864 | 1867 | ||
| 1865 | if (addr >= TASK_SIZE) | 1868 | if (addr >= TASK_SIZE || addr < mmap_min_addr) |
| 1866 | return -EINVAL; | 1869 | return -EINVAL; |
| 1867 | 1870 | ||
| 1868 | error = -EINVAL; | 1871 | error = -EINVAL; |
| @@ -1924,12 +1927,6 @@ static int prctl_set_mm(int opt, unsigned long addr, | |||
| 1924 | error = -EFAULT; | 1927 | error = -EFAULT; |
| 1925 | goto out; | 1928 | goto out; |
| 1926 | } | 1929 | } |
| 1927 | #ifdef CONFIG_STACK_GROWSUP | ||
| 1928 | if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSUP, 0)) | ||
| 1929 | #else | ||
| 1930 | if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSDOWN, 0)) | ||
| 1931 | #endif | ||
| 1932 | goto out; | ||
| 1933 | if (opt == PR_SET_MM_START_STACK) | 1930 | if (opt == PR_SET_MM_START_STACK) |
| 1934 | mm->start_stack = addr; | 1931 | mm->start_stack = addr; |
| 1935 | else if (opt == PR_SET_MM_ARG_START) | 1932 | else if (opt == PR_SET_MM_ARG_START) |
| @@ -1981,12 +1978,22 @@ out: | |||
| 1981 | up_read(&mm->mmap_sem); | 1978 | up_read(&mm->mmap_sem); |
| 1982 | return error; | 1979 | return error; |
| 1983 | } | 1980 | } |
| 1981 | |||
| 1982 | static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) | ||
| 1983 | { | ||
| 1984 | return put_user(me->clear_child_tid, tid_addr); | ||
| 1985 | } | ||
| 1986 | |||
| 1984 | #else /* CONFIG_CHECKPOINT_RESTORE */ | 1987 | #else /* CONFIG_CHECKPOINT_RESTORE */ |
| 1985 | static int prctl_set_mm(int opt, unsigned long addr, | 1988 | static int prctl_set_mm(int opt, unsigned long addr, |
| 1986 | unsigned long arg4, unsigned long arg5) | 1989 | unsigned long arg4, unsigned long arg5) |
| 1987 | { | 1990 | { |
| 1988 | return -EINVAL; | 1991 | return -EINVAL; |
| 1989 | } | 1992 | } |
| 1993 | static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) | ||
| 1994 | { | ||
| 1995 | return -EINVAL; | ||
| 1996 | } | ||
| 1990 | #endif | 1997 | #endif |
| 1991 | 1998 | ||
| 1992 | SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | 1999 | SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, |
| @@ -2141,6 +2148,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
| 2141 | case PR_SET_MM: | 2148 | case PR_SET_MM: |
| 2142 | error = prctl_set_mm(arg2, arg3, arg4, arg5); | 2149 | error = prctl_set_mm(arg2, arg3, arg4, arg5); |
| 2143 | break; | 2150 | break; |
| 2151 | case PR_GET_TID_ADDRESS: | ||
| 2152 | error = prctl_get_tid_address(me, (int __user **)arg2); | ||
| 2153 | break; | ||
| 2144 | case PR_SET_CHILD_SUBREAPER: | 2154 | case PR_SET_CHILD_SUBREAPER: |
| 2145 | me->signal->is_child_subreaper = !!arg2; | 2155 | me->signal->is_child_subreaper = !!arg2; |
| 2146 | error = 0; | 2156 | error = 0; |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 9cd928f7a7c6..7e1ce012a851 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
| @@ -297,8 +297,7 @@ void clockevents_register_device(struct clock_event_device *dev) | |||
| 297 | } | 297 | } |
| 298 | EXPORT_SYMBOL_GPL(clockevents_register_device); | 298 | EXPORT_SYMBOL_GPL(clockevents_register_device); |
| 299 | 299 | ||
| 300 | static void clockevents_config(struct clock_event_device *dev, | 300 | void clockevents_config(struct clock_event_device *dev, u32 freq) |
| 301 | u32 freq) | ||
| 302 | { | 301 | { |
| 303 | u64 sec; | 302 | u64 sec; |
| 304 | 303 | ||
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 70b33abcc7bb..b7fbadc5c973 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
| @@ -409,7 +409,9 @@ int second_overflow(unsigned long secs) | |||
| 409 | time_state = TIME_DEL; | 409 | time_state = TIME_DEL; |
| 410 | break; | 410 | break; |
| 411 | case TIME_INS: | 411 | case TIME_INS: |
| 412 | if (secs % 86400 == 0) { | 412 | if (!(time_status & STA_INS)) |
| 413 | time_state = TIME_OK; | ||
| 414 | else if (secs % 86400 == 0) { | ||
| 413 | leap = -1; | 415 | leap = -1; |
| 414 | time_state = TIME_OOP; | 416 | time_state = TIME_OOP; |
| 415 | time_tai++; | 417 | time_tai++; |
| @@ -418,7 +420,9 @@ int second_overflow(unsigned long secs) | |||
| 418 | } | 420 | } |
| 419 | break; | 421 | break; |
| 420 | case TIME_DEL: | 422 | case TIME_DEL: |
| 421 | if ((secs + 1) % 86400 == 0) { | 423 | if (!(time_status & STA_DEL)) |
| 424 | time_state = TIME_OK; | ||
| 425 | else if ((secs + 1) % 86400 == 0) { | ||
| 422 | leap = 1; | 426 | leap = 1; |
| 423 | time_tai--; | 427 | time_tai--; |
| 424 | time_state = TIME_WAIT; | 428 | time_state = TIME_WAIT; |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6a3a5b9ff561..4a08472c3ca7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -274,6 +274,7 @@ EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); | |||
| 274 | static void tick_nohz_stop_sched_tick(struct tick_sched *ts) | 274 | static void tick_nohz_stop_sched_tick(struct tick_sched *ts) |
| 275 | { | 275 | { |
| 276 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; | 276 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; |
| 277 | unsigned long rcu_delta_jiffies; | ||
| 277 | ktime_t last_update, expires, now; | 278 | ktime_t last_update, expires, now; |
| 278 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 279 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; |
| 279 | u64 time_delta; | 280 | u64 time_delta; |
| @@ -322,7 +323,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) | |||
| 322 | time_delta = timekeeping_max_deferment(); | 323 | time_delta = timekeeping_max_deferment(); |
| 323 | } while (read_seqretry(&xtime_lock, seq)); | 324 | } while (read_seqretry(&xtime_lock, seq)); |
| 324 | 325 | ||
| 325 | if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || | 326 | if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || |
| 326 | arch_needs_cpu(cpu)) { | 327 | arch_needs_cpu(cpu)) { |
| 327 | next_jiffies = last_jiffies + 1; | 328 | next_jiffies = last_jiffies + 1; |
| 328 | delta_jiffies = 1; | 329 | delta_jiffies = 1; |
| @@ -330,6 +331,10 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) | |||
| 330 | /* Get the next timer wheel timer */ | 331 | /* Get the next timer wheel timer */ |
| 331 | next_jiffies = get_next_timer_interrupt(last_jiffies); | 332 | next_jiffies = get_next_timer_interrupt(last_jiffies); |
| 332 | delta_jiffies = next_jiffies - last_jiffies; | 333 | delta_jiffies = next_jiffies - last_jiffies; |
| 334 | if (rcu_delta_jiffies < delta_jiffies) { | ||
| 335 | next_jiffies = last_jiffies + rcu_delta_jiffies; | ||
| 336 | delta_jiffies = rcu_delta_jiffies; | ||
| 337 | } | ||
| 333 | } | 338 | } |
| 334 | /* | 339 | /* |
| 335 | * Do not stop the tick, if we are only one off | 340 | * Do not stop the tick, if we are only one off |
| @@ -401,6 +406,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) | |||
| 401 | */ | 406 | */ |
| 402 | if (!ts->tick_stopped) { | 407 | if (!ts->tick_stopped) { |
| 403 | select_nohz_load_balancer(1); | 408 | select_nohz_load_balancer(1); |
| 409 | calc_load_enter_idle(); | ||
| 404 | 410 | ||
| 405 | ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); | 411 | ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); |
| 406 | ts->tick_stopped = 1; | 412 | ts->tick_stopped = 1; |
| @@ -576,6 +582,7 @@ void tick_nohz_idle_exit(void) | |||
| 576 | /* Update jiffies first */ | 582 | /* Update jiffies first */ |
| 577 | select_nohz_load_balancer(0); | 583 | select_nohz_load_balancer(0); |
| 578 | tick_do_update_jiffies64(now); | 584 | tick_do_update_jiffies64(now); |
| 585 | update_cpu_load_nohz(); | ||
| 579 | 586 | ||
| 580 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 587 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
| 581 | /* | 588 | /* |
| @@ -591,6 +598,7 @@ void tick_nohz_idle_exit(void) | |||
| 591 | account_idle_ticks(ticks); | 598 | account_idle_ticks(ticks); |
| 592 | #endif | 599 | #endif |
| 593 | 600 | ||
| 601 | calc_load_exit_idle(); | ||
| 594 | touch_softlockup_watchdog(); | 602 | touch_softlockup_watchdog(); |
| 595 | /* | 603 | /* |
| 596 | * Cancel the scheduled timer and restore the tick | 604 | * Cancel the scheduled timer and restore the tick |
| @@ -814,6 +822,16 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | |||
| 814 | return HRTIMER_RESTART; | 822 | return HRTIMER_RESTART; |
| 815 | } | 823 | } |
| 816 | 824 | ||
| 825 | static int sched_skew_tick; | ||
| 826 | |||
| 827 | static int __init skew_tick(char *str) | ||
| 828 | { | ||
| 829 | get_option(&str, &sched_skew_tick); | ||
| 830 | |||
| 831 | return 0; | ||
| 832 | } | ||
| 833 | early_param("skew_tick", skew_tick); | ||
| 834 | |||
| 817 | /** | 835 | /** |
| 818 | * tick_setup_sched_timer - setup the tick emulation timer | 836 | * tick_setup_sched_timer - setup the tick emulation timer |
| 819 | */ | 837 | */ |
| @@ -831,6 +849,14 @@ void tick_setup_sched_timer(void) | |||
| 831 | /* Get the next period (per cpu) */ | 849 | /* Get the next period (per cpu) */ |
| 832 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); | 850 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); |
| 833 | 851 | ||
| 852 | /* Offset the tick to avert xtime_lock contention. */ | ||
| 853 | if (sched_skew_tick) { | ||
| 854 | u64 offset = ktime_to_ns(tick_period) >> 1; | ||
| 855 | do_div(offset, num_possible_cpus()); | ||
| 856 | offset *= smp_processor_id(); | ||
| 857 | hrtimer_add_expires_ns(&ts->sched_timer, offset); | ||
| 858 | } | ||
| 859 | |||
| 834 | for (;;) { | 860 | for (;;) { |
| 835 | hrtimer_forward(&ts->sched_timer, now, tick_period); | 861 | hrtimer_forward(&ts->sched_timer, now, tick_period); |
| 836 | hrtimer_start_expires(&ts->sched_timer, | 862 | hrtimer_start_expires(&ts->sched_timer, |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6e46cacf5969..3447cfaf11e7 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -70,6 +70,12 @@ struct timekeeper { | |||
| 70 | /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ | 70 | /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ |
| 71 | struct timespec raw_time; | 71 | struct timespec raw_time; |
| 72 | 72 | ||
| 73 | /* Offset clock monotonic -> clock realtime */ | ||
| 74 | ktime_t offs_real; | ||
| 75 | |||
| 76 | /* Offset clock monotonic -> clock boottime */ | ||
| 77 | ktime_t offs_boot; | ||
| 78 | |||
| 73 | /* Seqlock for all timekeeper values */ | 79 | /* Seqlock for all timekeeper values */ |
| 74 | seqlock_t lock; | 80 | seqlock_t lock; |
| 75 | }; | 81 | }; |
| @@ -172,6 +178,14 @@ static inline s64 timekeeping_get_ns_raw(void) | |||
| 172 | return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); | 178 | return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); |
| 173 | } | 179 | } |
| 174 | 180 | ||
| 181 | static void update_rt_offset(void) | ||
| 182 | { | ||
| 183 | struct timespec tmp, *wtm = &timekeeper.wall_to_monotonic; | ||
| 184 | |||
| 185 | set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec); | ||
| 186 | timekeeper.offs_real = timespec_to_ktime(tmp); | ||
| 187 | } | ||
| 188 | |||
| 175 | /* must hold write on timekeeper.lock */ | 189 | /* must hold write on timekeeper.lock */ |
| 176 | static void timekeeping_update(bool clearntp) | 190 | static void timekeeping_update(bool clearntp) |
| 177 | { | 191 | { |
| @@ -179,6 +193,7 @@ static void timekeeping_update(bool clearntp) | |||
| 179 | timekeeper.ntp_error = 0; | 193 | timekeeper.ntp_error = 0; |
| 180 | ntp_clear(); | 194 | ntp_clear(); |
| 181 | } | 195 | } |
| 196 | update_rt_offset(); | ||
| 182 | update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, | 197 | update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, |
| 183 | timekeeper.clock, timekeeper.mult); | 198 | timekeeper.clock, timekeeper.mult); |
| 184 | } | 199 | } |
| @@ -604,6 +619,7 @@ void __init timekeeping_init(void) | |||
| 604 | } | 619 | } |
| 605 | set_normalized_timespec(&timekeeper.wall_to_monotonic, | 620 | set_normalized_timespec(&timekeeper.wall_to_monotonic, |
| 606 | -boot.tv_sec, -boot.tv_nsec); | 621 | -boot.tv_sec, -boot.tv_nsec); |
| 622 | update_rt_offset(); | ||
| 607 | timekeeper.total_sleep_time.tv_sec = 0; | 623 | timekeeper.total_sleep_time.tv_sec = 0; |
| 608 | timekeeper.total_sleep_time.tv_nsec = 0; | 624 | timekeeper.total_sleep_time.tv_nsec = 0; |
| 609 | write_sequnlock_irqrestore(&timekeeper.lock, flags); | 625 | write_sequnlock_irqrestore(&timekeeper.lock, flags); |
| @@ -612,6 +628,12 @@ void __init timekeeping_init(void) | |||
| 612 | /* time in seconds when suspend began */ | 628 | /* time in seconds when suspend began */ |
| 613 | static struct timespec timekeeping_suspend_time; | 629 | static struct timespec timekeeping_suspend_time; |
| 614 | 630 | ||
| 631 | static void update_sleep_time(struct timespec t) | ||
| 632 | { | ||
| 633 | timekeeper.total_sleep_time = t; | ||
| 634 | timekeeper.offs_boot = timespec_to_ktime(t); | ||
| 635 | } | ||
| 636 | |||
| 615 | /** | 637 | /** |
| 616 | * __timekeeping_inject_sleeptime - Internal function to add sleep interval | 638 | * __timekeeping_inject_sleeptime - Internal function to add sleep interval |
| 617 | * @delta: pointer to a timespec delta value | 639 | * @delta: pointer to a timespec delta value |
| @@ -630,8 +652,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta) | |||
| 630 | timekeeper.xtime = timespec_add(timekeeper.xtime, *delta); | 652 | timekeeper.xtime = timespec_add(timekeeper.xtime, *delta); |
| 631 | timekeeper.wall_to_monotonic = | 653 | timekeeper.wall_to_monotonic = |
| 632 | timespec_sub(timekeeper.wall_to_monotonic, *delta); | 654 | timespec_sub(timekeeper.wall_to_monotonic, *delta); |
| 633 | timekeeper.total_sleep_time = timespec_add( | 655 | update_sleep_time(timespec_add(timekeeper.total_sleep_time, *delta)); |
| 634 | timekeeper.total_sleep_time, *delta); | ||
| 635 | } | 656 | } |
| 636 | 657 | ||
| 637 | 658 | ||
| @@ -696,6 +717,7 @@ static void timekeeping_resume(void) | |||
| 696 | timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); | 717 | timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); |
| 697 | timekeeper.ntp_error = 0; | 718 | timekeeper.ntp_error = 0; |
| 698 | timekeeping_suspended = 0; | 719 | timekeeping_suspended = 0; |
| 720 | timekeeping_update(false); | ||
| 699 | write_sequnlock_irqrestore(&timekeeper.lock, flags); | 721 | write_sequnlock_irqrestore(&timekeeper.lock, flags); |
| 700 | 722 | ||
| 701 | touch_softlockup_watchdog(); | 723 | touch_softlockup_watchdog(); |
| @@ -962,6 +984,9 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) | |||
| 962 | timekeeper.xtime.tv_sec++; | 984 | timekeeper.xtime.tv_sec++; |
| 963 | leap = second_overflow(timekeeper.xtime.tv_sec); | 985 | leap = second_overflow(timekeeper.xtime.tv_sec); |
| 964 | timekeeper.xtime.tv_sec += leap; | 986 | timekeeper.xtime.tv_sec += leap; |
| 987 | timekeeper.wall_to_monotonic.tv_sec -= leap; | ||
| 988 | if (leap) | ||
| 989 | clock_was_set_delayed(); | ||
| 965 | } | 990 | } |
| 966 | 991 | ||
| 967 | /* Accumulate raw time */ | 992 | /* Accumulate raw time */ |
| @@ -1077,6 +1102,9 @@ static void update_wall_time(void) | |||
| 1077 | timekeeper.xtime.tv_sec++; | 1102 | timekeeper.xtime.tv_sec++; |
| 1078 | leap = second_overflow(timekeeper.xtime.tv_sec); | 1103 | leap = second_overflow(timekeeper.xtime.tv_sec); |
| 1079 | timekeeper.xtime.tv_sec += leap; | 1104 | timekeeper.xtime.tv_sec += leap; |
| 1105 | timekeeper.wall_to_monotonic.tv_sec -= leap; | ||
| 1106 | if (leap) | ||
| 1107 | clock_was_set_delayed(); | ||
| 1080 | } | 1108 | } |
| 1081 | 1109 | ||
| 1082 | timekeeping_update(false); | 1110 | timekeeping_update(false); |
| @@ -1244,6 +1272,40 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, | |||
| 1244 | } while (read_seqretry(&timekeeper.lock, seq)); | 1272 | } while (read_seqretry(&timekeeper.lock, seq)); |
| 1245 | } | 1273 | } |
| 1246 | 1274 | ||
| 1275 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
| 1276 | /** | ||
| 1277 | * ktime_get_update_offsets - hrtimer helper | ||
| 1278 | * @offs_real: pointer to storage for monotonic -> realtime offset | ||
| 1279 | * @offs_boot: pointer to storage for monotonic -> boottime offset | ||
| 1280 | * | ||
| 1281 | * Returns current monotonic time and updates the offsets | ||
| 1282 | * Called from hrtimer_interupt() or retrigger_next_event() | ||
| 1283 | */ | ||
| 1284 | ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) | ||
| 1285 | { | ||
| 1286 | ktime_t now; | ||
| 1287 | unsigned int seq; | ||
| 1288 | u64 secs, nsecs; | ||
| 1289 | |||
| 1290 | do { | ||
| 1291 | seq = read_seqbegin(&timekeeper.lock); | ||
| 1292 | |||
| 1293 | secs = timekeeper.xtime.tv_sec; | ||
| 1294 | nsecs = timekeeper.xtime.tv_nsec; | ||
| 1295 | nsecs += timekeeping_get_ns(); | ||
| 1296 | /* If arch requires, add in gettimeoffset() */ | ||
| 1297 | nsecs += arch_gettimeoffset(); | ||
| 1298 | |||
| 1299 | *offs_real = timekeeper.offs_real; | ||
| 1300 | *offs_boot = timekeeper.offs_boot; | ||
| 1301 | } while (read_seqretry(&timekeeper.lock, seq)); | ||
| 1302 | |||
| 1303 | now = ktime_add_ns(ktime_set(secs, 0), nsecs); | ||
| 1304 | now = ktime_sub(now, *offs_real); | ||
| 1305 | return now; | ||
| 1306 | } | ||
| 1307 | #endif | ||
| 1308 | |||
| 1247 | /** | 1309 | /** |
| 1248 | * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format | 1310 | * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format |
| 1249 | */ | 1311 | */ |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d0f6a8a0e5e..f765465bffe4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
| @@ -1075,6 +1075,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) | |||
| 1075 | rb_init_page(bpage->page); | 1075 | rb_init_page(bpage->page); |
| 1076 | 1076 | ||
| 1077 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); | 1077 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); |
| 1078 | INIT_LIST_HEAD(&cpu_buffer->new_pages); | ||
| 1078 | 1079 | ||
| 1079 | ret = rb_allocate_pages(cpu_buffer, nr_pages); | 1080 | ret = rb_allocate_pages(cpu_buffer, nr_pages); |
| 1080 | if (ret < 0) | 1081 | if (ret < 0) |
| @@ -1346,10 +1347,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) | |||
| 1346 | * If something was added to this page, it was full | 1347 | * If something was added to this page, it was full |
| 1347 | * since it is not the tail page. So we deduct the | 1348 | * since it is not the tail page. So we deduct the |
| 1348 | * bytes consumed in ring buffer from here. | 1349 | * bytes consumed in ring buffer from here. |
| 1349 | * No need to update overruns, since this page is | 1350 | * Increment overrun to account for the lost events. |
| 1350 | * deleted from ring buffer and its entries are | ||
| 1351 | * already accounted for. | ||
| 1352 | */ | 1351 | */ |
| 1352 | local_add(page_entries, &cpu_buffer->overrun); | ||
| 1353 | local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); | 1353 | local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); |
| 1354 | } | 1354 | } |
| 1355 | 1355 | ||
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 288488082224..a7fa0702be1c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -371,7 +371,7 @@ EXPORT_SYMBOL_GPL(tracing_on); | |||
| 371 | void tracing_off(void) | 371 | void tracing_off(void) |
| 372 | { | 372 | { |
| 373 | if (global_trace.buffer) | 373 | if (global_trace.buffer) |
| 374 | ring_buffer_record_on(global_trace.buffer); | 374 | ring_buffer_record_off(global_trace.buffer); |
| 375 | /* | 375 | /* |
| 376 | * This flag is only looked at when buffers haven't been | 376 | * This flag is only looked at when buffers haven't been |
| 377 | * allocated yet. We don't really care about the race | 377 | * allocated yet. We don't really care about the race |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index e5e1d85b8c7c..4b1dfba70f7c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
| @@ -372,6 +372,13 @@ static int watchdog(void *unused) | |||
| 372 | 372 | ||
| 373 | 373 | ||
| 374 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 374 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
| 375 | /* | ||
| 376 | * People like the simple clean cpu node info on boot. | ||
| 377 | * Reduce the watchdog noise by only printing messages | ||
| 378 | * that are different from what cpu0 displayed. | ||
| 379 | */ | ||
| 380 | static unsigned long cpu0_err; | ||
| 381 | |||
| 375 | static int watchdog_nmi_enable(int cpu) | 382 | static int watchdog_nmi_enable(int cpu) |
| 376 | { | 383 | { |
| 377 | struct perf_event_attr *wd_attr; | 384 | struct perf_event_attr *wd_attr; |
| @@ -390,11 +397,21 @@ static int watchdog_nmi_enable(int cpu) | |||
| 390 | 397 | ||
| 391 | /* Try to register using hardware perf events */ | 398 | /* Try to register using hardware perf events */ |
| 392 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); | 399 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); |
| 400 | |||
| 401 | /* save cpu0 error for future comparision */ | ||
| 402 | if (cpu == 0 && IS_ERR(event)) | ||
| 403 | cpu0_err = PTR_ERR(event); | ||
| 404 | |||
| 393 | if (!IS_ERR(event)) { | 405 | if (!IS_ERR(event)) { |
| 394 | pr_info("enabled, takes one hw-pmu counter.\n"); | 406 | /* only print for cpu0 or different than cpu0 */ |
| 407 | if (cpu == 0 || cpu0_err) | ||
| 408 | pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); | ||
| 395 | goto out_save; | 409 | goto out_save; |
| 396 | } | 410 | } |
| 397 | 411 | ||
| 412 | /* skip displaying the same error again */ | ||
| 413 | if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) | ||
| 414 | return PTR_ERR(event); | ||
| 398 | 415 | ||
| 399 | /* vary the KERN level based on the returned errno */ | 416 | /* vary the KERN level based on the returned errno */ |
| 400 | if (PTR_ERR(event) == -EOPNOTSUPP) | 417 | if (PTR_ERR(event) == -EOPNOTSUPP) |
