diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup.c | 30 | ||||
-rw-r--r-- | kernel/events/core.c | 11 | ||||
-rw-r--r-- | kernel/exit.c | 19 | ||||
-rw-r--r-- | kernel/irq/chip.c | 8 | ||||
-rw-r--r-- | kernel/irq/internals.h | 3 | ||||
-rw-r--r-- | kernel/irq/manage.c | 39 | ||||
-rw-r--r-- | kernel/irq/migration.c | 13 | ||||
-rw-r--r-- | kernel/panic.c | 6 | ||||
-rw-r--r-- | kernel/pid_namespace.c | 20 | ||||
-rw-r--r-- | kernel/printk.c | 532 | ||||
-rw-r--r-- | kernel/rcutree.c | 16 | ||||
-rw-r--r-- | kernel/rcutree.h | 14 | ||||
-rw-r--r-- | kernel/rcutree_plugin.h | 165 | ||||
-rw-r--r-- | kernel/sched/core.c | 249 | ||||
-rw-r--r-- | kernel/sched/fair.c | 71 | ||||
-rw-r--r-- | kernel/sched/rt.c | 53 | ||||
-rw-r--r-- | kernel/sched/sched.h | 2 | ||||
-rw-r--r-- | kernel/smpboot.c | 17 | ||||
-rw-r--r-- | kernel/sys.c | 60 | ||||
-rw-r--r-- | kernel/time/clockevents.c | 3 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 26 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace.c | 2 | ||||
-rw-r--r-- | kernel/watchdog.c | 19 |
24 files changed, 996 insertions, 384 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0f3527d6184a..2097684cf194 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -255,12 +255,17 @@ int cgroup_lock_is_held(void) | |||
255 | 255 | ||
256 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); | 256 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); |
257 | 257 | ||
258 | static int css_unbias_refcnt(int refcnt) | ||
259 | { | ||
260 | return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; | ||
261 | } | ||
262 | |||
258 | /* the current nr of refs, always >= 0 whether @css is deactivated or not */ | 263 | /* the current nr of refs, always >= 0 whether @css is deactivated or not */ |
259 | static int css_refcnt(struct cgroup_subsys_state *css) | 264 | static int css_refcnt(struct cgroup_subsys_state *css) |
260 | { | 265 | { |
261 | int v = atomic_read(&css->refcnt); | 266 | int v = atomic_read(&css->refcnt); |
262 | 267 | ||
263 | return v >= 0 ? v : v - CSS_DEACT_BIAS; | 268 | return css_unbias_refcnt(v); |
264 | } | 269 | } |
265 | 270 | ||
266 | /* convenient tests for these bits */ | 271 | /* convenient tests for these bits */ |
@@ -896,10 +901,13 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
896 | mutex_unlock(&cgroup_mutex); | 901 | mutex_unlock(&cgroup_mutex); |
897 | 902 | ||
898 | /* | 903 | /* |
899 | * Drop the active superblock reference that we took when we | 904 | * We want to drop the active superblock reference from the |
900 | * created the cgroup | 905 | * cgroup creation after all the dentry refs are gone - |
906 | * kill_sb gets mighty unhappy otherwise. Mark | ||
907 | * dentry->d_fsdata with cgroup_diput() to tell | ||
908 | * cgroup_d_release() to call deactivate_super(). | ||
901 | */ | 909 | */ |
902 | deactivate_super(cgrp->root->sb); | 910 | dentry->d_fsdata = cgroup_diput; |
903 | 911 | ||
904 | /* | 912 | /* |
905 | * if we're getting rid of the cgroup, refcount should ensure | 913 | * if we're getting rid of the cgroup, refcount should ensure |
@@ -925,6 +933,13 @@ static int cgroup_delete(const struct dentry *d) | |||
925 | return 1; | 933 | return 1; |
926 | } | 934 | } |
927 | 935 | ||
936 | static void cgroup_d_release(struct dentry *dentry) | ||
937 | { | ||
938 | /* did cgroup_diput() tell me to deactivate super? */ | ||
939 | if (dentry->d_fsdata == cgroup_diput) | ||
940 | deactivate_super(dentry->d_sb); | ||
941 | } | ||
942 | |||
928 | static void remove_dir(struct dentry *d) | 943 | static void remove_dir(struct dentry *d) |
929 | { | 944 | { |
930 | struct dentry *parent = dget(d->d_parent); | 945 | struct dentry *parent = dget(d->d_parent); |
@@ -1532,6 +1547,7 @@ static int cgroup_get_rootdir(struct super_block *sb) | |||
1532 | static const struct dentry_operations cgroup_dops = { | 1547 | static const struct dentry_operations cgroup_dops = { |
1533 | .d_iput = cgroup_diput, | 1548 | .d_iput = cgroup_diput, |
1534 | .d_delete = cgroup_delete, | 1549 | .d_delete = cgroup_delete, |
1550 | .d_release = cgroup_d_release, | ||
1535 | }; | 1551 | }; |
1536 | 1552 | ||
1537 | struct inode *inode = | 1553 | struct inode *inode = |
@@ -4971,10 +4987,12 @@ EXPORT_SYMBOL_GPL(__css_tryget); | |||
4971 | void __css_put(struct cgroup_subsys_state *css) | 4987 | void __css_put(struct cgroup_subsys_state *css) |
4972 | { | 4988 | { |
4973 | struct cgroup *cgrp = css->cgroup; | 4989 | struct cgroup *cgrp = css->cgroup; |
4990 | int v; | ||
4974 | 4991 | ||
4975 | rcu_read_lock(); | 4992 | rcu_read_lock(); |
4976 | atomic_dec(&css->refcnt); | 4993 | v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); |
4977 | switch (css_refcnt(css)) { | 4994 | |
4995 | switch (v) { | ||
4978 | case 1: | 4996 | case 1: |
4979 | if (notify_on_release(cgrp)) { | 4997 | if (notify_on_release(cgrp)) { |
4980 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 4998 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
diff --git a/kernel/events/core.c b/kernel/events/core.c index 5b06cbbf6931..d7d71d6ec972 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -253,9 +253,9 @@ perf_cgroup_match(struct perf_event *event) | |||
253 | return !event->cgrp || event->cgrp == cpuctx->cgrp; | 253 | return !event->cgrp || event->cgrp == cpuctx->cgrp; |
254 | } | 254 | } |
255 | 255 | ||
256 | static inline void perf_get_cgroup(struct perf_event *event) | 256 | static inline bool perf_tryget_cgroup(struct perf_event *event) |
257 | { | 257 | { |
258 | css_get(&event->cgrp->css); | 258 | return css_tryget(&event->cgrp->css); |
259 | } | 259 | } |
260 | 260 | ||
261 | static inline void perf_put_cgroup(struct perf_event *event) | 261 | static inline void perf_put_cgroup(struct perf_event *event) |
@@ -484,7 +484,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, | |||
484 | event->cgrp = cgrp; | 484 | event->cgrp = cgrp; |
485 | 485 | ||
486 | /* must be done before we fput() the file */ | 486 | /* must be done before we fput() the file */ |
487 | perf_get_cgroup(event); | 487 | if (!perf_tryget_cgroup(event)) { |
488 | event->cgrp = NULL; | ||
489 | ret = -ENOENT; | ||
490 | goto out; | ||
491 | } | ||
488 | 492 | ||
489 | /* | 493 | /* |
490 | * all events in a group must monitor | 494 | * all events in a group must monitor |
@@ -3181,7 +3185,6 @@ static void perf_event_for_each(struct perf_event *event, | |||
3181 | event = event->group_leader; | 3185 | event = event->group_leader; |
3182 | 3186 | ||
3183 | perf_event_for_each_child(event, func); | 3187 | perf_event_for_each_child(event, func); |
3184 | func(event); | ||
3185 | list_for_each_entry(sibling, &event->sibling_list, group_entry) | 3188 | list_for_each_entry(sibling, &event->sibling_list, group_entry) |
3186 | perf_event_for_each_child(sibling, func); | 3189 | perf_event_for_each_child(sibling, func); |
3187 | mutex_unlock(&ctx->mutex); | 3190 | mutex_unlock(&ctx->mutex); |
diff --git a/kernel/exit.c b/kernel/exit.c index 34867cc5b42a..2f59cc334516 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -72,6 +72,18 @@ static void __unhash_process(struct task_struct *p, bool group_dead) | |||
72 | list_del_rcu(&p->tasks); | 72 | list_del_rcu(&p->tasks); |
73 | list_del_init(&p->sibling); | 73 | list_del_init(&p->sibling); |
74 | __this_cpu_dec(process_counts); | 74 | __this_cpu_dec(process_counts); |
75 | /* | ||
76 | * If we are the last child process in a pid namespace to be | ||
77 | * reaped, notify the reaper sleeping zap_pid_ns_processes(). | ||
78 | */ | ||
79 | if (IS_ENABLED(CONFIG_PID_NS)) { | ||
80 | struct task_struct *parent = p->real_parent; | ||
81 | |||
82 | if ((task_active_pid_ns(parent)->child_reaper == parent) && | ||
83 | list_empty(&parent->children) && | ||
84 | (parent->flags & PF_EXITING)) | ||
85 | wake_up_process(parent); | ||
86 | } | ||
75 | } | 87 | } |
76 | list_del_rcu(&p->thread_group); | 88 | list_del_rcu(&p->thread_group); |
77 | } | 89 | } |
@@ -643,6 +655,7 @@ static void exit_mm(struct task_struct * tsk) | |||
643 | mm_release(tsk, mm); | 655 | mm_release(tsk, mm); |
644 | if (!mm) | 656 | if (!mm) |
645 | return; | 657 | return; |
658 | sync_mm_rss(mm); | ||
646 | /* | 659 | /* |
647 | * Serialize with any possible pending coredump. | 660 | * Serialize with any possible pending coredump. |
648 | * We must hold mmap_sem around checking core_state | 661 | * We must hold mmap_sem around checking core_state |
@@ -719,12 +732,6 @@ static struct task_struct *find_new_reaper(struct task_struct *father) | |||
719 | 732 | ||
720 | zap_pid_ns_processes(pid_ns); | 733 | zap_pid_ns_processes(pid_ns); |
721 | write_lock_irq(&tasklist_lock); | 734 | write_lock_irq(&tasklist_lock); |
722 | /* | ||
723 | * We can not clear ->child_reaper or leave it alone. | ||
724 | * There may by stealth EXIT_DEAD tasks on ->children, | ||
725 | * forget_original_parent() must move them somewhere. | ||
726 | */ | ||
727 | pid_ns->child_reaper = init_pid_ns.child_reaper; | ||
728 | } else if (father->signal->has_child_subreaper) { | 735 | } else if (father->signal->has_child_subreaper) { |
729 | struct task_struct *reaper; | 736 | struct task_struct *reaper; |
730 | 737 | ||
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index fc275e4f629b..eebd6d5cfb44 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -275,8 +275,10 @@ void handle_nested_irq(unsigned int irq) | |||
275 | kstat_incr_irqs_this_cpu(irq, desc); | 275 | kstat_incr_irqs_this_cpu(irq, desc); |
276 | 276 | ||
277 | action = desc->action; | 277 | action = desc->action; |
278 | if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) | 278 | if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { |
279 | desc->istate |= IRQS_PENDING; | ||
279 | goto out_unlock; | 280 | goto out_unlock; |
281 | } | ||
280 | 282 | ||
281 | irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); | 283 | irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); |
282 | raw_spin_unlock_irq(&desc->lock); | 284 | raw_spin_unlock_irq(&desc->lock); |
@@ -324,8 +326,10 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) | |||
324 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | 326 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); |
325 | kstat_incr_irqs_this_cpu(irq, desc); | 327 | kstat_incr_irqs_this_cpu(irq, desc); |
326 | 328 | ||
327 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) | 329 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { |
330 | desc->istate |= IRQS_PENDING; | ||
328 | goto out_unlock; | 331 | goto out_unlock; |
332 | } | ||
329 | 333 | ||
330 | handle_irq_event(desc); | 334 | handle_irq_event(desc); |
331 | 335 | ||
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 8e5c56b3b7d9..001fa5bab490 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
@@ -101,6 +101,9 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); | |||
101 | 101 | ||
102 | extern void irq_set_thread_affinity(struct irq_desc *desc); | 102 | extern void irq_set_thread_affinity(struct irq_desc *desc); |
103 | 103 | ||
104 | extern int irq_do_set_affinity(struct irq_data *data, | ||
105 | const struct cpumask *dest, bool force); | ||
106 | |||
104 | /* Inline functions for support of irq chips on slow busses */ | 107 | /* Inline functions for support of irq chips on slow busses */ |
105 | static inline void chip_bus_lock(struct irq_desc *desc) | 108 | static inline void chip_bus_lock(struct irq_desc *desc) |
106 | { | 109 | { |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ea0c6c2ae6f7..8c548232ba39 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -142,6 +142,25 @@ static inline void | |||
142 | irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } | 142 | irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } |
143 | #endif | 143 | #endif |
144 | 144 | ||
145 | int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, | ||
146 | bool force) | ||
147 | { | ||
148 | struct irq_desc *desc = irq_data_to_desc(data); | ||
149 | struct irq_chip *chip = irq_data_get_irq_chip(data); | ||
150 | int ret; | ||
151 | |||
152 | ret = chip->irq_set_affinity(data, mask, false); | ||
153 | switch (ret) { | ||
154 | case IRQ_SET_MASK_OK: | ||
155 | cpumask_copy(data->affinity, mask); | ||
156 | case IRQ_SET_MASK_OK_NOCOPY: | ||
157 | irq_set_thread_affinity(desc); | ||
158 | ret = 0; | ||
159 | } | ||
160 | |||
161 | return ret; | ||
162 | } | ||
163 | |||
145 | int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) | 164 | int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) |
146 | { | 165 | { |
147 | struct irq_chip *chip = irq_data_get_irq_chip(data); | 166 | struct irq_chip *chip = irq_data_get_irq_chip(data); |
@@ -152,14 +171,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) | |||
152 | return -EINVAL; | 171 | return -EINVAL; |
153 | 172 | ||
154 | if (irq_can_move_pcntxt(data)) { | 173 | if (irq_can_move_pcntxt(data)) { |
155 | ret = chip->irq_set_affinity(data, mask, false); | 174 | ret = irq_do_set_affinity(data, mask, false); |
156 | switch (ret) { | ||
157 | case IRQ_SET_MASK_OK: | ||
158 | cpumask_copy(data->affinity, mask); | ||
159 | case IRQ_SET_MASK_OK_NOCOPY: | ||
160 | irq_set_thread_affinity(desc); | ||
161 | ret = 0; | ||
162 | } | ||
163 | } else { | 175 | } else { |
164 | irqd_set_move_pending(data); | 176 | irqd_set_move_pending(data); |
165 | irq_copy_pending(desc, mask); | 177 | irq_copy_pending(desc, mask); |
@@ -283,9 +295,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); | |||
283 | static int | 295 | static int |
284 | setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) | 296 | setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) |
285 | { | 297 | { |
286 | struct irq_chip *chip = irq_desc_get_chip(desc); | ||
287 | struct cpumask *set = irq_default_affinity; | 298 | struct cpumask *set = irq_default_affinity; |
288 | int ret, node = desc->irq_data.node; | 299 | int node = desc->irq_data.node; |
289 | 300 | ||
290 | /* Excludes PER_CPU and NO_BALANCE interrupts */ | 301 | /* Excludes PER_CPU and NO_BALANCE interrupts */ |
291 | if (!irq_can_set_affinity(irq)) | 302 | if (!irq_can_set_affinity(irq)) |
@@ -311,13 +322,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) | |||
311 | if (cpumask_intersects(mask, nodemask)) | 322 | if (cpumask_intersects(mask, nodemask)) |
312 | cpumask_and(mask, mask, nodemask); | 323 | cpumask_and(mask, mask, nodemask); |
313 | } | 324 | } |
314 | ret = chip->irq_set_affinity(&desc->irq_data, mask, false); | 325 | irq_do_set_affinity(&desc->irq_data, mask, false); |
315 | switch (ret) { | ||
316 | case IRQ_SET_MASK_OK: | ||
317 | cpumask_copy(desc->irq_data.affinity, mask); | ||
318 | case IRQ_SET_MASK_OK_NOCOPY: | ||
319 | irq_set_thread_affinity(desc); | ||
320 | } | ||
321 | return 0; | 326 | return 0; |
322 | } | 327 | } |
323 | #else | 328 | #else |
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index c3c89751b327..ca3f4aaff707 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c | |||
@@ -42,17 +42,8 @@ void irq_move_masked_irq(struct irq_data *idata) | |||
42 | * For correct operation this depends on the caller | 42 | * For correct operation this depends on the caller |
43 | * masking the irqs. | 43 | * masking the irqs. |
44 | */ | 44 | */ |
45 | if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) | 45 | if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) |
46 | < nr_cpu_ids)) { | 46 | irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false); |
47 | int ret = chip->irq_set_affinity(&desc->irq_data, | ||
48 | desc->pending_mask, false); | ||
49 | switch (ret) { | ||
50 | case IRQ_SET_MASK_OK: | ||
51 | cpumask_copy(desc->irq_data.affinity, desc->pending_mask); | ||
52 | case IRQ_SET_MASK_OK_NOCOPY: | ||
53 | irq_set_thread_affinity(desc); | ||
54 | } | ||
55 | } | ||
56 | 47 | ||
57 | cpumask_clear(desc->pending_mask); | 48 | cpumask_clear(desc->pending_mask); |
58 | } | 49 | } |
diff --git a/kernel/panic.c b/kernel/panic.c index 8ed89a175d79..d2a5f4ecc6dd 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -27,7 +27,7 @@ | |||
27 | #define PANIC_TIMER_STEP 100 | 27 | #define PANIC_TIMER_STEP 100 |
28 | #define PANIC_BLINK_SPD 18 | 28 | #define PANIC_BLINK_SPD 18 |
29 | 29 | ||
30 | int panic_on_oops; | 30 | int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; |
31 | static unsigned long tainted_mask; | 31 | static unsigned long tainted_mask; |
32 | static int pause_on_oops; | 32 | static int pause_on_oops; |
33 | static int pause_on_oops_flag; | 33 | static int pause_on_oops_flag; |
@@ -108,8 +108,6 @@ void panic(const char *fmt, ...) | |||
108 | */ | 108 | */ |
109 | crash_kexec(NULL); | 109 | crash_kexec(NULL); |
110 | 110 | ||
111 | kmsg_dump(KMSG_DUMP_PANIC); | ||
112 | |||
113 | /* | 111 | /* |
114 | * Note smp_send_stop is the usual smp shutdown function, which | 112 | * Note smp_send_stop is the usual smp shutdown function, which |
115 | * unfortunately means it may not be hardened to work in a panic | 113 | * unfortunately means it may not be hardened to work in a panic |
@@ -117,6 +115,8 @@ void panic(const char *fmt, ...) | |||
117 | */ | 115 | */ |
118 | smp_send_stop(); | 116 | smp_send_stop(); |
119 | 117 | ||
118 | kmsg_dump(KMSG_DUMP_PANIC); | ||
119 | |||
120 | atomic_notifier_call_chain(&panic_notifier_list, 0, buf); | 120 | atomic_notifier_call_chain(&panic_notifier_list, 0, buf); |
121 | 121 | ||
122 | bust_spinlocks(0); | 122 | bust_spinlocks(0); |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 16b20e38c4a1..b3c7fd554250 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -184,11 +184,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
184 | } | 184 | } |
185 | read_unlock(&tasklist_lock); | 185 | read_unlock(&tasklist_lock); |
186 | 186 | ||
187 | /* Firstly reap the EXIT_ZOMBIE children we may have. */ | ||
187 | do { | 188 | do { |
188 | clear_thread_flag(TIF_SIGPENDING); | 189 | clear_thread_flag(TIF_SIGPENDING); |
189 | rc = sys_wait4(-1, NULL, __WALL, NULL); | 190 | rc = sys_wait4(-1, NULL, __WALL, NULL); |
190 | } while (rc != -ECHILD); | 191 | } while (rc != -ECHILD); |
191 | 192 | ||
193 | /* | ||
194 | * sys_wait4() above can't reap the TASK_DEAD children. | ||
195 | * Make sure they all go away, see __unhash_process(). | ||
196 | */ | ||
197 | for (;;) { | ||
198 | bool need_wait = false; | ||
199 | |||
200 | read_lock(&tasklist_lock); | ||
201 | if (!list_empty(¤t->children)) { | ||
202 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
203 | need_wait = true; | ||
204 | } | ||
205 | read_unlock(&tasklist_lock); | ||
206 | |||
207 | if (!need_wait) | ||
208 | break; | ||
209 | schedule(); | ||
210 | } | ||
211 | |||
192 | if (pid_ns->reboot) | 212 | if (pid_ns->reboot) |
193 | current->signal->group_exit_code = pid_ns->reboot; | 213 | current->signal->group_exit_code = pid_ns->reboot; |
194 | 214 | ||
diff --git a/kernel/printk.c b/kernel/printk.c index 32462d2b364a..dba18211685e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -193,12 +193,19 @@ static int console_may_schedule; | |||
193 | * separated by ',', and find the message after the ';' character. | 193 | * separated by ',', and find the message after the ';' character. |
194 | */ | 194 | */ |
195 | 195 | ||
196 | enum log_flags { | ||
197 | LOG_DEFAULT = 0, | ||
198 | LOG_NOCONS = 1, /* already flushed, do not print to console */ | ||
199 | }; | ||
200 | |||
196 | struct log { | 201 | struct log { |
197 | u64 ts_nsec; /* timestamp in nanoseconds */ | 202 | u64 ts_nsec; /* timestamp in nanoseconds */ |
198 | u16 len; /* length of entire record */ | 203 | u16 len; /* length of entire record */ |
199 | u16 text_len; /* length of text buffer */ | 204 | u16 text_len; /* length of text buffer */ |
200 | u16 dict_len; /* length of dictionary buffer */ | 205 | u16 dict_len; /* length of dictionary buffer */ |
201 | u16 level; /* syslog level + facility */ | 206 | u8 facility; /* syslog facility */ |
207 | u8 flags:5; /* internal record flags */ | ||
208 | u8 level:3; /* syslog level */ | ||
202 | }; | 209 | }; |
203 | 210 | ||
204 | /* | 211 | /* |
@@ -227,10 +234,10 @@ static u32 clear_idx; | |||
227 | #define LOG_LINE_MAX 1024 | 234 | #define LOG_LINE_MAX 1024 |
228 | 235 | ||
229 | /* record buffer */ | 236 | /* record buffer */ |
230 | #if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) | 237 | #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) |
231 | #define LOG_ALIGN 4 | 238 | #define LOG_ALIGN 4 |
232 | #else | 239 | #else |
233 | #define LOG_ALIGN 8 | 240 | #define LOG_ALIGN __alignof__(struct log) |
234 | #endif | 241 | #endif |
235 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) | 242 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) |
236 | static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); | 243 | static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); |
@@ -286,6 +293,7 @@ static u32 log_next(u32 idx) | |||
286 | 293 | ||
287 | /* insert record into the buffer, discard old ones, update heads */ | 294 | /* insert record into the buffer, discard old ones, update heads */ |
288 | static void log_store(int facility, int level, | 295 | static void log_store(int facility, int level, |
296 | enum log_flags flags, u64 ts_nsec, | ||
289 | const char *dict, u16 dict_len, | 297 | const char *dict, u16 dict_len, |
290 | const char *text, u16 text_len) | 298 | const char *text, u16 text_len) |
291 | { | 299 | { |
@@ -329,8 +337,13 @@ static void log_store(int facility, int level, | |||
329 | msg->text_len = text_len; | 337 | msg->text_len = text_len; |
330 | memcpy(log_dict(msg), dict, dict_len); | 338 | memcpy(log_dict(msg), dict, dict_len); |
331 | msg->dict_len = dict_len; | 339 | msg->dict_len = dict_len; |
332 | msg->level = (facility << 3) | (level & 7); | 340 | msg->facility = facility; |
333 | msg->ts_nsec = local_clock(); | 341 | msg->level = level & 7; |
342 | msg->flags = flags & 0x1f; | ||
343 | if (ts_nsec > 0) | ||
344 | msg->ts_nsec = ts_nsec; | ||
345 | else | ||
346 | msg->ts_nsec = local_clock(); | ||
334 | memset(log_dict(msg) + dict_len, 0, pad_len); | 347 | memset(log_dict(msg) + dict_len, 0, pad_len); |
335 | msg->len = sizeof(struct log) + text_len + dict_len + pad_len; | 348 | msg->len = sizeof(struct log) + text_len + dict_len + pad_len; |
336 | 349 | ||
@@ -414,7 +427,9 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, | |||
414 | if (!user) | 427 | if (!user) |
415 | return -EBADF; | 428 | return -EBADF; |
416 | 429 | ||
417 | mutex_lock(&user->lock); | 430 | ret = mutex_lock_interruptible(&user->lock); |
431 | if (ret) | ||
432 | return ret; | ||
418 | raw_spin_lock(&logbuf_lock); | 433 | raw_spin_lock(&logbuf_lock); |
419 | while (user->seq == log_next_seq) { | 434 | while (user->seq == log_next_seq) { |
420 | if (file->f_flags & O_NONBLOCK) { | 435 | if (file->f_flags & O_NONBLOCK) { |
@@ -444,7 +459,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, | |||
444 | ts_usec = msg->ts_nsec; | 459 | ts_usec = msg->ts_nsec; |
445 | do_div(ts_usec, 1000); | 460 | do_div(ts_usec, 1000); |
446 | len = sprintf(user->buf, "%u,%llu,%llu;", | 461 | len = sprintf(user->buf, "%u,%llu,%llu;", |
447 | msg->level, user->seq, ts_usec); | 462 | (msg->facility << 3) | msg->level, user->seq, ts_usec); |
448 | 463 | ||
449 | /* escape non-printable characters */ | 464 | /* escape non-printable characters */ |
450 | for (i = 0; i < msg->text_len; i++) { | 465 | for (i = 0; i < msg->text_len; i++) { |
@@ -785,6 +800,21 @@ static bool printk_time; | |||
785 | #endif | 800 | #endif |
786 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); | 801 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); |
787 | 802 | ||
803 | static size_t print_time(u64 ts, char *buf) | ||
804 | { | ||
805 | unsigned long rem_nsec; | ||
806 | |||
807 | if (!printk_time) | ||
808 | return 0; | ||
809 | |||
810 | if (!buf) | ||
811 | return 15; | ||
812 | |||
813 | rem_nsec = do_div(ts, 1000000000); | ||
814 | return sprintf(buf, "[%5lu.%06lu] ", | ||
815 | (unsigned long)ts, rem_nsec / 1000); | ||
816 | } | ||
817 | |||
788 | static size_t print_prefix(const struct log *msg, bool syslog, char *buf) | 818 | static size_t print_prefix(const struct log *msg, bool syslog, char *buf) |
789 | { | 819 | { |
790 | size_t len = 0; | 820 | size_t len = 0; |
@@ -801,18 +831,7 @@ static size_t print_prefix(const struct log *msg, bool syslog, char *buf) | |||
801 | } | 831 | } |
802 | } | 832 | } |
803 | 833 | ||
804 | if (printk_time) { | 834 | len += print_time(msg->ts_nsec, buf ? buf + len : NULL); |
805 | if (buf) { | ||
806 | unsigned long long ts = msg->ts_nsec; | ||
807 | unsigned long rem_nsec = do_div(ts, 1000000000); | ||
808 | |||
809 | len += sprintf(buf + len, "[%5lu.%06lu] ", | ||
810 | (unsigned long) ts, rem_nsec / 1000); | ||
811 | } else { | ||
812 | len += 15; | ||
813 | } | ||
814 | } | ||
815 | |||
816 | return len; | 835 | return len; |
817 | } | 836 | } |
818 | 837 | ||
@@ -860,26 +879,49 @@ static int syslog_print(char __user *buf, int size) | |||
860 | { | 879 | { |
861 | char *text; | 880 | char *text; |
862 | struct log *msg; | 881 | struct log *msg; |
863 | int len; | 882 | int len = 0; |
864 | 883 | ||
865 | text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); | 884 | text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); |
866 | if (!text) | 885 | if (!text) |
867 | return -ENOMEM; | 886 | return -ENOMEM; |
868 | 887 | ||
869 | raw_spin_lock_irq(&logbuf_lock); | 888 | while (size > 0) { |
870 | if (syslog_seq < log_first_seq) { | 889 | size_t n; |
871 | /* messages are gone, move to first one */ | 890 | |
872 | syslog_seq = log_first_seq; | 891 | raw_spin_lock_irq(&logbuf_lock); |
873 | syslog_idx = log_first_idx; | 892 | if (syslog_seq < log_first_seq) { |
874 | } | 893 | /* messages are gone, move to first one */ |
875 | msg = log_from_idx(syslog_idx); | 894 | syslog_seq = log_first_seq; |
876 | len = msg_print_text(msg, true, text, LOG_LINE_MAX); | 895 | syslog_idx = log_first_idx; |
877 | syslog_idx = log_next(syslog_idx); | 896 | } |
878 | syslog_seq++; | 897 | if (syslog_seq == log_next_seq) { |
879 | raw_spin_unlock_irq(&logbuf_lock); | 898 | raw_spin_unlock_irq(&logbuf_lock); |
899 | break; | ||
900 | } | ||
901 | msg = log_from_idx(syslog_idx); | ||
902 | n = msg_print_text(msg, true, text, LOG_LINE_MAX); | ||
903 | if (n <= size) { | ||
904 | syslog_idx = log_next(syslog_idx); | ||
905 | syslog_seq++; | ||
906 | } else | ||
907 | n = 0; | ||
908 | raw_spin_unlock_irq(&logbuf_lock); | ||
909 | |||
910 | if (!n) | ||
911 | break; | ||
912 | |||
913 | len += n; | ||
914 | size -= n; | ||
915 | buf += n; | ||
916 | n = copy_to_user(buf - n, text, n); | ||
880 | 917 | ||
881 | if (len > 0 && copy_to_user(buf, text, len)) | 918 | if (n) { |
882 | len = -EFAULT; | 919 | len -= n; |
920 | if (!len) | ||
921 | len = -EFAULT; | ||
922 | break; | ||
923 | } | ||
924 | } | ||
883 | 925 | ||
884 | kfree(text); | 926 | kfree(text); |
885 | return len; | 927 | return len; |
@@ -909,7 +951,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
909 | /* | 951 | /* |
910 | * Find first record that fits, including all following records, | 952 | * Find first record that fits, including all following records, |
911 | * into the user-provided buffer for this dump. | 953 | * into the user-provided buffer for this dump. |
912 | */ | 954 | */ |
913 | seq = clear_seq; | 955 | seq = clear_seq; |
914 | idx = clear_idx; | 956 | idx = clear_idx; |
915 | while (seq < log_next_seq) { | 957 | while (seq < log_next_seq) { |
@@ -919,6 +961,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
919 | idx = log_next(idx); | 961 | idx = log_next(idx); |
920 | seq++; | 962 | seq++; |
921 | } | 963 | } |
964 | |||
965 | /* move first record forward until length fits into the buffer */ | ||
922 | seq = clear_seq; | 966 | seq = clear_seq; |
923 | idx = clear_idx; | 967 | idx = clear_idx; |
924 | while (len > size && seq < log_next_seq) { | 968 | while (len > size && seq < log_next_seq) { |
@@ -929,7 +973,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
929 | seq++; | 973 | seq++; |
930 | } | 974 | } |
931 | 975 | ||
932 | /* last message in this dump */ | 976 | /* last message fitting into this dump */ |
933 | next_seq = log_next_seq; | 977 | next_seq = log_next_seq; |
934 | 978 | ||
935 | len = 0; | 979 | len = 0; |
@@ -974,6 +1018,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
974 | { | 1018 | { |
975 | bool clear = false; | 1019 | bool clear = false; |
976 | static int saved_console_loglevel = -1; | 1020 | static int saved_console_loglevel = -1; |
1021 | static DEFINE_MUTEX(syslog_mutex); | ||
977 | int error; | 1022 | int error; |
978 | 1023 | ||
979 | error = check_syslog_permissions(type, from_file); | 1024 | error = check_syslog_permissions(type, from_file); |
@@ -1000,11 +1045,17 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
1000 | error = -EFAULT; | 1045 | error = -EFAULT; |
1001 | goto out; | 1046 | goto out; |
1002 | } | 1047 | } |
1048 | error = mutex_lock_interruptible(&syslog_mutex); | ||
1049 | if (error) | ||
1050 | goto out; | ||
1003 | error = wait_event_interruptible(log_wait, | 1051 | error = wait_event_interruptible(log_wait, |
1004 | syslog_seq != log_next_seq); | 1052 | syslog_seq != log_next_seq); |
1005 | if (error) | 1053 | if (error) { |
1054 | mutex_unlock(&syslog_mutex); | ||
1006 | goto out; | 1055 | goto out; |
1056 | } | ||
1007 | error = syslog_print(buf, len); | 1057 | error = syslog_print(buf, len); |
1058 | mutex_unlock(&syslog_mutex); | ||
1008 | break; | 1059 | break; |
1009 | /* Read/clear last kernel messages */ | 1060 | /* Read/clear last kernel messages */ |
1010 | case SYSLOG_ACTION_READ_CLEAR: | 1061 | case SYSLOG_ACTION_READ_CLEAR: |
@@ -1027,6 +1078,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
1027 | /* Clear ring buffer */ | 1078 | /* Clear ring buffer */ |
1028 | case SYSLOG_ACTION_CLEAR: | 1079 | case SYSLOG_ACTION_CLEAR: |
1029 | syslog_print_all(NULL, 0, true); | 1080 | syslog_print_all(NULL, 0, true); |
1081 | break; | ||
1030 | /* Disable logging to console */ | 1082 | /* Disable logging to console */ |
1031 | case SYSLOG_ACTION_CONSOLE_OFF: | 1083 | case SYSLOG_ACTION_CONSOLE_OFF: |
1032 | if (saved_console_loglevel == -1) | 1084 | if (saved_console_loglevel == -1) |
@@ -1259,15 +1311,92 @@ static inline void printk_delay(void) | |||
1259 | } | 1311 | } |
1260 | } | 1312 | } |
1261 | 1313 | ||
1314 | /* | ||
1315 | * Continuation lines are buffered, and not committed to the record buffer | ||
1316 | * until the line is complete, or a race forces it. The line fragments | ||
1317 | * though, are printed immediately to the consoles to ensure everything has | ||
1318 | * reached the console in case of a kernel crash. | ||
1319 | */ | ||
1320 | static struct cont { | ||
1321 | char buf[LOG_LINE_MAX]; | ||
1322 | size_t len; /* length == 0 means unused buffer */ | ||
1323 | size_t cons; /* bytes written to console */ | ||
1324 | struct task_struct *owner; /* task of first print*/ | ||
1325 | u64 ts_nsec; /* time of first print */ | ||
1326 | u8 level; /* log level of first message */ | ||
1327 | u8 facility; /* log level of first message */ | ||
1328 | bool flushed:1; /* buffer sealed and committed */ | ||
1329 | } cont; | ||
1330 | |||
1331 | static void cont_flush(void) | ||
1332 | { | ||
1333 | if (cont.flushed) | ||
1334 | return; | ||
1335 | if (cont.len == 0) | ||
1336 | return; | ||
1337 | |||
1338 | log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec, | ||
1339 | NULL, 0, cont.buf, cont.len); | ||
1340 | |||
1341 | cont.flushed = true; | ||
1342 | } | ||
1343 | |||
1344 | static bool cont_add(int facility, int level, const char *text, size_t len) | ||
1345 | { | ||
1346 | if (cont.len && cont.flushed) | ||
1347 | return false; | ||
1348 | |||
1349 | if (cont.len + len > sizeof(cont.buf)) { | ||
1350 | cont_flush(); | ||
1351 | return false; | ||
1352 | } | ||
1353 | |||
1354 | if (!cont.len) { | ||
1355 | cont.facility = facility; | ||
1356 | cont.level = level; | ||
1357 | cont.owner = current; | ||
1358 | cont.ts_nsec = local_clock(); | ||
1359 | cont.cons = 0; | ||
1360 | cont.flushed = false; | ||
1361 | } | ||
1362 | |||
1363 | memcpy(cont.buf + cont.len, text, len); | ||
1364 | cont.len += len; | ||
1365 | return true; | ||
1366 | } | ||
1367 | |||
1368 | static size_t cont_print_text(char *text, size_t size) | ||
1369 | { | ||
1370 | size_t textlen = 0; | ||
1371 | size_t len; | ||
1372 | |||
1373 | if (cont.cons == 0) { | ||
1374 | textlen += print_time(cont.ts_nsec, text); | ||
1375 | size -= textlen; | ||
1376 | } | ||
1377 | |||
1378 | len = cont.len - cont.cons; | ||
1379 | if (len > 0) { | ||
1380 | if (len+1 > size) | ||
1381 | len = size-1; | ||
1382 | memcpy(text + textlen, cont.buf + cont.cons, len); | ||
1383 | textlen += len; | ||
1384 | cont.cons = cont.len; | ||
1385 | } | ||
1386 | |||
1387 | if (cont.flushed) { | ||
1388 | text[textlen++] = '\n'; | ||
1389 | /* got everything, release buffer */ | ||
1390 | cont.len = 0; | ||
1391 | } | ||
1392 | return textlen; | ||
1393 | } | ||
1394 | |||
1262 | asmlinkage int vprintk_emit(int facility, int level, | 1395 | asmlinkage int vprintk_emit(int facility, int level, |
1263 | const char *dict, size_t dictlen, | 1396 | const char *dict, size_t dictlen, |
1264 | const char *fmt, va_list args) | 1397 | const char *fmt, va_list args) |
1265 | { | 1398 | { |
1266 | static int recursion_bug; | 1399 | static int recursion_bug; |
1267 | static char cont_buf[LOG_LINE_MAX]; | ||
1268 | static size_t cont_len; | ||
1269 | static int cont_level; | ||
1270 | static struct task_struct *cont_task; | ||
1271 | static char textbuf[LOG_LINE_MAX]; | 1400 | static char textbuf[LOG_LINE_MAX]; |
1272 | char *text = textbuf; | 1401 | char *text = textbuf; |
1273 | size_t text_len; | 1402 | size_t text_len; |
@@ -1313,7 +1442,8 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1313 | recursion_bug = 0; | 1442 | recursion_bug = 0; |
1314 | printed_len += strlen(recursion_msg); | 1443 | printed_len += strlen(recursion_msg); |
1315 | /* emit KERN_CRIT message */ | 1444 | /* emit KERN_CRIT message */ |
1316 | log_store(0, 2, NULL, 0, recursion_msg, printed_len); | 1445 | log_store(0, 2, LOG_DEFAULT, 0, |
1446 | NULL, 0, recursion_msg, printed_len); | ||
1317 | } | 1447 | } |
1318 | 1448 | ||
1319 | /* | 1449 | /* |
@@ -1351,55 +1481,37 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1351 | } | 1481 | } |
1352 | 1482 | ||
1353 | if (!newline) { | 1483 | if (!newline) { |
1354 | if (cont_len && (prefix || cont_task != current)) { | 1484 | /* |
1355 | /* | 1485 | * Flush the conflicting buffer. An earlier newline was missing, |
1356 | * Flush earlier buffer, which is either from a | 1486 | * or another task also prints continuation lines. |
1357 | * different thread, or when we got a new prefix. | 1487 | */ |
1358 | */ | 1488 | if (cont.len && (prefix || cont.owner != current)) |
1359 | log_store(facility, cont_level, NULL, 0, cont_buf, cont_len); | 1489 | cont_flush(); |
1360 | cont_len = 0; | ||
1361 | } | ||
1362 | |||
1363 | if (!cont_len) { | ||
1364 | cont_level = level; | ||
1365 | cont_task = current; | ||
1366 | } | ||
1367 | 1490 | ||
1368 | /* buffer or append to earlier buffer from the same thread */ | 1491 | /* buffer line if possible, otherwise store it right away */ |
1369 | if (cont_len + text_len > sizeof(cont_buf)) | 1492 | if (!cont_add(facility, level, text, text_len)) |
1370 | text_len = sizeof(cont_buf) - cont_len; | 1493 | log_store(facility, level, LOG_DEFAULT, 0, |
1371 | memcpy(cont_buf + cont_len, text, text_len); | 1494 | dict, dictlen, text, text_len); |
1372 | cont_len += text_len; | ||
1373 | } else { | 1495 | } else { |
1374 | if (cont_len && cont_task == current) { | 1496 | bool stored = false; |
1375 | if (prefix) { | ||
1376 | /* | ||
1377 | * New prefix from the same thread; flush. We | ||
1378 | * either got no earlier newline, or we race | ||
1379 | * with an interrupt. | ||
1380 | */ | ||
1381 | log_store(facility, cont_level, | ||
1382 | NULL, 0, cont_buf, cont_len); | ||
1383 | cont_len = 0; | ||
1384 | } | ||
1385 | 1497 | ||
1386 | /* append to the earlier buffer and flush */ | 1498 | /* |
1387 | if (cont_len + text_len > sizeof(cont_buf)) | 1499 | * If an earlier newline was missing and it was the same task, |
1388 | text_len = sizeof(cont_buf) - cont_len; | 1500 | * either merge it with the current buffer and flush, or if |
1389 | memcpy(cont_buf + cont_len, text, text_len); | 1501 | * there was a race with interrupts (prefix == true) then just |
1390 | cont_len += text_len; | 1502 | * flush it out and store this line separately. |
1391 | log_store(facility, cont_level, | 1503 | */ |
1392 | NULL, 0, cont_buf, cont_len); | 1504 | if (cont.len && cont.owner == current) { |
1393 | cont_len = 0; | 1505 | if (!prefix) |
1394 | cont_task = NULL; | 1506 | stored = cont_add(facility, level, text, text_len); |
1395 | printed_len = cont_len; | 1507 | cont_flush(); |
1396 | } else { | ||
1397 | /* ordinary single and terminated line */ | ||
1398 | log_store(facility, level, | ||
1399 | dict, dictlen, text, text_len); | ||
1400 | printed_len = text_len; | ||
1401 | } | 1508 | } |
1509 | |||
1510 | if (!stored) | ||
1511 | log_store(facility, level, LOG_DEFAULT, 0, | ||
1512 | dict, dictlen, text, text_len); | ||
1402 | } | 1513 | } |
1514 | printed_len += text_len; | ||
1403 | 1515 | ||
1404 | /* | 1516 | /* |
1405 | * Try to acquire and then immediately release the console semaphore. | 1517 | * Try to acquire and then immediately release the console semaphore. |
@@ -1486,11 +1598,18 @@ EXPORT_SYMBOL(printk); | |||
1486 | #else | 1598 | #else |
1487 | 1599 | ||
1488 | #define LOG_LINE_MAX 0 | 1600 | #define LOG_LINE_MAX 0 |
1601 | static struct cont { | ||
1602 | size_t len; | ||
1603 | size_t cons; | ||
1604 | u8 level; | ||
1605 | bool flushed:1; | ||
1606 | } cont; | ||
1489 | static struct log *log_from_idx(u32 idx) { return NULL; } | 1607 | static struct log *log_from_idx(u32 idx) { return NULL; } |
1490 | static u32 log_next(u32 idx) { return 0; } | 1608 | static u32 log_next(u32 idx) { return 0; } |
1491 | static void call_console_drivers(int level, const char *text, size_t len) {} | 1609 | static void call_console_drivers(int level, const char *text, size_t len) {} |
1492 | static size_t msg_print_text(const struct log *msg, bool syslog, | 1610 | static size_t msg_print_text(const struct log *msg, bool syslog, |
1493 | char *buf, size_t size) { return 0; } | 1611 | char *buf, size_t size) { return 0; } |
1612 | static size_t cont_print_text(char *text, size_t size) { return 0; } | ||
1494 | 1613 | ||
1495 | #endif /* CONFIG_PRINTK */ | 1614 | #endif /* CONFIG_PRINTK */ |
1496 | 1615 | ||
@@ -1782,6 +1901,7 @@ static u32 console_idx; | |||
1782 | */ | 1901 | */ |
1783 | void console_unlock(void) | 1902 | void console_unlock(void) |
1784 | { | 1903 | { |
1904 | static char text[LOG_LINE_MAX]; | ||
1785 | static u64 seen_seq; | 1905 | static u64 seen_seq; |
1786 | unsigned long flags; | 1906 | unsigned long flags; |
1787 | bool wake_klogd = false; | 1907 | bool wake_klogd = false; |
@@ -1794,10 +1914,23 @@ void console_unlock(void) | |||
1794 | 1914 | ||
1795 | console_may_schedule = 0; | 1915 | console_may_schedule = 0; |
1796 | 1916 | ||
1917 | /* flush buffered message fragment immediately to console */ | ||
1918 | raw_spin_lock_irqsave(&logbuf_lock, flags); | ||
1919 | if (cont.len && (cont.cons < cont.len || cont.flushed)) { | ||
1920 | size_t len; | ||
1921 | |||
1922 | len = cont_print_text(text, sizeof(text)); | ||
1923 | raw_spin_unlock(&logbuf_lock); | ||
1924 | stop_critical_timings(); | ||
1925 | call_console_drivers(cont.level, text, len); | ||
1926 | start_critical_timings(); | ||
1927 | local_irq_restore(flags); | ||
1928 | } else | ||
1929 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
1930 | |||
1797 | again: | 1931 | again: |
1798 | for (;;) { | 1932 | for (;;) { |
1799 | struct log *msg; | 1933 | struct log *msg; |
1800 | static char text[LOG_LINE_MAX]; | ||
1801 | size_t len; | 1934 | size_t len; |
1802 | int level; | 1935 | int level; |
1803 | 1936 | ||
@@ -1812,13 +1945,22 @@ again: | |||
1812 | console_seq = log_first_seq; | 1945 | console_seq = log_first_seq; |
1813 | console_idx = log_first_idx; | 1946 | console_idx = log_first_idx; |
1814 | } | 1947 | } |
1815 | 1948 | skip: | |
1816 | if (console_seq == log_next_seq) | 1949 | if (console_seq == log_next_seq) |
1817 | break; | 1950 | break; |
1818 | 1951 | ||
1819 | msg = log_from_idx(console_idx); | 1952 | msg = log_from_idx(console_idx); |
1820 | level = msg->level & 7; | 1953 | if (msg->flags & LOG_NOCONS) { |
1954 | /* | ||
1955 | * Skip record we have buffered and already printed | ||
1956 | * directly to the console when we received it. | ||
1957 | */ | ||
1958 | console_idx = log_next(console_idx); | ||
1959 | console_seq++; | ||
1960 | goto skip; | ||
1961 | } | ||
1821 | 1962 | ||
1963 | level = msg->level; | ||
1822 | len = msg_print_text(msg, false, text, sizeof(text)); | 1964 | len = msg_print_text(msg, false, text, sizeof(text)); |
1823 | 1965 | ||
1824 | console_idx = log_next(console_idx); | 1966 | console_idx = log_next(console_idx); |
@@ -2300,48 +2442,210 @@ module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); | |||
2300 | * kmsg_dump - dump kernel log to kernel message dumpers. | 2442 | * kmsg_dump - dump kernel log to kernel message dumpers. |
2301 | * @reason: the reason (oops, panic etc) for dumping | 2443 | * @reason: the reason (oops, panic etc) for dumping |
2302 | * | 2444 | * |
2303 | * Iterate through each of the dump devices and call the oops/panic | 2445 | * Call each of the registered dumper's dump() callback, which can |
2304 | * callbacks with the log buffer. | 2446 | * retrieve the kmsg records with kmsg_dump_get_line() or |
2447 | * kmsg_dump_get_buffer(). | ||
2305 | */ | 2448 | */ |
2306 | void kmsg_dump(enum kmsg_dump_reason reason) | 2449 | void kmsg_dump(enum kmsg_dump_reason reason) |
2307 | { | 2450 | { |
2308 | u64 idx; | ||
2309 | struct kmsg_dumper *dumper; | 2451 | struct kmsg_dumper *dumper; |
2310 | const char *s1, *s2; | ||
2311 | unsigned long l1, l2; | ||
2312 | unsigned long flags; | 2452 | unsigned long flags; |
2313 | 2453 | ||
2314 | if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) | 2454 | if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) |
2315 | return; | 2455 | return; |
2316 | 2456 | ||
2317 | /* Theoretically, the log could move on after we do this, but | 2457 | rcu_read_lock(); |
2318 | there's not a lot we can do about that. The new messages | 2458 | list_for_each_entry_rcu(dumper, &dump_list, list) { |
2319 | will overwrite the start of what we dump. */ | 2459 | if (dumper->max_reason && reason > dumper->max_reason) |
2460 | continue; | ||
2461 | |||
2462 | /* initialize iterator with data about the stored records */ | ||
2463 | dumper->active = true; | ||
2464 | |||
2465 | raw_spin_lock_irqsave(&logbuf_lock, flags); | ||
2466 | dumper->cur_seq = clear_seq; | ||
2467 | dumper->cur_idx = clear_idx; | ||
2468 | dumper->next_seq = log_next_seq; | ||
2469 | dumper->next_idx = log_next_idx; | ||
2470 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
2471 | |||
2472 | /* invoke dumper which will iterate over records */ | ||
2473 | dumper->dump(dumper, reason); | ||
2474 | |||
2475 | /* reset iterator */ | ||
2476 | dumper->active = false; | ||
2477 | } | ||
2478 | rcu_read_unlock(); | ||
2479 | } | ||
2480 | |||
2481 | /** | ||
2482 | * kmsg_dump_get_line - retrieve one kmsg log line | ||
2483 | * @dumper: registered kmsg dumper | ||
2484 | * @syslog: include the "<4>" prefixes | ||
2485 | * @line: buffer to copy the line to | ||
2486 | * @size: maximum size of the buffer | ||
2487 | * @len: length of line placed into buffer | ||
2488 | * | ||
2489 | * Start at the beginning of the kmsg buffer, with the oldest kmsg | ||
2490 | * record, and copy one record into the provided buffer. | ||
2491 | * | ||
2492 | * Consecutive calls will return the next available record moving | ||
2493 | * towards the end of the buffer with the youngest messages. | ||
2494 | * | ||
2495 | * A return value of FALSE indicates that there are no more records to | ||
2496 | * read. | ||
2497 | */ | ||
2498 | bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, | ||
2499 | char *line, size_t size, size_t *len) | ||
2500 | { | ||
2501 | unsigned long flags; | ||
2502 | struct log *msg; | ||
2503 | size_t l = 0; | ||
2504 | bool ret = false; | ||
2505 | |||
2506 | if (!dumper->active) | ||
2507 | goto out; | ||
2320 | 2508 | ||
2321 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 2509 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
2322 | if (syslog_seq < log_first_seq) | 2510 | if (dumper->cur_seq < log_first_seq) { |
2323 | idx = syslog_idx; | 2511 | /* messages are gone, move to first available one */ |
2324 | else | 2512 | dumper->cur_seq = log_first_seq; |
2325 | idx = log_first_idx; | 2513 | dumper->cur_idx = log_first_idx; |
2514 | } | ||
2326 | 2515 | ||
2327 | if (idx > log_next_idx) { | 2516 | /* last entry */ |
2328 | s1 = log_buf; | 2517 | if (dumper->cur_seq >= log_next_seq) { |
2329 | l1 = log_next_idx; | 2518 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
2519 | goto out; | ||
2520 | } | ||
2330 | 2521 | ||
2331 | s2 = log_buf + idx; | 2522 | msg = log_from_idx(dumper->cur_idx); |
2332 | l2 = log_buf_len - idx; | 2523 | l = msg_print_text(msg, syslog, |
2333 | } else { | 2524 | line, size); |
2334 | s1 = ""; | ||
2335 | l1 = 0; | ||
2336 | 2525 | ||
2337 | s2 = log_buf + idx; | 2526 | dumper->cur_idx = log_next(dumper->cur_idx); |
2338 | l2 = log_next_idx - idx; | 2527 | dumper->cur_seq++; |
2528 | ret = true; | ||
2529 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
2530 | out: | ||
2531 | if (len) | ||
2532 | *len = l; | ||
2533 | return ret; | ||
2534 | } | ||
2535 | EXPORT_SYMBOL_GPL(kmsg_dump_get_line); | ||
2536 | |||
2537 | /** | ||
2538 | * kmsg_dump_get_buffer - copy kmsg log lines | ||
2539 | * @dumper: registered kmsg dumper | ||
2540 | * @syslog: include the "<4>" prefixes | ||
2541 | * @buf: buffer to copy the line to | ||
2542 | * @size: maximum size of the buffer | ||
2543 | * @len: length of line placed into buffer | ||
2544 | * | ||
2545 | * Start at the end of the kmsg buffer and fill the provided buffer | ||
2546 | * with as many of the the *youngest* kmsg records that fit into it. | ||
2547 | * If the buffer is large enough, all available kmsg records will be | ||
2548 | * copied with a single call. | ||
2549 | * | ||
2550 | * Consecutive calls will fill the buffer with the next block of | ||
2551 | * available older records, not including the earlier retrieved ones. | ||
2552 | * | ||
2553 | * A return value of FALSE indicates that there are no more records to | ||
2554 | * read. | ||
2555 | */ | ||
2556 | bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, | ||
2557 | char *buf, size_t size, size_t *len) | ||
2558 | { | ||
2559 | unsigned long flags; | ||
2560 | u64 seq; | ||
2561 | u32 idx; | ||
2562 | u64 next_seq; | ||
2563 | u32 next_idx; | ||
2564 | size_t l = 0; | ||
2565 | bool ret = false; | ||
2566 | |||
2567 | if (!dumper->active) | ||
2568 | goto out; | ||
2569 | |||
2570 | raw_spin_lock_irqsave(&logbuf_lock, flags); | ||
2571 | if (dumper->cur_seq < log_first_seq) { | ||
2572 | /* messages are gone, move to first available one */ | ||
2573 | dumper->cur_seq = log_first_seq; | ||
2574 | dumper->cur_idx = log_first_idx; | ||
2339 | } | 2575 | } |
2576 | |||
2577 | /* last entry */ | ||
2578 | if (dumper->cur_seq >= dumper->next_seq) { | ||
2579 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
2580 | goto out; | ||
2581 | } | ||
2582 | |||
2583 | /* calculate length of entire buffer */ | ||
2584 | seq = dumper->cur_seq; | ||
2585 | idx = dumper->cur_idx; | ||
2586 | while (seq < dumper->next_seq) { | ||
2587 | struct log *msg = log_from_idx(idx); | ||
2588 | |||
2589 | l += msg_print_text(msg, true, NULL, 0); | ||
2590 | idx = log_next(idx); | ||
2591 | seq++; | ||
2592 | } | ||
2593 | |||
2594 | /* move first record forward until length fits into the buffer */ | ||
2595 | seq = dumper->cur_seq; | ||
2596 | idx = dumper->cur_idx; | ||
2597 | while (l > size && seq < dumper->next_seq) { | ||
2598 | struct log *msg = log_from_idx(idx); | ||
2599 | |||
2600 | l -= msg_print_text(msg, true, NULL, 0); | ||
2601 | idx = log_next(idx); | ||
2602 | seq++; | ||
2603 | } | ||
2604 | |||
2605 | /* last message in next interation */ | ||
2606 | next_seq = seq; | ||
2607 | next_idx = idx; | ||
2608 | |||
2609 | l = 0; | ||
2610 | while (seq < dumper->next_seq) { | ||
2611 | struct log *msg = log_from_idx(idx); | ||
2612 | |||
2613 | l += msg_print_text(msg, syslog, | ||
2614 | buf + l, size - l); | ||
2615 | |||
2616 | idx = log_next(idx); | ||
2617 | seq++; | ||
2618 | } | ||
2619 | |||
2620 | dumper->next_seq = next_seq; | ||
2621 | dumper->next_idx = next_idx; | ||
2622 | ret = true; | ||
2340 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 2623 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
2624 | out: | ||
2625 | if (len) | ||
2626 | *len = l; | ||
2627 | return ret; | ||
2628 | } | ||
2629 | EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); | ||
2341 | 2630 | ||
2342 | rcu_read_lock(); | 2631 | /** |
2343 | list_for_each_entry_rcu(dumper, &dump_list, list) | 2632 | * kmsg_dump_rewind - reset the interator |
2344 | dumper->dump(dumper, reason, s1, l1, s2, l2); | 2633 | * @dumper: registered kmsg dumper |
2345 | rcu_read_unlock(); | 2634 | * |
2635 | * Reset the dumper's iterator so that kmsg_dump_get_line() and | ||
2636 | * kmsg_dump_get_buffer() can be called again and used multiple | ||
2637 | * times within the same dumper.dump() callback. | ||
2638 | */ | ||
2639 | void kmsg_dump_rewind(struct kmsg_dumper *dumper) | ||
2640 | { | ||
2641 | unsigned long flags; | ||
2642 | |||
2643 | raw_spin_lock_irqsave(&logbuf_lock, flags); | ||
2644 | dumper->cur_seq = clear_seq; | ||
2645 | dumper->cur_idx = clear_idx; | ||
2646 | dumper->next_seq = log_next_seq; | ||
2647 | dumper->next_idx = log_next_idx; | ||
2648 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
2346 | } | 2649 | } |
2650 | EXPORT_SYMBOL_GPL(kmsg_dump_rewind); | ||
2347 | #endif | 2651 | #endif |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0da7b88d92d0..38ecdda3f55f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -1397,6 +1397,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | |||
1397 | rdp->qlen_lazy += rsp->qlen_lazy; | 1397 | rdp->qlen_lazy += rsp->qlen_lazy; |
1398 | rdp->qlen += rsp->qlen; | 1398 | rdp->qlen += rsp->qlen; |
1399 | rdp->n_cbs_adopted += rsp->qlen; | 1399 | rdp->n_cbs_adopted += rsp->qlen; |
1400 | if (rsp->qlen_lazy != rsp->qlen) | ||
1401 | rcu_idle_count_callbacks_posted(); | ||
1400 | rsp->qlen_lazy = 0; | 1402 | rsp->qlen_lazy = 0; |
1401 | rsp->qlen = 0; | 1403 | rsp->qlen = 0; |
1402 | 1404 | ||
@@ -1528,7 +1530,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1528 | { | 1530 | { |
1529 | unsigned long flags; | 1531 | unsigned long flags; |
1530 | struct rcu_head *next, *list, **tail; | 1532 | struct rcu_head *next, *list, **tail; |
1531 | int bl, count, count_lazy; | 1533 | int bl, count, count_lazy, i; |
1532 | 1534 | ||
1533 | /* If no callbacks are ready, just return.*/ | 1535 | /* If no callbacks are ready, just return.*/ |
1534 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { | 1536 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { |
@@ -1551,9 +1553,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1551 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; | 1553 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; |
1552 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; | 1554 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; |
1553 | tail = rdp->nxttail[RCU_DONE_TAIL]; | 1555 | tail = rdp->nxttail[RCU_DONE_TAIL]; |
1554 | for (count = RCU_NEXT_SIZE - 1; count >= 0; count--) | 1556 | for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) |
1555 | if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL]) | 1557 | if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) |
1556 | rdp->nxttail[count] = &rdp->nxtlist; | 1558 | rdp->nxttail[i] = &rdp->nxtlist; |
1557 | local_irq_restore(flags); | 1559 | local_irq_restore(flags); |
1558 | 1560 | ||
1559 | /* Invoke callbacks. */ | 1561 | /* Invoke callbacks. */ |
@@ -1581,9 +1583,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1581 | if (list != NULL) { | 1583 | if (list != NULL) { |
1582 | *tail = rdp->nxtlist; | 1584 | *tail = rdp->nxtlist; |
1583 | rdp->nxtlist = list; | 1585 | rdp->nxtlist = list; |
1584 | for (count = 0; count < RCU_NEXT_SIZE; count++) | 1586 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
1585 | if (&rdp->nxtlist == rdp->nxttail[count]) | 1587 | if (&rdp->nxtlist == rdp->nxttail[i]) |
1586 | rdp->nxttail[count] = tail; | 1588 | rdp->nxttail[i] = tail; |
1587 | else | 1589 | else |
1588 | break; | 1590 | break; |
1589 | } | 1591 | } |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 7f5d138dedf5..ea056495783e 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -84,6 +84,20 @@ struct rcu_dynticks { | |||
84 | /* Process level is worth LLONG_MAX/2. */ | 84 | /* Process level is worth LLONG_MAX/2. */ |
85 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ | 85 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ |
86 | atomic_t dynticks; /* Even value for idle, else odd. */ | 86 | atomic_t dynticks; /* Even value for idle, else odd. */ |
87 | #ifdef CONFIG_RCU_FAST_NO_HZ | ||
88 | int dyntick_drain; /* Prepare-for-idle state variable. */ | ||
89 | unsigned long dyntick_holdoff; | ||
90 | /* No retries for the jiffy of failure. */ | ||
91 | struct timer_list idle_gp_timer; | ||
92 | /* Wake up CPU sleeping with callbacks. */ | ||
93 | unsigned long idle_gp_timer_expires; | ||
94 | /* When to wake up CPU (for repost). */ | ||
95 | bool idle_first_pass; /* First pass of attempt to go idle? */ | ||
96 | unsigned long nonlazy_posted; | ||
97 | /* # times non-lazy CBs posted to CPU. */ | ||
98 | unsigned long nonlazy_posted_snap; | ||
99 | /* idle-period nonlazy_posted snapshot. */ | ||
100 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | ||
87 | }; | 101 | }; |
88 | 102 | ||
89 | /* RCU's kthread states for tracing. */ | 103 | /* RCU's kthread states for tracing. */ |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 2411000d9869..5271a020887e 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -1886,8 +1886,9 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) | |||
1886 | * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs | 1886 | * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs |
1887 | * any flavor of RCU. | 1887 | * any flavor of RCU. |
1888 | */ | 1888 | */ |
1889 | int rcu_needs_cpu(int cpu) | 1889 | int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) |
1890 | { | 1890 | { |
1891 | *delta_jiffies = ULONG_MAX; | ||
1891 | return rcu_cpu_has_callbacks(cpu); | 1892 | return rcu_cpu_has_callbacks(cpu); |
1892 | } | 1893 | } |
1893 | 1894 | ||
@@ -1962,41 +1963,6 @@ static void rcu_idle_count_callbacks_posted(void) | |||
1962 | #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ | 1963 | #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ |
1963 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ | 1964 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ |
1964 | 1965 | ||
1965 | /* Loop counter for rcu_prepare_for_idle(). */ | ||
1966 | static DEFINE_PER_CPU(int, rcu_dyntick_drain); | ||
1967 | /* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */ | ||
1968 | static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); | ||
1969 | /* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */ | ||
1970 | static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); | ||
1971 | /* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */ | ||
1972 | static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires); | ||
1973 | /* Enable special processing on first attempt to enter dyntick-idle mode. */ | ||
1974 | static DEFINE_PER_CPU(bool, rcu_idle_first_pass); | ||
1975 | /* Running count of non-lazy callbacks posted, never decremented. */ | ||
1976 | static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted); | ||
1977 | /* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */ | ||
1978 | static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap); | ||
1979 | |||
1980 | /* | ||
1981 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no | ||
1982 | * callbacks on this CPU, (2) this CPU has not yet attempted to enter | ||
1983 | * dyntick-idle mode, or (3) this CPU is in the process of attempting to | ||
1984 | * enter dyntick-idle mode. Otherwise, if we have recently tried and failed | ||
1985 | * to enter dyntick-idle mode, we refuse to try to enter it. After all, | ||
1986 | * it is better to incur scheduling-clock interrupts than to spin | ||
1987 | * continuously for the same time duration! | ||
1988 | */ | ||
1989 | int rcu_needs_cpu(int cpu) | ||
1990 | { | ||
1991 | /* Flag a new idle sojourn to the idle-entry state machine. */ | ||
1992 | per_cpu(rcu_idle_first_pass, cpu) = 1; | ||
1993 | /* If no callbacks, RCU doesn't need the CPU. */ | ||
1994 | if (!rcu_cpu_has_callbacks(cpu)) | ||
1995 | return 0; | ||
1996 | /* Otherwise, RCU needs the CPU only if it recently tried and failed. */ | ||
1997 | return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies; | ||
1998 | } | ||
1999 | |||
2000 | /* | 1966 | /* |
2001 | * Does the specified flavor of RCU have non-lazy callbacks pending on | 1967 | * Does the specified flavor of RCU have non-lazy callbacks pending on |
2002 | * the specified CPU? Both RCU flavor and CPU are specified by the | 1968 | * the specified CPU? Both RCU flavor and CPU are specified by the |
@@ -2040,6 +2006,47 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu) | |||
2040 | } | 2006 | } |
2041 | 2007 | ||
2042 | /* | 2008 | /* |
2009 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no | ||
2010 | * callbacks on this CPU, (2) this CPU has not yet attempted to enter | ||
2011 | * dyntick-idle mode, or (3) this CPU is in the process of attempting to | ||
2012 | * enter dyntick-idle mode. Otherwise, if we have recently tried and failed | ||
2013 | * to enter dyntick-idle mode, we refuse to try to enter it. After all, | ||
2014 | * it is better to incur scheduling-clock interrupts than to spin | ||
2015 | * continuously for the same time duration! | ||
2016 | * | ||
2017 | * The delta_jiffies argument is used to store the time when RCU is | ||
2018 | * going to need the CPU again if it still has callbacks. The reason | ||
2019 | * for this is that rcu_prepare_for_idle() might need to post a timer, | ||
2020 | * but if so, it will do so after tick_nohz_stop_sched_tick() has set | ||
2021 | * the wakeup time for this CPU. This means that RCU's timer can be | ||
2022 | * delayed until the wakeup time, which defeats the purpose of posting | ||
2023 | * a timer. | ||
2024 | */ | ||
2025 | int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) | ||
2026 | { | ||
2027 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
2028 | |||
2029 | /* Flag a new idle sojourn to the idle-entry state machine. */ | ||
2030 | rdtp->idle_first_pass = 1; | ||
2031 | /* If no callbacks, RCU doesn't need the CPU. */ | ||
2032 | if (!rcu_cpu_has_callbacks(cpu)) { | ||
2033 | *delta_jiffies = ULONG_MAX; | ||
2034 | return 0; | ||
2035 | } | ||
2036 | if (rdtp->dyntick_holdoff == jiffies) { | ||
2037 | /* RCU recently tried and failed, so don't try again. */ | ||
2038 | *delta_jiffies = 1; | ||
2039 | return 1; | ||
2040 | } | ||
2041 | /* Set up for the possibility that RCU will post a timer. */ | ||
2042 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) | ||
2043 | *delta_jiffies = RCU_IDLE_GP_DELAY; | ||
2044 | else | ||
2045 | *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY; | ||
2046 | return 0; | ||
2047 | } | ||
2048 | |||
2049 | /* | ||
2043 | * Handler for smp_call_function_single(). The only point of this | 2050 | * Handler for smp_call_function_single(). The only point of this |
2044 | * handler is to wake the CPU up, so the handler does only tracing. | 2051 | * handler is to wake the CPU up, so the handler does only tracing. |
2045 | */ | 2052 | */ |
@@ -2075,21 +2082,24 @@ static void rcu_idle_gp_timer_func(unsigned long cpu_in) | |||
2075 | */ | 2082 | */ |
2076 | static void rcu_prepare_for_idle_init(int cpu) | 2083 | static void rcu_prepare_for_idle_init(int cpu) |
2077 | { | 2084 | { |
2078 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; | 2085 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
2079 | setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), | 2086 | |
2080 | rcu_idle_gp_timer_func, cpu); | 2087 | rdtp->dyntick_holdoff = jiffies - 1; |
2081 | per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1; | 2088 | setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu); |
2082 | per_cpu(rcu_idle_first_pass, cpu) = 1; | 2089 | rdtp->idle_gp_timer_expires = jiffies - 1; |
2090 | rdtp->idle_first_pass = 1; | ||
2083 | } | 2091 | } |
2084 | 2092 | ||
2085 | /* | 2093 | /* |
2086 | * Clean up for exit from idle. Because we are exiting from idle, there | 2094 | * Clean up for exit from idle. Because we are exiting from idle, there |
2087 | * is no longer any point to rcu_idle_gp_timer, so cancel it. This will | 2095 | * is no longer any point to ->idle_gp_timer, so cancel it. This will |
2088 | * do nothing if this timer is not active, so just cancel it unconditionally. | 2096 | * do nothing if this timer is not active, so just cancel it unconditionally. |
2089 | */ | 2097 | */ |
2090 | static void rcu_cleanup_after_idle(int cpu) | 2098 | static void rcu_cleanup_after_idle(int cpu) |
2091 | { | 2099 | { |
2092 | del_timer(&per_cpu(rcu_idle_gp_timer, cpu)); | 2100 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
2101 | |||
2102 | del_timer(&rdtp->idle_gp_timer); | ||
2093 | trace_rcu_prep_idle("Cleanup after idle"); | 2103 | trace_rcu_prep_idle("Cleanup after idle"); |
2094 | } | 2104 | } |
2095 | 2105 | ||
@@ -2108,42 +2118,41 @@ static void rcu_cleanup_after_idle(int cpu) | |||
2108 | * Because it is not legal to invoke rcu_process_callbacks() with irqs | 2118 | * Because it is not legal to invoke rcu_process_callbacks() with irqs |
2109 | * disabled, we do one pass of force_quiescent_state(), then do a | 2119 | * disabled, we do one pass of force_quiescent_state(), then do a |
2110 | * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked | 2120 | * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked |
2111 | * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. | 2121 | * later. The ->dyntick_drain field controls the sequencing. |
2112 | * | 2122 | * |
2113 | * The caller must have disabled interrupts. | 2123 | * The caller must have disabled interrupts. |
2114 | */ | 2124 | */ |
2115 | static void rcu_prepare_for_idle(int cpu) | 2125 | static void rcu_prepare_for_idle(int cpu) |
2116 | { | 2126 | { |
2117 | struct timer_list *tp; | 2127 | struct timer_list *tp; |
2128 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
2118 | 2129 | ||
2119 | /* | 2130 | /* |
2120 | * If this is an idle re-entry, for example, due to use of | 2131 | * If this is an idle re-entry, for example, due to use of |
2121 | * RCU_NONIDLE() or the new idle-loop tracing API within the idle | 2132 | * RCU_NONIDLE() or the new idle-loop tracing API within the idle |
2122 | * loop, then don't take any state-machine actions, unless the | 2133 | * loop, then don't take any state-machine actions, unless the |
2123 | * momentary exit from idle queued additional non-lazy callbacks. | 2134 | * momentary exit from idle queued additional non-lazy callbacks. |
2124 | * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks | 2135 | * Instead, repost the ->idle_gp_timer if this CPU has callbacks |
2125 | * pending. | 2136 | * pending. |
2126 | */ | 2137 | */ |
2127 | if (!per_cpu(rcu_idle_first_pass, cpu) && | 2138 | if (!rdtp->idle_first_pass && |
2128 | (per_cpu(rcu_nonlazy_posted, cpu) == | 2139 | (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) { |
2129 | per_cpu(rcu_nonlazy_posted_snap, cpu))) { | ||
2130 | if (rcu_cpu_has_callbacks(cpu)) { | 2140 | if (rcu_cpu_has_callbacks(cpu)) { |
2131 | tp = &per_cpu(rcu_idle_gp_timer, cpu); | 2141 | tp = &rdtp->idle_gp_timer; |
2132 | mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); | 2142 | mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); |
2133 | } | 2143 | } |
2134 | return; | 2144 | return; |
2135 | } | 2145 | } |
2136 | per_cpu(rcu_idle_first_pass, cpu) = 0; | 2146 | rdtp->idle_first_pass = 0; |
2137 | per_cpu(rcu_nonlazy_posted_snap, cpu) = | 2147 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1; |
2138 | per_cpu(rcu_nonlazy_posted, cpu) - 1; | ||
2139 | 2148 | ||
2140 | /* | 2149 | /* |
2141 | * If there are no callbacks on this CPU, enter dyntick-idle mode. | 2150 | * If there are no callbacks on this CPU, enter dyntick-idle mode. |
2142 | * Also reset state to avoid prejudicing later attempts. | 2151 | * Also reset state to avoid prejudicing later attempts. |
2143 | */ | 2152 | */ |
2144 | if (!rcu_cpu_has_callbacks(cpu)) { | 2153 | if (!rcu_cpu_has_callbacks(cpu)) { |
2145 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; | 2154 | rdtp->dyntick_holdoff = jiffies - 1; |
2146 | per_cpu(rcu_dyntick_drain, cpu) = 0; | 2155 | rdtp->dyntick_drain = 0; |
2147 | trace_rcu_prep_idle("No callbacks"); | 2156 | trace_rcu_prep_idle("No callbacks"); |
2148 | return; | 2157 | return; |
2149 | } | 2158 | } |
@@ -2152,36 +2161,37 @@ static void rcu_prepare_for_idle(int cpu) | |||
2152 | * If in holdoff mode, just return. We will presumably have | 2161 | * If in holdoff mode, just return. We will presumably have |
2153 | * refrained from disabling the scheduling-clock tick. | 2162 | * refrained from disabling the scheduling-clock tick. |
2154 | */ | 2163 | */ |
2155 | if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { | 2164 | if (rdtp->dyntick_holdoff == jiffies) { |
2156 | trace_rcu_prep_idle("In holdoff"); | 2165 | trace_rcu_prep_idle("In holdoff"); |
2157 | return; | 2166 | return; |
2158 | } | 2167 | } |
2159 | 2168 | ||
2160 | /* Check and update the rcu_dyntick_drain sequencing. */ | 2169 | /* Check and update the ->dyntick_drain sequencing. */ |
2161 | if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { | 2170 | if (rdtp->dyntick_drain <= 0) { |
2162 | /* First time through, initialize the counter. */ | 2171 | /* First time through, initialize the counter. */ |
2163 | per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; | 2172 | rdtp->dyntick_drain = RCU_IDLE_FLUSHES; |
2164 | } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && | 2173 | } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES && |
2165 | !rcu_pending(cpu) && | 2174 | !rcu_pending(cpu) && |
2166 | !local_softirq_pending()) { | 2175 | !local_softirq_pending()) { |
2167 | /* Can we go dyntick-idle despite still having callbacks? */ | 2176 | /* Can we go dyntick-idle despite still having callbacks? */ |
2168 | trace_rcu_prep_idle("Dyntick with callbacks"); | 2177 | rdtp->dyntick_drain = 0; |
2169 | per_cpu(rcu_dyntick_drain, cpu) = 0; | 2178 | rdtp->dyntick_holdoff = jiffies; |
2170 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; | 2179 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) { |
2171 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) | 2180 | trace_rcu_prep_idle("Dyntick with callbacks"); |
2172 | per_cpu(rcu_idle_gp_timer_expires, cpu) = | 2181 | rdtp->idle_gp_timer_expires = |
2173 | jiffies + RCU_IDLE_GP_DELAY; | 2182 | jiffies + RCU_IDLE_GP_DELAY; |
2174 | else | 2183 | } else { |
2175 | per_cpu(rcu_idle_gp_timer_expires, cpu) = | 2184 | rdtp->idle_gp_timer_expires = |
2176 | jiffies + RCU_IDLE_LAZY_GP_DELAY; | 2185 | jiffies + RCU_IDLE_LAZY_GP_DELAY; |
2177 | tp = &per_cpu(rcu_idle_gp_timer, cpu); | 2186 | trace_rcu_prep_idle("Dyntick with lazy callbacks"); |
2178 | mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); | 2187 | } |
2179 | per_cpu(rcu_nonlazy_posted_snap, cpu) = | 2188 | tp = &rdtp->idle_gp_timer; |
2180 | per_cpu(rcu_nonlazy_posted, cpu); | 2189 | mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); |
2190 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; | ||
2181 | return; /* Nothing more to do immediately. */ | 2191 | return; /* Nothing more to do immediately. */ |
2182 | } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { | 2192 | } else if (--(rdtp->dyntick_drain) <= 0) { |
2183 | /* We have hit the limit, so time to give up. */ | 2193 | /* We have hit the limit, so time to give up. */ |
2184 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; | 2194 | rdtp->dyntick_holdoff = jiffies; |
2185 | trace_rcu_prep_idle("Begin holdoff"); | 2195 | trace_rcu_prep_idle("Begin holdoff"); |
2186 | invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ | 2196 | invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ |
2187 | return; | 2197 | return; |
@@ -2227,7 +2237,7 @@ static void rcu_prepare_for_idle(int cpu) | |||
2227 | */ | 2237 | */ |
2228 | static void rcu_idle_count_callbacks_posted(void) | 2238 | static void rcu_idle_count_callbacks_posted(void) |
2229 | { | 2239 | { |
2230 | __this_cpu_add(rcu_nonlazy_posted, 1); | 2240 | __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); |
2231 | } | 2241 | } |
2232 | 2242 | ||
2233 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 2243 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
@@ -2238,11 +2248,12 @@ static void rcu_idle_count_callbacks_posted(void) | |||
2238 | 2248 | ||
2239 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | 2249 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) |
2240 | { | 2250 | { |
2241 | struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu); | 2251 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
2252 | struct timer_list *tltp = &rdtp->idle_gp_timer; | ||
2242 | 2253 | ||
2243 | sprintf(cp, "drain=%d %c timer=%lu", | 2254 | sprintf(cp, "drain=%d %c timer=%lu", |
2244 | per_cpu(rcu_dyntick_drain, cpu), | 2255 | rdtp->dyntick_drain, |
2245 | per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', | 2256 | rdtp->dyntick_holdoff == jiffies ? 'H' : '.', |
2246 | timer_pending(tltp) ? tltp->expires - jiffies : -1); | 2257 | timer_pending(tltp) ? tltp->expires - jiffies : -1); |
2247 | } | 2258 | } |
2248 | 2259 | ||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 39eb6011bc38..d5594a4268d4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -142,9 +142,8 @@ const_debug unsigned int sysctl_sched_features = | |||
142 | #define SCHED_FEAT(name, enabled) \ | 142 | #define SCHED_FEAT(name, enabled) \ |
143 | #name , | 143 | #name , |
144 | 144 | ||
145 | static __read_mostly char *sched_feat_names[] = { | 145 | static const char * const sched_feat_names[] = { |
146 | #include "features.h" | 146 | #include "features.h" |
147 | NULL | ||
148 | }; | 147 | }; |
149 | 148 | ||
150 | #undef SCHED_FEAT | 149 | #undef SCHED_FEAT |
@@ -2517,25 +2516,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, | |||
2517 | sched_avg_update(this_rq); | 2516 | sched_avg_update(this_rq); |
2518 | } | 2517 | } |
2519 | 2518 | ||
2519 | #ifdef CONFIG_NO_HZ | ||
2520 | /* | ||
2521 | * There is no sane way to deal with nohz on smp when using jiffies because the | ||
2522 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | ||
2523 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. | ||
2524 | * | ||
2525 | * Therefore we cannot use the delta approach from the regular tick since that | ||
2526 | * would seriously skew the load calculation. However we'll make do for those | ||
2527 | * updates happening while idle (nohz_idle_balance) or coming out of idle | ||
2528 | * (tick_nohz_idle_exit). | ||
2529 | * | ||
2530 | * This means we might still be one tick off for nohz periods. | ||
2531 | */ | ||
2532 | |||
2520 | /* | 2533 | /* |
2521 | * Called from nohz_idle_balance() to update the load ratings before doing the | 2534 | * Called from nohz_idle_balance() to update the load ratings before doing the |
2522 | * idle balance. | 2535 | * idle balance. |
2523 | */ | 2536 | */ |
2524 | void update_idle_cpu_load(struct rq *this_rq) | 2537 | void update_idle_cpu_load(struct rq *this_rq) |
2525 | { | 2538 | { |
2526 | unsigned long curr_jiffies = jiffies; | 2539 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); |
2527 | unsigned long load = this_rq->load.weight; | 2540 | unsigned long load = this_rq->load.weight; |
2528 | unsigned long pending_updates; | 2541 | unsigned long pending_updates; |
2529 | 2542 | ||
2530 | /* | 2543 | /* |
2531 | * Bloody broken means of dealing with nohz, but better than nothing.. | 2544 | * bail if there's load or we're actually up-to-date. |
2532 | * jiffies is updated by one cpu, another cpu can drift wrt the jiffy | ||
2533 | * update and see 0 difference the one time and 2 the next, even though | ||
2534 | * we ticked at roughtly the same rate. | ||
2535 | * | ||
2536 | * Hence we only use this from nohz_idle_balance() and skip this | ||
2537 | * nonsense when called from the scheduler_tick() since that's | ||
2538 | * guaranteed a stable rate. | ||
2539 | */ | 2545 | */ |
2540 | if (load || curr_jiffies == this_rq->last_load_update_tick) | 2546 | if (load || curr_jiffies == this_rq->last_load_update_tick) |
2541 | return; | 2547 | return; |
@@ -2547,12 +2553,38 @@ void update_idle_cpu_load(struct rq *this_rq) | |||
2547 | } | 2553 | } |
2548 | 2554 | ||
2549 | /* | 2555 | /* |
2556 | * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. | ||
2557 | */ | ||
2558 | void update_cpu_load_nohz(void) | ||
2559 | { | ||
2560 | struct rq *this_rq = this_rq(); | ||
2561 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
2562 | unsigned long pending_updates; | ||
2563 | |||
2564 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
2565 | return; | ||
2566 | |||
2567 | raw_spin_lock(&this_rq->lock); | ||
2568 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2569 | if (pending_updates) { | ||
2570 | this_rq->last_load_update_tick = curr_jiffies; | ||
2571 | /* | ||
2572 | * We were idle, this means load 0, the current load might be | ||
2573 | * !0 due to remote wakeups and the sort. | ||
2574 | */ | ||
2575 | __update_cpu_load(this_rq, 0, pending_updates); | ||
2576 | } | ||
2577 | raw_spin_unlock(&this_rq->lock); | ||
2578 | } | ||
2579 | #endif /* CONFIG_NO_HZ */ | ||
2580 | |||
2581 | /* | ||
2550 | * Called from scheduler_tick() | 2582 | * Called from scheduler_tick() |
2551 | */ | 2583 | */ |
2552 | static void update_cpu_load_active(struct rq *this_rq) | 2584 | static void update_cpu_load_active(struct rq *this_rq) |
2553 | { | 2585 | { |
2554 | /* | 2586 | /* |
2555 | * See the mess in update_idle_cpu_load(). | 2587 | * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). |
2556 | */ | 2588 | */ |
2557 | this_rq->last_load_update_tick = jiffies; | 2589 | this_rq->last_load_update_tick = jiffies; |
2558 | __update_cpu_load(this_rq, this_rq->load.weight, 1); | 2590 | __update_cpu_load(this_rq, this_rq->load.weight, 1); |
@@ -4982,7 +5014,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
4982 | p->sched_class->set_cpus_allowed(p, new_mask); | 5014 | p->sched_class->set_cpus_allowed(p, new_mask); |
4983 | 5015 | ||
4984 | cpumask_copy(&p->cpus_allowed, new_mask); | 5016 | cpumask_copy(&p->cpus_allowed, new_mask); |
4985 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); | 5017 | p->nr_cpus_allowed = cpumask_weight(new_mask); |
4986 | } | 5018 | } |
4987 | 5019 | ||
4988 | /* | 5020 | /* |
@@ -5524,15 +5556,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ | |||
5524 | 5556 | ||
5525 | #ifdef CONFIG_SCHED_DEBUG | 5557 | #ifdef CONFIG_SCHED_DEBUG |
5526 | 5558 | ||
5527 | static __read_mostly int sched_domain_debug_enabled; | 5559 | static __read_mostly int sched_debug_enabled; |
5528 | 5560 | ||
5529 | static int __init sched_domain_debug_setup(char *str) | 5561 | static int __init sched_debug_setup(char *str) |
5530 | { | 5562 | { |
5531 | sched_domain_debug_enabled = 1; | 5563 | sched_debug_enabled = 1; |
5532 | 5564 | ||
5533 | return 0; | 5565 | return 0; |
5534 | } | 5566 | } |
5535 | early_param("sched_debug", sched_domain_debug_setup); | 5567 | early_param("sched_debug", sched_debug_setup); |
5568 | |||
5569 | static inline bool sched_debug(void) | ||
5570 | { | ||
5571 | return sched_debug_enabled; | ||
5572 | } | ||
5536 | 5573 | ||
5537 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | 5574 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, |
5538 | struct cpumask *groupmask) | 5575 | struct cpumask *groupmask) |
@@ -5572,7 +5609,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
5572 | break; | 5609 | break; |
5573 | } | 5610 | } |
5574 | 5611 | ||
5575 | if (!group->sgp->power) { | 5612 | /* |
5613 | * Even though we initialize ->power to something semi-sane, | ||
5614 | * we leave power_orig unset. This allows us to detect if | ||
5615 | * domain iteration is still funny without causing /0 traps. | ||
5616 | */ | ||
5617 | if (!group->sgp->power_orig) { | ||
5576 | printk(KERN_CONT "\n"); | 5618 | printk(KERN_CONT "\n"); |
5577 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 5619 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
5578 | "set\n"); | 5620 | "set\n"); |
@@ -5620,7 +5662,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
5620 | { | 5662 | { |
5621 | int level = 0; | 5663 | int level = 0; |
5622 | 5664 | ||
5623 | if (!sched_domain_debug_enabled) | 5665 | if (!sched_debug_enabled) |
5624 | return; | 5666 | return; |
5625 | 5667 | ||
5626 | if (!sd) { | 5668 | if (!sd) { |
@@ -5641,6 +5683,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
5641 | } | 5683 | } |
5642 | #else /* !CONFIG_SCHED_DEBUG */ | 5684 | #else /* !CONFIG_SCHED_DEBUG */ |
5643 | # define sched_domain_debug(sd, cpu) do { } while (0) | 5685 | # define sched_domain_debug(sd, cpu) do { } while (0) |
5686 | static inline bool sched_debug(void) | ||
5687 | { | ||
5688 | return false; | ||
5689 | } | ||
5644 | #endif /* CONFIG_SCHED_DEBUG */ | 5690 | #endif /* CONFIG_SCHED_DEBUG */ |
5645 | 5691 | ||
5646 | static int sd_degenerate(struct sched_domain *sd) | 5692 | static int sd_degenerate(struct sched_domain *sd) |
@@ -5962,6 +6008,44 @@ struct sched_domain_topology_level { | |||
5962 | struct sd_data data; | 6008 | struct sd_data data; |
5963 | }; | 6009 | }; |
5964 | 6010 | ||
6011 | /* | ||
6012 | * Build an iteration mask that can exclude certain CPUs from the upwards | ||
6013 | * domain traversal. | ||
6014 | * | ||
6015 | * Asymmetric node setups can result in situations where the domain tree is of | ||
6016 | * unequal depth, make sure to skip domains that already cover the entire | ||
6017 | * range. | ||
6018 | * | ||
6019 | * In that case build_sched_domains() will have terminated the iteration early | ||
6020 | * and our sibling sd spans will be empty. Domains should always include the | ||
6021 | * cpu they're built on, so check that. | ||
6022 | * | ||
6023 | */ | ||
6024 | static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) | ||
6025 | { | ||
6026 | const struct cpumask *span = sched_domain_span(sd); | ||
6027 | struct sd_data *sdd = sd->private; | ||
6028 | struct sched_domain *sibling; | ||
6029 | int i; | ||
6030 | |||
6031 | for_each_cpu(i, span) { | ||
6032 | sibling = *per_cpu_ptr(sdd->sd, i); | ||
6033 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) | ||
6034 | continue; | ||
6035 | |||
6036 | cpumask_set_cpu(i, sched_group_mask(sg)); | ||
6037 | } | ||
6038 | } | ||
6039 | |||
6040 | /* | ||
6041 | * Return the canonical balance cpu for this group, this is the first cpu | ||
6042 | * of this group that's also in the iteration mask. | ||
6043 | */ | ||
6044 | int group_balance_cpu(struct sched_group *sg) | ||
6045 | { | ||
6046 | return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); | ||
6047 | } | ||
6048 | |||
5965 | static int | 6049 | static int |
5966 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) | 6050 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) |
5967 | { | 6051 | { |
@@ -5980,6 +6064,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
5980 | if (cpumask_test_cpu(i, covered)) | 6064 | if (cpumask_test_cpu(i, covered)) |
5981 | continue; | 6065 | continue; |
5982 | 6066 | ||
6067 | child = *per_cpu_ptr(sdd->sd, i); | ||
6068 | |||
6069 | /* See the comment near build_group_mask(). */ | ||
6070 | if (!cpumask_test_cpu(i, sched_domain_span(child))) | ||
6071 | continue; | ||
6072 | |||
5983 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | 6073 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), |
5984 | GFP_KERNEL, cpu_to_node(cpu)); | 6074 | GFP_KERNEL, cpu_to_node(cpu)); |
5985 | 6075 | ||
@@ -5987,8 +6077,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
5987 | goto fail; | 6077 | goto fail; |
5988 | 6078 | ||
5989 | sg_span = sched_group_cpus(sg); | 6079 | sg_span = sched_group_cpus(sg); |
5990 | |||
5991 | child = *per_cpu_ptr(sdd->sd, i); | ||
5992 | if (child->child) { | 6080 | if (child->child) { |
5993 | child = child->child; | 6081 | child = child->child; |
5994 | cpumask_copy(sg_span, sched_domain_span(child)); | 6082 | cpumask_copy(sg_span, sched_domain_span(child)); |
@@ -5997,10 +6085,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
5997 | 6085 | ||
5998 | cpumask_or(covered, covered, sg_span); | 6086 | cpumask_or(covered, covered, sg_span); |
5999 | 6087 | ||
6000 | sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); | 6088 | sg->sgp = *per_cpu_ptr(sdd->sgp, i); |
6001 | atomic_inc(&sg->sgp->ref); | 6089 | if (atomic_inc_return(&sg->sgp->ref) == 1) |
6090 | build_group_mask(sd, sg); | ||
6002 | 6091 | ||
6003 | if (cpumask_test_cpu(cpu, sg_span)) | 6092 | /* |
6093 | * Initialize sgp->power such that even if we mess up the | ||
6094 | * domains and no possible iteration will get us here, we won't | ||
6095 | * die on a /0 trap. | ||
6096 | */ | ||
6097 | sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); | ||
6098 | |||
6099 | /* | ||
6100 | * Make sure the first group of this domain contains the | ||
6101 | * canonical balance cpu. Otherwise the sched_domain iteration | ||
6102 | * breaks. See update_sg_lb_stats(). | ||
6103 | */ | ||
6104 | if ((!groups && cpumask_test_cpu(cpu, sg_span)) || | ||
6105 | group_balance_cpu(sg) == cpu) | ||
6004 | groups = sg; | 6106 | groups = sg; |
6005 | 6107 | ||
6006 | if (!first) | 6108 | if (!first) |
@@ -6074,6 +6176,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) | |||
6074 | 6176 | ||
6075 | cpumask_clear(sched_group_cpus(sg)); | 6177 | cpumask_clear(sched_group_cpus(sg)); |
6076 | sg->sgp->power = 0; | 6178 | sg->sgp->power = 0; |
6179 | cpumask_setall(sched_group_mask(sg)); | ||
6077 | 6180 | ||
6078 | for_each_cpu(j, span) { | 6181 | for_each_cpu(j, span) { |
6079 | if (get_group(j, sdd, NULL) != group) | 6182 | if (get_group(j, sdd, NULL) != group) |
@@ -6115,7 +6218,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6115 | sg = sg->next; | 6218 | sg = sg->next; |
6116 | } while (sg != sd->groups); | 6219 | } while (sg != sd->groups); |
6117 | 6220 | ||
6118 | if (cpu != group_first_cpu(sg)) | 6221 | if (cpu != group_balance_cpu(sg)) |
6119 | return; | 6222 | return; |
6120 | 6223 | ||
6121 | update_group_power(sd, cpu); | 6224 | update_group_power(sd, cpu); |
@@ -6165,11 +6268,8 @@ int sched_domain_level_max; | |||
6165 | 6268 | ||
6166 | static int __init setup_relax_domain_level(char *str) | 6269 | static int __init setup_relax_domain_level(char *str) |
6167 | { | 6270 | { |
6168 | unsigned long val; | 6271 | if (kstrtoint(str, 0, &default_relax_domain_level)) |
6169 | 6272 | pr_warn("Unable to set relax_domain_level\n"); | |
6170 | val = simple_strtoul(str, NULL, 0); | ||
6171 | if (val < sched_domain_level_max) | ||
6172 | default_relax_domain_level = val; | ||
6173 | 6273 | ||
6174 | return 1; | 6274 | return 1; |
6175 | } | 6275 | } |
@@ -6279,14 +6379,13 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol | |||
6279 | #ifdef CONFIG_NUMA | 6379 | #ifdef CONFIG_NUMA |
6280 | 6380 | ||
6281 | static int sched_domains_numa_levels; | 6381 | static int sched_domains_numa_levels; |
6282 | static int sched_domains_numa_scale; | ||
6283 | static int *sched_domains_numa_distance; | 6382 | static int *sched_domains_numa_distance; |
6284 | static struct cpumask ***sched_domains_numa_masks; | 6383 | static struct cpumask ***sched_domains_numa_masks; |
6285 | static int sched_domains_curr_level; | 6384 | static int sched_domains_curr_level; |
6286 | 6385 | ||
6287 | static inline int sd_local_flags(int level) | 6386 | static inline int sd_local_flags(int level) |
6288 | { | 6387 | { |
6289 | if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) | 6388 | if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) |
6290 | return 0; | 6389 | return 0; |
6291 | 6390 | ||
6292 | return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; | 6391 | return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; |
@@ -6344,6 +6443,42 @@ static const struct cpumask *sd_numa_mask(int cpu) | |||
6344 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; | 6443 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; |
6345 | } | 6444 | } |
6346 | 6445 | ||
6446 | static void sched_numa_warn(const char *str) | ||
6447 | { | ||
6448 | static int done = false; | ||
6449 | int i,j; | ||
6450 | |||
6451 | if (done) | ||
6452 | return; | ||
6453 | |||
6454 | done = true; | ||
6455 | |||
6456 | printk(KERN_WARNING "ERROR: %s\n\n", str); | ||
6457 | |||
6458 | for (i = 0; i < nr_node_ids; i++) { | ||
6459 | printk(KERN_WARNING " "); | ||
6460 | for (j = 0; j < nr_node_ids; j++) | ||
6461 | printk(KERN_CONT "%02d ", node_distance(i,j)); | ||
6462 | printk(KERN_CONT "\n"); | ||
6463 | } | ||
6464 | printk(KERN_WARNING "\n"); | ||
6465 | } | ||
6466 | |||
6467 | static bool find_numa_distance(int distance) | ||
6468 | { | ||
6469 | int i; | ||
6470 | |||
6471 | if (distance == node_distance(0, 0)) | ||
6472 | return true; | ||
6473 | |||
6474 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
6475 | if (sched_domains_numa_distance[i] == distance) | ||
6476 | return true; | ||
6477 | } | ||
6478 | |||
6479 | return false; | ||
6480 | } | ||
6481 | |||
6347 | static void sched_init_numa(void) | 6482 | static void sched_init_numa(void) |
6348 | { | 6483 | { |
6349 | int next_distance, curr_distance = node_distance(0, 0); | 6484 | int next_distance, curr_distance = node_distance(0, 0); |
@@ -6351,7 +6486,6 @@ static void sched_init_numa(void) | |||
6351 | int level = 0; | 6486 | int level = 0; |
6352 | int i, j, k; | 6487 | int i, j, k; |
6353 | 6488 | ||
6354 | sched_domains_numa_scale = curr_distance; | ||
6355 | sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); | 6489 | sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); |
6356 | if (!sched_domains_numa_distance) | 6490 | if (!sched_domains_numa_distance) |
6357 | return; | 6491 | return; |
@@ -6362,23 +6496,41 @@ static void sched_init_numa(void) | |||
6362 | * | 6496 | * |
6363 | * Assumes node_distance(0,j) includes all distances in | 6497 | * Assumes node_distance(0,j) includes all distances in |
6364 | * node_distance(i,j) in order to avoid cubic time. | 6498 | * node_distance(i,j) in order to avoid cubic time. |
6365 | * | ||
6366 | * XXX: could be optimized to O(n log n) by using sort() | ||
6367 | */ | 6499 | */ |
6368 | next_distance = curr_distance; | 6500 | next_distance = curr_distance; |
6369 | for (i = 0; i < nr_node_ids; i++) { | 6501 | for (i = 0; i < nr_node_ids; i++) { |
6370 | for (j = 0; j < nr_node_ids; j++) { | 6502 | for (j = 0; j < nr_node_ids; j++) { |
6371 | int distance = node_distance(0, j); | 6503 | for (k = 0; k < nr_node_ids; k++) { |
6372 | if (distance > curr_distance && | 6504 | int distance = node_distance(i, k); |
6373 | (distance < next_distance || | 6505 | |
6374 | next_distance == curr_distance)) | 6506 | if (distance > curr_distance && |
6375 | next_distance = distance; | 6507 | (distance < next_distance || |
6508 | next_distance == curr_distance)) | ||
6509 | next_distance = distance; | ||
6510 | |||
6511 | /* | ||
6512 | * While not a strong assumption it would be nice to know | ||
6513 | * about cases where if node A is connected to B, B is not | ||
6514 | * equally connected to A. | ||
6515 | */ | ||
6516 | if (sched_debug() && node_distance(k, i) != distance) | ||
6517 | sched_numa_warn("Node-distance not symmetric"); | ||
6518 | |||
6519 | if (sched_debug() && i && !find_numa_distance(distance)) | ||
6520 | sched_numa_warn("Node-0 not representative"); | ||
6521 | } | ||
6522 | if (next_distance != curr_distance) { | ||
6523 | sched_domains_numa_distance[level++] = next_distance; | ||
6524 | sched_domains_numa_levels = level; | ||
6525 | curr_distance = next_distance; | ||
6526 | } else break; | ||
6376 | } | 6527 | } |
6377 | if (next_distance != curr_distance) { | 6528 | |
6378 | sched_domains_numa_distance[level++] = next_distance; | 6529 | /* |
6379 | sched_domains_numa_levels = level; | 6530 | * In case of sched_debug() we verify the above assumption. |
6380 | curr_distance = next_distance; | 6531 | */ |
6381 | } else break; | 6532 | if (!sched_debug()) |
6533 | break; | ||
6382 | } | 6534 | } |
6383 | /* | 6535 | /* |
6384 | * 'level' contains the number of unique distances, excluding the | 6536 | * 'level' contains the number of unique distances, excluding the |
@@ -6403,7 +6555,7 @@ static void sched_init_numa(void) | |||
6403 | return; | 6555 | return; |
6404 | 6556 | ||
6405 | for (j = 0; j < nr_node_ids; j++) { | 6557 | for (j = 0; j < nr_node_ids; j++) { |
6406 | struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j); | 6558 | struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); |
6407 | if (!mask) | 6559 | if (!mask) |
6408 | return; | 6560 | return; |
6409 | 6561 | ||
@@ -6490,7 +6642,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
6490 | 6642 | ||
6491 | *per_cpu_ptr(sdd->sg, j) = sg; | 6643 | *per_cpu_ptr(sdd->sg, j) = sg; |
6492 | 6644 | ||
6493 | sgp = kzalloc_node(sizeof(struct sched_group_power), | 6645 | sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), |
6494 | GFP_KERNEL, cpu_to_node(j)); | 6646 | GFP_KERNEL, cpu_to_node(j)); |
6495 | if (!sgp) | 6647 | if (!sgp) |
6496 | return -ENOMEM; | 6648 | return -ENOMEM; |
@@ -6543,7 +6695,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
6543 | if (!sd) | 6695 | if (!sd) |
6544 | return child; | 6696 | return child; |
6545 | 6697 | ||
6546 | set_domain_attribute(sd, attr); | ||
6547 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | 6698 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); |
6548 | if (child) { | 6699 | if (child) { |
6549 | sd->level = child->level + 1; | 6700 | sd->level = child->level + 1; |
@@ -6551,6 +6702,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
6551 | child->parent = sd; | 6702 | child->parent = sd; |
6552 | } | 6703 | } |
6553 | sd->child = child; | 6704 | sd->child = child; |
6705 | set_domain_attribute(sd, attr); | ||
6554 | 6706 | ||
6555 | return sd; | 6707 | return sd; |
6556 | } | 6708 | } |
@@ -6691,7 +6843,6 @@ static int init_sched_domains(const struct cpumask *cpu_map) | |||
6691 | if (!doms_cur) | 6843 | if (!doms_cur) |
6692 | doms_cur = &fallback_doms; | 6844 | doms_cur = &fallback_doms; |
6693 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | 6845 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); |
6694 | dattr_cur = NULL; | ||
6695 | err = build_sched_domains(doms_cur[0], NULL); | 6846 | err = build_sched_domains(doms_cur[0], NULL); |
6696 | register_sched_domain_sysctl(); | 6847 | register_sched_domain_sysctl(); |
6697 | 6848 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 940e6d17cf96..c099cc6eebe3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
2703 | int want_sd = 1; | 2703 | int want_sd = 1; |
2704 | int sync = wake_flags & WF_SYNC; | 2704 | int sync = wake_flags & WF_SYNC; |
2705 | 2705 | ||
2706 | if (p->rt.nr_cpus_allowed == 1) | 2706 | if (p->nr_cpus_allowed == 1) |
2707 | return prev_cpu; | 2707 | return prev_cpu; |
2708 | 2708 | ||
2709 | if (sd_flag & SD_BALANCE_WAKE) { | 2709 | if (sd_flag & SD_BALANCE_WAKE) { |
@@ -3503,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | |||
3503 | unsigned long scale_rt_power(int cpu) | 3503 | unsigned long scale_rt_power(int cpu) |
3504 | { | 3504 | { |
3505 | struct rq *rq = cpu_rq(cpu); | 3505 | struct rq *rq = cpu_rq(cpu); |
3506 | u64 total, available; | 3506 | u64 total, available, age_stamp, avg; |
3507 | 3507 | ||
3508 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | 3508 | /* |
3509 | * Since we're reading these variables without serialization make sure | ||
3510 | * we read them once before doing sanity checks on them. | ||
3511 | */ | ||
3512 | age_stamp = ACCESS_ONCE(rq->age_stamp); | ||
3513 | avg = ACCESS_ONCE(rq->rt_avg); | ||
3514 | |||
3515 | total = sched_avg_period() + (rq->clock - age_stamp); | ||
3509 | 3516 | ||
3510 | if (unlikely(total < rq->rt_avg)) { | 3517 | if (unlikely(total < avg)) { |
3511 | /* Ensures that power won't end up being negative */ | 3518 | /* Ensures that power won't end up being negative */ |
3512 | available = 0; | 3519 | available = 0; |
3513 | } else { | 3520 | } else { |
3514 | available = total - rq->rt_avg; | 3521 | available = total - avg; |
3515 | } | 3522 | } |
3516 | 3523 | ||
3517 | if (unlikely((s64)total < SCHED_POWER_SCALE)) | 3524 | if (unlikely((s64)total < SCHED_POWER_SCALE)) |
@@ -3574,13 +3581,28 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
3574 | 3581 | ||
3575 | power = 0; | 3582 | power = 0; |
3576 | 3583 | ||
3577 | group = child->groups; | 3584 | if (child->flags & SD_OVERLAP) { |
3578 | do { | 3585 | /* |
3579 | power += group->sgp->power; | 3586 | * SD_OVERLAP domains cannot assume that child groups |
3580 | group = group->next; | 3587 | * span the current group. |
3581 | } while (group != child->groups); | 3588 | */ |
3582 | 3589 | ||
3583 | sdg->sgp->power = power; | 3590 | for_each_cpu(cpu, sched_group_cpus(sdg)) |
3591 | power += power_of(cpu); | ||
3592 | } else { | ||
3593 | /* | ||
3594 | * !SD_OVERLAP domains can assume that child groups | ||
3595 | * span the current group. | ||
3596 | */ | ||
3597 | |||
3598 | group = child->groups; | ||
3599 | do { | ||
3600 | power += group->sgp->power; | ||
3601 | group = group->next; | ||
3602 | } while (group != child->groups); | ||
3603 | } | ||
3604 | |||
3605 | sdg->sgp->power_orig = sdg->sgp->power = power; | ||
3584 | } | 3606 | } |
3585 | 3607 | ||
3586 | /* | 3608 | /* |
@@ -3610,7 +3632,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
3610 | 3632 | ||
3611 | /** | 3633 | /** |
3612 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 3634 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
3613 | * @sd: The sched_domain whose statistics are to be updated. | 3635 | * @env: The load balancing environment. |
3614 | * @group: sched_group whose statistics are to be updated. | 3636 | * @group: sched_group whose statistics are to be updated. |
3615 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 3637 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
3616 | * @local_group: Does group contain this_cpu. | 3638 | * @local_group: Does group contain this_cpu. |
@@ -3630,7 +3652,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
3630 | int i; | 3652 | int i; |
3631 | 3653 | ||
3632 | if (local_group) | 3654 | if (local_group) |
3633 | balance_cpu = group_first_cpu(group); | 3655 | balance_cpu = group_balance_cpu(group); |
3634 | 3656 | ||
3635 | /* Tally up the load of all CPUs in the group */ | 3657 | /* Tally up the load of all CPUs in the group */ |
3636 | max_cpu_load = 0; | 3658 | max_cpu_load = 0; |
@@ -3645,7 +3667,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
3645 | 3667 | ||
3646 | /* Bias balancing toward cpus of our domain */ | 3668 | /* Bias balancing toward cpus of our domain */ |
3647 | if (local_group) { | 3669 | if (local_group) { |
3648 | if (idle_cpu(i) && !first_idle_cpu) { | 3670 | if (idle_cpu(i) && !first_idle_cpu && |
3671 | cpumask_test_cpu(i, sched_group_mask(group))) { | ||
3649 | first_idle_cpu = 1; | 3672 | first_idle_cpu = 1; |
3650 | balance_cpu = i; | 3673 | balance_cpu = i; |
3651 | } | 3674 | } |
@@ -3719,11 +3742,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
3719 | 3742 | ||
3720 | /** | 3743 | /** |
3721 | * update_sd_pick_busiest - return 1 on busiest group | 3744 | * update_sd_pick_busiest - return 1 on busiest group |
3722 | * @sd: sched_domain whose statistics are to be checked | 3745 | * @env: The load balancing environment. |
3723 | * @sds: sched_domain statistics | 3746 | * @sds: sched_domain statistics |
3724 | * @sg: sched_group candidate to be checked for being the busiest | 3747 | * @sg: sched_group candidate to be checked for being the busiest |
3725 | * @sgs: sched_group statistics | 3748 | * @sgs: sched_group statistics |
3726 | * @this_cpu: the current cpu | ||
3727 | * | 3749 | * |
3728 | * Determine if @sg is a busier group than the previously selected | 3750 | * Determine if @sg is a busier group than the previously selected |
3729 | * busiest group. | 3751 | * busiest group. |
@@ -3761,9 +3783,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
3761 | 3783 | ||
3762 | /** | 3784 | /** |
3763 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. | 3785 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. |
3764 | * @sd: sched_domain whose statistics are to be updated. | 3786 | * @env: The load balancing environment. |
3765 | * @this_cpu: Cpu for which load balance is currently performed. | ||
3766 | * @idle: Idle status of this_cpu | ||
3767 | * @cpus: Set of cpus considered for load balancing. | 3787 | * @cpus: Set of cpus considered for load balancing. |
3768 | * @balance: Should we balance. | 3788 | * @balance: Should we balance. |
3769 | * @sds: variable to hold the statistics for this sched_domain. | 3789 | * @sds: variable to hold the statistics for this sched_domain. |
@@ -3852,10 +3872,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
3852 | * Returns 1 when packing is required and a task should be moved to | 3872 | * Returns 1 when packing is required and a task should be moved to |
3853 | * this CPU. The amount of the imbalance is returned in *imbalance. | 3873 | * this CPU. The amount of the imbalance is returned in *imbalance. |
3854 | * | 3874 | * |
3855 | * @sd: The sched_domain whose packing is to be checked. | 3875 | * @env: The load balancing environment. |
3856 | * @sds: Statistics of the sched_domain which is to be packed | 3876 | * @sds: Statistics of the sched_domain which is to be packed |
3857 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
3858 | * @imbalance: returns amount of imbalanced due to packing. | ||
3859 | */ | 3877 | */ |
3860 | static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) | 3878 | static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) |
3861 | { | 3879 | { |
@@ -3881,9 +3899,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) | |||
3881 | * fix_small_imbalance - Calculate the minor imbalance that exists | 3899 | * fix_small_imbalance - Calculate the minor imbalance that exists |
3882 | * amongst the groups of a sched_domain, during | 3900 | * amongst the groups of a sched_domain, during |
3883 | * load balancing. | 3901 | * load balancing. |
3902 | * @env: The load balancing environment. | ||
3884 | * @sds: Statistics of the sched_domain whose imbalance is to be calculated. | 3903 | * @sds: Statistics of the sched_domain whose imbalance is to be calculated. |
3885 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
3886 | * @imbalance: Variable to store the imbalance. | ||
3887 | */ | 3904 | */ |
3888 | static inline | 3905 | static inline |
3889 | void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | 3906 | void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) |
@@ -4026,11 +4043,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
4026 | * Also calculates the amount of weighted load which should be moved | 4043 | * Also calculates the amount of weighted load which should be moved |
4027 | * to restore balance. | 4044 | * to restore balance. |
4028 | * | 4045 | * |
4029 | * @sd: The sched_domain whose busiest group is to be returned. | 4046 | * @env: The load balancing environment. |
4030 | * @this_cpu: The cpu for which load balancing is currently being performed. | ||
4031 | * @imbalance: Variable which stores amount of weighted load which should | ||
4032 | * be moved to restore balance/put a group to idle. | ||
4033 | * @idle: The idle status of this_cpu. | ||
4034 | * @cpus: The set of CPUs under consideration for load-balancing. | 4047 | * @cpus: The set of CPUs under consideration for load-balancing. |
4035 | * @balance: Pointer to a variable indicating if this_cpu | 4048 | * @balance: Pointer to a variable indicating if this_cpu |
4036 | * is the appropriate cpu to perform load balancing at this_level. | 4049 | * is the appropriate cpu to perform load balancing at this_level. |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index c5565c3c515f..573e1ca01102 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq) | |||
274 | 274 | ||
275 | static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | 275 | static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) |
276 | { | 276 | { |
277 | struct task_struct *p; | ||
278 | |||
277 | if (!rt_entity_is_task(rt_se)) | 279 | if (!rt_entity_is_task(rt_se)) |
278 | return; | 280 | return; |
279 | 281 | ||
282 | p = rt_task_of(rt_se); | ||
280 | rt_rq = &rq_of_rt_rq(rt_rq)->rt; | 283 | rt_rq = &rq_of_rt_rq(rt_rq)->rt; |
281 | 284 | ||
282 | rt_rq->rt_nr_total++; | 285 | rt_rq->rt_nr_total++; |
283 | if (rt_se->nr_cpus_allowed > 1) | 286 | if (p->nr_cpus_allowed > 1) |
284 | rt_rq->rt_nr_migratory++; | 287 | rt_rq->rt_nr_migratory++; |
285 | 288 | ||
286 | update_rt_migration(rt_rq); | 289 | update_rt_migration(rt_rq); |
@@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
288 | 291 | ||
289 | static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | 292 | static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) |
290 | { | 293 | { |
294 | struct task_struct *p; | ||
295 | |||
291 | if (!rt_entity_is_task(rt_se)) | 296 | if (!rt_entity_is_task(rt_se)) |
292 | return; | 297 | return; |
293 | 298 | ||
299 | p = rt_task_of(rt_se); | ||
294 | rt_rq = &rq_of_rt_rq(rt_rq)->rt; | 300 | rt_rq = &rq_of_rt_rq(rt_rq)->rt; |
295 | 301 | ||
296 | rt_rq->rt_nr_total--; | 302 | rt_rq->rt_nr_total--; |
297 | if (rt_se->nr_cpus_allowed > 1) | 303 | if (p->nr_cpus_allowed > 1) |
298 | rt_rq->rt_nr_migratory--; | 304 | rt_rq->rt_nr_migratory--; |
299 | 305 | ||
300 | update_rt_migration(rt_rq); | 306 | update_rt_migration(rt_rq); |
@@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
1161 | 1167 | ||
1162 | enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); | 1168 | enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); |
1163 | 1169 | ||
1164 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) | 1170 | if (!task_current(rq, p) && p->nr_cpus_allowed > 1) |
1165 | enqueue_pushable_task(rq, p); | 1171 | enqueue_pushable_task(rq, p); |
1166 | 1172 | ||
1167 | inc_nr_running(rq); | 1173 | inc_nr_running(rq); |
@@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
1225 | 1231 | ||
1226 | cpu = task_cpu(p); | 1232 | cpu = task_cpu(p); |
1227 | 1233 | ||
1228 | if (p->rt.nr_cpus_allowed == 1) | 1234 | if (p->nr_cpus_allowed == 1) |
1229 | goto out; | 1235 | goto out; |
1230 | 1236 | ||
1231 | /* For anything but wake ups, just return the task_cpu */ | 1237 | /* For anything but wake ups, just return the task_cpu */ |
@@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
1260 | * will have to sort it out. | 1266 | * will have to sort it out. |
1261 | */ | 1267 | */ |
1262 | if (curr && unlikely(rt_task(curr)) && | 1268 | if (curr && unlikely(rt_task(curr)) && |
1263 | (curr->rt.nr_cpus_allowed < 2 || | 1269 | (curr->nr_cpus_allowed < 2 || |
1264 | curr->prio <= p->prio) && | 1270 | curr->prio <= p->prio) && |
1265 | (p->rt.nr_cpus_allowed > 1)) { | 1271 | (p->nr_cpus_allowed > 1)) { |
1266 | int target = find_lowest_rq(p); | 1272 | int target = find_lowest_rq(p); |
1267 | 1273 | ||
1268 | if (target != -1) | 1274 | if (target != -1) |
@@ -1276,10 +1282,10 @@ out: | |||
1276 | 1282 | ||
1277 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) | 1283 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) |
1278 | { | 1284 | { |
1279 | if (rq->curr->rt.nr_cpus_allowed == 1) | 1285 | if (rq->curr->nr_cpus_allowed == 1) |
1280 | return; | 1286 | return; |
1281 | 1287 | ||
1282 | if (p->rt.nr_cpus_allowed != 1 | 1288 | if (p->nr_cpus_allowed != 1 |
1283 | && cpupri_find(&rq->rd->cpupri, p, NULL)) | 1289 | && cpupri_find(&rq->rd->cpupri, p, NULL)) |
1284 | return; | 1290 | return; |
1285 | 1291 | ||
@@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
1395 | * The previous task needs to be made eligible for pushing | 1401 | * The previous task needs to be made eligible for pushing |
1396 | * if it is still active | 1402 | * if it is still active |
1397 | */ | 1403 | */ |
1398 | if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) | 1404 | if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) |
1399 | enqueue_pushable_task(rq, p); | 1405 | enqueue_pushable_task(rq, p); |
1400 | } | 1406 | } |
1401 | 1407 | ||
@@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | |||
1408 | { | 1414 | { |
1409 | if (!task_running(rq, p) && | 1415 | if (!task_running(rq, p) && |
1410 | (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && | 1416 | (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && |
1411 | (p->rt.nr_cpus_allowed > 1)) | 1417 | (p->nr_cpus_allowed > 1)) |
1412 | return 1; | 1418 | return 1; |
1413 | return 0; | 1419 | return 0; |
1414 | } | 1420 | } |
@@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task) | |||
1464 | if (unlikely(!lowest_mask)) | 1470 | if (unlikely(!lowest_mask)) |
1465 | return -1; | 1471 | return -1; |
1466 | 1472 | ||
1467 | if (task->rt.nr_cpus_allowed == 1) | 1473 | if (task->nr_cpus_allowed == 1) |
1468 | return -1; /* No other targets possible */ | 1474 | return -1; /* No other targets possible */ |
1469 | 1475 | ||
1470 | if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) | 1476 | if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) |
@@ -1556,7 +1562,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | |||
1556 | task_running(rq, task) || | 1562 | task_running(rq, task) || |
1557 | !task->on_rq)) { | 1563 | !task->on_rq)) { |
1558 | 1564 | ||
1559 | raw_spin_unlock(&lowest_rq->lock); | 1565 | double_unlock_balance(rq, lowest_rq); |
1560 | lowest_rq = NULL; | 1566 | lowest_rq = NULL; |
1561 | break; | 1567 | break; |
1562 | } | 1568 | } |
@@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) | |||
1586 | 1592 | ||
1587 | BUG_ON(rq->cpu != task_cpu(p)); | 1593 | BUG_ON(rq->cpu != task_cpu(p)); |
1588 | BUG_ON(task_current(rq, p)); | 1594 | BUG_ON(task_current(rq, p)); |
1589 | BUG_ON(p->rt.nr_cpus_allowed <= 1); | 1595 | BUG_ON(p->nr_cpus_allowed <= 1); |
1590 | 1596 | ||
1591 | BUG_ON(!p->on_rq); | 1597 | BUG_ON(!p->on_rq); |
1592 | BUG_ON(!rt_task(p)); | 1598 | BUG_ON(!rt_task(p)); |
@@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
1793 | if (!task_running(rq, p) && | 1799 | if (!task_running(rq, p) && |
1794 | !test_tsk_need_resched(rq->curr) && | 1800 | !test_tsk_need_resched(rq->curr) && |
1795 | has_pushable_tasks(rq) && | 1801 | has_pushable_tasks(rq) && |
1796 | p->rt.nr_cpus_allowed > 1 && | 1802 | p->nr_cpus_allowed > 1 && |
1797 | rt_task(rq->curr) && | 1803 | rt_task(rq->curr) && |
1798 | (rq->curr->rt.nr_cpus_allowed < 2 || | 1804 | (rq->curr->nr_cpus_allowed < 2 || |
1799 | rq->curr->prio <= p->prio)) | 1805 | rq->curr->prio <= p->prio)) |
1800 | push_rt_tasks(rq); | 1806 | push_rt_tasks(rq); |
1801 | } | 1807 | } |
@@ -1817,7 +1823,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, | |||
1817 | * Only update if the process changes its state from whether it | 1823 | * Only update if the process changes its state from whether it |
1818 | * can migrate or not. | 1824 | * can migrate or not. |
1819 | */ | 1825 | */ |
1820 | if ((p->rt.nr_cpus_allowed > 1) == (weight > 1)) | 1826 | if ((p->nr_cpus_allowed > 1) == (weight > 1)) |
1821 | return; | 1827 | return; |
1822 | 1828 | ||
1823 | rq = task_rq(p); | 1829 | rq = task_rq(p); |
@@ -1979,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p) | |||
1979 | 1985 | ||
1980 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | 1986 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) |
1981 | { | 1987 | { |
1988 | struct sched_rt_entity *rt_se = &p->rt; | ||
1989 | |||
1982 | update_curr_rt(rq); | 1990 | update_curr_rt(rq); |
1983 | 1991 | ||
1984 | watchdog(rq, p); | 1992 | watchdog(rq, p); |
@@ -1996,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | |||
1996 | p->rt.time_slice = RR_TIMESLICE; | 2004 | p->rt.time_slice = RR_TIMESLICE; |
1997 | 2005 | ||
1998 | /* | 2006 | /* |
1999 | * Requeue to the end of queue if we are not the only element | 2007 | * Requeue to the end of queue if we (and all of our ancestors) are the |
2000 | * on the queue: | 2008 | * only element on the queue |
2001 | */ | 2009 | */ |
2002 | if (p->rt.run_list.prev != p->rt.run_list.next) { | 2010 | for_each_sched_rt_entity(rt_se) { |
2003 | requeue_task_rt(rq, p, 0); | 2011 | if (rt_se->run_list.prev != rt_se->run_list.next) { |
2004 | set_tsk_need_resched(p); | 2012 | requeue_task_rt(rq, p, 0); |
2013 | set_tsk_need_resched(p); | ||
2014 | return; | ||
2015 | } | ||
2005 | } | 2016 | } |
2006 | } | 2017 | } |
2007 | 2018 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ba9dccfd24ce..6d52cea7f33d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) | |||
526 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); | 526 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); |
527 | DECLARE_PER_CPU(int, sd_llc_id); | 527 | DECLARE_PER_CPU(int, sd_llc_id); |
528 | 528 | ||
529 | extern int group_balance_cpu(struct sched_group *sg); | ||
530 | |||
529 | #endif /* CONFIG_SMP */ | 531 | #endif /* CONFIG_SMP */ |
530 | 532 | ||
531 | #include "stats.h" | 533 | #include "stats.h" |
diff --git a/kernel/smpboot.c b/kernel/smpboot.c index e1a797e028a3..98f60c5caa1b 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c | |||
@@ -31,6 +31,12 @@ void __init idle_thread_set_boot_cpu(void) | |||
31 | per_cpu(idle_threads, smp_processor_id()) = current; | 31 | per_cpu(idle_threads, smp_processor_id()) = current; |
32 | } | 32 | } |
33 | 33 | ||
34 | /** | ||
35 | * idle_init - Initialize the idle thread for a cpu | ||
36 | * @cpu: The cpu for which the idle thread should be initialized | ||
37 | * | ||
38 | * Creates the thread if it does not exist. | ||
39 | */ | ||
34 | static inline void idle_init(unsigned int cpu) | 40 | static inline void idle_init(unsigned int cpu) |
35 | { | 41 | { |
36 | struct task_struct *tsk = per_cpu(idle_threads, cpu); | 42 | struct task_struct *tsk = per_cpu(idle_threads, cpu); |
@@ -45,17 +51,16 @@ static inline void idle_init(unsigned int cpu) | |||
45 | } | 51 | } |
46 | 52 | ||
47 | /** | 53 | /** |
48 | * idle_thread_init - Initialize the idle thread for a cpu | 54 | * idle_threads_init - Initialize idle threads for all cpus |
49 | * @cpu: The cpu for which the idle thread should be initialized | ||
50 | * | ||
51 | * Creates the thread if it does not exist. | ||
52 | */ | 55 | */ |
53 | void __init idle_threads_init(void) | 56 | void __init idle_threads_init(void) |
54 | { | 57 | { |
55 | unsigned int cpu; | 58 | unsigned int cpu, boot_cpu; |
59 | |||
60 | boot_cpu = smp_processor_id(); | ||
56 | 61 | ||
57 | for_each_possible_cpu(cpu) { | 62 | for_each_possible_cpu(cpu) { |
58 | if (cpu != smp_processor_id()) | 63 | if (cpu != boot_cpu) |
59 | idle_init(cpu); | 64 | idle_init(cpu); |
60 | } | 65 | } |
61 | } | 66 | } |
diff --git a/kernel/sys.c b/kernel/sys.c index 9ff89cb9657a..e0c8ffc50d7f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -1786,27 +1786,13 @@ SYSCALL_DEFINE1(umask, int, mask) | |||
1786 | } | 1786 | } |
1787 | 1787 | ||
1788 | #ifdef CONFIG_CHECKPOINT_RESTORE | 1788 | #ifdef CONFIG_CHECKPOINT_RESTORE |
1789 | static bool vma_flags_mismatch(struct vm_area_struct *vma, | ||
1790 | unsigned long required, | ||
1791 | unsigned long banned) | ||
1792 | { | ||
1793 | return (vma->vm_flags & required) != required || | ||
1794 | (vma->vm_flags & banned); | ||
1795 | } | ||
1796 | |||
1797 | static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | 1789 | static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) |
1798 | { | 1790 | { |
1791 | struct vm_area_struct *vma; | ||
1799 | struct file *exe_file; | 1792 | struct file *exe_file; |
1800 | struct dentry *dentry; | 1793 | struct dentry *dentry; |
1801 | int err; | 1794 | int err; |
1802 | 1795 | ||
1803 | /* | ||
1804 | * Setting new mm::exe_file is only allowed when no VM_EXECUTABLE vma's | ||
1805 | * remain. So perform a quick test first. | ||
1806 | */ | ||
1807 | if (mm->num_exe_file_vmas) | ||
1808 | return -EBUSY; | ||
1809 | |||
1810 | exe_file = fget(fd); | 1796 | exe_file = fget(fd); |
1811 | if (!exe_file) | 1797 | if (!exe_file) |
1812 | return -EBADF; | 1798 | return -EBADF; |
@@ -1827,17 +1813,30 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | |||
1827 | if (err) | 1813 | if (err) |
1828 | goto exit; | 1814 | goto exit; |
1829 | 1815 | ||
1816 | down_write(&mm->mmap_sem); | ||
1817 | |||
1818 | /* | ||
1819 | * Forbid mm->exe_file change if there are mapped other files. | ||
1820 | */ | ||
1821 | err = -EBUSY; | ||
1822 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
1823 | if (vma->vm_file && !path_equal(&vma->vm_file->f_path, | ||
1824 | &exe_file->f_path)) | ||
1825 | goto exit_unlock; | ||
1826 | } | ||
1827 | |||
1830 | /* | 1828 | /* |
1831 | * The symlink can be changed only once, just to disallow arbitrary | 1829 | * The symlink can be changed only once, just to disallow arbitrary |
1832 | * transitions malicious software might bring in. This means one | 1830 | * transitions malicious software might bring in. This means one |
1833 | * could make a snapshot over all processes running and monitor | 1831 | * could make a snapshot over all processes running and monitor |
1834 | * /proc/pid/exe changes to notice unusual activity if needed. | 1832 | * /proc/pid/exe changes to notice unusual activity if needed. |
1835 | */ | 1833 | */ |
1836 | down_write(&mm->mmap_sem); | 1834 | err = -EPERM; |
1837 | if (likely(!mm->exe_file)) | 1835 | if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) |
1838 | set_mm_exe_file(mm, exe_file); | 1836 | goto exit_unlock; |
1839 | else | 1837 | |
1840 | err = -EBUSY; | 1838 | set_mm_exe_file(mm, exe_file); |
1839 | exit_unlock: | ||
1841 | up_write(&mm->mmap_sem); | 1840 | up_write(&mm->mmap_sem); |
1842 | 1841 | ||
1843 | exit: | 1842 | exit: |
@@ -1862,7 +1861,7 @@ static int prctl_set_mm(int opt, unsigned long addr, | |||
1862 | if (opt == PR_SET_MM_EXE_FILE) | 1861 | if (opt == PR_SET_MM_EXE_FILE) |
1863 | return prctl_set_mm_exe_file(mm, (unsigned int)addr); | 1862 | return prctl_set_mm_exe_file(mm, (unsigned int)addr); |
1864 | 1863 | ||
1865 | if (addr >= TASK_SIZE) | 1864 | if (addr >= TASK_SIZE || addr < mmap_min_addr) |
1866 | return -EINVAL; | 1865 | return -EINVAL; |
1867 | 1866 | ||
1868 | error = -EINVAL; | 1867 | error = -EINVAL; |
@@ -1924,12 +1923,6 @@ static int prctl_set_mm(int opt, unsigned long addr, | |||
1924 | error = -EFAULT; | 1923 | error = -EFAULT; |
1925 | goto out; | 1924 | goto out; |
1926 | } | 1925 | } |
1927 | #ifdef CONFIG_STACK_GROWSUP | ||
1928 | if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSUP, 0)) | ||
1929 | #else | ||
1930 | if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSDOWN, 0)) | ||
1931 | #endif | ||
1932 | goto out; | ||
1933 | if (opt == PR_SET_MM_START_STACK) | 1926 | if (opt == PR_SET_MM_START_STACK) |
1934 | mm->start_stack = addr; | 1927 | mm->start_stack = addr; |
1935 | else if (opt == PR_SET_MM_ARG_START) | 1928 | else if (opt == PR_SET_MM_ARG_START) |
@@ -1981,12 +1974,22 @@ out: | |||
1981 | up_read(&mm->mmap_sem); | 1974 | up_read(&mm->mmap_sem); |
1982 | return error; | 1975 | return error; |
1983 | } | 1976 | } |
1977 | |||
1978 | static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) | ||
1979 | { | ||
1980 | return put_user(me->clear_child_tid, tid_addr); | ||
1981 | } | ||
1982 | |||
1984 | #else /* CONFIG_CHECKPOINT_RESTORE */ | 1983 | #else /* CONFIG_CHECKPOINT_RESTORE */ |
1985 | static int prctl_set_mm(int opt, unsigned long addr, | 1984 | static int prctl_set_mm(int opt, unsigned long addr, |
1986 | unsigned long arg4, unsigned long arg5) | 1985 | unsigned long arg4, unsigned long arg5) |
1987 | { | 1986 | { |
1988 | return -EINVAL; | 1987 | return -EINVAL; |
1989 | } | 1988 | } |
1989 | static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) | ||
1990 | { | ||
1991 | return -EINVAL; | ||
1992 | } | ||
1990 | #endif | 1993 | #endif |
1991 | 1994 | ||
1992 | SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | 1995 | SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, |
@@ -2141,6 +2144,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
2141 | case PR_SET_MM: | 2144 | case PR_SET_MM: |
2142 | error = prctl_set_mm(arg2, arg3, arg4, arg5); | 2145 | error = prctl_set_mm(arg2, arg3, arg4, arg5); |
2143 | break; | 2146 | break; |
2147 | case PR_GET_TID_ADDRESS: | ||
2148 | error = prctl_get_tid_address(me, (int __user **)arg2); | ||
2149 | break; | ||
2144 | case PR_SET_CHILD_SUBREAPER: | 2150 | case PR_SET_CHILD_SUBREAPER: |
2145 | me->signal->is_child_subreaper = !!arg2; | 2151 | me->signal->is_child_subreaper = !!arg2; |
2146 | error = 0; | 2152 | error = 0; |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 9cd928f7a7c6..7e1ce012a851 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -297,8 +297,7 @@ void clockevents_register_device(struct clock_event_device *dev) | |||
297 | } | 297 | } |
298 | EXPORT_SYMBOL_GPL(clockevents_register_device); | 298 | EXPORT_SYMBOL_GPL(clockevents_register_device); |
299 | 299 | ||
300 | static void clockevents_config(struct clock_event_device *dev, | 300 | void clockevents_config(struct clock_event_device *dev, u32 freq) |
301 | u32 freq) | ||
302 | { | 301 | { |
303 | u64 sec; | 302 | u64 sec; |
304 | 303 | ||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6a3a5b9ff561..869997833928 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -274,6 +274,7 @@ EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); | |||
274 | static void tick_nohz_stop_sched_tick(struct tick_sched *ts) | 274 | static void tick_nohz_stop_sched_tick(struct tick_sched *ts) |
275 | { | 275 | { |
276 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; | 276 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; |
277 | unsigned long rcu_delta_jiffies; | ||
277 | ktime_t last_update, expires, now; | 278 | ktime_t last_update, expires, now; |
278 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 279 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; |
279 | u64 time_delta; | 280 | u64 time_delta; |
@@ -322,7 +323,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) | |||
322 | time_delta = timekeeping_max_deferment(); | 323 | time_delta = timekeeping_max_deferment(); |
323 | } while (read_seqretry(&xtime_lock, seq)); | 324 | } while (read_seqretry(&xtime_lock, seq)); |
324 | 325 | ||
325 | if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || | 326 | if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || |
326 | arch_needs_cpu(cpu)) { | 327 | arch_needs_cpu(cpu)) { |
327 | next_jiffies = last_jiffies + 1; | 328 | next_jiffies = last_jiffies + 1; |
328 | delta_jiffies = 1; | 329 | delta_jiffies = 1; |
@@ -330,6 +331,10 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) | |||
330 | /* Get the next timer wheel timer */ | 331 | /* Get the next timer wheel timer */ |
331 | next_jiffies = get_next_timer_interrupt(last_jiffies); | 332 | next_jiffies = get_next_timer_interrupt(last_jiffies); |
332 | delta_jiffies = next_jiffies - last_jiffies; | 333 | delta_jiffies = next_jiffies - last_jiffies; |
334 | if (rcu_delta_jiffies < delta_jiffies) { | ||
335 | next_jiffies = last_jiffies + rcu_delta_jiffies; | ||
336 | delta_jiffies = rcu_delta_jiffies; | ||
337 | } | ||
333 | } | 338 | } |
334 | /* | 339 | /* |
335 | * Do not stop the tick, if we are only one off | 340 | * Do not stop the tick, if we are only one off |
@@ -576,6 +581,7 @@ void tick_nohz_idle_exit(void) | |||
576 | /* Update jiffies first */ | 581 | /* Update jiffies first */ |
577 | select_nohz_load_balancer(0); | 582 | select_nohz_load_balancer(0); |
578 | tick_do_update_jiffies64(now); | 583 | tick_do_update_jiffies64(now); |
584 | update_cpu_load_nohz(); | ||
579 | 585 | ||
580 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 586 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
581 | /* | 587 | /* |
@@ -814,6 +820,16 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | |||
814 | return HRTIMER_RESTART; | 820 | return HRTIMER_RESTART; |
815 | } | 821 | } |
816 | 822 | ||
823 | static int sched_skew_tick; | ||
824 | |||
825 | static int __init skew_tick(char *str) | ||
826 | { | ||
827 | get_option(&str, &sched_skew_tick); | ||
828 | |||
829 | return 0; | ||
830 | } | ||
831 | early_param("skew_tick", skew_tick); | ||
832 | |||
817 | /** | 833 | /** |
818 | * tick_setup_sched_timer - setup the tick emulation timer | 834 | * tick_setup_sched_timer - setup the tick emulation timer |
819 | */ | 835 | */ |
@@ -831,6 +847,14 @@ void tick_setup_sched_timer(void) | |||
831 | /* Get the next period (per cpu) */ | 847 | /* Get the next period (per cpu) */ |
832 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); | 848 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); |
833 | 849 | ||
850 | /* Offset the tick to avert xtime_lock contention. */ | ||
851 | if (sched_skew_tick) { | ||
852 | u64 offset = ktime_to_ns(tick_period) >> 1; | ||
853 | do_div(offset, num_possible_cpus()); | ||
854 | offset *= smp_processor_id(); | ||
855 | hrtimer_add_expires_ns(&ts->sched_timer, offset); | ||
856 | } | ||
857 | |||
834 | for (;;) { | 858 | for (;;) { |
835 | hrtimer_forward(&ts->sched_timer, now, tick_period); | 859 | hrtimer_forward(&ts->sched_timer, now, tick_period); |
836 | hrtimer_start_expires(&ts->sched_timer, | 860 | hrtimer_start_expires(&ts->sched_timer, |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6e46cacf5969..6f46a00a1e8a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -962,6 +962,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) | |||
962 | timekeeper.xtime.tv_sec++; | 962 | timekeeper.xtime.tv_sec++; |
963 | leap = second_overflow(timekeeper.xtime.tv_sec); | 963 | leap = second_overflow(timekeeper.xtime.tv_sec); |
964 | timekeeper.xtime.tv_sec += leap; | 964 | timekeeper.xtime.tv_sec += leap; |
965 | timekeeper.wall_to_monotonic.tv_sec -= leap; | ||
965 | } | 966 | } |
966 | 967 | ||
967 | /* Accumulate raw time */ | 968 | /* Accumulate raw time */ |
@@ -1077,6 +1078,7 @@ static void update_wall_time(void) | |||
1077 | timekeeper.xtime.tv_sec++; | 1078 | timekeeper.xtime.tv_sec++; |
1078 | leap = second_overflow(timekeeper.xtime.tv_sec); | 1079 | leap = second_overflow(timekeeper.xtime.tv_sec); |
1079 | timekeeper.xtime.tv_sec += leap; | 1080 | timekeeper.xtime.tv_sec += leap; |
1081 | timekeeper.wall_to_monotonic.tv_sec -= leap; | ||
1080 | } | 1082 | } |
1081 | 1083 | ||
1082 | timekeeping_update(false); | 1084 | timekeeping_update(false); |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 288488082224..a7fa0702be1c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -371,7 +371,7 @@ EXPORT_SYMBOL_GPL(tracing_on); | |||
371 | void tracing_off(void) | 371 | void tracing_off(void) |
372 | { | 372 | { |
373 | if (global_trace.buffer) | 373 | if (global_trace.buffer) |
374 | ring_buffer_record_on(global_trace.buffer); | 374 | ring_buffer_record_off(global_trace.buffer); |
375 | /* | 375 | /* |
376 | * This flag is only looked at when buffers haven't been | 376 | * This flag is only looked at when buffers haven't been |
377 | * allocated yet. We don't really care about the race | 377 | * allocated yet. We don't really care about the race |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index e5e1d85b8c7c..4b1dfba70f7c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -372,6 +372,13 @@ static int watchdog(void *unused) | |||
372 | 372 | ||
373 | 373 | ||
374 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 374 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
375 | /* | ||
376 | * People like the simple clean cpu node info on boot. | ||
377 | * Reduce the watchdog noise by only printing messages | ||
378 | * that are different from what cpu0 displayed. | ||
379 | */ | ||
380 | static unsigned long cpu0_err; | ||
381 | |||
375 | static int watchdog_nmi_enable(int cpu) | 382 | static int watchdog_nmi_enable(int cpu) |
376 | { | 383 | { |
377 | struct perf_event_attr *wd_attr; | 384 | struct perf_event_attr *wd_attr; |
@@ -390,11 +397,21 @@ static int watchdog_nmi_enable(int cpu) | |||
390 | 397 | ||
391 | /* Try to register using hardware perf events */ | 398 | /* Try to register using hardware perf events */ |
392 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); | 399 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); |
400 | |||
401 | /* save cpu0 error for future comparision */ | ||
402 | if (cpu == 0 && IS_ERR(event)) | ||
403 | cpu0_err = PTR_ERR(event); | ||
404 | |||
393 | if (!IS_ERR(event)) { | 405 | if (!IS_ERR(event)) { |
394 | pr_info("enabled, takes one hw-pmu counter.\n"); | 406 | /* only print for cpu0 or different than cpu0 */ |
407 | if (cpu == 0 || cpu0_err) | ||
408 | pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); | ||
395 | goto out_save; | 409 | goto out_save; |
396 | } | 410 | } |
397 | 411 | ||
412 | /* skip displaying the same error again */ | ||
413 | if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) | ||
414 | return PTR_ERR(event); | ||
398 | 415 | ||
399 | /* vary the KERN level based on the returned errno */ | 416 | /* vary the KERN level based on the returned errno */ |
400 | if (PTR_ERR(event) == -EOPNOTSUPP) | 417 | if (PTR_ERR(event) == -EOPNOTSUPP) |