aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c30
-rw-r--r--kernel/events/core.c11
-rw-r--r--kernel/exit.c19
-rw-r--r--kernel/irq/chip.c8
-rw-r--r--kernel/irq/internals.h3
-rw-r--r--kernel/irq/manage.c39
-rw-r--r--kernel/irq/migration.c13
-rw-r--r--kernel/panic.c6
-rw-r--r--kernel/pid_namespace.c20
-rw-r--r--kernel/printk.c532
-rw-r--r--kernel/rcutree.c16
-rw-r--r--kernel/rcutree.h14
-rw-r--r--kernel/rcutree_plugin.h165
-rw-r--r--kernel/sched/core.c249
-rw-r--r--kernel/sched/fair.c71
-rw-r--r--kernel/sched/rt.c53
-rw-r--r--kernel/sched/sched.h2
-rw-r--r--kernel/smpboot.c17
-rw-r--r--kernel/sys.c60
-rw-r--r--kernel/time/clockevents.c3
-rw-r--r--kernel/time/tick-sched.c26
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/trace/trace.c2
-rw-r--r--kernel/watchdog.c19
24 files changed, 996 insertions, 384 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0f3527d6184a..2097684cf194 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -255,12 +255,17 @@ int cgroup_lock_is_held(void)
255 255
256EXPORT_SYMBOL_GPL(cgroup_lock_is_held); 256EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
257 257
258static int css_unbias_refcnt(int refcnt)
259{
260 return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
261}
262
258/* the current nr of refs, always >= 0 whether @css is deactivated or not */ 263/* the current nr of refs, always >= 0 whether @css is deactivated or not */
259static int css_refcnt(struct cgroup_subsys_state *css) 264static int css_refcnt(struct cgroup_subsys_state *css)
260{ 265{
261 int v = atomic_read(&css->refcnt); 266 int v = atomic_read(&css->refcnt);
262 267
263 return v >= 0 ? v : v - CSS_DEACT_BIAS; 268 return css_unbias_refcnt(v);
264} 269}
265 270
266/* convenient tests for these bits */ 271/* convenient tests for these bits */
@@ -896,10 +901,13 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
896 mutex_unlock(&cgroup_mutex); 901 mutex_unlock(&cgroup_mutex);
897 902
898 /* 903 /*
899 * Drop the active superblock reference that we took when we 904 * We want to drop the active superblock reference from the
900 * created the cgroup 905 * cgroup creation after all the dentry refs are gone -
906 * kill_sb gets mighty unhappy otherwise. Mark
907 * dentry->d_fsdata with cgroup_diput() to tell
908 * cgroup_d_release() to call deactivate_super().
901 */ 909 */
902 deactivate_super(cgrp->root->sb); 910 dentry->d_fsdata = cgroup_diput;
903 911
904 /* 912 /*
905 * if we're getting rid of the cgroup, refcount should ensure 913 * if we're getting rid of the cgroup, refcount should ensure
@@ -925,6 +933,13 @@ static int cgroup_delete(const struct dentry *d)
925 return 1; 933 return 1;
926} 934}
927 935
936static void cgroup_d_release(struct dentry *dentry)
937{
938 /* did cgroup_diput() tell me to deactivate super? */
939 if (dentry->d_fsdata == cgroup_diput)
940 deactivate_super(dentry->d_sb);
941}
942
928static void remove_dir(struct dentry *d) 943static void remove_dir(struct dentry *d)
929{ 944{
930 struct dentry *parent = dget(d->d_parent); 945 struct dentry *parent = dget(d->d_parent);
@@ -1532,6 +1547,7 @@ static int cgroup_get_rootdir(struct super_block *sb)
1532 static const struct dentry_operations cgroup_dops = { 1547 static const struct dentry_operations cgroup_dops = {
1533 .d_iput = cgroup_diput, 1548 .d_iput = cgroup_diput,
1534 .d_delete = cgroup_delete, 1549 .d_delete = cgroup_delete,
1550 .d_release = cgroup_d_release,
1535 }; 1551 };
1536 1552
1537 struct inode *inode = 1553 struct inode *inode =
@@ -4971,10 +4987,12 @@ EXPORT_SYMBOL_GPL(__css_tryget);
4971void __css_put(struct cgroup_subsys_state *css) 4987void __css_put(struct cgroup_subsys_state *css)
4972{ 4988{
4973 struct cgroup *cgrp = css->cgroup; 4989 struct cgroup *cgrp = css->cgroup;
4990 int v;
4974 4991
4975 rcu_read_lock(); 4992 rcu_read_lock();
4976 atomic_dec(&css->refcnt); 4993 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
4977 switch (css_refcnt(css)) { 4994
4995 switch (v) {
4978 case 1: 4996 case 1:
4979 if (notify_on_release(cgrp)) { 4997 if (notify_on_release(cgrp)) {
4980 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4998 set_bit(CGRP_RELEASABLE, &cgrp->flags);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5b06cbbf6931..d7d71d6ec972 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -253,9 +253,9 @@ perf_cgroup_match(struct perf_event *event)
253 return !event->cgrp || event->cgrp == cpuctx->cgrp; 253 return !event->cgrp || event->cgrp == cpuctx->cgrp;
254} 254}
255 255
256static inline void perf_get_cgroup(struct perf_event *event) 256static inline bool perf_tryget_cgroup(struct perf_event *event)
257{ 257{
258 css_get(&event->cgrp->css); 258 return css_tryget(&event->cgrp->css);
259} 259}
260 260
261static inline void perf_put_cgroup(struct perf_event *event) 261static inline void perf_put_cgroup(struct perf_event *event)
@@ -484,7 +484,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
484 event->cgrp = cgrp; 484 event->cgrp = cgrp;
485 485
486 /* must be done before we fput() the file */ 486 /* must be done before we fput() the file */
487 perf_get_cgroup(event); 487 if (!perf_tryget_cgroup(event)) {
488 event->cgrp = NULL;
489 ret = -ENOENT;
490 goto out;
491 }
488 492
489 /* 493 /*
490 * all events in a group must monitor 494 * all events in a group must monitor
@@ -3181,7 +3185,6 @@ static void perf_event_for_each(struct perf_event *event,
3181 event = event->group_leader; 3185 event = event->group_leader;
3182 3186
3183 perf_event_for_each_child(event, func); 3187 perf_event_for_each_child(event, func);
3184 func(event);
3185 list_for_each_entry(sibling, &event->sibling_list, group_entry) 3188 list_for_each_entry(sibling, &event->sibling_list, group_entry)
3186 perf_event_for_each_child(sibling, func); 3189 perf_event_for_each_child(sibling, func);
3187 mutex_unlock(&ctx->mutex); 3190 mutex_unlock(&ctx->mutex);
diff --git a/kernel/exit.c b/kernel/exit.c
index 34867cc5b42a..2f59cc334516 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,6 +72,18 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
72 list_del_rcu(&p->tasks); 72 list_del_rcu(&p->tasks);
73 list_del_init(&p->sibling); 73 list_del_init(&p->sibling);
74 __this_cpu_dec(process_counts); 74 __this_cpu_dec(process_counts);
75 /*
76 * If we are the last child process in a pid namespace to be
77 * reaped, notify the reaper sleeping zap_pid_ns_processes().
78 */
79 if (IS_ENABLED(CONFIG_PID_NS)) {
80 struct task_struct *parent = p->real_parent;
81
82 if ((task_active_pid_ns(parent)->child_reaper == parent) &&
83 list_empty(&parent->children) &&
84 (parent->flags & PF_EXITING))
85 wake_up_process(parent);
86 }
75 } 87 }
76 list_del_rcu(&p->thread_group); 88 list_del_rcu(&p->thread_group);
77} 89}
@@ -643,6 +655,7 @@ static void exit_mm(struct task_struct * tsk)
643 mm_release(tsk, mm); 655 mm_release(tsk, mm);
644 if (!mm) 656 if (!mm)
645 return; 657 return;
658 sync_mm_rss(mm);
646 /* 659 /*
647 * Serialize with any possible pending coredump. 660 * Serialize with any possible pending coredump.
648 * We must hold mmap_sem around checking core_state 661 * We must hold mmap_sem around checking core_state
@@ -719,12 +732,6 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
719 732
720 zap_pid_ns_processes(pid_ns); 733 zap_pid_ns_processes(pid_ns);
721 write_lock_irq(&tasklist_lock); 734 write_lock_irq(&tasklist_lock);
722 /*
723 * We can not clear ->child_reaper or leave it alone.
724 * There may by stealth EXIT_DEAD tasks on ->children,
725 * forget_original_parent() must move them somewhere.
726 */
727 pid_ns->child_reaper = init_pid_ns.child_reaper;
728 } else if (father->signal->has_child_subreaper) { 735 } else if (father->signal->has_child_subreaper) {
729 struct task_struct *reaper; 736 struct task_struct *reaper;
730 737
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index fc275e4f629b..eebd6d5cfb44 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -275,8 +275,10 @@ void handle_nested_irq(unsigned int irq)
275 kstat_incr_irqs_this_cpu(irq, desc); 275 kstat_incr_irqs_this_cpu(irq, desc);
276 276
277 action = desc->action; 277 action = desc->action;
278 if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) 278 if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
279 desc->istate |= IRQS_PENDING;
279 goto out_unlock; 280 goto out_unlock;
281 }
280 282
281 irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); 283 irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
282 raw_spin_unlock_irq(&desc->lock); 284 raw_spin_unlock_irq(&desc->lock);
@@ -324,8 +326,10 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
324 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); 326 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
325 kstat_incr_irqs_this_cpu(irq, desc); 327 kstat_incr_irqs_this_cpu(irq, desc);
326 328
327 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) 329 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
330 desc->istate |= IRQS_PENDING;
328 goto out_unlock; 331 goto out_unlock;
332 }
329 333
330 handle_irq_event(desc); 334 handle_irq_event(desc);
331 335
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 8e5c56b3b7d9..001fa5bab490 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -101,6 +101,9 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
101 101
102extern void irq_set_thread_affinity(struct irq_desc *desc); 102extern void irq_set_thread_affinity(struct irq_desc *desc);
103 103
104extern int irq_do_set_affinity(struct irq_data *data,
105 const struct cpumask *dest, bool force);
106
104/* Inline functions for support of irq chips on slow busses */ 107/* Inline functions for support of irq chips on slow busses */
105static inline void chip_bus_lock(struct irq_desc *desc) 108static inline void chip_bus_lock(struct irq_desc *desc)
106{ 109{
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ea0c6c2ae6f7..8c548232ba39 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -142,6 +142,25 @@ static inline void
142irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } 142irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
143#endif 143#endif
144 144
145int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
146 bool force)
147{
148 struct irq_desc *desc = irq_data_to_desc(data);
149 struct irq_chip *chip = irq_data_get_irq_chip(data);
150 int ret;
151
152 ret = chip->irq_set_affinity(data, mask, false);
153 switch (ret) {
154 case IRQ_SET_MASK_OK:
155 cpumask_copy(data->affinity, mask);
156 case IRQ_SET_MASK_OK_NOCOPY:
157 irq_set_thread_affinity(desc);
158 ret = 0;
159 }
160
161 return ret;
162}
163
145int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) 164int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
146{ 165{
147 struct irq_chip *chip = irq_data_get_irq_chip(data); 166 struct irq_chip *chip = irq_data_get_irq_chip(data);
@@ -152,14 +171,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
152 return -EINVAL; 171 return -EINVAL;
153 172
154 if (irq_can_move_pcntxt(data)) { 173 if (irq_can_move_pcntxt(data)) {
155 ret = chip->irq_set_affinity(data, mask, false); 174 ret = irq_do_set_affinity(data, mask, false);
156 switch (ret) {
157 case IRQ_SET_MASK_OK:
158 cpumask_copy(data->affinity, mask);
159 case IRQ_SET_MASK_OK_NOCOPY:
160 irq_set_thread_affinity(desc);
161 ret = 0;
162 }
163 } else { 175 } else {
164 irqd_set_move_pending(data); 176 irqd_set_move_pending(data);
165 irq_copy_pending(desc, mask); 177 irq_copy_pending(desc, mask);
@@ -283,9 +295,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
283static int 295static int
284setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) 296setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
285{ 297{
286 struct irq_chip *chip = irq_desc_get_chip(desc);
287 struct cpumask *set = irq_default_affinity; 298 struct cpumask *set = irq_default_affinity;
288 int ret, node = desc->irq_data.node; 299 int node = desc->irq_data.node;
289 300
290 /* Excludes PER_CPU and NO_BALANCE interrupts */ 301 /* Excludes PER_CPU and NO_BALANCE interrupts */
291 if (!irq_can_set_affinity(irq)) 302 if (!irq_can_set_affinity(irq))
@@ -311,13 +322,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
311 if (cpumask_intersects(mask, nodemask)) 322 if (cpumask_intersects(mask, nodemask))
312 cpumask_and(mask, mask, nodemask); 323 cpumask_and(mask, mask, nodemask);
313 } 324 }
314 ret = chip->irq_set_affinity(&desc->irq_data, mask, false); 325 irq_do_set_affinity(&desc->irq_data, mask, false);
315 switch (ret) {
316 case IRQ_SET_MASK_OK:
317 cpumask_copy(desc->irq_data.affinity, mask);
318 case IRQ_SET_MASK_OK_NOCOPY:
319 irq_set_thread_affinity(desc);
320 }
321 return 0; 326 return 0;
322} 327}
323#else 328#else
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index c3c89751b327..ca3f4aaff707 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -42,17 +42,8 @@ void irq_move_masked_irq(struct irq_data *idata)
42 * For correct operation this depends on the caller 42 * For correct operation this depends on the caller
43 * masking the irqs. 43 * masking the irqs.
44 */ 44 */
45 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) 45 if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)
46 < nr_cpu_ids)) { 46 irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false);
47 int ret = chip->irq_set_affinity(&desc->irq_data,
48 desc->pending_mask, false);
49 switch (ret) {
50 case IRQ_SET_MASK_OK:
51 cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
52 case IRQ_SET_MASK_OK_NOCOPY:
53 irq_set_thread_affinity(desc);
54 }
55 }
56 47
57 cpumask_clear(desc->pending_mask); 48 cpumask_clear(desc->pending_mask);
58} 49}
diff --git a/kernel/panic.c b/kernel/panic.c
index 8ed89a175d79..d2a5f4ecc6dd 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -27,7 +27,7 @@
27#define PANIC_TIMER_STEP 100 27#define PANIC_TIMER_STEP 100
28#define PANIC_BLINK_SPD 18 28#define PANIC_BLINK_SPD 18
29 29
30int panic_on_oops; 30int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
31static unsigned long tainted_mask; 31static unsigned long tainted_mask;
32static int pause_on_oops; 32static int pause_on_oops;
33static int pause_on_oops_flag; 33static int pause_on_oops_flag;
@@ -108,8 +108,6 @@ void panic(const char *fmt, ...)
108 */ 108 */
109 crash_kexec(NULL); 109 crash_kexec(NULL);
110 110
111 kmsg_dump(KMSG_DUMP_PANIC);
112
113 /* 111 /*
114 * Note smp_send_stop is the usual smp shutdown function, which 112 * Note smp_send_stop is the usual smp shutdown function, which
115 * unfortunately means it may not be hardened to work in a panic 113 * unfortunately means it may not be hardened to work in a panic
@@ -117,6 +115,8 @@ void panic(const char *fmt, ...)
117 */ 115 */
118 smp_send_stop(); 116 smp_send_stop();
119 117
118 kmsg_dump(KMSG_DUMP_PANIC);
119
120 atomic_notifier_call_chain(&panic_notifier_list, 0, buf); 120 atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
121 121
122 bust_spinlocks(0); 122 bust_spinlocks(0);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 16b20e38c4a1..b3c7fd554250 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -184,11 +184,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
184 } 184 }
185 read_unlock(&tasklist_lock); 185 read_unlock(&tasklist_lock);
186 186
187 /* Firstly reap the EXIT_ZOMBIE children we may have. */
187 do { 188 do {
188 clear_thread_flag(TIF_SIGPENDING); 189 clear_thread_flag(TIF_SIGPENDING);
189 rc = sys_wait4(-1, NULL, __WALL, NULL); 190 rc = sys_wait4(-1, NULL, __WALL, NULL);
190 } while (rc != -ECHILD); 191 } while (rc != -ECHILD);
191 192
193 /*
194 * sys_wait4() above can't reap the TASK_DEAD children.
195 * Make sure they all go away, see __unhash_process().
196 */
197 for (;;) {
198 bool need_wait = false;
199
200 read_lock(&tasklist_lock);
201 if (!list_empty(&current->children)) {
202 __set_current_state(TASK_UNINTERRUPTIBLE);
203 need_wait = true;
204 }
205 read_unlock(&tasklist_lock);
206
207 if (!need_wait)
208 break;
209 schedule();
210 }
211
192 if (pid_ns->reboot) 212 if (pid_ns->reboot)
193 current->signal->group_exit_code = pid_ns->reboot; 213 current->signal->group_exit_code = pid_ns->reboot;
194 214
diff --git a/kernel/printk.c b/kernel/printk.c
index 32462d2b364a..dba18211685e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -193,12 +193,19 @@ static int console_may_schedule;
193 * separated by ',', and find the message after the ';' character. 193 * separated by ',', and find the message after the ';' character.
194 */ 194 */
195 195
196enum log_flags {
197 LOG_DEFAULT = 0,
198 LOG_NOCONS = 1, /* already flushed, do not print to console */
199};
200
196struct log { 201struct log {
197 u64 ts_nsec; /* timestamp in nanoseconds */ 202 u64 ts_nsec; /* timestamp in nanoseconds */
198 u16 len; /* length of entire record */ 203 u16 len; /* length of entire record */
199 u16 text_len; /* length of text buffer */ 204 u16 text_len; /* length of text buffer */
200 u16 dict_len; /* length of dictionary buffer */ 205 u16 dict_len; /* length of dictionary buffer */
201 u16 level; /* syslog level + facility */ 206 u8 facility; /* syslog facility */
207 u8 flags:5; /* internal record flags */
208 u8 level:3; /* syslog level */
202}; 209};
203 210
204/* 211/*
@@ -227,10 +234,10 @@ static u32 clear_idx;
227#define LOG_LINE_MAX 1024 234#define LOG_LINE_MAX 1024
228 235
229/* record buffer */ 236/* record buffer */
230#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) 237#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
231#define LOG_ALIGN 4 238#define LOG_ALIGN 4
232#else 239#else
233#define LOG_ALIGN 8 240#define LOG_ALIGN __alignof__(struct log)
234#endif 241#endif
235#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) 242#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
236static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); 243static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
@@ -286,6 +293,7 @@ static u32 log_next(u32 idx)
286 293
287/* insert record into the buffer, discard old ones, update heads */ 294/* insert record into the buffer, discard old ones, update heads */
288static void log_store(int facility, int level, 295static void log_store(int facility, int level,
296 enum log_flags flags, u64 ts_nsec,
289 const char *dict, u16 dict_len, 297 const char *dict, u16 dict_len,
290 const char *text, u16 text_len) 298 const char *text, u16 text_len)
291{ 299{
@@ -329,8 +337,13 @@ static void log_store(int facility, int level,
329 msg->text_len = text_len; 337 msg->text_len = text_len;
330 memcpy(log_dict(msg), dict, dict_len); 338 memcpy(log_dict(msg), dict, dict_len);
331 msg->dict_len = dict_len; 339 msg->dict_len = dict_len;
332 msg->level = (facility << 3) | (level & 7); 340 msg->facility = facility;
333 msg->ts_nsec = local_clock(); 341 msg->level = level & 7;
342 msg->flags = flags & 0x1f;
343 if (ts_nsec > 0)
344 msg->ts_nsec = ts_nsec;
345 else
346 msg->ts_nsec = local_clock();
334 memset(log_dict(msg) + dict_len, 0, pad_len); 347 memset(log_dict(msg) + dict_len, 0, pad_len);
335 msg->len = sizeof(struct log) + text_len + dict_len + pad_len; 348 msg->len = sizeof(struct log) + text_len + dict_len + pad_len;
336 349
@@ -414,7 +427,9 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
414 if (!user) 427 if (!user)
415 return -EBADF; 428 return -EBADF;
416 429
417 mutex_lock(&user->lock); 430 ret = mutex_lock_interruptible(&user->lock);
431 if (ret)
432 return ret;
418 raw_spin_lock(&logbuf_lock); 433 raw_spin_lock(&logbuf_lock);
419 while (user->seq == log_next_seq) { 434 while (user->seq == log_next_seq) {
420 if (file->f_flags & O_NONBLOCK) { 435 if (file->f_flags & O_NONBLOCK) {
@@ -444,7 +459,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
444 ts_usec = msg->ts_nsec; 459 ts_usec = msg->ts_nsec;
445 do_div(ts_usec, 1000); 460 do_div(ts_usec, 1000);
446 len = sprintf(user->buf, "%u,%llu,%llu;", 461 len = sprintf(user->buf, "%u,%llu,%llu;",
447 msg->level, user->seq, ts_usec); 462 (msg->facility << 3) | msg->level, user->seq, ts_usec);
448 463
449 /* escape non-printable characters */ 464 /* escape non-printable characters */
450 for (i = 0; i < msg->text_len; i++) { 465 for (i = 0; i < msg->text_len; i++) {
@@ -785,6 +800,21 @@ static bool printk_time;
785#endif 800#endif
786module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); 801module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
787 802
803static size_t print_time(u64 ts, char *buf)
804{
805 unsigned long rem_nsec;
806
807 if (!printk_time)
808 return 0;
809
810 if (!buf)
811 return 15;
812
813 rem_nsec = do_div(ts, 1000000000);
814 return sprintf(buf, "[%5lu.%06lu] ",
815 (unsigned long)ts, rem_nsec / 1000);
816}
817
788static size_t print_prefix(const struct log *msg, bool syslog, char *buf) 818static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
789{ 819{
790 size_t len = 0; 820 size_t len = 0;
@@ -801,18 +831,7 @@ static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
801 } 831 }
802 } 832 }
803 833
804 if (printk_time) { 834 len += print_time(msg->ts_nsec, buf ? buf + len : NULL);
805 if (buf) {
806 unsigned long long ts = msg->ts_nsec;
807 unsigned long rem_nsec = do_div(ts, 1000000000);
808
809 len += sprintf(buf + len, "[%5lu.%06lu] ",
810 (unsigned long) ts, rem_nsec / 1000);
811 } else {
812 len += 15;
813 }
814 }
815
816 return len; 835 return len;
817} 836}
818 837
@@ -860,26 +879,49 @@ static int syslog_print(char __user *buf, int size)
860{ 879{
861 char *text; 880 char *text;
862 struct log *msg; 881 struct log *msg;
863 int len; 882 int len = 0;
864 883
865 text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); 884 text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
866 if (!text) 885 if (!text)
867 return -ENOMEM; 886 return -ENOMEM;
868 887
869 raw_spin_lock_irq(&logbuf_lock); 888 while (size > 0) {
870 if (syslog_seq < log_first_seq) { 889 size_t n;
871 /* messages are gone, move to first one */ 890
872 syslog_seq = log_first_seq; 891 raw_spin_lock_irq(&logbuf_lock);
873 syslog_idx = log_first_idx; 892 if (syslog_seq < log_first_seq) {
874 } 893 /* messages are gone, move to first one */
875 msg = log_from_idx(syslog_idx); 894 syslog_seq = log_first_seq;
876 len = msg_print_text(msg, true, text, LOG_LINE_MAX); 895 syslog_idx = log_first_idx;
877 syslog_idx = log_next(syslog_idx); 896 }
878 syslog_seq++; 897 if (syslog_seq == log_next_seq) {
879 raw_spin_unlock_irq(&logbuf_lock); 898 raw_spin_unlock_irq(&logbuf_lock);
899 break;
900 }
901 msg = log_from_idx(syslog_idx);
902 n = msg_print_text(msg, true, text, LOG_LINE_MAX);
903 if (n <= size) {
904 syslog_idx = log_next(syslog_idx);
905 syslog_seq++;
906 } else
907 n = 0;
908 raw_spin_unlock_irq(&logbuf_lock);
909
910 if (!n)
911 break;
912
913 len += n;
914 size -= n;
915 buf += n;
916 n = copy_to_user(buf - n, text, n);
880 917
881 if (len > 0 && copy_to_user(buf, text, len)) 918 if (n) {
882 len = -EFAULT; 919 len -= n;
920 if (!len)
921 len = -EFAULT;
922 break;
923 }
924 }
883 925
884 kfree(text); 926 kfree(text);
885 return len; 927 return len;
@@ -909,7 +951,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
909 /* 951 /*
910 * Find first record that fits, including all following records, 952 * Find first record that fits, including all following records,
911 * into the user-provided buffer for this dump. 953 * into the user-provided buffer for this dump.
912 */ 954 */
913 seq = clear_seq; 955 seq = clear_seq;
914 idx = clear_idx; 956 idx = clear_idx;
915 while (seq < log_next_seq) { 957 while (seq < log_next_seq) {
@@ -919,6 +961,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
919 idx = log_next(idx); 961 idx = log_next(idx);
920 seq++; 962 seq++;
921 } 963 }
964
965 /* move first record forward until length fits into the buffer */
922 seq = clear_seq; 966 seq = clear_seq;
923 idx = clear_idx; 967 idx = clear_idx;
924 while (len > size && seq < log_next_seq) { 968 while (len > size && seq < log_next_seq) {
@@ -929,7 +973,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
929 seq++; 973 seq++;
930 } 974 }
931 975
932 /* last message in this dump */ 976 /* last message fitting into this dump */
933 next_seq = log_next_seq; 977 next_seq = log_next_seq;
934 978
935 len = 0; 979 len = 0;
@@ -974,6 +1018,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
974{ 1018{
975 bool clear = false; 1019 bool clear = false;
976 static int saved_console_loglevel = -1; 1020 static int saved_console_loglevel = -1;
1021 static DEFINE_MUTEX(syslog_mutex);
977 int error; 1022 int error;
978 1023
979 error = check_syslog_permissions(type, from_file); 1024 error = check_syslog_permissions(type, from_file);
@@ -1000,11 +1045,17 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1000 error = -EFAULT; 1045 error = -EFAULT;
1001 goto out; 1046 goto out;
1002 } 1047 }
1048 error = mutex_lock_interruptible(&syslog_mutex);
1049 if (error)
1050 goto out;
1003 error = wait_event_interruptible(log_wait, 1051 error = wait_event_interruptible(log_wait,
1004 syslog_seq != log_next_seq); 1052 syslog_seq != log_next_seq);
1005 if (error) 1053 if (error) {
1054 mutex_unlock(&syslog_mutex);
1006 goto out; 1055 goto out;
1056 }
1007 error = syslog_print(buf, len); 1057 error = syslog_print(buf, len);
1058 mutex_unlock(&syslog_mutex);
1008 break; 1059 break;
1009 /* Read/clear last kernel messages */ 1060 /* Read/clear last kernel messages */
1010 case SYSLOG_ACTION_READ_CLEAR: 1061 case SYSLOG_ACTION_READ_CLEAR:
@@ -1027,6 +1078,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1027 /* Clear ring buffer */ 1078 /* Clear ring buffer */
1028 case SYSLOG_ACTION_CLEAR: 1079 case SYSLOG_ACTION_CLEAR:
1029 syslog_print_all(NULL, 0, true); 1080 syslog_print_all(NULL, 0, true);
1081 break;
1030 /* Disable logging to console */ 1082 /* Disable logging to console */
1031 case SYSLOG_ACTION_CONSOLE_OFF: 1083 case SYSLOG_ACTION_CONSOLE_OFF:
1032 if (saved_console_loglevel == -1) 1084 if (saved_console_loglevel == -1)
@@ -1259,15 +1311,92 @@ static inline void printk_delay(void)
1259 } 1311 }
1260} 1312}
1261 1313
1314/*
1315 * Continuation lines are buffered, and not committed to the record buffer
1316 * until the line is complete, or a race forces it. The line fragments
1317 * though, are printed immediately to the consoles to ensure everything has
1318 * reached the console in case of a kernel crash.
1319 */
1320static struct cont {
1321 char buf[LOG_LINE_MAX];
1322 size_t len; /* length == 0 means unused buffer */
1323 size_t cons; /* bytes written to console */
1324 struct task_struct *owner; /* task of first print*/
1325 u64 ts_nsec; /* time of first print */
1326 u8 level; /* log level of first message */
1327 u8 facility; /* log level of first message */
1328 bool flushed:1; /* buffer sealed and committed */
1329} cont;
1330
1331static void cont_flush(void)
1332{
1333 if (cont.flushed)
1334 return;
1335 if (cont.len == 0)
1336 return;
1337
1338 log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec,
1339 NULL, 0, cont.buf, cont.len);
1340
1341 cont.flushed = true;
1342}
1343
1344static bool cont_add(int facility, int level, const char *text, size_t len)
1345{
1346 if (cont.len && cont.flushed)
1347 return false;
1348
1349 if (cont.len + len > sizeof(cont.buf)) {
1350 cont_flush();
1351 return false;
1352 }
1353
1354 if (!cont.len) {
1355 cont.facility = facility;
1356 cont.level = level;
1357 cont.owner = current;
1358 cont.ts_nsec = local_clock();
1359 cont.cons = 0;
1360 cont.flushed = false;
1361 }
1362
1363 memcpy(cont.buf + cont.len, text, len);
1364 cont.len += len;
1365 return true;
1366}
1367
1368static size_t cont_print_text(char *text, size_t size)
1369{
1370 size_t textlen = 0;
1371 size_t len;
1372
1373 if (cont.cons == 0) {
1374 textlen += print_time(cont.ts_nsec, text);
1375 size -= textlen;
1376 }
1377
1378 len = cont.len - cont.cons;
1379 if (len > 0) {
1380 if (len+1 > size)
1381 len = size-1;
1382 memcpy(text + textlen, cont.buf + cont.cons, len);
1383 textlen += len;
1384 cont.cons = cont.len;
1385 }
1386
1387 if (cont.flushed) {
1388 text[textlen++] = '\n';
1389 /* got everything, release buffer */
1390 cont.len = 0;
1391 }
1392 return textlen;
1393}
1394
1262asmlinkage int vprintk_emit(int facility, int level, 1395asmlinkage int vprintk_emit(int facility, int level,
1263 const char *dict, size_t dictlen, 1396 const char *dict, size_t dictlen,
1264 const char *fmt, va_list args) 1397 const char *fmt, va_list args)
1265{ 1398{
1266 static int recursion_bug; 1399 static int recursion_bug;
1267 static char cont_buf[LOG_LINE_MAX];
1268 static size_t cont_len;
1269 static int cont_level;
1270 static struct task_struct *cont_task;
1271 static char textbuf[LOG_LINE_MAX]; 1400 static char textbuf[LOG_LINE_MAX];
1272 char *text = textbuf; 1401 char *text = textbuf;
1273 size_t text_len; 1402 size_t text_len;
@@ -1313,7 +1442,8 @@ asmlinkage int vprintk_emit(int facility, int level,
1313 recursion_bug = 0; 1442 recursion_bug = 0;
1314 printed_len += strlen(recursion_msg); 1443 printed_len += strlen(recursion_msg);
1315 /* emit KERN_CRIT message */ 1444 /* emit KERN_CRIT message */
1316 log_store(0, 2, NULL, 0, recursion_msg, printed_len); 1445 log_store(0, 2, LOG_DEFAULT, 0,
1446 NULL, 0, recursion_msg, printed_len);
1317 } 1447 }
1318 1448
1319 /* 1449 /*
@@ -1351,55 +1481,37 @@ asmlinkage int vprintk_emit(int facility, int level,
1351 } 1481 }
1352 1482
1353 if (!newline) { 1483 if (!newline) {
1354 if (cont_len && (prefix || cont_task != current)) { 1484 /*
1355 /* 1485 * Flush the conflicting buffer. An earlier newline was missing,
1356 * Flush earlier buffer, which is either from a 1486 * or another task also prints continuation lines.
1357 * different thread, or when we got a new prefix. 1487 */
1358 */ 1488 if (cont.len && (prefix || cont.owner != current))
1359 log_store(facility, cont_level, NULL, 0, cont_buf, cont_len); 1489 cont_flush();
1360 cont_len = 0;
1361 }
1362
1363 if (!cont_len) {
1364 cont_level = level;
1365 cont_task = current;
1366 }
1367 1490
1368 /* buffer or append to earlier buffer from the same thread */ 1491 /* buffer line if possible, otherwise store it right away */
1369 if (cont_len + text_len > sizeof(cont_buf)) 1492 if (!cont_add(facility, level, text, text_len))
1370 text_len = sizeof(cont_buf) - cont_len; 1493 log_store(facility, level, LOG_DEFAULT, 0,
1371 memcpy(cont_buf + cont_len, text, text_len); 1494 dict, dictlen, text, text_len);
1372 cont_len += text_len;
1373 } else { 1495 } else {
1374 if (cont_len && cont_task == current) { 1496 bool stored = false;
1375 if (prefix) {
1376 /*
1377 * New prefix from the same thread; flush. We
1378 * either got no earlier newline, or we race
1379 * with an interrupt.
1380 */
1381 log_store(facility, cont_level,
1382 NULL, 0, cont_buf, cont_len);
1383 cont_len = 0;
1384 }
1385 1497
1386 /* append to the earlier buffer and flush */ 1498 /*
1387 if (cont_len + text_len > sizeof(cont_buf)) 1499 * If an earlier newline was missing and it was the same task,
1388 text_len = sizeof(cont_buf) - cont_len; 1500 * either merge it with the current buffer and flush, or if
1389 memcpy(cont_buf + cont_len, text, text_len); 1501 * there was a race with interrupts (prefix == true) then just
1390 cont_len += text_len; 1502 * flush it out and store this line separately.
1391 log_store(facility, cont_level, 1503 */
1392 NULL, 0, cont_buf, cont_len); 1504 if (cont.len && cont.owner == current) {
1393 cont_len = 0; 1505 if (!prefix)
1394 cont_task = NULL; 1506 stored = cont_add(facility, level, text, text_len);
1395 printed_len = cont_len; 1507 cont_flush();
1396 } else {
1397 /* ordinary single and terminated line */
1398 log_store(facility, level,
1399 dict, dictlen, text, text_len);
1400 printed_len = text_len;
1401 } 1508 }
1509
1510 if (!stored)
1511 log_store(facility, level, LOG_DEFAULT, 0,
1512 dict, dictlen, text, text_len);
1402 } 1513 }
1514 printed_len += text_len;
1403 1515
1404 /* 1516 /*
1405 * Try to acquire and then immediately release the console semaphore. 1517 * Try to acquire and then immediately release the console semaphore.
@@ -1486,11 +1598,18 @@ EXPORT_SYMBOL(printk);
1486#else 1598#else
1487 1599
1488#define LOG_LINE_MAX 0 1600#define LOG_LINE_MAX 0
1601static struct cont {
1602 size_t len;
1603 size_t cons;
1604 u8 level;
1605 bool flushed:1;
1606} cont;
1489static struct log *log_from_idx(u32 idx) { return NULL; } 1607static struct log *log_from_idx(u32 idx) { return NULL; }
1490static u32 log_next(u32 idx) { return 0; } 1608static u32 log_next(u32 idx) { return 0; }
1491static void call_console_drivers(int level, const char *text, size_t len) {} 1609static void call_console_drivers(int level, const char *text, size_t len) {}
1492static size_t msg_print_text(const struct log *msg, bool syslog, 1610static size_t msg_print_text(const struct log *msg, bool syslog,
1493 char *buf, size_t size) { return 0; } 1611 char *buf, size_t size) { return 0; }
1612static size_t cont_print_text(char *text, size_t size) { return 0; }
1494 1613
1495#endif /* CONFIG_PRINTK */ 1614#endif /* CONFIG_PRINTK */
1496 1615
@@ -1782,6 +1901,7 @@ static u32 console_idx;
1782 */ 1901 */
1783void console_unlock(void) 1902void console_unlock(void)
1784{ 1903{
1904 static char text[LOG_LINE_MAX];
1785 static u64 seen_seq; 1905 static u64 seen_seq;
1786 unsigned long flags; 1906 unsigned long flags;
1787 bool wake_klogd = false; 1907 bool wake_klogd = false;
@@ -1794,10 +1914,23 @@ void console_unlock(void)
1794 1914
1795 console_may_schedule = 0; 1915 console_may_schedule = 0;
1796 1916
1917 /* flush buffered message fragment immediately to console */
1918 raw_spin_lock_irqsave(&logbuf_lock, flags);
1919 if (cont.len && (cont.cons < cont.len || cont.flushed)) {
1920 size_t len;
1921
1922 len = cont_print_text(text, sizeof(text));
1923 raw_spin_unlock(&logbuf_lock);
1924 stop_critical_timings();
1925 call_console_drivers(cont.level, text, len);
1926 start_critical_timings();
1927 local_irq_restore(flags);
1928 } else
1929 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1930
1797again: 1931again:
1798 for (;;) { 1932 for (;;) {
1799 struct log *msg; 1933 struct log *msg;
1800 static char text[LOG_LINE_MAX];
1801 size_t len; 1934 size_t len;
1802 int level; 1935 int level;
1803 1936
@@ -1812,13 +1945,22 @@ again:
1812 console_seq = log_first_seq; 1945 console_seq = log_first_seq;
1813 console_idx = log_first_idx; 1946 console_idx = log_first_idx;
1814 } 1947 }
1815 1948skip:
1816 if (console_seq == log_next_seq) 1949 if (console_seq == log_next_seq)
1817 break; 1950 break;
1818 1951
1819 msg = log_from_idx(console_idx); 1952 msg = log_from_idx(console_idx);
1820 level = msg->level & 7; 1953 if (msg->flags & LOG_NOCONS) {
1954 /*
1955 * Skip record we have buffered and already printed
1956 * directly to the console when we received it.
1957 */
1958 console_idx = log_next(console_idx);
1959 console_seq++;
1960 goto skip;
1961 }
1821 1962
1963 level = msg->level;
1822 len = msg_print_text(msg, false, text, sizeof(text)); 1964 len = msg_print_text(msg, false, text, sizeof(text));
1823 1965
1824 console_idx = log_next(console_idx); 1966 console_idx = log_next(console_idx);
@@ -2300,48 +2442,210 @@ module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
2300 * kmsg_dump - dump kernel log to kernel message dumpers. 2442 * kmsg_dump - dump kernel log to kernel message dumpers.
2301 * @reason: the reason (oops, panic etc) for dumping 2443 * @reason: the reason (oops, panic etc) for dumping
2302 * 2444 *
2303 * Iterate through each of the dump devices and call the oops/panic 2445 * Call each of the registered dumper's dump() callback, which can
2304 * callbacks with the log buffer. 2446 * retrieve the kmsg records with kmsg_dump_get_line() or
2447 * kmsg_dump_get_buffer().
2305 */ 2448 */
2306void kmsg_dump(enum kmsg_dump_reason reason) 2449void kmsg_dump(enum kmsg_dump_reason reason)
2307{ 2450{
2308 u64 idx;
2309 struct kmsg_dumper *dumper; 2451 struct kmsg_dumper *dumper;
2310 const char *s1, *s2;
2311 unsigned long l1, l2;
2312 unsigned long flags; 2452 unsigned long flags;
2313 2453
2314 if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) 2454 if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump)
2315 return; 2455 return;
2316 2456
2317 /* Theoretically, the log could move on after we do this, but 2457 rcu_read_lock();
2318 there's not a lot we can do about that. The new messages 2458 list_for_each_entry_rcu(dumper, &dump_list, list) {
2319 will overwrite the start of what we dump. */ 2459 if (dumper->max_reason && reason > dumper->max_reason)
2460 continue;
2461
2462 /* initialize iterator with data about the stored records */
2463 dumper->active = true;
2464
2465 raw_spin_lock_irqsave(&logbuf_lock, flags);
2466 dumper->cur_seq = clear_seq;
2467 dumper->cur_idx = clear_idx;
2468 dumper->next_seq = log_next_seq;
2469 dumper->next_idx = log_next_idx;
2470 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2471
2472 /* invoke dumper which will iterate over records */
2473 dumper->dump(dumper, reason);
2474
2475 /* reset iterator */
2476 dumper->active = false;
2477 }
2478 rcu_read_unlock();
2479}
2480
2481/**
2482 * kmsg_dump_get_line - retrieve one kmsg log line
2483 * @dumper: registered kmsg dumper
2484 * @syslog: include the "<4>" prefixes
2485 * @line: buffer to copy the line to
2486 * @size: maximum size of the buffer
2487 * @len: length of line placed into buffer
2488 *
2489 * Start at the beginning of the kmsg buffer, with the oldest kmsg
2490 * record, and copy one record into the provided buffer.
2491 *
2492 * Consecutive calls will return the next available record moving
2493 * towards the end of the buffer with the youngest messages.
2494 *
2495 * A return value of FALSE indicates that there are no more records to
2496 * read.
2497 */
2498bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
2499 char *line, size_t size, size_t *len)
2500{
2501 unsigned long flags;
2502 struct log *msg;
2503 size_t l = 0;
2504 bool ret = false;
2505
2506 if (!dumper->active)
2507 goto out;
2320 2508
2321 raw_spin_lock_irqsave(&logbuf_lock, flags); 2509 raw_spin_lock_irqsave(&logbuf_lock, flags);
2322 if (syslog_seq < log_first_seq) 2510 if (dumper->cur_seq < log_first_seq) {
2323 idx = syslog_idx; 2511 /* messages are gone, move to first available one */
2324 else 2512 dumper->cur_seq = log_first_seq;
2325 idx = log_first_idx; 2513 dumper->cur_idx = log_first_idx;
2514 }
2326 2515
2327 if (idx > log_next_idx) { 2516 /* last entry */
2328 s1 = log_buf; 2517 if (dumper->cur_seq >= log_next_seq) {
2329 l1 = log_next_idx; 2518 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2519 goto out;
2520 }
2330 2521
2331 s2 = log_buf + idx; 2522 msg = log_from_idx(dumper->cur_idx);
2332 l2 = log_buf_len - idx; 2523 l = msg_print_text(msg, syslog,
2333 } else { 2524 line, size);
2334 s1 = "";
2335 l1 = 0;
2336 2525
2337 s2 = log_buf + idx; 2526 dumper->cur_idx = log_next(dumper->cur_idx);
2338 l2 = log_next_idx - idx; 2527 dumper->cur_seq++;
2528 ret = true;
2529 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2530out:
2531 if (len)
2532 *len = l;
2533 return ret;
2534}
2535EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
2536
2537/**
2538 * kmsg_dump_get_buffer - copy kmsg log lines
2539 * @dumper: registered kmsg dumper
2540 * @syslog: include the "<4>" prefixes
2541 * @buf: buffer to copy the line to
2542 * @size: maximum size of the buffer
2543 * @len: length of line placed into buffer
2544 *
2545 * Start at the end of the kmsg buffer and fill the provided buffer
2546 * with as many of the the *youngest* kmsg records that fit into it.
2547 * If the buffer is large enough, all available kmsg records will be
2548 * copied with a single call.
2549 *
2550 * Consecutive calls will fill the buffer with the next block of
2551 * available older records, not including the earlier retrieved ones.
2552 *
2553 * A return value of FALSE indicates that there are no more records to
2554 * read.
2555 */
2556bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
2557 char *buf, size_t size, size_t *len)
2558{
2559 unsigned long flags;
2560 u64 seq;
2561 u32 idx;
2562 u64 next_seq;
2563 u32 next_idx;
2564 size_t l = 0;
2565 bool ret = false;
2566
2567 if (!dumper->active)
2568 goto out;
2569
2570 raw_spin_lock_irqsave(&logbuf_lock, flags);
2571 if (dumper->cur_seq < log_first_seq) {
2572 /* messages are gone, move to first available one */
2573 dumper->cur_seq = log_first_seq;
2574 dumper->cur_idx = log_first_idx;
2339 } 2575 }
2576
2577 /* last entry */
2578 if (dumper->cur_seq >= dumper->next_seq) {
2579 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2580 goto out;
2581 }
2582
2583 /* calculate length of entire buffer */
2584 seq = dumper->cur_seq;
2585 idx = dumper->cur_idx;
2586 while (seq < dumper->next_seq) {
2587 struct log *msg = log_from_idx(idx);
2588
2589 l += msg_print_text(msg, true, NULL, 0);
2590 idx = log_next(idx);
2591 seq++;
2592 }
2593
2594 /* move first record forward until length fits into the buffer */
2595 seq = dumper->cur_seq;
2596 idx = dumper->cur_idx;
2597 while (l > size && seq < dumper->next_seq) {
2598 struct log *msg = log_from_idx(idx);
2599
2600 l -= msg_print_text(msg, true, NULL, 0);
2601 idx = log_next(idx);
2602 seq++;
2603 }
2604
2605 /* last message in next interation */
2606 next_seq = seq;
2607 next_idx = idx;
2608
2609 l = 0;
2610 while (seq < dumper->next_seq) {
2611 struct log *msg = log_from_idx(idx);
2612
2613 l += msg_print_text(msg, syslog,
2614 buf + l, size - l);
2615
2616 idx = log_next(idx);
2617 seq++;
2618 }
2619
2620 dumper->next_seq = next_seq;
2621 dumper->next_idx = next_idx;
2622 ret = true;
2340 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2623 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2624out:
2625 if (len)
2626 *len = l;
2627 return ret;
2628}
2629EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
2341 2630
2342 rcu_read_lock(); 2631/**
2343 list_for_each_entry_rcu(dumper, &dump_list, list) 2632 * kmsg_dump_rewind - reset the interator
2344 dumper->dump(dumper, reason, s1, l1, s2, l2); 2633 * @dumper: registered kmsg dumper
2345 rcu_read_unlock(); 2634 *
2635 * Reset the dumper's iterator so that kmsg_dump_get_line() and
2636 * kmsg_dump_get_buffer() can be called again and used multiple
2637 * times within the same dumper.dump() callback.
2638 */
2639void kmsg_dump_rewind(struct kmsg_dumper *dumper)
2640{
2641 unsigned long flags;
2642
2643 raw_spin_lock_irqsave(&logbuf_lock, flags);
2644 dumper->cur_seq = clear_seq;
2645 dumper->cur_idx = clear_idx;
2646 dumper->next_seq = log_next_seq;
2647 dumper->next_idx = log_next_idx;
2648 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2346} 2649}
2650EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
2347#endif 2651#endif
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 0da7b88d92d0..38ecdda3f55f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1397,6 +1397,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1397 rdp->qlen_lazy += rsp->qlen_lazy; 1397 rdp->qlen_lazy += rsp->qlen_lazy;
1398 rdp->qlen += rsp->qlen; 1398 rdp->qlen += rsp->qlen;
1399 rdp->n_cbs_adopted += rsp->qlen; 1399 rdp->n_cbs_adopted += rsp->qlen;
1400 if (rsp->qlen_lazy != rsp->qlen)
1401 rcu_idle_count_callbacks_posted();
1400 rsp->qlen_lazy = 0; 1402 rsp->qlen_lazy = 0;
1401 rsp->qlen = 0; 1403 rsp->qlen = 0;
1402 1404
@@ -1528,7 +1530,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1528{ 1530{
1529 unsigned long flags; 1531 unsigned long flags;
1530 struct rcu_head *next, *list, **tail; 1532 struct rcu_head *next, *list, **tail;
1531 int bl, count, count_lazy; 1533 int bl, count, count_lazy, i;
1532 1534
1533 /* If no callbacks are ready, just return.*/ 1535 /* If no callbacks are ready, just return.*/
1534 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 1536 if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
@@ -1551,9 +1553,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1551 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; 1553 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
1552 *rdp->nxttail[RCU_DONE_TAIL] = NULL; 1554 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
1553 tail = rdp->nxttail[RCU_DONE_TAIL]; 1555 tail = rdp->nxttail[RCU_DONE_TAIL];
1554 for (count = RCU_NEXT_SIZE - 1; count >= 0; count--) 1556 for (i = RCU_NEXT_SIZE - 1; i >= 0; i--)
1555 if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL]) 1557 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
1556 rdp->nxttail[count] = &rdp->nxtlist; 1558 rdp->nxttail[i] = &rdp->nxtlist;
1557 local_irq_restore(flags); 1559 local_irq_restore(flags);
1558 1560
1559 /* Invoke callbacks. */ 1561 /* Invoke callbacks. */
@@ -1581,9 +1583,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1581 if (list != NULL) { 1583 if (list != NULL) {
1582 *tail = rdp->nxtlist; 1584 *tail = rdp->nxtlist;
1583 rdp->nxtlist = list; 1585 rdp->nxtlist = list;
1584 for (count = 0; count < RCU_NEXT_SIZE; count++) 1586 for (i = 0; i < RCU_NEXT_SIZE; i++)
1585 if (&rdp->nxtlist == rdp->nxttail[count]) 1587 if (&rdp->nxtlist == rdp->nxttail[i])
1586 rdp->nxttail[count] = tail; 1588 rdp->nxttail[i] = tail;
1587 else 1589 else
1588 break; 1590 break;
1589 } 1591 }
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 7f5d138dedf5..ea056495783e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,6 +84,20 @@ struct rcu_dynticks {
84 /* Process level is worth LLONG_MAX/2. */ 84 /* Process level is worth LLONG_MAX/2. */
85 int dynticks_nmi_nesting; /* Track NMI nesting level. */ 85 int dynticks_nmi_nesting; /* Track NMI nesting level. */
86 atomic_t dynticks; /* Even value for idle, else odd. */ 86 atomic_t dynticks; /* Even value for idle, else odd. */
87#ifdef CONFIG_RCU_FAST_NO_HZ
88 int dyntick_drain; /* Prepare-for-idle state variable. */
89 unsigned long dyntick_holdoff;
90 /* No retries for the jiffy of failure. */
91 struct timer_list idle_gp_timer;
92 /* Wake up CPU sleeping with callbacks. */
93 unsigned long idle_gp_timer_expires;
94 /* When to wake up CPU (for repost). */
95 bool idle_first_pass; /* First pass of attempt to go idle? */
96 unsigned long nonlazy_posted;
97 /* # times non-lazy CBs posted to CPU. */
98 unsigned long nonlazy_posted_snap;
99 /* idle-period nonlazy_posted snapshot. */
100#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
87}; 101};
88 102
89/* RCU's kthread states for tracing. */ 103/* RCU's kthread states for tracing. */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 2411000d9869..5271a020887e 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1886,8 +1886,9 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
1886 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs 1886 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
1887 * any flavor of RCU. 1887 * any flavor of RCU.
1888 */ 1888 */
1889int rcu_needs_cpu(int cpu) 1889int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
1890{ 1890{
1891 *delta_jiffies = ULONG_MAX;
1891 return rcu_cpu_has_callbacks(cpu); 1892 return rcu_cpu_has_callbacks(cpu);
1892} 1893}
1893 1894
@@ -1962,41 +1963,6 @@ static void rcu_idle_count_callbacks_posted(void)
1962#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ 1963#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */
1963#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ 1964#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
1964 1965
1965/* Loop counter for rcu_prepare_for_idle(). */
1966static DEFINE_PER_CPU(int, rcu_dyntick_drain);
1967/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */
1968static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1969/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */
1970static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer);
1971/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */
1972static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires);
1973/* Enable special processing on first attempt to enter dyntick-idle mode. */
1974static DEFINE_PER_CPU(bool, rcu_idle_first_pass);
1975/* Running count of non-lazy callbacks posted, never decremented. */
1976static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted);
1977/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */
1978static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap);
1979
1980/*
1981 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
1982 * callbacks on this CPU, (2) this CPU has not yet attempted to enter
1983 * dyntick-idle mode, or (3) this CPU is in the process of attempting to
1984 * enter dyntick-idle mode. Otherwise, if we have recently tried and failed
1985 * to enter dyntick-idle mode, we refuse to try to enter it. After all,
1986 * it is better to incur scheduling-clock interrupts than to spin
1987 * continuously for the same time duration!
1988 */
1989int rcu_needs_cpu(int cpu)
1990{
1991 /* Flag a new idle sojourn to the idle-entry state machine. */
1992 per_cpu(rcu_idle_first_pass, cpu) = 1;
1993 /* If no callbacks, RCU doesn't need the CPU. */
1994 if (!rcu_cpu_has_callbacks(cpu))
1995 return 0;
1996 /* Otherwise, RCU needs the CPU only if it recently tried and failed. */
1997 return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies;
1998}
1999
2000/* 1966/*
2001 * Does the specified flavor of RCU have non-lazy callbacks pending on 1967 * Does the specified flavor of RCU have non-lazy callbacks pending on
2002 * the specified CPU? Both RCU flavor and CPU are specified by the 1968 * the specified CPU? Both RCU flavor and CPU are specified by the
@@ -2040,6 +2006,47 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
2040} 2006}
2041 2007
2042/* 2008/*
2009 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
2010 * callbacks on this CPU, (2) this CPU has not yet attempted to enter
2011 * dyntick-idle mode, or (3) this CPU is in the process of attempting to
2012 * enter dyntick-idle mode. Otherwise, if we have recently tried and failed
2013 * to enter dyntick-idle mode, we refuse to try to enter it. After all,
2014 * it is better to incur scheduling-clock interrupts than to spin
2015 * continuously for the same time duration!
2016 *
2017 * The delta_jiffies argument is used to store the time when RCU is
2018 * going to need the CPU again if it still has callbacks. The reason
2019 * for this is that rcu_prepare_for_idle() might need to post a timer,
2020 * but if so, it will do so after tick_nohz_stop_sched_tick() has set
2021 * the wakeup time for this CPU. This means that RCU's timer can be
2022 * delayed until the wakeup time, which defeats the purpose of posting
2023 * a timer.
2024 */
2025int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
2026{
2027 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2028
2029 /* Flag a new idle sojourn to the idle-entry state machine. */
2030 rdtp->idle_first_pass = 1;
2031 /* If no callbacks, RCU doesn't need the CPU. */
2032 if (!rcu_cpu_has_callbacks(cpu)) {
2033 *delta_jiffies = ULONG_MAX;
2034 return 0;
2035 }
2036 if (rdtp->dyntick_holdoff == jiffies) {
2037 /* RCU recently tried and failed, so don't try again. */
2038 *delta_jiffies = 1;
2039 return 1;
2040 }
2041 /* Set up for the possibility that RCU will post a timer. */
2042 if (rcu_cpu_has_nonlazy_callbacks(cpu))
2043 *delta_jiffies = RCU_IDLE_GP_DELAY;
2044 else
2045 *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY;
2046 return 0;
2047}
2048
2049/*
2043 * Handler for smp_call_function_single(). The only point of this 2050 * Handler for smp_call_function_single(). The only point of this
2044 * handler is to wake the CPU up, so the handler does only tracing. 2051 * handler is to wake the CPU up, so the handler does only tracing.
2045 */ 2052 */
@@ -2075,21 +2082,24 @@ static void rcu_idle_gp_timer_func(unsigned long cpu_in)
2075 */ 2082 */
2076static void rcu_prepare_for_idle_init(int cpu) 2083static void rcu_prepare_for_idle_init(int cpu)
2077{ 2084{
2078 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 2085 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2079 setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), 2086
2080 rcu_idle_gp_timer_func, cpu); 2087 rdtp->dyntick_holdoff = jiffies - 1;
2081 per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1; 2088 setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu);
2082 per_cpu(rcu_idle_first_pass, cpu) = 1; 2089 rdtp->idle_gp_timer_expires = jiffies - 1;
2090 rdtp->idle_first_pass = 1;
2083} 2091}
2084 2092
2085/* 2093/*
2086 * Clean up for exit from idle. Because we are exiting from idle, there 2094 * Clean up for exit from idle. Because we are exiting from idle, there
2087 * is no longer any point to rcu_idle_gp_timer, so cancel it. This will 2095 * is no longer any point to ->idle_gp_timer, so cancel it. This will
2088 * do nothing if this timer is not active, so just cancel it unconditionally. 2096 * do nothing if this timer is not active, so just cancel it unconditionally.
2089 */ 2097 */
2090static void rcu_cleanup_after_idle(int cpu) 2098static void rcu_cleanup_after_idle(int cpu)
2091{ 2099{
2092 del_timer(&per_cpu(rcu_idle_gp_timer, cpu)); 2100 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2101
2102 del_timer(&rdtp->idle_gp_timer);
2093 trace_rcu_prep_idle("Cleanup after idle"); 2103 trace_rcu_prep_idle("Cleanup after idle");
2094} 2104}
2095 2105
@@ -2108,42 +2118,41 @@ static void rcu_cleanup_after_idle(int cpu)
2108 * Because it is not legal to invoke rcu_process_callbacks() with irqs 2118 * Because it is not legal to invoke rcu_process_callbacks() with irqs
2109 * disabled, we do one pass of force_quiescent_state(), then do a 2119 * disabled, we do one pass of force_quiescent_state(), then do a
2110 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked 2120 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
2111 * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. 2121 * later. The ->dyntick_drain field controls the sequencing.
2112 * 2122 *
2113 * The caller must have disabled interrupts. 2123 * The caller must have disabled interrupts.
2114 */ 2124 */
2115static void rcu_prepare_for_idle(int cpu) 2125static void rcu_prepare_for_idle(int cpu)
2116{ 2126{
2117 struct timer_list *tp; 2127 struct timer_list *tp;
2128 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2118 2129
2119 /* 2130 /*
2120 * If this is an idle re-entry, for example, due to use of 2131 * If this is an idle re-entry, for example, due to use of
2121 * RCU_NONIDLE() or the new idle-loop tracing API within the idle 2132 * RCU_NONIDLE() or the new idle-loop tracing API within the idle
2122 * loop, then don't take any state-machine actions, unless the 2133 * loop, then don't take any state-machine actions, unless the
2123 * momentary exit from idle queued additional non-lazy callbacks. 2134 * momentary exit from idle queued additional non-lazy callbacks.
2124 * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks 2135 * Instead, repost the ->idle_gp_timer if this CPU has callbacks
2125 * pending. 2136 * pending.
2126 */ 2137 */
2127 if (!per_cpu(rcu_idle_first_pass, cpu) && 2138 if (!rdtp->idle_first_pass &&
2128 (per_cpu(rcu_nonlazy_posted, cpu) == 2139 (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) {
2129 per_cpu(rcu_nonlazy_posted_snap, cpu))) {
2130 if (rcu_cpu_has_callbacks(cpu)) { 2140 if (rcu_cpu_has_callbacks(cpu)) {
2131 tp = &per_cpu(rcu_idle_gp_timer, cpu); 2141 tp = &rdtp->idle_gp_timer;
2132 mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); 2142 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
2133 } 2143 }
2134 return; 2144 return;
2135 } 2145 }
2136 per_cpu(rcu_idle_first_pass, cpu) = 0; 2146 rdtp->idle_first_pass = 0;
2137 per_cpu(rcu_nonlazy_posted_snap, cpu) = 2147 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;
2138 per_cpu(rcu_nonlazy_posted, cpu) - 1;
2139 2148
2140 /* 2149 /*
2141 * If there are no callbacks on this CPU, enter dyntick-idle mode. 2150 * If there are no callbacks on this CPU, enter dyntick-idle mode.
2142 * Also reset state to avoid prejudicing later attempts. 2151 * Also reset state to avoid prejudicing later attempts.
2143 */ 2152 */
2144 if (!rcu_cpu_has_callbacks(cpu)) { 2153 if (!rcu_cpu_has_callbacks(cpu)) {
2145 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 2154 rdtp->dyntick_holdoff = jiffies - 1;
2146 per_cpu(rcu_dyntick_drain, cpu) = 0; 2155 rdtp->dyntick_drain = 0;
2147 trace_rcu_prep_idle("No callbacks"); 2156 trace_rcu_prep_idle("No callbacks");
2148 return; 2157 return;
2149 } 2158 }
@@ -2152,36 +2161,37 @@ static void rcu_prepare_for_idle(int cpu)
2152 * If in holdoff mode, just return. We will presumably have 2161 * If in holdoff mode, just return. We will presumably have
2153 * refrained from disabling the scheduling-clock tick. 2162 * refrained from disabling the scheduling-clock tick.
2154 */ 2163 */
2155 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { 2164 if (rdtp->dyntick_holdoff == jiffies) {
2156 trace_rcu_prep_idle("In holdoff"); 2165 trace_rcu_prep_idle("In holdoff");
2157 return; 2166 return;
2158 } 2167 }
2159 2168
2160 /* Check and update the rcu_dyntick_drain sequencing. */ 2169 /* Check and update the ->dyntick_drain sequencing. */
2161 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2170 if (rdtp->dyntick_drain <= 0) {
2162 /* First time through, initialize the counter. */ 2171 /* First time through, initialize the counter. */
2163 per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; 2172 rdtp->dyntick_drain = RCU_IDLE_FLUSHES;
2164 } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && 2173 } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES &&
2165 !rcu_pending(cpu) && 2174 !rcu_pending(cpu) &&
2166 !local_softirq_pending()) { 2175 !local_softirq_pending()) {
2167 /* Can we go dyntick-idle despite still having callbacks? */ 2176 /* Can we go dyntick-idle despite still having callbacks? */
2168 trace_rcu_prep_idle("Dyntick with callbacks"); 2177 rdtp->dyntick_drain = 0;
2169 per_cpu(rcu_dyntick_drain, cpu) = 0; 2178 rdtp->dyntick_holdoff = jiffies;
2170 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; 2179 if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
2171 if (rcu_cpu_has_nonlazy_callbacks(cpu)) 2180 trace_rcu_prep_idle("Dyntick with callbacks");
2172 per_cpu(rcu_idle_gp_timer_expires, cpu) = 2181 rdtp->idle_gp_timer_expires =
2173 jiffies + RCU_IDLE_GP_DELAY; 2182 jiffies + RCU_IDLE_GP_DELAY;
2174 else 2183 } else {
2175 per_cpu(rcu_idle_gp_timer_expires, cpu) = 2184 rdtp->idle_gp_timer_expires =
2176 jiffies + RCU_IDLE_LAZY_GP_DELAY; 2185 jiffies + RCU_IDLE_LAZY_GP_DELAY;
2177 tp = &per_cpu(rcu_idle_gp_timer, cpu); 2186 trace_rcu_prep_idle("Dyntick with lazy callbacks");
2178 mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); 2187 }
2179 per_cpu(rcu_nonlazy_posted_snap, cpu) = 2188 tp = &rdtp->idle_gp_timer;
2180 per_cpu(rcu_nonlazy_posted, cpu); 2189 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
2190 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
2181 return; /* Nothing more to do immediately. */ 2191 return; /* Nothing more to do immediately. */
2182 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2192 } else if (--(rdtp->dyntick_drain) <= 0) {
2183 /* We have hit the limit, so time to give up. */ 2193 /* We have hit the limit, so time to give up. */
2184 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; 2194 rdtp->dyntick_holdoff = jiffies;
2185 trace_rcu_prep_idle("Begin holdoff"); 2195 trace_rcu_prep_idle("Begin holdoff");
2186 invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ 2196 invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */
2187 return; 2197 return;
@@ -2227,7 +2237,7 @@ static void rcu_prepare_for_idle(int cpu)
2227 */ 2237 */
2228static void rcu_idle_count_callbacks_posted(void) 2238static void rcu_idle_count_callbacks_posted(void)
2229{ 2239{
2230 __this_cpu_add(rcu_nonlazy_posted, 1); 2240 __this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
2231} 2241}
2232 2242
2233#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 2243#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
@@ -2238,11 +2248,12 @@ static void rcu_idle_count_callbacks_posted(void)
2238 2248
2239static void print_cpu_stall_fast_no_hz(char *cp, int cpu) 2249static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2240{ 2250{
2241 struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu); 2251 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2252 struct timer_list *tltp = &rdtp->idle_gp_timer;
2242 2253
2243 sprintf(cp, "drain=%d %c timer=%lu", 2254 sprintf(cp, "drain=%d %c timer=%lu",
2244 per_cpu(rcu_dyntick_drain, cpu), 2255 rdtp->dyntick_drain,
2245 per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', 2256 rdtp->dyntick_holdoff == jiffies ? 'H' : '.',
2246 timer_pending(tltp) ? tltp->expires - jiffies : -1); 2257 timer_pending(tltp) ? tltp->expires - jiffies : -1);
2247} 2258}
2248 2259
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 39eb6011bc38..d5594a4268d4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -142,9 +142,8 @@ const_debug unsigned int sysctl_sched_features =
142#define SCHED_FEAT(name, enabled) \ 142#define SCHED_FEAT(name, enabled) \
143 #name , 143 #name ,
144 144
145static __read_mostly char *sched_feat_names[] = { 145static const char * const sched_feat_names[] = {
146#include "features.h" 146#include "features.h"
147 NULL
148}; 147};
149 148
150#undef SCHED_FEAT 149#undef SCHED_FEAT
@@ -2517,25 +2516,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2517 sched_avg_update(this_rq); 2516 sched_avg_update(this_rq);
2518} 2517}
2519 2518
2519#ifdef CONFIG_NO_HZ
2520/*
2521 * There is no sane way to deal with nohz on smp when using jiffies because the
2522 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
2523 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
2524 *
2525 * Therefore we cannot use the delta approach from the regular tick since that
2526 * would seriously skew the load calculation. However we'll make do for those
2527 * updates happening while idle (nohz_idle_balance) or coming out of idle
2528 * (tick_nohz_idle_exit).
2529 *
2530 * This means we might still be one tick off for nohz periods.
2531 */
2532
2520/* 2533/*
2521 * Called from nohz_idle_balance() to update the load ratings before doing the 2534 * Called from nohz_idle_balance() to update the load ratings before doing the
2522 * idle balance. 2535 * idle balance.
2523 */ 2536 */
2524void update_idle_cpu_load(struct rq *this_rq) 2537void update_idle_cpu_load(struct rq *this_rq)
2525{ 2538{
2526 unsigned long curr_jiffies = jiffies; 2539 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2527 unsigned long load = this_rq->load.weight; 2540 unsigned long load = this_rq->load.weight;
2528 unsigned long pending_updates; 2541 unsigned long pending_updates;
2529 2542
2530 /* 2543 /*
2531 * Bloody broken means of dealing with nohz, but better than nothing.. 2544 * bail if there's load or we're actually up-to-date.
2532 * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
2533 * update and see 0 difference the one time and 2 the next, even though
2534 * we ticked at roughtly the same rate.
2535 *
2536 * Hence we only use this from nohz_idle_balance() and skip this
2537 * nonsense when called from the scheduler_tick() since that's
2538 * guaranteed a stable rate.
2539 */ 2545 */
2540 if (load || curr_jiffies == this_rq->last_load_update_tick) 2546 if (load || curr_jiffies == this_rq->last_load_update_tick)
2541 return; 2547 return;
@@ -2547,12 +2553,38 @@ void update_idle_cpu_load(struct rq *this_rq)
2547} 2553}
2548 2554
2549/* 2555/*
2556 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
2557 */
2558void update_cpu_load_nohz(void)
2559{
2560 struct rq *this_rq = this_rq();
2561 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2562 unsigned long pending_updates;
2563
2564 if (curr_jiffies == this_rq->last_load_update_tick)
2565 return;
2566
2567 raw_spin_lock(&this_rq->lock);
2568 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2569 if (pending_updates) {
2570 this_rq->last_load_update_tick = curr_jiffies;
2571 /*
2572 * We were idle, this means load 0, the current load might be
2573 * !0 due to remote wakeups and the sort.
2574 */
2575 __update_cpu_load(this_rq, 0, pending_updates);
2576 }
2577 raw_spin_unlock(&this_rq->lock);
2578}
2579#endif /* CONFIG_NO_HZ */
2580
2581/*
2550 * Called from scheduler_tick() 2582 * Called from scheduler_tick()
2551 */ 2583 */
2552static void update_cpu_load_active(struct rq *this_rq) 2584static void update_cpu_load_active(struct rq *this_rq)
2553{ 2585{
2554 /* 2586 /*
2555 * See the mess in update_idle_cpu_load(). 2587 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
2556 */ 2588 */
2557 this_rq->last_load_update_tick = jiffies; 2589 this_rq->last_load_update_tick = jiffies;
2558 __update_cpu_load(this_rq, this_rq->load.weight, 1); 2590 __update_cpu_load(this_rq, this_rq->load.weight, 1);
@@ -4982,7 +5014,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4982 p->sched_class->set_cpus_allowed(p, new_mask); 5014 p->sched_class->set_cpus_allowed(p, new_mask);
4983 5015
4984 cpumask_copy(&p->cpus_allowed, new_mask); 5016 cpumask_copy(&p->cpus_allowed, new_mask);
4985 p->rt.nr_cpus_allowed = cpumask_weight(new_mask); 5017 p->nr_cpus_allowed = cpumask_weight(new_mask);
4986} 5018}
4987 5019
4988/* 5020/*
@@ -5524,15 +5556,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
5524 5556
5525#ifdef CONFIG_SCHED_DEBUG 5557#ifdef CONFIG_SCHED_DEBUG
5526 5558
5527static __read_mostly int sched_domain_debug_enabled; 5559static __read_mostly int sched_debug_enabled;
5528 5560
5529static int __init sched_domain_debug_setup(char *str) 5561static int __init sched_debug_setup(char *str)
5530{ 5562{
5531 sched_domain_debug_enabled = 1; 5563 sched_debug_enabled = 1;
5532 5564
5533 return 0; 5565 return 0;
5534} 5566}
5535early_param("sched_debug", sched_domain_debug_setup); 5567early_param("sched_debug", sched_debug_setup);
5568
5569static inline bool sched_debug(void)
5570{
5571 return sched_debug_enabled;
5572}
5536 5573
5537static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 5574static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5538 struct cpumask *groupmask) 5575 struct cpumask *groupmask)
@@ -5572,7 +5609,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5572 break; 5609 break;
5573 } 5610 }
5574 5611
5575 if (!group->sgp->power) { 5612 /*
5613 * Even though we initialize ->power to something semi-sane,
5614 * we leave power_orig unset. This allows us to detect if
5615 * domain iteration is still funny without causing /0 traps.
5616 */
5617 if (!group->sgp->power_orig) {
5576 printk(KERN_CONT "\n"); 5618 printk(KERN_CONT "\n");
5577 printk(KERN_ERR "ERROR: domain->cpu_power not " 5619 printk(KERN_ERR "ERROR: domain->cpu_power not "
5578 "set\n"); 5620 "set\n");
@@ -5620,7 +5662,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5620{ 5662{
5621 int level = 0; 5663 int level = 0;
5622 5664
5623 if (!sched_domain_debug_enabled) 5665 if (!sched_debug_enabled)
5624 return; 5666 return;
5625 5667
5626 if (!sd) { 5668 if (!sd) {
@@ -5641,6 +5683,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5641} 5683}
5642#else /* !CONFIG_SCHED_DEBUG */ 5684#else /* !CONFIG_SCHED_DEBUG */
5643# define sched_domain_debug(sd, cpu) do { } while (0) 5685# define sched_domain_debug(sd, cpu) do { } while (0)
5686static inline bool sched_debug(void)
5687{
5688 return false;
5689}
5644#endif /* CONFIG_SCHED_DEBUG */ 5690#endif /* CONFIG_SCHED_DEBUG */
5645 5691
5646static int sd_degenerate(struct sched_domain *sd) 5692static int sd_degenerate(struct sched_domain *sd)
@@ -5962,6 +6008,44 @@ struct sched_domain_topology_level {
5962 struct sd_data data; 6008 struct sd_data data;
5963}; 6009};
5964 6010
6011/*
6012 * Build an iteration mask that can exclude certain CPUs from the upwards
6013 * domain traversal.
6014 *
6015 * Asymmetric node setups can result in situations where the domain tree is of
6016 * unequal depth, make sure to skip domains that already cover the entire
6017 * range.
6018 *
6019 * In that case build_sched_domains() will have terminated the iteration early
6020 * and our sibling sd spans will be empty. Domains should always include the
6021 * cpu they're built on, so check that.
6022 *
6023 */
6024static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
6025{
6026 const struct cpumask *span = sched_domain_span(sd);
6027 struct sd_data *sdd = sd->private;
6028 struct sched_domain *sibling;
6029 int i;
6030
6031 for_each_cpu(i, span) {
6032 sibling = *per_cpu_ptr(sdd->sd, i);
6033 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
6034 continue;
6035
6036 cpumask_set_cpu(i, sched_group_mask(sg));
6037 }
6038}
6039
6040/*
6041 * Return the canonical balance cpu for this group, this is the first cpu
6042 * of this group that's also in the iteration mask.
6043 */
6044int group_balance_cpu(struct sched_group *sg)
6045{
6046 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
6047}
6048
5965static int 6049static int
5966build_overlap_sched_groups(struct sched_domain *sd, int cpu) 6050build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5967{ 6051{
@@ -5980,6 +6064,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5980 if (cpumask_test_cpu(i, covered)) 6064 if (cpumask_test_cpu(i, covered))
5981 continue; 6065 continue;
5982 6066
6067 child = *per_cpu_ptr(sdd->sd, i);
6068
6069 /* See the comment near build_group_mask(). */
6070 if (!cpumask_test_cpu(i, sched_domain_span(child)))
6071 continue;
6072
5983 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 6073 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
5984 GFP_KERNEL, cpu_to_node(cpu)); 6074 GFP_KERNEL, cpu_to_node(cpu));
5985 6075
@@ -5987,8 +6077,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5987 goto fail; 6077 goto fail;
5988 6078
5989 sg_span = sched_group_cpus(sg); 6079 sg_span = sched_group_cpus(sg);
5990
5991 child = *per_cpu_ptr(sdd->sd, i);
5992 if (child->child) { 6080 if (child->child) {
5993 child = child->child; 6081 child = child->child;
5994 cpumask_copy(sg_span, sched_domain_span(child)); 6082 cpumask_copy(sg_span, sched_domain_span(child));
@@ -5997,10 +6085,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5997 6085
5998 cpumask_or(covered, covered, sg_span); 6086 cpumask_or(covered, covered, sg_span);
5999 6087
6000 sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); 6088 sg->sgp = *per_cpu_ptr(sdd->sgp, i);
6001 atomic_inc(&sg->sgp->ref); 6089 if (atomic_inc_return(&sg->sgp->ref) == 1)
6090 build_group_mask(sd, sg);
6002 6091
6003 if (cpumask_test_cpu(cpu, sg_span)) 6092 /*
6093 * Initialize sgp->power such that even if we mess up the
6094 * domains and no possible iteration will get us here, we won't
6095 * die on a /0 trap.
6096 */
6097 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
6098
6099 /*
6100 * Make sure the first group of this domain contains the
6101 * canonical balance cpu. Otherwise the sched_domain iteration
6102 * breaks. See update_sg_lb_stats().
6103 */
6104 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
6105 group_balance_cpu(sg) == cpu)
6004 groups = sg; 6106 groups = sg;
6005 6107
6006 if (!first) 6108 if (!first)
@@ -6074,6 +6176,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
6074 6176
6075 cpumask_clear(sched_group_cpus(sg)); 6177 cpumask_clear(sched_group_cpus(sg));
6076 sg->sgp->power = 0; 6178 sg->sgp->power = 0;
6179 cpumask_setall(sched_group_mask(sg));
6077 6180
6078 for_each_cpu(j, span) { 6181 for_each_cpu(j, span) {
6079 if (get_group(j, sdd, NULL) != group) 6182 if (get_group(j, sdd, NULL) != group)
@@ -6115,7 +6218,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6115 sg = sg->next; 6218 sg = sg->next;
6116 } while (sg != sd->groups); 6219 } while (sg != sd->groups);
6117 6220
6118 if (cpu != group_first_cpu(sg)) 6221 if (cpu != group_balance_cpu(sg))
6119 return; 6222 return;
6120 6223
6121 update_group_power(sd, cpu); 6224 update_group_power(sd, cpu);
@@ -6165,11 +6268,8 @@ int sched_domain_level_max;
6165 6268
6166static int __init setup_relax_domain_level(char *str) 6269static int __init setup_relax_domain_level(char *str)
6167{ 6270{
6168 unsigned long val; 6271 if (kstrtoint(str, 0, &default_relax_domain_level))
6169 6272 pr_warn("Unable to set relax_domain_level\n");
6170 val = simple_strtoul(str, NULL, 0);
6171 if (val < sched_domain_level_max)
6172 default_relax_domain_level = val;
6173 6273
6174 return 1; 6274 return 1;
6175} 6275}
@@ -6279,14 +6379,13 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol
6279#ifdef CONFIG_NUMA 6379#ifdef CONFIG_NUMA
6280 6380
6281static int sched_domains_numa_levels; 6381static int sched_domains_numa_levels;
6282static int sched_domains_numa_scale;
6283static int *sched_domains_numa_distance; 6382static int *sched_domains_numa_distance;
6284static struct cpumask ***sched_domains_numa_masks; 6383static struct cpumask ***sched_domains_numa_masks;
6285static int sched_domains_curr_level; 6384static int sched_domains_curr_level;
6286 6385
6287static inline int sd_local_flags(int level) 6386static inline int sd_local_flags(int level)
6288{ 6387{
6289 if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) 6388 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
6290 return 0; 6389 return 0;
6291 6390
6292 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; 6391 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
@@ -6344,6 +6443,42 @@ static const struct cpumask *sd_numa_mask(int cpu)
6344 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 6443 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6345} 6444}
6346 6445
6446static void sched_numa_warn(const char *str)
6447{
6448 static int done = false;
6449 int i,j;
6450
6451 if (done)
6452 return;
6453
6454 done = true;
6455
6456 printk(KERN_WARNING "ERROR: %s\n\n", str);
6457
6458 for (i = 0; i < nr_node_ids; i++) {
6459 printk(KERN_WARNING " ");
6460 for (j = 0; j < nr_node_ids; j++)
6461 printk(KERN_CONT "%02d ", node_distance(i,j));
6462 printk(KERN_CONT "\n");
6463 }
6464 printk(KERN_WARNING "\n");
6465}
6466
6467static bool find_numa_distance(int distance)
6468{
6469 int i;
6470
6471 if (distance == node_distance(0, 0))
6472 return true;
6473
6474 for (i = 0; i < sched_domains_numa_levels; i++) {
6475 if (sched_domains_numa_distance[i] == distance)
6476 return true;
6477 }
6478
6479 return false;
6480}
6481
6347static void sched_init_numa(void) 6482static void sched_init_numa(void)
6348{ 6483{
6349 int next_distance, curr_distance = node_distance(0, 0); 6484 int next_distance, curr_distance = node_distance(0, 0);
@@ -6351,7 +6486,6 @@ static void sched_init_numa(void)
6351 int level = 0; 6486 int level = 0;
6352 int i, j, k; 6487 int i, j, k;
6353 6488
6354 sched_domains_numa_scale = curr_distance;
6355 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); 6489 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6356 if (!sched_domains_numa_distance) 6490 if (!sched_domains_numa_distance)
6357 return; 6491 return;
@@ -6362,23 +6496,41 @@ static void sched_init_numa(void)
6362 * 6496 *
6363 * Assumes node_distance(0,j) includes all distances in 6497 * Assumes node_distance(0,j) includes all distances in
6364 * node_distance(i,j) in order to avoid cubic time. 6498 * node_distance(i,j) in order to avoid cubic time.
6365 *
6366 * XXX: could be optimized to O(n log n) by using sort()
6367 */ 6499 */
6368 next_distance = curr_distance; 6500 next_distance = curr_distance;
6369 for (i = 0; i < nr_node_ids; i++) { 6501 for (i = 0; i < nr_node_ids; i++) {
6370 for (j = 0; j < nr_node_ids; j++) { 6502 for (j = 0; j < nr_node_ids; j++) {
6371 int distance = node_distance(0, j); 6503 for (k = 0; k < nr_node_ids; k++) {
6372 if (distance > curr_distance && 6504 int distance = node_distance(i, k);
6373 (distance < next_distance || 6505
6374 next_distance == curr_distance)) 6506 if (distance > curr_distance &&
6375 next_distance = distance; 6507 (distance < next_distance ||
6508 next_distance == curr_distance))
6509 next_distance = distance;
6510
6511 /*
6512 * While not a strong assumption it would be nice to know
6513 * about cases where if node A is connected to B, B is not
6514 * equally connected to A.
6515 */
6516 if (sched_debug() && node_distance(k, i) != distance)
6517 sched_numa_warn("Node-distance not symmetric");
6518
6519 if (sched_debug() && i && !find_numa_distance(distance))
6520 sched_numa_warn("Node-0 not representative");
6521 }
6522 if (next_distance != curr_distance) {
6523 sched_domains_numa_distance[level++] = next_distance;
6524 sched_domains_numa_levels = level;
6525 curr_distance = next_distance;
6526 } else break;
6376 } 6527 }
6377 if (next_distance != curr_distance) { 6528
6378 sched_domains_numa_distance[level++] = next_distance; 6529 /*
6379 sched_domains_numa_levels = level; 6530 * In case of sched_debug() we verify the above assumption.
6380 curr_distance = next_distance; 6531 */
6381 } else break; 6532 if (!sched_debug())
6533 break;
6382 } 6534 }
6383 /* 6535 /*
6384 * 'level' contains the number of unique distances, excluding the 6536 * 'level' contains the number of unique distances, excluding the
@@ -6403,7 +6555,7 @@ static void sched_init_numa(void)
6403 return; 6555 return;
6404 6556
6405 for (j = 0; j < nr_node_ids; j++) { 6557 for (j = 0; j < nr_node_ids; j++) {
6406 struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j); 6558 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
6407 if (!mask) 6559 if (!mask)
6408 return; 6560 return;
6409 6561
@@ -6490,7 +6642,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6490 6642
6491 *per_cpu_ptr(sdd->sg, j) = sg; 6643 *per_cpu_ptr(sdd->sg, j) = sg;
6492 6644
6493 sgp = kzalloc_node(sizeof(struct sched_group_power), 6645 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
6494 GFP_KERNEL, cpu_to_node(j)); 6646 GFP_KERNEL, cpu_to_node(j));
6495 if (!sgp) 6647 if (!sgp)
6496 return -ENOMEM; 6648 return -ENOMEM;
@@ -6543,7 +6695,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6543 if (!sd) 6695 if (!sd)
6544 return child; 6696 return child;
6545 6697
6546 set_domain_attribute(sd, attr);
6547 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); 6698 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6548 if (child) { 6699 if (child) {
6549 sd->level = child->level + 1; 6700 sd->level = child->level + 1;
@@ -6551,6 +6702,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6551 child->parent = sd; 6702 child->parent = sd;
6552 } 6703 }
6553 sd->child = child; 6704 sd->child = child;
6705 set_domain_attribute(sd, attr);
6554 6706
6555 return sd; 6707 return sd;
6556} 6708}
@@ -6691,7 +6843,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)
6691 if (!doms_cur) 6843 if (!doms_cur)
6692 doms_cur = &fallback_doms; 6844 doms_cur = &fallback_doms;
6693 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 6845 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6694 dattr_cur = NULL;
6695 err = build_sched_domains(doms_cur[0], NULL); 6846 err = build_sched_domains(doms_cur[0], NULL);
6696 register_sched_domain_sysctl(); 6847 register_sched_domain_sysctl();
6697 6848
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 940e6d17cf96..c099cc6eebe3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2703 int want_sd = 1; 2703 int want_sd = 1;
2704 int sync = wake_flags & WF_SYNC; 2704 int sync = wake_flags & WF_SYNC;
2705 2705
2706 if (p->rt.nr_cpus_allowed == 1) 2706 if (p->nr_cpus_allowed == 1)
2707 return prev_cpu; 2707 return prev_cpu;
2708 2708
2709 if (sd_flag & SD_BALANCE_WAKE) { 2709 if (sd_flag & SD_BALANCE_WAKE) {
@@ -3503,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3503unsigned long scale_rt_power(int cpu) 3503unsigned long scale_rt_power(int cpu)
3504{ 3504{
3505 struct rq *rq = cpu_rq(cpu); 3505 struct rq *rq = cpu_rq(cpu);
3506 u64 total, available; 3506 u64 total, available, age_stamp, avg;
3507 3507
3508 total = sched_avg_period() + (rq->clock - rq->age_stamp); 3508 /*
3509 * Since we're reading these variables without serialization make sure
3510 * we read them once before doing sanity checks on them.
3511 */
3512 age_stamp = ACCESS_ONCE(rq->age_stamp);
3513 avg = ACCESS_ONCE(rq->rt_avg);
3514
3515 total = sched_avg_period() + (rq->clock - age_stamp);
3509 3516
3510 if (unlikely(total < rq->rt_avg)) { 3517 if (unlikely(total < avg)) {
3511 /* Ensures that power won't end up being negative */ 3518 /* Ensures that power won't end up being negative */
3512 available = 0; 3519 available = 0;
3513 } else { 3520 } else {
3514 available = total - rq->rt_avg; 3521 available = total - avg;
3515 } 3522 }
3516 3523
3517 if (unlikely((s64)total < SCHED_POWER_SCALE)) 3524 if (unlikely((s64)total < SCHED_POWER_SCALE))
@@ -3574,13 +3581,28 @@ void update_group_power(struct sched_domain *sd, int cpu)
3574 3581
3575 power = 0; 3582 power = 0;
3576 3583
3577 group = child->groups; 3584 if (child->flags & SD_OVERLAP) {
3578 do { 3585 /*
3579 power += group->sgp->power; 3586 * SD_OVERLAP domains cannot assume that child groups
3580 group = group->next; 3587 * span the current group.
3581 } while (group != child->groups); 3588 */
3582 3589
3583 sdg->sgp->power = power; 3590 for_each_cpu(cpu, sched_group_cpus(sdg))
3591 power += power_of(cpu);
3592 } else {
3593 /*
3594 * !SD_OVERLAP domains can assume that child groups
3595 * span the current group.
3596 */
3597
3598 group = child->groups;
3599 do {
3600 power += group->sgp->power;
3601 group = group->next;
3602 } while (group != child->groups);
3603 }
3604
3605 sdg->sgp->power_orig = sdg->sgp->power = power;
3584} 3606}
3585 3607
3586/* 3608/*
@@ -3610,7 +3632,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
3610 3632
3611/** 3633/**
3612 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3634 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3613 * @sd: The sched_domain whose statistics are to be updated. 3635 * @env: The load balancing environment.
3614 * @group: sched_group whose statistics are to be updated. 3636 * @group: sched_group whose statistics are to be updated.
3615 * @load_idx: Load index of sched_domain of this_cpu for load calc. 3637 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3616 * @local_group: Does group contain this_cpu. 3638 * @local_group: Does group contain this_cpu.
@@ -3630,7 +3652,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
3630 int i; 3652 int i;
3631 3653
3632 if (local_group) 3654 if (local_group)
3633 balance_cpu = group_first_cpu(group); 3655 balance_cpu = group_balance_cpu(group);
3634 3656
3635 /* Tally up the load of all CPUs in the group */ 3657 /* Tally up the load of all CPUs in the group */
3636 max_cpu_load = 0; 3658 max_cpu_load = 0;
@@ -3645,7 +3667,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
3645 3667
3646 /* Bias balancing toward cpus of our domain */ 3668 /* Bias balancing toward cpus of our domain */
3647 if (local_group) { 3669 if (local_group) {
3648 if (idle_cpu(i) && !first_idle_cpu) { 3670 if (idle_cpu(i) && !first_idle_cpu &&
3671 cpumask_test_cpu(i, sched_group_mask(group))) {
3649 first_idle_cpu = 1; 3672 first_idle_cpu = 1;
3650 balance_cpu = i; 3673 balance_cpu = i;
3651 } 3674 }
@@ -3719,11 +3742,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
3719 3742
3720/** 3743/**
3721 * update_sd_pick_busiest - return 1 on busiest group 3744 * update_sd_pick_busiest - return 1 on busiest group
3722 * @sd: sched_domain whose statistics are to be checked 3745 * @env: The load balancing environment.
3723 * @sds: sched_domain statistics 3746 * @sds: sched_domain statistics
3724 * @sg: sched_group candidate to be checked for being the busiest 3747 * @sg: sched_group candidate to be checked for being the busiest
3725 * @sgs: sched_group statistics 3748 * @sgs: sched_group statistics
3726 * @this_cpu: the current cpu
3727 * 3749 *
3728 * Determine if @sg is a busier group than the previously selected 3750 * Determine if @sg is a busier group than the previously selected
3729 * busiest group. 3751 * busiest group.
@@ -3761,9 +3783,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
3761 3783
3762/** 3784/**
3763 * update_sd_lb_stats - Update sched_domain's statistics for load balancing. 3785 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
3764 * @sd: sched_domain whose statistics are to be updated. 3786 * @env: The load balancing environment.
3765 * @this_cpu: Cpu for which load balance is currently performed.
3766 * @idle: Idle status of this_cpu
3767 * @cpus: Set of cpus considered for load balancing. 3787 * @cpus: Set of cpus considered for load balancing.
3768 * @balance: Should we balance. 3788 * @balance: Should we balance.
3769 * @sds: variable to hold the statistics for this sched_domain. 3789 * @sds: variable to hold the statistics for this sched_domain.
@@ -3852,10 +3872,8 @@ static inline void update_sd_lb_stats(struct lb_env *env,
3852 * Returns 1 when packing is required and a task should be moved to 3872 * Returns 1 when packing is required and a task should be moved to
3853 * this CPU. The amount of the imbalance is returned in *imbalance. 3873 * this CPU. The amount of the imbalance is returned in *imbalance.
3854 * 3874 *
3855 * @sd: The sched_domain whose packing is to be checked. 3875 * @env: The load balancing environment.
3856 * @sds: Statistics of the sched_domain which is to be packed 3876 * @sds: Statistics of the sched_domain which is to be packed
3857 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3858 * @imbalance: returns amount of imbalanced due to packing.
3859 */ 3877 */
3860static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) 3878static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
3861{ 3879{
@@ -3881,9 +3899,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
3881 * fix_small_imbalance - Calculate the minor imbalance that exists 3899 * fix_small_imbalance - Calculate the minor imbalance that exists
3882 * amongst the groups of a sched_domain, during 3900 * amongst the groups of a sched_domain, during
3883 * load balancing. 3901 * load balancing.
3902 * @env: The load balancing environment.
3884 * @sds: Statistics of the sched_domain whose imbalance is to be calculated. 3903 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3885 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3886 * @imbalance: Variable to store the imbalance.
3887 */ 3904 */
3888static inline 3905static inline
3889void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) 3906void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
@@ -4026,11 +4043,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4026 * Also calculates the amount of weighted load which should be moved 4043 * Also calculates the amount of weighted load which should be moved
4027 * to restore balance. 4044 * to restore balance.
4028 * 4045 *
4029 * @sd: The sched_domain whose busiest group is to be returned. 4046 * @env: The load balancing environment.
4030 * @this_cpu: The cpu for which load balancing is currently being performed.
4031 * @imbalance: Variable which stores amount of weighted load which should
4032 * be moved to restore balance/put a group to idle.
4033 * @idle: The idle status of this_cpu.
4034 * @cpus: The set of CPUs under consideration for load-balancing. 4047 * @cpus: The set of CPUs under consideration for load-balancing.
4035 * @balance: Pointer to a variable indicating if this_cpu 4048 * @balance: Pointer to a variable indicating if this_cpu
4036 * is the appropriate cpu to perform load balancing at this_level. 4049 * is the appropriate cpu to perform load balancing at this_level.
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c5565c3c515f..573e1ca01102 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq)
274 274
275static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 275static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
276{ 276{
277 struct task_struct *p;
278
277 if (!rt_entity_is_task(rt_se)) 279 if (!rt_entity_is_task(rt_se))
278 return; 280 return;
279 281
282 p = rt_task_of(rt_se);
280 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 283 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
281 284
282 rt_rq->rt_nr_total++; 285 rt_rq->rt_nr_total++;
283 if (rt_se->nr_cpus_allowed > 1) 286 if (p->nr_cpus_allowed > 1)
284 rt_rq->rt_nr_migratory++; 287 rt_rq->rt_nr_migratory++;
285 288
286 update_rt_migration(rt_rq); 289 update_rt_migration(rt_rq);
@@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
288 291
289static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 292static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
290{ 293{
294 struct task_struct *p;
295
291 if (!rt_entity_is_task(rt_se)) 296 if (!rt_entity_is_task(rt_se))
292 return; 297 return;
293 298
299 p = rt_task_of(rt_se);
294 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 300 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
295 301
296 rt_rq->rt_nr_total--; 302 rt_rq->rt_nr_total--;
297 if (rt_se->nr_cpus_allowed > 1) 303 if (p->nr_cpus_allowed > 1)
298 rt_rq->rt_nr_migratory--; 304 rt_rq->rt_nr_migratory--;
299 305
300 update_rt_migration(rt_rq); 306 update_rt_migration(rt_rq);
@@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1161 1167
1162 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); 1168 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
1163 1169
1164 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 1170 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1165 enqueue_pushable_task(rq, p); 1171 enqueue_pushable_task(rq, p);
1166 1172
1167 inc_nr_running(rq); 1173 inc_nr_running(rq);
@@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1225 1231
1226 cpu = task_cpu(p); 1232 cpu = task_cpu(p);
1227 1233
1228 if (p->rt.nr_cpus_allowed == 1) 1234 if (p->nr_cpus_allowed == 1)
1229 goto out; 1235 goto out;
1230 1236
1231 /* For anything but wake ups, just return the task_cpu */ 1237 /* For anything but wake ups, just return the task_cpu */
@@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1260 * will have to sort it out. 1266 * will have to sort it out.
1261 */ 1267 */
1262 if (curr && unlikely(rt_task(curr)) && 1268 if (curr && unlikely(rt_task(curr)) &&
1263 (curr->rt.nr_cpus_allowed < 2 || 1269 (curr->nr_cpus_allowed < 2 ||
1264 curr->prio <= p->prio) && 1270 curr->prio <= p->prio) &&
1265 (p->rt.nr_cpus_allowed > 1)) { 1271 (p->nr_cpus_allowed > 1)) {
1266 int target = find_lowest_rq(p); 1272 int target = find_lowest_rq(p);
1267 1273
1268 if (target != -1) 1274 if (target != -1)
@@ -1276,10 +1282,10 @@ out:
1276 1282
1277static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1283static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1278{ 1284{
1279 if (rq->curr->rt.nr_cpus_allowed == 1) 1285 if (rq->curr->nr_cpus_allowed == 1)
1280 return; 1286 return;
1281 1287
1282 if (p->rt.nr_cpus_allowed != 1 1288 if (p->nr_cpus_allowed != 1
1283 && cpupri_find(&rq->rd->cpupri, p, NULL)) 1289 && cpupri_find(&rq->rd->cpupri, p, NULL))
1284 return; 1290 return;
1285 1291
@@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1395 * The previous task needs to be made eligible for pushing 1401 * The previous task needs to be made eligible for pushing
1396 * if it is still active 1402 * if it is still active
1397 */ 1403 */
1398 if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) 1404 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1399 enqueue_pushable_task(rq, p); 1405 enqueue_pushable_task(rq, p);
1400} 1406}
1401 1407
@@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1408{ 1414{
1409 if (!task_running(rq, p) && 1415 if (!task_running(rq, p) &&
1410 (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && 1416 (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
1411 (p->rt.nr_cpus_allowed > 1)) 1417 (p->nr_cpus_allowed > 1))
1412 return 1; 1418 return 1;
1413 return 0; 1419 return 0;
1414} 1420}
@@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task)
1464 if (unlikely(!lowest_mask)) 1470 if (unlikely(!lowest_mask))
1465 return -1; 1471 return -1;
1466 1472
1467 if (task->rt.nr_cpus_allowed == 1) 1473 if (task->nr_cpus_allowed == 1)
1468 return -1; /* No other targets possible */ 1474 return -1; /* No other targets possible */
1469 1475
1470 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) 1476 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1556,7 +1562,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1556 task_running(rq, task) || 1562 task_running(rq, task) ||
1557 !task->on_rq)) { 1563 !task->on_rq)) {
1558 1564
1559 raw_spin_unlock(&lowest_rq->lock); 1565 double_unlock_balance(rq, lowest_rq);
1560 lowest_rq = NULL; 1566 lowest_rq = NULL;
1561 break; 1567 break;
1562 } 1568 }
@@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
1586 1592
1587 BUG_ON(rq->cpu != task_cpu(p)); 1593 BUG_ON(rq->cpu != task_cpu(p));
1588 BUG_ON(task_current(rq, p)); 1594 BUG_ON(task_current(rq, p));
1589 BUG_ON(p->rt.nr_cpus_allowed <= 1); 1595 BUG_ON(p->nr_cpus_allowed <= 1);
1590 1596
1591 BUG_ON(!p->on_rq); 1597 BUG_ON(!p->on_rq);
1592 BUG_ON(!rt_task(p)); 1598 BUG_ON(!rt_task(p));
@@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1793 if (!task_running(rq, p) && 1799 if (!task_running(rq, p) &&
1794 !test_tsk_need_resched(rq->curr) && 1800 !test_tsk_need_resched(rq->curr) &&
1795 has_pushable_tasks(rq) && 1801 has_pushable_tasks(rq) &&
1796 p->rt.nr_cpus_allowed > 1 && 1802 p->nr_cpus_allowed > 1 &&
1797 rt_task(rq->curr) && 1803 rt_task(rq->curr) &&
1798 (rq->curr->rt.nr_cpus_allowed < 2 || 1804 (rq->curr->nr_cpus_allowed < 2 ||
1799 rq->curr->prio <= p->prio)) 1805 rq->curr->prio <= p->prio))
1800 push_rt_tasks(rq); 1806 push_rt_tasks(rq);
1801} 1807}
@@ -1817,7 +1823,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1817 * Only update if the process changes its state from whether it 1823 * Only update if the process changes its state from whether it
1818 * can migrate or not. 1824 * can migrate or not.
1819 */ 1825 */
1820 if ((p->rt.nr_cpus_allowed > 1) == (weight > 1)) 1826 if ((p->nr_cpus_allowed > 1) == (weight > 1))
1821 return; 1827 return;
1822 1828
1823 rq = task_rq(p); 1829 rq = task_rq(p);
@@ -1979,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1979 1985
1980static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) 1986static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1981{ 1987{
1988 struct sched_rt_entity *rt_se = &p->rt;
1989
1982 update_curr_rt(rq); 1990 update_curr_rt(rq);
1983 1991
1984 watchdog(rq, p); 1992 watchdog(rq, p);
@@ -1996,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1996 p->rt.time_slice = RR_TIMESLICE; 2004 p->rt.time_slice = RR_TIMESLICE;
1997 2005
1998 /* 2006 /*
1999 * Requeue to the end of queue if we are not the only element 2007 * Requeue to the end of queue if we (and all of our ancestors) are the
2000 * on the queue: 2008 * only element on the queue
2001 */ 2009 */
2002 if (p->rt.run_list.prev != p->rt.run_list.next) { 2010 for_each_sched_rt_entity(rt_se) {
2003 requeue_task_rt(rq, p, 0); 2011 if (rt_se->run_list.prev != rt_se->run_list.next) {
2004 set_tsk_need_resched(p); 2012 requeue_task_rt(rq, p, 0);
2013 set_tsk_need_resched(p);
2014 return;
2015 }
2005 } 2016 }
2006} 2017}
2007 2018
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ba9dccfd24ce..6d52cea7f33d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
526DECLARE_PER_CPU(struct sched_domain *, sd_llc); 526DECLARE_PER_CPU(struct sched_domain *, sd_llc);
527DECLARE_PER_CPU(int, sd_llc_id); 527DECLARE_PER_CPU(int, sd_llc_id);
528 528
529extern int group_balance_cpu(struct sched_group *sg);
530
529#endif /* CONFIG_SMP */ 531#endif /* CONFIG_SMP */
530 532
531#include "stats.h" 533#include "stats.h"
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index e1a797e028a3..98f60c5caa1b 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -31,6 +31,12 @@ void __init idle_thread_set_boot_cpu(void)
31 per_cpu(idle_threads, smp_processor_id()) = current; 31 per_cpu(idle_threads, smp_processor_id()) = current;
32} 32}
33 33
34/**
35 * idle_init - Initialize the idle thread for a cpu
36 * @cpu: The cpu for which the idle thread should be initialized
37 *
38 * Creates the thread if it does not exist.
39 */
34static inline void idle_init(unsigned int cpu) 40static inline void idle_init(unsigned int cpu)
35{ 41{
36 struct task_struct *tsk = per_cpu(idle_threads, cpu); 42 struct task_struct *tsk = per_cpu(idle_threads, cpu);
@@ -45,17 +51,16 @@ static inline void idle_init(unsigned int cpu)
45} 51}
46 52
47/** 53/**
48 * idle_thread_init - Initialize the idle thread for a cpu 54 * idle_threads_init - Initialize idle threads for all cpus
49 * @cpu: The cpu for which the idle thread should be initialized
50 *
51 * Creates the thread if it does not exist.
52 */ 55 */
53void __init idle_threads_init(void) 56void __init idle_threads_init(void)
54{ 57{
55 unsigned int cpu; 58 unsigned int cpu, boot_cpu;
59
60 boot_cpu = smp_processor_id();
56 61
57 for_each_possible_cpu(cpu) { 62 for_each_possible_cpu(cpu) {
58 if (cpu != smp_processor_id()) 63 if (cpu != boot_cpu)
59 idle_init(cpu); 64 idle_init(cpu);
60 } 65 }
61} 66}
diff --git a/kernel/sys.c b/kernel/sys.c
index 9ff89cb9657a..e0c8ffc50d7f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1786,27 +1786,13 @@ SYSCALL_DEFINE1(umask, int, mask)
1786} 1786}
1787 1787
1788#ifdef CONFIG_CHECKPOINT_RESTORE 1788#ifdef CONFIG_CHECKPOINT_RESTORE
1789static bool vma_flags_mismatch(struct vm_area_struct *vma,
1790 unsigned long required,
1791 unsigned long banned)
1792{
1793 return (vma->vm_flags & required) != required ||
1794 (vma->vm_flags & banned);
1795}
1796
1797static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) 1789static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1798{ 1790{
1791 struct vm_area_struct *vma;
1799 struct file *exe_file; 1792 struct file *exe_file;
1800 struct dentry *dentry; 1793 struct dentry *dentry;
1801 int err; 1794 int err;
1802 1795
1803 /*
1804 * Setting new mm::exe_file is only allowed when no VM_EXECUTABLE vma's
1805 * remain. So perform a quick test first.
1806 */
1807 if (mm->num_exe_file_vmas)
1808 return -EBUSY;
1809
1810 exe_file = fget(fd); 1796 exe_file = fget(fd);
1811 if (!exe_file) 1797 if (!exe_file)
1812 return -EBADF; 1798 return -EBADF;
@@ -1827,17 +1813,30 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1827 if (err) 1813 if (err)
1828 goto exit; 1814 goto exit;
1829 1815
1816 down_write(&mm->mmap_sem);
1817
1818 /*
1819 * Forbid mm->exe_file change if there are mapped other files.
1820 */
1821 err = -EBUSY;
1822 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1823 if (vma->vm_file && !path_equal(&vma->vm_file->f_path,
1824 &exe_file->f_path))
1825 goto exit_unlock;
1826 }
1827
1830 /* 1828 /*
1831 * The symlink can be changed only once, just to disallow arbitrary 1829 * The symlink can be changed only once, just to disallow arbitrary
1832 * transitions malicious software might bring in. This means one 1830 * transitions malicious software might bring in. This means one
1833 * could make a snapshot over all processes running and monitor 1831 * could make a snapshot over all processes running and monitor
1834 * /proc/pid/exe changes to notice unusual activity if needed. 1832 * /proc/pid/exe changes to notice unusual activity if needed.
1835 */ 1833 */
1836 down_write(&mm->mmap_sem); 1834 err = -EPERM;
1837 if (likely(!mm->exe_file)) 1835 if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
1838 set_mm_exe_file(mm, exe_file); 1836 goto exit_unlock;
1839 else 1837
1840 err = -EBUSY; 1838 set_mm_exe_file(mm, exe_file);
1839exit_unlock:
1841 up_write(&mm->mmap_sem); 1840 up_write(&mm->mmap_sem);
1842 1841
1843exit: 1842exit:
@@ -1862,7 +1861,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
1862 if (opt == PR_SET_MM_EXE_FILE) 1861 if (opt == PR_SET_MM_EXE_FILE)
1863 return prctl_set_mm_exe_file(mm, (unsigned int)addr); 1862 return prctl_set_mm_exe_file(mm, (unsigned int)addr);
1864 1863
1865 if (addr >= TASK_SIZE) 1864 if (addr >= TASK_SIZE || addr < mmap_min_addr)
1866 return -EINVAL; 1865 return -EINVAL;
1867 1866
1868 error = -EINVAL; 1867 error = -EINVAL;
@@ -1924,12 +1923,6 @@ static int prctl_set_mm(int opt, unsigned long addr,
1924 error = -EFAULT; 1923 error = -EFAULT;
1925 goto out; 1924 goto out;
1926 } 1925 }
1927#ifdef CONFIG_STACK_GROWSUP
1928 if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSUP, 0))
1929#else
1930 if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSDOWN, 0))
1931#endif
1932 goto out;
1933 if (opt == PR_SET_MM_START_STACK) 1926 if (opt == PR_SET_MM_START_STACK)
1934 mm->start_stack = addr; 1927 mm->start_stack = addr;
1935 else if (opt == PR_SET_MM_ARG_START) 1928 else if (opt == PR_SET_MM_ARG_START)
@@ -1981,12 +1974,22 @@ out:
1981 up_read(&mm->mmap_sem); 1974 up_read(&mm->mmap_sem);
1982 return error; 1975 return error;
1983} 1976}
1977
1978static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
1979{
1980 return put_user(me->clear_child_tid, tid_addr);
1981}
1982
1984#else /* CONFIG_CHECKPOINT_RESTORE */ 1983#else /* CONFIG_CHECKPOINT_RESTORE */
1985static int prctl_set_mm(int opt, unsigned long addr, 1984static int prctl_set_mm(int opt, unsigned long addr,
1986 unsigned long arg4, unsigned long arg5) 1985 unsigned long arg4, unsigned long arg5)
1987{ 1986{
1988 return -EINVAL; 1987 return -EINVAL;
1989} 1988}
1989static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
1990{
1991 return -EINVAL;
1992}
1990#endif 1993#endif
1991 1994
1992SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, 1995SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
@@ -2141,6 +2144,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2141 case PR_SET_MM: 2144 case PR_SET_MM:
2142 error = prctl_set_mm(arg2, arg3, arg4, arg5); 2145 error = prctl_set_mm(arg2, arg3, arg4, arg5);
2143 break; 2146 break;
2147 case PR_GET_TID_ADDRESS:
2148 error = prctl_get_tid_address(me, (int __user **)arg2);
2149 break;
2144 case PR_SET_CHILD_SUBREAPER: 2150 case PR_SET_CHILD_SUBREAPER:
2145 me->signal->is_child_subreaper = !!arg2; 2151 me->signal->is_child_subreaper = !!arg2;
2146 error = 0; 2152 error = 0;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 9cd928f7a7c6..7e1ce012a851 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -297,8 +297,7 @@ void clockevents_register_device(struct clock_event_device *dev)
297} 297}
298EXPORT_SYMBOL_GPL(clockevents_register_device); 298EXPORT_SYMBOL_GPL(clockevents_register_device);
299 299
300static void clockevents_config(struct clock_event_device *dev, 300void clockevents_config(struct clock_event_device *dev, u32 freq)
301 u32 freq)
302{ 301{
303 u64 sec; 302 u64 sec;
304 303
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6a3a5b9ff561..869997833928 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -274,6 +274,7 @@ EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
274static void tick_nohz_stop_sched_tick(struct tick_sched *ts) 274static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
275{ 275{
276 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; 276 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
277 unsigned long rcu_delta_jiffies;
277 ktime_t last_update, expires, now; 278 ktime_t last_update, expires, now;
278 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 279 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
279 u64 time_delta; 280 u64 time_delta;
@@ -322,7 +323,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
322 time_delta = timekeeping_max_deferment(); 323 time_delta = timekeeping_max_deferment();
323 } while (read_seqretry(&xtime_lock, seq)); 324 } while (read_seqretry(&xtime_lock, seq));
324 325
325 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || 326 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) ||
326 arch_needs_cpu(cpu)) { 327 arch_needs_cpu(cpu)) {
327 next_jiffies = last_jiffies + 1; 328 next_jiffies = last_jiffies + 1;
328 delta_jiffies = 1; 329 delta_jiffies = 1;
@@ -330,6 +331,10 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
330 /* Get the next timer wheel timer */ 331 /* Get the next timer wheel timer */
331 next_jiffies = get_next_timer_interrupt(last_jiffies); 332 next_jiffies = get_next_timer_interrupt(last_jiffies);
332 delta_jiffies = next_jiffies - last_jiffies; 333 delta_jiffies = next_jiffies - last_jiffies;
334 if (rcu_delta_jiffies < delta_jiffies) {
335 next_jiffies = last_jiffies + rcu_delta_jiffies;
336 delta_jiffies = rcu_delta_jiffies;
337 }
333 } 338 }
334 /* 339 /*
335 * Do not stop the tick, if we are only one off 340 * Do not stop the tick, if we are only one off
@@ -576,6 +581,7 @@ void tick_nohz_idle_exit(void)
576 /* Update jiffies first */ 581 /* Update jiffies first */
577 select_nohz_load_balancer(0); 582 select_nohz_load_balancer(0);
578 tick_do_update_jiffies64(now); 583 tick_do_update_jiffies64(now);
584 update_cpu_load_nohz();
579 585
580#ifndef CONFIG_VIRT_CPU_ACCOUNTING 586#ifndef CONFIG_VIRT_CPU_ACCOUNTING
581 /* 587 /*
@@ -814,6 +820,16 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
814 return HRTIMER_RESTART; 820 return HRTIMER_RESTART;
815} 821}
816 822
823static int sched_skew_tick;
824
825static int __init skew_tick(char *str)
826{
827 get_option(&str, &sched_skew_tick);
828
829 return 0;
830}
831early_param("skew_tick", skew_tick);
832
817/** 833/**
818 * tick_setup_sched_timer - setup the tick emulation timer 834 * tick_setup_sched_timer - setup the tick emulation timer
819 */ 835 */
@@ -831,6 +847,14 @@ void tick_setup_sched_timer(void)
831 /* Get the next period (per cpu) */ 847 /* Get the next period (per cpu) */
832 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 848 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
833 849
850 /* Offset the tick to avert xtime_lock contention. */
851 if (sched_skew_tick) {
852 u64 offset = ktime_to_ns(tick_period) >> 1;
853 do_div(offset, num_possible_cpus());
854 offset *= smp_processor_id();
855 hrtimer_add_expires_ns(&ts->sched_timer, offset);
856 }
857
834 for (;;) { 858 for (;;) {
835 hrtimer_forward(&ts->sched_timer, now, tick_period); 859 hrtimer_forward(&ts->sched_timer, now, tick_period);
836 hrtimer_start_expires(&ts->sched_timer, 860 hrtimer_start_expires(&ts->sched_timer,
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6e46cacf5969..6f46a00a1e8a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -962,6 +962,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
962 timekeeper.xtime.tv_sec++; 962 timekeeper.xtime.tv_sec++;
963 leap = second_overflow(timekeeper.xtime.tv_sec); 963 leap = second_overflow(timekeeper.xtime.tv_sec);
964 timekeeper.xtime.tv_sec += leap; 964 timekeeper.xtime.tv_sec += leap;
965 timekeeper.wall_to_monotonic.tv_sec -= leap;
965 } 966 }
966 967
967 /* Accumulate raw time */ 968 /* Accumulate raw time */
@@ -1077,6 +1078,7 @@ static void update_wall_time(void)
1077 timekeeper.xtime.tv_sec++; 1078 timekeeper.xtime.tv_sec++;
1078 leap = second_overflow(timekeeper.xtime.tv_sec); 1079 leap = second_overflow(timekeeper.xtime.tv_sec);
1079 timekeeper.xtime.tv_sec += leap; 1080 timekeeper.xtime.tv_sec += leap;
1081 timekeeper.wall_to_monotonic.tv_sec -= leap;
1080 } 1082 }
1081 1083
1082 timekeeping_update(false); 1084 timekeeping_update(false);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 288488082224..a7fa0702be1c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -371,7 +371,7 @@ EXPORT_SYMBOL_GPL(tracing_on);
371void tracing_off(void) 371void tracing_off(void)
372{ 372{
373 if (global_trace.buffer) 373 if (global_trace.buffer)
374 ring_buffer_record_on(global_trace.buffer); 374 ring_buffer_record_off(global_trace.buffer);
375 /* 375 /*
376 * This flag is only looked at when buffers haven't been 376 * This flag is only looked at when buffers haven't been
377 * allocated yet. We don't really care about the race 377 * allocated yet. We don't really care about the race
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index e5e1d85b8c7c..4b1dfba70f7c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -372,6 +372,13 @@ static int watchdog(void *unused)
372 372
373 373
374#ifdef CONFIG_HARDLOCKUP_DETECTOR 374#ifdef CONFIG_HARDLOCKUP_DETECTOR
375/*
376 * People like the simple clean cpu node info on boot.
377 * Reduce the watchdog noise by only printing messages
378 * that are different from what cpu0 displayed.
379 */
380static unsigned long cpu0_err;
381
375static int watchdog_nmi_enable(int cpu) 382static int watchdog_nmi_enable(int cpu)
376{ 383{
377 struct perf_event_attr *wd_attr; 384 struct perf_event_attr *wd_attr;
@@ -390,11 +397,21 @@ static int watchdog_nmi_enable(int cpu)
390 397
391 /* Try to register using hardware perf events */ 398 /* Try to register using hardware perf events */
392 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); 399 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
400
401 /* save cpu0 error for future comparision */
402 if (cpu == 0 && IS_ERR(event))
403 cpu0_err = PTR_ERR(event);
404
393 if (!IS_ERR(event)) { 405 if (!IS_ERR(event)) {
394 pr_info("enabled, takes one hw-pmu counter.\n"); 406 /* only print for cpu0 or different than cpu0 */
407 if (cpu == 0 || cpu0_err)
408 pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
395 goto out_save; 409 goto out_save;
396 } 410 }
397 411
412 /* skip displaying the same error again */
413 if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
414 return PTR_ERR(event);
398 415
399 /* vary the KERN level based on the returned errno */ 416 /* vary the KERN level based on the returned errno */
400 if (PTR_ERR(event) == -EOPNOTSUPP) 417 if (PTR_ERR(event) == -EOPNOTSUPP)