aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-11-07 04:29:58 -0500
committerIngo Molnar <mingo@elte.hu>2008-11-07 04:29:58 -0500
commit258594a138f4ca9adf214f5272592d7f21def610 (patch)
treed97ee71c997b0412f79b9ec4150cb52ce838fe13 /kernel
parenta87d091434ed2a34d647979ab12084139ee1fe41 (diff)
parentca3273f9646694e0419cfb9d6c12deb1c9aff27c (diff)
Merge branch 'sched/urgent' into sched/core
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c1
-rw-r--r--kernel/sched.c13
-rw-r--r--kernel/sched_fair.c76
-rw-r--r--kernel/sched_features.h1
-rw-r--r--kernel/smp.c18
-rw-r--r--kernel/timer.c129
-rw-r--r--kernel/trace/Kconfig2
-rw-r--r--kernel/trace/ring_buffer.c56
-rw-r--r--kernel/trace/trace.c41
9 files changed, 250 insertions, 87 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 35eebd5510c..358e77564e6 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2497,7 +2497,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2497 list_del(&cgrp->sibling); 2497 list_del(&cgrp->sibling);
2498 spin_lock(&cgrp->dentry->d_lock); 2498 spin_lock(&cgrp->dentry->d_lock);
2499 d = dget(cgrp->dentry); 2499 d = dget(cgrp->dentry);
2500 cgrp->dentry = NULL;
2501 spin_unlock(&d->d_lock); 2500 spin_unlock(&d->d_lock);
2502 2501
2503 cgroup_d_remove_dir(d); 2502 cgroup_d_remove_dir(d);
diff --git a/kernel/sched.c b/kernel/sched.c
index 213cad5e50a..b24e57a10f6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -397,7 +397,7 @@ struct cfs_rq {
397 * 'curr' points to currently running entity on this cfs_rq. 397 * 'curr' points to currently running entity on this cfs_rq.
398 * It is set to NULL otherwise (i.e when none are currently running). 398 * It is set to NULL otherwise (i.e when none are currently running).
399 */ 399 */
400 struct sched_entity *curr, *next; 400 struct sched_entity *curr, *next, *last;
401 401
402 unsigned long nr_spread_over; 402 unsigned long nr_spread_over;
403 403
@@ -1785,7 +1785,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1785 /* 1785 /*
1786 * Buddy candidates are cache hot: 1786 * Buddy candidates are cache hot:
1787 */ 1787 */
1788 if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next)) 1788 if (sched_feat(CACHE_HOT_BUDDY) &&
1789 (&p->se == cfs_rq_of(&p->se)->next ||
1790 &p->se == cfs_rq_of(&p->se)->last))
1789 return 1; 1791 return 1;
1790 1792
1791 if (p->sched_class != &fair_sched_class) 1793 if (p->sched_class != &fair_sched_class)
@@ -6832,15 +6834,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6832 struct sched_domain *tmp; 6834 struct sched_domain *tmp;
6833 6835
6834 /* Remove the sched domains which do not contribute to scheduling. */ 6836 /* Remove the sched domains which do not contribute to scheduling. */
6835 for (tmp = sd; tmp; tmp = tmp->parent) { 6837 for (tmp = sd; tmp; ) {
6836 struct sched_domain *parent = tmp->parent; 6838 struct sched_domain *parent = tmp->parent;
6837 if (!parent) 6839 if (!parent)
6838 break; 6840 break;
6841
6839 if (sd_parent_degenerate(tmp, parent)) { 6842 if (sd_parent_degenerate(tmp, parent)) {
6840 tmp->parent = parent->parent; 6843 tmp->parent = parent->parent;
6841 if (parent->parent) 6844 if (parent->parent)
6842 parent->parent->child = tmp; 6845 parent->parent->child = tmp;
6843 } 6846 } else
6847 tmp = tmp->parent;
6844 } 6848 }
6845 6849
6846 if (sd && sd_degenerate(sd)) { 6850 if (sd && sd_degenerate(sd)) {
@@ -7629,6 +7633,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7629error: 7633error:
7630 free_sched_groups(cpu_map, tmpmask); 7634 free_sched_groups(cpu_map, tmpmask);
7631 SCHED_CPUMASK_FREE((void *)allmasks); 7635 SCHED_CPUMASK_FREE((void *)allmasks);
7636 kfree(rd);
7632 return -ENOMEM; 7637 return -ENOMEM;
7633#endif 7638#endif
7634} 7639}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ce514afd78f..51aa3e102ac 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -341,23 +341,20 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
341 cfs_rq->rb_leftmost = next_node; 341 cfs_rq->rb_leftmost = next_node;
342 } 342 }
343 343
344 if (cfs_rq->next == se)
345 cfs_rq->next = NULL;
346
347 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 344 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
348} 345}
349 346
350static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
351{
352 return cfs_rq->rb_leftmost;
353}
354
355static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) 347static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
356{ 348{
357 return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); 349 struct rb_node *left = cfs_rq->rb_leftmost;
350
351 if (!left)
352 return NULL;
353
354 return rb_entry(left, struct sched_entity, run_node);
358} 355}
359 356
360static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 357static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
361{ 358{
362 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); 359 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
363 360
@@ -741,6 +738,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
741#endif 738#endif
742 } 739 }
743 740
741 if (cfs_rq->last == se)
742 cfs_rq->last = NULL;
743
744 if (cfs_rq->next == se)
745 cfs_rq->next = NULL;
746
744 if (se != cfs_rq->curr) 747 if (se != cfs_rq->curr)
745 __dequeue_entity(cfs_rq, se); 748 __dequeue_entity(cfs_rq, se);
746 account_entity_dequeue(cfs_rq, se); 749 account_entity_dequeue(cfs_rq, se);
@@ -794,24 +797,15 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
794static int 797static int
795wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); 798wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
796 799
797static struct sched_entity *
798pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
799{
800 if (!cfs_rq->next || wakeup_preempt_entity(cfs_rq->next, se) == 1)
801 return se;
802
803 return cfs_rq->next;
804}
805
806static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 800static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
807{ 801{
808 struct sched_entity *se = NULL; 802 struct sched_entity *se = __pick_next_entity(cfs_rq);
809 803
810 if (first_fair(cfs_rq)) { 804 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)
811 se = __pick_next_entity(cfs_rq); 805 return cfs_rq->next;
812 se = pick_next(cfs_rq, se); 806
813 set_next_entity(cfs_rq, se); 807 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
814 } 808 return cfs_rq->last;
815 809
816 return se; 810 return se;
817} 811}
@@ -1325,26 +1319,53 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1325 return 0; 1319 return 0;
1326} 1320}
1327 1321
1322static void set_last_buddy(struct sched_entity *se)
1323{
1324 for_each_sched_entity(se)
1325 cfs_rq_of(se)->last = se;
1326}
1327
1328static void set_next_buddy(struct sched_entity *se)
1329{
1330 for_each_sched_entity(se)
1331 cfs_rq_of(se)->next = se;
1332}
1333
1328/* 1334/*
1329 * Preempt the current task with a newly woken task if needed: 1335 * Preempt the current task with a newly woken task if needed:
1330 */ 1336 */
1331static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) 1337static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1332{ 1338{
1333 struct task_struct *curr = rq->curr; 1339 struct task_struct *curr = rq->curr;
1334 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1335 struct sched_entity *se = &curr->se, *pse = &p->se; 1340 struct sched_entity *se = &curr->se, *pse = &p->se;
1336 1341
1337 if (unlikely(rt_prio(p->prio))) { 1342 if (unlikely(rt_prio(p->prio))) {
1343 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1344
1338 update_rq_clock(rq); 1345 update_rq_clock(rq);
1339 update_curr(cfs_rq); 1346 update_curr(cfs_rq);
1340 resched_task(curr); 1347 resched_task(curr);
1341 return; 1348 return;
1342 } 1349 }
1343 1350
1351 if (unlikely(p->sched_class != &fair_sched_class))
1352 return;
1353
1344 if (unlikely(se == pse)) 1354 if (unlikely(se == pse))
1345 return; 1355 return;
1346 1356
1347 cfs_rq_of(pse)->next = pse; 1357 /*
1358 * Only set the backward buddy when the current task is still on the
1359 * rq. This can happen when a wakeup gets interleaved with schedule on
1360 * the ->pre_schedule() or idle_balance() point, either of which can
1361 * drop the rq lock.
1362 *
1363 * Also, during early boot the idle thread is in the fair class, for
1364 * obvious reasons its a bad idea to schedule back to the idle thread.
1365 */
1366 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
1367 set_last_buddy(se);
1368 set_next_buddy(pse);
1348 1369
1349 /* 1370 /*
1350 * We can come here with TIF_NEED_RESCHED already set from new task 1371 * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1396,6 +1417,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1396 1417
1397 do { 1418 do {
1398 se = pick_next_entity(cfs_rq); 1419 se = pick_next_entity(cfs_rq);
1420 set_next_entity(cfs_rq, se);
1399 cfs_rq = group_cfs_rq(se); 1421 cfs_rq = group_cfs_rq(se);
1400 } while (cfs_rq); 1422 } while (cfs_rq);
1401 1423
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index fda01621829..da5d93b5d2c 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -12,3 +12,4 @@ SCHED_FEAT(LB_BIAS, 1)
12SCHED_FEAT(LB_WAKEUP_UPDATE, 1) 12SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
13SCHED_FEAT(ASYM_EFF_LOAD, 1) 13SCHED_FEAT(ASYM_EFF_LOAD, 1)
14SCHED_FEAT(WAKEUP_OVERLAP, 0) 14SCHED_FEAT(WAKEUP_OVERLAP, 0)
15SCHED_FEAT(LAST_BUDDY, 1)
diff --git a/kernel/smp.c b/kernel/smp.c
index f362a855377..75c8dde58c5 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -51,10 +51,6 @@ static void csd_flag_wait(struct call_single_data *data)
51{ 51{
52 /* Wait for response */ 52 /* Wait for response */
53 do { 53 do {
54 /*
55 * We need to see the flags store in the IPI handler
56 */
57 smp_mb();
58 if (!(data->flags & CSD_FLAG_WAIT)) 54 if (!(data->flags & CSD_FLAG_WAIT))
59 break; 55 break;
60 cpu_relax(); 56 cpu_relax();
@@ -76,6 +72,11 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
76 list_add_tail(&data->list, &dst->list); 72 list_add_tail(&data->list, &dst->list);
77 spin_unlock_irqrestore(&dst->lock, flags); 73 spin_unlock_irqrestore(&dst->lock, flags);
78 74
75 /*
76 * Make the list addition visible before sending the ipi.
77 */
78 smp_mb();
79
79 if (ipi) 80 if (ipi)
80 arch_send_call_function_single_ipi(cpu); 81 arch_send_call_function_single_ipi(cpu);
81 82
@@ -157,7 +158,7 @@ void generic_smp_call_function_single_interrupt(void)
157 * Need to see other stores to list head for checking whether 158 * Need to see other stores to list head for checking whether
158 * list is empty without holding q->lock 159 * list is empty without holding q->lock
159 */ 160 */
160 smp_mb(); 161 smp_read_barrier_depends();
161 while (!list_empty(&q->list)) { 162 while (!list_empty(&q->list)) {
162 unsigned int data_flags; 163 unsigned int data_flags;
163 164
@@ -191,7 +192,7 @@ void generic_smp_call_function_single_interrupt(void)
191 /* 192 /*
192 * See comment on outer loop 193 * See comment on outer loop
193 */ 194 */
194 smp_mb(); 195 smp_read_barrier_depends();
195 } 196 }
196} 197}
197 198
@@ -370,6 +371,11 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
370 list_add_tail_rcu(&data->csd.list, &call_function_queue); 371 list_add_tail_rcu(&data->csd.list, &call_function_queue);
371 spin_unlock_irqrestore(&call_function_lock, flags); 372 spin_unlock_irqrestore(&call_function_lock, flags);
372 373
374 /*
375 * Make the list addition visible before sending the ipi.
376 */
377 smp_mb();
378
373 /* Send a message to all CPUs in the map */ 379 /* Send a message to all CPUs in the map */
374 arch_send_call_function_ipi(mask); 380 arch_send_call_function_ipi(mask);
375 381
diff --git a/kernel/timer.c b/kernel/timer.c
index 56becf373c5..dbd50fabe4c 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -112,27 +112,8 @@ timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
112 tbase_get_deferrable(timer->base)); 112 tbase_get_deferrable(timer->base));
113} 113}
114 114
115/** 115static unsigned long round_jiffies_common(unsigned long j, int cpu,
116 * __round_jiffies - function to round jiffies to a full second 116 bool force_up)
117 * @j: the time in (absolute) jiffies that should be rounded
118 * @cpu: the processor number on which the timeout will happen
119 *
120 * __round_jiffies() rounds an absolute time in the future (in jiffies)
121 * up or down to (approximately) full seconds. This is useful for timers
122 * for which the exact time they fire does not matter too much, as long as
123 * they fire approximately every X seconds.
124 *
125 * By rounding these timers to whole seconds, all such timers will fire
126 * at the same time, rather than at various times spread out. The goal
127 * of this is to have the CPU wake up less, which saves power.
128 *
129 * The exact rounding is skewed for each processor to avoid all
130 * processors firing at the exact same time, which could lead
131 * to lock contention or spurious cache line bouncing.
132 *
133 * The return value is the rounded version of the @j parameter.
134 */
135unsigned long __round_jiffies(unsigned long j, int cpu)
136{ 117{
137 int rem; 118 int rem;
138 unsigned long original = j; 119 unsigned long original = j;
@@ -154,8 +135,9 @@ unsigned long __round_jiffies(unsigned long j, int cpu)
154 * due to delays of the timer irq, long irq off times etc etc) then 135 * due to delays of the timer irq, long irq off times etc etc) then
155 * we should round down to the whole second, not up. Use 1/4th second 136 * we should round down to the whole second, not up. Use 1/4th second
156 * as cutoff for this rounding as an extreme upper bound for this. 137 * as cutoff for this rounding as an extreme upper bound for this.
138 * But never round down if @force_up is set.
157 */ 139 */
158 if (rem < HZ/4) /* round down */ 140 if (rem < HZ/4 && !force_up) /* round down */
159 j = j - rem; 141 j = j - rem;
160 else /* round up */ 142 else /* round up */
161 j = j - rem + HZ; 143 j = j - rem + HZ;
@@ -167,6 +149,31 @@ unsigned long __round_jiffies(unsigned long j, int cpu)
167 return original; 149 return original;
168 return j; 150 return j;
169} 151}
152
153/**
154 * __round_jiffies - function to round jiffies to a full second
155 * @j: the time in (absolute) jiffies that should be rounded
156 * @cpu: the processor number on which the timeout will happen
157 *
158 * __round_jiffies() rounds an absolute time in the future (in jiffies)
159 * up or down to (approximately) full seconds. This is useful for timers
160 * for which the exact time they fire does not matter too much, as long as
161 * they fire approximately every X seconds.
162 *
163 * By rounding these timers to whole seconds, all such timers will fire
164 * at the same time, rather than at various times spread out. The goal
165 * of this is to have the CPU wake up less, which saves power.
166 *
167 * The exact rounding is skewed for each processor to avoid all
168 * processors firing at the exact same time, which could lead
169 * to lock contention or spurious cache line bouncing.
170 *
171 * The return value is the rounded version of the @j parameter.
172 */
173unsigned long __round_jiffies(unsigned long j, int cpu)
174{
175 return round_jiffies_common(j, cpu, false);
176}
170EXPORT_SYMBOL_GPL(__round_jiffies); 177EXPORT_SYMBOL_GPL(__round_jiffies);
171 178
172/** 179/**
@@ -191,13 +198,10 @@ EXPORT_SYMBOL_GPL(__round_jiffies);
191 */ 198 */
192unsigned long __round_jiffies_relative(unsigned long j, int cpu) 199unsigned long __round_jiffies_relative(unsigned long j, int cpu)
193{ 200{
194 /* 201 unsigned long j0 = jiffies;
195 * In theory the following code can skip a jiffy in case jiffies 202
196 * increments right between the addition and the later subtraction. 203 /* Use j0 because jiffies might change while we run */
197 * However since the entire point of this function is to use approximate 204 return round_jiffies_common(j + j0, cpu, false) - j0;
198 * timeouts, it's entirely ok to not handle that.
199 */
200 return __round_jiffies(j + jiffies, cpu) - jiffies;
201} 205}
202EXPORT_SYMBOL_GPL(__round_jiffies_relative); 206EXPORT_SYMBOL_GPL(__round_jiffies_relative);
203 207
@@ -218,7 +222,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies_relative);
218 */ 222 */
219unsigned long round_jiffies(unsigned long j) 223unsigned long round_jiffies(unsigned long j)
220{ 224{
221 return __round_jiffies(j, raw_smp_processor_id()); 225 return round_jiffies_common(j, raw_smp_processor_id(), false);
222} 226}
223EXPORT_SYMBOL_GPL(round_jiffies); 227EXPORT_SYMBOL_GPL(round_jiffies);
224 228
@@ -243,6 +247,71 @@ unsigned long round_jiffies_relative(unsigned long j)
243} 247}
244EXPORT_SYMBOL_GPL(round_jiffies_relative); 248EXPORT_SYMBOL_GPL(round_jiffies_relative);
245 249
250/**
251 * __round_jiffies_up - function to round jiffies up to a full second
252 * @j: the time in (absolute) jiffies that should be rounded
253 * @cpu: the processor number on which the timeout will happen
254 *
255 * This is the same as __round_jiffies() except that it will never
256 * round down. This is useful for timeouts for which the exact time
257 * of firing does not matter too much, as long as they don't fire too
258 * early.
259 */
260unsigned long __round_jiffies_up(unsigned long j, int cpu)
261{
262 return round_jiffies_common(j, cpu, true);
263}
264EXPORT_SYMBOL_GPL(__round_jiffies_up);
265
266/**
267 * __round_jiffies_up_relative - function to round jiffies up to a full second
268 * @j: the time in (relative) jiffies that should be rounded
269 * @cpu: the processor number on which the timeout will happen
270 *
271 * This is the same as __round_jiffies_relative() except that it will never
272 * round down. This is useful for timeouts for which the exact time
273 * of firing does not matter too much, as long as they don't fire too
274 * early.
275 */
276unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
277{
278 unsigned long j0 = jiffies;
279
280 /* Use j0 because jiffies might change while we run */
281 return round_jiffies_common(j + j0, cpu, true) - j0;
282}
283EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
284
285/**
286 * round_jiffies_up - function to round jiffies up to a full second
287 * @j: the time in (absolute) jiffies that should be rounded
288 *
289 * This is the same as round_jiffies() except that it will never
290 * round down. This is useful for timeouts for which the exact time
291 * of firing does not matter too much, as long as they don't fire too
292 * early.
293 */
294unsigned long round_jiffies_up(unsigned long j)
295{
296 return round_jiffies_common(j, raw_smp_processor_id(), true);
297}
298EXPORT_SYMBOL_GPL(round_jiffies_up);
299
300/**
301 * round_jiffies_up_relative - function to round jiffies up to a full second
302 * @j: the time in (relative) jiffies that should be rounded
303 *
304 * This is the same as round_jiffies_relative() except that it will never
305 * round down. This is useful for timeouts for which the exact time
306 * of firing does not matter too much, as long as they don't fire too
307 * early.
308 */
309unsigned long round_jiffies_up_relative(unsigned long j)
310{
311 return __round_jiffies_up_relative(j, raw_smp_processor_id());
312}
313EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
314
246 315
247static inline void set_running_timer(struct tvec_base *base, 316static inline void set_running_timer(struct tvec_base *base,
248 struct timer_list *timer) 317 struct timer_list *timer)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b58f43bec36..33dbefd471e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -25,7 +25,7 @@ config TRACING
25 bool 25 bool
26 select DEBUG_FS 26 select DEBUG_FS
27 select RING_BUFFER 27 select RING_BUFFER
28 select STACKTRACE 28 select STACKTRACE if STACKTRACE_SUPPORT
29 select TRACEPOINTS 29 select TRACEPOINTS
30 select NOP_TRACER 30 select NOP_TRACER
31 31
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index cedf4e26828..3f338063864 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1022,8 +1022,23 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1022 struct ring_buffer_event *event; 1022 struct ring_buffer_event *event;
1023 u64 ts, delta; 1023 u64 ts, delta;
1024 int commit = 0; 1024 int commit = 0;
1025 int nr_loops = 0;
1025 1026
1026 again: 1027 again:
1028 /*
1029 * We allow for interrupts to reenter here and do a trace.
1030 * If one does, it will cause this original code to loop
1031 * back here. Even with heavy interrupts happening, this
1032 * should only happen a few times in a row. If this happens
1033 * 1000 times in a row, there must be either an interrupt
1034 * storm or we have something buggy.
1035 * Bail!
1036 */
1037 if (unlikely(++nr_loops > 1000)) {
1038 RB_WARN_ON(cpu_buffer, 1);
1039 return NULL;
1040 }
1041
1027 ts = ring_buffer_time_stamp(cpu_buffer->cpu); 1042 ts = ring_buffer_time_stamp(cpu_buffer->cpu);
1028 1043
1029 /* 1044 /*
@@ -1532,10 +1547,23 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1532{ 1547{
1533 struct buffer_page *reader = NULL; 1548 struct buffer_page *reader = NULL;
1534 unsigned long flags; 1549 unsigned long flags;
1550 int nr_loops = 0;
1535 1551
1536 spin_lock_irqsave(&cpu_buffer->lock, flags); 1552 spin_lock_irqsave(&cpu_buffer->lock, flags);
1537 1553
1538 again: 1554 again:
1555 /*
1556 * This should normally only loop twice. But because the
1557 * start of the reader inserts an empty page, it causes
1558 * a case where we will loop three times. There should be no
1559 * reason to loop four times (that I know of).
1560 */
1561 if (unlikely(++nr_loops > 3)) {
1562 RB_WARN_ON(cpu_buffer, 1);
1563 reader = NULL;
1564 goto out;
1565 }
1566
1539 reader = cpu_buffer->reader_page; 1567 reader = cpu_buffer->reader_page;
1540 1568
1541 /* If there's more to read, return this page */ 1569 /* If there's more to read, return this page */
@@ -1665,6 +1693,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1665 struct ring_buffer_per_cpu *cpu_buffer; 1693 struct ring_buffer_per_cpu *cpu_buffer;
1666 struct ring_buffer_event *event; 1694 struct ring_buffer_event *event;
1667 struct buffer_page *reader; 1695 struct buffer_page *reader;
1696 int nr_loops = 0;
1668 1697
1669 if (!cpu_isset(cpu, buffer->cpumask)) 1698 if (!cpu_isset(cpu, buffer->cpumask))
1670 return NULL; 1699 return NULL;
@@ -1672,6 +1701,19 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1672 cpu_buffer = buffer->buffers[cpu]; 1701 cpu_buffer = buffer->buffers[cpu];
1673 1702
1674 again: 1703 again:
1704 /*
1705 * We repeat when a timestamp is encountered. It is possible
1706 * to get multiple timestamps from an interrupt entering just
1707 * as one timestamp is about to be written. The max times
1708 * that this can happen is the number of nested interrupts we
1709 * can have. Nesting 10 deep of interrupts is clearly
1710 * an anomaly.
1711 */
1712 if (unlikely(++nr_loops > 10)) {
1713 RB_WARN_ON(cpu_buffer, 1);
1714 return NULL;
1715 }
1716
1675 reader = rb_get_reader_page(cpu_buffer); 1717 reader = rb_get_reader_page(cpu_buffer);
1676 if (!reader) 1718 if (!reader)
1677 return NULL; 1719 return NULL;
@@ -1722,6 +1764,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1722 struct ring_buffer *buffer; 1764 struct ring_buffer *buffer;
1723 struct ring_buffer_per_cpu *cpu_buffer; 1765 struct ring_buffer_per_cpu *cpu_buffer;
1724 struct ring_buffer_event *event; 1766 struct ring_buffer_event *event;
1767 int nr_loops = 0;
1725 1768
1726 if (ring_buffer_iter_empty(iter)) 1769 if (ring_buffer_iter_empty(iter))
1727 return NULL; 1770 return NULL;
@@ -1730,6 +1773,19 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1730 buffer = cpu_buffer->buffer; 1773 buffer = cpu_buffer->buffer;
1731 1774
1732 again: 1775 again:
1776 /*
1777 * We repeat when a timestamp is encountered. It is possible
1778 * to get multiple timestamps from an interrupt entering just
1779 * as one timestamp is about to be written. The max times
1780 * that this can happen is the number of nested interrupts we
1781 * can have. Nesting 10 deep of interrupts is clearly
1782 * an anomaly.
1783 */
1784 if (unlikely(++nr_loops > 10)) {
1785 RB_WARN_ON(cpu_buffer, 1);
1786 return NULL;
1787 }
1788
1733 if (rb_per_cpu_empty(cpu_buffer)) 1789 if (rb_per_cpu_empty(cpu_buffer))
1734 return NULL; 1790 return NULL;
1735 1791
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8a499e2adae..9f3b478f917 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -705,6 +705,7 @@ static void ftrace_trace_stack(struct trace_array *tr,
705 unsigned long flags, 705 unsigned long flags,
706 int skip, int pc) 706 int skip, int pc)
707{ 707{
708#ifdef CONFIG_STACKTRACE
708 struct ring_buffer_event *event; 709 struct ring_buffer_event *event;
709 struct stack_entry *entry; 710 struct stack_entry *entry;
710 struct stack_trace trace; 711 struct stack_trace trace;
@@ -730,6 +731,7 @@ static void ftrace_trace_stack(struct trace_array *tr,
730 731
731 save_stack_trace(&trace); 732 save_stack_trace(&trace);
732 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 733 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
734#endif
733} 735}
734 736
735void __trace_stack(struct trace_array *tr, 737void __trace_stack(struct trace_array *tr,
@@ -1086,17 +1088,20 @@ static void s_stop(struct seq_file *m, void *p)
1086 mutex_unlock(&trace_types_lock); 1088 mutex_unlock(&trace_types_lock);
1087} 1089}
1088 1090
1089#define KRETPROBE_MSG "[unknown/kretprobe'd]"
1090
1091#ifdef CONFIG_KRETPROBES 1091#ifdef CONFIG_KRETPROBES
1092static inline int kretprobed(unsigned long addr) 1092static inline const char *kretprobed(const char *name)
1093{ 1093{
1094 return addr == (unsigned long)kretprobe_trampoline; 1094 static const char tramp_name[] = "kretprobe_trampoline";
1095 int size = sizeof(tramp_name);
1096
1097 if (strncmp(tramp_name, name, size) == 0)
1098 return "[unknown/kretprobe'd]";
1099 return name;
1095} 1100}
1096#else 1101#else
1097static inline int kretprobed(unsigned long addr) 1102static inline const char *kretprobed(const char *name)
1098{ 1103{
1099 return 0; 1104 return name;
1100} 1105}
1101#endif /* CONFIG_KRETPROBES */ 1106#endif /* CONFIG_KRETPROBES */
1102 1107
@@ -1105,10 +1110,13 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
1105{ 1110{
1106#ifdef CONFIG_KALLSYMS 1111#ifdef CONFIG_KALLSYMS
1107 char str[KSYM_SYMBOL_LEN]; 1112 char str[KSYM_SYMBOL_LEN];
1113 const char *name;
1108 1114
1109 kallsyms_lookup(address, NULL, NULL, NULL, str); 1115 kallsyms_lookup(address, NULL, NULL, NULL, str);
1110 1116
1111 return trace_seq_printf(s, fmt, str); 1117 name = kretprobed(str);
1118
1119 return trace_seq_printf(s, fmt, name);
1112#endif 1120#endif
1113 return 1; 1121 return 1;
1114} 1122}
@@ -1119,9 +1127,12 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
1119{ 1127{
1120#ifdef CONFIG_KALLSYMS 1128#ifdef CONFIG_KALLSYMS
1121 char str[KSYM_SYMBOL_LEN]; 1129 char str[KSYM_SYMBOL_LEN];
1130 const char *name;
1122 1131
1123 sprint_symbol(str, address); 1132 sprint_symbol(str, address);
1124 return trace_seq_printf(s, fmt, str); 1133 name = kretprobed(str);
1134
1135 return trace_seq_printf(s, fmt, name);
1125#endif 1136#endif
1126 return 1; 1137 return 1;
1127} 1138}
@@ -1375,10 +1386,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1375 1386
1376 seq_print_ip_sym(s, field->ip, sym_flags); 1387 seq_print_ip_sym(s, field->ip, sym_flags);
1377 trace_seq_puts(s, " ("); 1388 trace_seq_puts(s, " (");
1378 if (kretprobed(field->parent_ip)) 1389 seq_print_ip_sym(s, field->parent_ip, sym_flags);
1379 trace_seq_puts(s, KRETPROBE_MSG);
1380 else
1381 seq_print_ip_sym(s, field->parent_ip, sym_flags);
1382 trace_seq_puts(s, ")\n"); 1390 trace_seq_puts(s, ")\n");
1383 break; 1391 break;
1384 } 1392 }
@@ -1494,12 +1502,9 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1494 ret = trace_seq_printf(s, " <-"); 1502 ret = trace_seq_printf(s, " <-");
1495 if (!ret) 1503 if (!ret)
1496 return TRACE_TYPE_PARTIAL_LINE; 1504 return TRACE_TYPE_PARTIAL_LINE;
1497 if (kretprobed(field->parent_ip)) 1505 ret = seq_print_ip_sym(s,
1498 ret = trace_seq_puts(s, KRETPROBE_MSG); 1506 field->parent_ip,
1499 else 1507 sym_flags);
1500 ret = seq_print_ip_sym(s,
1501 field->parent_ip,
1502 sym_flags);
1503 if (!ret) 1508 if (!ret)
1504 return TRACE_TYPE_PARTIAL_LINE; 1509 return TRACE_TYPE_PARTIAL_LINE;
1505 } 1510 }