aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2009-03-30 13:07:02 -0400
committerIngo Molnar <mingo@elte.hu>2009-04-06 03:30:36 -0400
commit925d519ab82b6dd7aca9420d809ee83819c08db2 (patch)
treeaa05bd7eb607915aa691d5434ec74521b487b466
parent53cfbf593758916aac41db728f029986a62f1254 (diff)
perf_counter: unify and fix delayed counter wakeup
While going over the wakeup code I noticed delayed wakeups only work for hardware counters but basically all software counters rely on them. This patch unifies and generalizes the delayed wakeup to fix this issue. Since we're dealing with NMI context bits here, use a cmpxchg() based single link list implementation to track counters that have pending wakeups. [ This should really be generic code for delayed wakeups, but since we cannot use cmpxchg()/xchg() in generic code, I've let it live in the perf_counter code. -- Eric Dumazet could use it to aggregate the network wakeups. ] Furthermore, the x86 method of using TIF flags was flawed in that its quite possible to end up setting the bit on the idle task, loosing the wakeup. The powerpc method uses per-cpu storage and does appear to be sufficient. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: Paul Mackerras <paulus@samba.org> Orig-LKML-Reference: <20090330171023.153932974@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--arch/powerpc/include/asm/hw_irq.h4
-rw-r--r--arch/powerpc/kernel/irq.c2
-rw-r--r--arch/powerpc/kernel/perf_counter.c22
-rw-r--r--arch/x86/include/asm/perf_counter.h5
-rw-r--r--arch/x86/include/asm/thread_info.h4
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c29
-rw-r--r--arch/x86/kernel/signal.c6
-rw-r--r--include/linux/perf_counter.h15
-rw-r--r--kernel/perf_counter.c128
-rw-r--r--kernel/timer.c3
10 files changed, 142 insertions, 76 deletions
diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index cb32d571c9c7..20a44d0c9fdd 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -132,7 +132,7 @@ static inline int irqs_disabled_flags(unsigned long flags)
132struct irq_chip; 132struct irq_chip;
133 133
134#ifdef CONFIG_PERF_COUNTERS 134#ifdef CONFIG_PERF_COUNTERS
135static inline unsigned long get_perf_counter_pending(void) 135static inline unsigned long test_perf_counter_pending(void)
136{ 136{
137 unsigned long x; 137 unsigned long x;
138 138
@@ -160,7 +160,7 @@ extern void perf_counter_do_pending(void);
160 160
161#else 161#else
162 162
163static inline unsigned long get_perf_counter_pending(void) 163static inline unsigned long test_perf_counter_pending(void)
164{ 164{
165 return 0; 165 return 0;
166} 166}
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 469e9635ff04..2cd471f92fe6 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -135,7 +135,7 @@ notrace void raw_local_irq_restore(unsigned long en)
135 iseries_handle_interrupts(); 135 iseries_handle_interrupts();
136 } 136 }
137 137
138 if (get_perf_counter_pending()) { 138 if (test_perf_counter_pending()) {
139 clear_perf_counter_pending(); 139 clear_perf_counter_pending();
140 perf_counter_do_pending(); 140 perf_counter_do_pending();
141 } 141 }
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index df007fe0cc0b..cde720fc495c 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -650,24 +650,6 @@ hw_perf_counter_init(struct perf_counter *counter)
650} 650}
651 651
652/* 652/*
653 * Handle wakeups.
654 */
655void perf_counter_do_pending(void)
656{
657 int i;
658 struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
659 struct perf_counter *counter;
660
661 for (i = 0; i < cpuhw->n_counters; ++i) {
662 counter = cpuhw->counter[i];
663 if (counter && counter->wakeup_pending) {
664 counter->wakeup_pending = 0;
665 wake_up(&counter->waitq);
666 }
667 }
668}
669
670/*
671 * A counter has overflowed; update its count and record 653 * A counter has overflowed; update its count and record
672 * things if requested. Note that interrupts are hard-disabled 654 * things if requested. Note that interrupts are hard-disabled
673 * here so there is no possibility of being interrupted. 655 * here so there is no possibility of being interrupted.
@@ -720,7 +702,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
720 struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters); 702 struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
721 struct perf_counter *counter; 703 struct perf_counter *counter;
722 long val; 704 long val;
723 int need_wakeup = 0, found = 0; 705 int found = 0;
724 706
725 for (i = 0; i < cpuhw->n_counters; ++i) { 707 for (i = 0; i < cpuhw->n_counters; ++i) {
726 counter = cpuhw->counter[i]; 708 counter = cpuhw->counter[i];
@@ -761,7 +743,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
761 * immediately; otherwise we'll have do the wakeup when interrupts 743 * immediately; otherwise we'll have do the wakeup when interrupts
762 * get soft-enabled. 744 * get soft-enabled.
763 */ 745 */
764 if (get_perf_counter_pending() && regs->softe) { 746 if (test_perf_counter_pending() && regs->softe) {
765 irq_enter(); 747 irq_enter();
766 clear_perf_counter_pending(); 748 clear_perf_counter_pending();
767 perf_counter_do_pending(); 749 perf_counter_do_pending();
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 1662043b340f..e2b0e66b2353 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -84,8 +84,9 @@ union cpuid10_edx {
84#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b 84#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
85#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) 85#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
86 86
87#define set_perf_counter_pending() \ 87#define set_perf_counter_pending() do { } while (0)
88 set_tsk_thread_flag(current, TIF_PERF_COUNTERS); 88#define clear_perf_counter_pending() do { } while (0)
89#define test_perf_counter_pending() (0)
89 90
90#ifdef CONFIG_PERF_COUNTERS 91#ifdef CONFIG_PERF_COUNTERS
91extern void init_hw_perf_counters(void); 92extern void init_hw_perf_counters(void);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 3ffd5d2a3676..8820a73ae090 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,7 +83,6 @@ struct thread_info {
83#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ 83#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
84#define TIF_SECCOMP 8 /* secure computing */ 84#define TIF_SECCOMP 8 /* secure computing */
85#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ 85#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
86#define TIF_PERF_COUNTERS 11 /* notify perf counter work */
87#define TIF_NOTSC 16 /* TSC is not accessible in userland */ 86#define TIF_NOTSC 16 /* TSC is not accessible in userland */
88#define TIF_IA32 17 /* 32bit process */ 87#define TIF_IA32 17 /* 32bit process */
89#define TIF_FORK 18 /* ret_from_fork */ 88#define TIF_FORK 18 /* ret_from_fork */
@@ -107,7 +106,6 @@ struct thread_info {
107#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) 106#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
108#define _TIF_SECCOMP (1 << TIF_SECCOMP) 107#define _TIF_SECCOMP (1 << TIF_SECCOMP)
109#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) 108#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
110#define _TIF_PERF_COUNTERS (1 << TIF_PERF_COUNTERS)
111#define _TIF_NOTSC (1 << TIF_NOTSC) 109#define _TIF_NOTSC (1 << TIF_NOTSC)
112#define _TIF_IA32 (1 << TIF_IA32) 110#define _TIF_IA32 (1 << TIF_IA32)
113#define _TIF_FORK (1 << TIF_FORK) 111#define _TIF_FORK (1 << TIF_FORK)
@@ -141,7 +139,7 @@ struct thread_info {
141 139
142/* Only used for 64 bit */ 140/* Only used for 64 bit */
143#define _TIF_DO_NOTIFY_MASK \ 141#define _TIF_DO_NOTIFY_MASK \
144 (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME) 142 (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
145 143
146/* flags to check in __switch_to() */ 144/* flags to check in __switch_to() */
147#define _TIF_WORK_CTXSW \ 145#define _TIF_WORK_CTXSW \
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3f95b0cdc550..7aab177fb566 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -227,7 +227,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
227 */ 227 */
228 hwc->config |= pmc_ops->event_map(perf_event_id(hw_event)); 228 hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
229 } 229 }
230 counter->wakeup_pending = 0;
231 230
232 return 0; 231 return 0;
233} 232}
@@ -773,34 +772,6 @@ void smp_perf_counter_interrupt(struct pt_regs *regs)
773 irq_exit(); 772 irq_exit();
774} 773}
775 774
776/*
777 * This handler is triggered by NMI contexts:
778 */
779void perf_counter_notify(struct pt_regs *regs)
780{
781 struct cpu_hw_counters *cpuc;
782 unsigned long flags;
783 int bit, cpu;
784
785 local_irq_save(flags);
786 cpu = smp_processor_id();
787 cpuc = &per_cpu(cpu_hw_counters, cpu);
788
789 for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) {
790 struct perf_counter *counter = cpuc->counters[bit];
791
792 if (!counter)
793 continue;
794
795 if (counter->wakeup_pending) {
796 counter->wakeup_pending = 0;
797 wake_up(&counter->waitq);
798 }
799 }
800
801 local_irq_restore(flags);
802}
803
804void perf_counters_lapic_init(int nmi) 775void perf_counters_lapic_init(int nmi)
805{ 776{
806 u32 apic_val; 777 u32 apic_val;
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 611615a92c90..0a813b17b172 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,6 @@
6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes 6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen 7 * 2000-2002 x86-64 support by Andi Kleen
8 */ 8 */
9#include <linux/perf_counter.h>
10#include <linux/sched.h> 9#include <linux/sched.h>
11#include <linux/mm.h> 10#include <linux/mm.h>
12#include <linux/smp.h> 11#include <linux/smp.h>
@@ -872,11 +871,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
872 tracehook_notify_resume(regs); 871 tracehook_notify_resume(regs);
873 } 872 }
874 873
875 if (thread_info_flags & _TIF_PERF_COUNTERS) {
876 clear_thread_flag(TIF_PERF_COUNTERS);
877 perf_counter_notify(regs);
878 }
879
880#ifdef CONFIG_X86_32 874#ifdef CONFIG_X86_32
881 clear_thread_flag(TIF_IRET); 875 clear_thread_flag(TIF_IRET);
882#endif /* CONFIG_X86_32 */ 876#endif /* CONFIG_X86_32 */
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 6bf67ce17625..0d833228eee5 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -275,6 +275,10 @@ struct perf_mmap_data {
275 void *data_pages[0]; 275 void *data_pages[0];
276}; 276};
277 277
278struct perf_wakeup_entry {
279 struct perf_wakeup_entry *next;
280};
281
278/** 282/**
279 * struct perf_counter - performance counter kernel representation: 283 * struct perf_counter - performance counter kernel representation:
280 */ 284 */
@@ -350,7 +354,7 @@ struct perf_counter {
350 /* poll related */ 354 /* poll related */
351 wait_queue_head_t waitq; 355 wait_queue_head_t waitq;
352 /* optional: for NMIs */ 356 /* optional: for NMIs */
353 int wakeup_pending; 357 struct perf_wakeup_entry wakeup;
354 358
355 void (*destroy)(struct perf_counter *); 359 void (*destroy)(struct perf_counter *);
356 struct rcu_head rcu_head; 360 struct rcu_head rcu_head;
@@ -427,7 +431,7 @@ extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
427extern void perf_counter_task_tick(struct task_struct *task, int cpu); 431extern void perf_counter_task_tick(struct task_struct *task, int cpu);
428extern void perf_counter_init_task(struct task_struct *child); 432extern void perf_counter_init_task(struct task_struct *child);
429extern void perf_counter_exit_task(struct task_struct *child); 433extern void perf_counter_exit_task(struct task_struct *child);
430extern void perf_counter_notify(struct pt_regs *regs); 434extern void perf_counter_do_pending(void);
431extern void perf_counter_print_debug(void); 435extern void perf_counter_print_debug(void);
432extern void perf_counter_unthrottle(void); 436extern void perf_counter_unthrottle(void);
433extern u64 hw_perf_save_disable(void); 437extern u64 hw_perf_save_disable(void);
@@ -461,7 +465,7 @@ static inline void
461perf_counter_task_tick(struct task_struct *task, int cpu) { } 465perf_counter_task_tick(struct task_struct *task, int cpu) { }
462static inline void perf_counter_init_task(struct task_struct *child) { } 466static inline void perf_counter_init_task(struct task_struct *child) { }
463static inline void perf_counter_exit_task(struct task_struct *child) { } 467static inline void perf_counter_exit_task(struct task_struct *child) { }
464static inline void perf_counter_notify(struct pt_regs *regs) { } 468static inline void perf_counter_do_pending(void) { }
465static inline void perf_counter_print_debug(void) { } 469static inline void perf_counter_print_debug(void) { }
466static inline void perf_counter_unthrottle(void) { } 470static inline void perf_counter_unthrottle(void) { }
467static inline void hw_perf_restore(u64 ctrl) { } 471static inline void hw_perf_restore(u64 ctrl) { }
@@ -469,8 +473,9 @@ static inline u64 hw_perf_save_disable(void) { return 0; }
469static inline int perf_counter_task_disable(void) { return -EINVAL; } 473static inline int perf_counter_task_disable(void) { return -EINVAL; }
470static inline int perf_counter_task_enable(void) { return -EINVAL; } 474static inline int perf_counter_task_enable(void) { return -EINVAL; }
471 475
472static inline void perf_swcounter_event(u32 event, u64 nr, 476static inline void
473 int nmi, struct pt_regs *regs) { } 477perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) { }
478
474#endif 479#endif
475 480
476#endif /* __KERNEL__ */ 481#endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 3b862a7988cd..f70ff80e79d7 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1197,8 +1197,12 @@ static void free_counter_rcu(struct rcu_head *head)
1197 kfree(counter); 1197 kfree(counter);
1198} 1198}
1199 1199
1200static void perf_pending_sync(struct perf_counter *counter);
1201
1200static void free_counter(struct perf_counter *counter) 1202static void free_counter(struct perf_counter *counter)
1201{ 1203{
1204 perf_pending_sync(counter);
1205
1202 if (counter->destroy) 1206 if (counter->destroy)
1203 counter->destroy(counter); 1207 counter->destroy(counter);
1204 1208
@@ -1529,6 +1533,118 @@ static const struct file_operations perf_fops = {
1529}; 1533};
1530 1534
1531/* 1535/*
1536 * Perf counter wakeup
1537 *
1538 * If there's data, ensure we set the poll() state and publish everything
1539 * to user-space before waking everybody up.
1540 */
1541
1542void perf_counter_wakeup(struct perf_counter *counter)
1543{
1544 struct perf_mmap_data *data;
1545
1546 rcu_read_lock();
1547 data = rcu_dereference(counter->data);
1548 if (data) {
1549 (void)atomic_xchg(&data->wakeup, POLL_IN);
1550 __perf_counter_update_userpage(counter, data);
1551 }
1552 rcu_read_unlock();
1553
1554 wake_up_all(&counter->waitq);
1555}
1556
1557/*
1558 * Pending wakeups
1559 *
1560 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
1561 *
1562 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
1563 * single linked list and use cmpxchg() to add entries lockless.
1564 */
1565
1566#define PENDING_TAIL ((struct perf_wakeup_entry *)-1UL)
1567
1568static DEFINE_PER_CPU(struct perf_wakeup_entry *, perf_wakeup_head) = {
1569 PENDING_TAIL,
1570};
1571
1572static void perf_pending_queue(struct perf_counter *counter)
1573{
1574 struct perf_wakeup_entry **head;
1575 struct perf_wakeup_entry *prev, *next;
1576
1577 if (cmpxchg(&counter->wakeup.next, NULL, PENDING_TAIL) != NULL)
1578 return;
1579
1580 head = &get_cpu_var(perf_wakeup_head);
1581
1582 do {
1583 prev = counter->wakeup.next = *head;
1584 next = &counter->wakeup;
1585 } while (cmpxchg(head, prev, next) != prev);
1586
1587 set_perf_counter_pending();
1588
1589 put_cpu_var(perf_wakeup_head);
1590}
1591
1592static int __perf_pending_run(void)
1593{
1594 struct perf_wakeup_entry *list;
1595 int nr = 0;
1596
1597 list = xchg(&__get_cpu_var(perf_wakeup_head), PENDING_TAIL);
1598 while (list != PENDING_TAIL) {
1599 struct perf_counter *counter = container_of(list,
1600 struct perf_counter, wakeup);
1601
1602 list = list->next;
1603
1604 counter->wakeup.next = NULL;
1605 /*
1606 * Ensure we observe the unqueue before we issue the wakeup,
1607 * so that we won't be waiting forever.
1608 * -- see perf_not_pending().
1609 */
1610 smp_wmb();
1611
1612 perf_counter_wakeup(counter);
1613 nr++;
1614 }
1615
1616 return nr;
1617}
1618
1619static inline int perf_not_pending(struct perf_counter *counter)
1620{
1621 /*
1622 * If we flush on whatever cpu we run, there is a chance we don't
1623 * need to wait.
1624 */
1625 get_cpu();
1626 __perf_pending_run();
1627 put_cpu();
1628
1629 /*
1630 * Ensure we see the proper queue state before going to sleep
1631 * so that we do not miss the wakeup. -- see perf_pending_handle()
1632 */
1633 smp_rmb();
1634 return counter->wakeup.next == NULL;
1635}
1636
1637static void perf_pending_sync(struct perf_counter *counter)
1638{
1639 wait_event(counter->waitq, perf_not_pending(counter));
1640}
1641
1642void perf_counter_do_pending(void)
1643{
1644 __perf_pending_run();
1645}
1646
1647/*
1532 * Output 1648 * Output
1533 */ 1649 */
1534 1650
@@ -1611,13 +1727,10 @@ static void perf_output_copy(struct perf_output_handle *handle,
1611static void perf_output_end(struct perf_output_handle *handle, int nmi) 1727static void perf_output_end(struct perf_output_handle *handle, int nmi)
1612{ 1728{
1613 if (handle->wakeup) { 1729 if (handle->wakeup) {
1614 (void)atomic_xchg(&handle->data->wakeup, POLL_IN); 1730 if (nmi)
1615 __perf_counter_update_userpage(handle->counter, handle->data); 1731 perf_pending_queue(handle->counter);
1616 if (nmi) { 1732 else
1617 handle->counter->wakeup_pending = 1; 1733 perf_counter_wakeup(handle->counter);
1618 set_perf_counter_pending();
1619 } else
1620 wake_up(&handle->counter->waitq);
1621 } 1734 }
1622 rcu_read_unlock(); 1735 rcu_read_unlock();
1623} 1736}
@@ -2211,7 +2324,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
2211 2324
2212 counter->cpu = cpu; 2325 counter->cpu = cpu;
2213 counter->hw_event = *hw_event; 2326 counter->hw_event = *hw_event;
2214 counter->wakeup_pending = 0;
2215 counter->group_leader = group_leader; 2327 counter->group_leader = group_leader;
2216 counter->hw_ops = NULL; 2328 counter->hw_ops = NULL;
2217 counter->ctx = ctx; 2329 counter->ctx = ctx;
diff --git a/kernel/timer.c b/kernel/timer.c
index b4555568b4e4..672ca25fbc43 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,6 +37,7 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_counter.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/unistd.h> 43#include <asm/unistd.h>
@@ -1167,6 +1168,8 @@ static void run_timer_softirq(struct softirq_action *h)
1167{ 1168{
1168 struct tvec_base *base = __get_cpu_var(tvec_bases); 1169 struct tvec_base *base = __get_cpu_var(tvec_bases);
1169 1170
1171 perf_counter_do_pending();
1172
1170 hrtimer_run_pending(); 1173 hrtimer_run_pending();
1171 1174
1172 if (time_after_eq(jiffies, base->timer_jiffies)) 1175 if (time_after_eq(jiffies, base->timer_jiffies))