aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2010-10-14 02:01:34 -0400
committerIngo Molnar <mingo@elte.hu>2010-10-18 13:58:50 -0400
commite360adbe29241a0194e10e20595360dd7b98a2b3 (patch)
treeef5fa5f50a895096bfb25bc11b25949603158238 /kernel
parent8e5fc1a7320baf6076391607515dceb61319b36a (diff)
irq_work: Add generic hardirq context callbacks
Provide a mechanism that allows running code in IRQ context. It is most useful for NMI code that needs to interact with the rest of the system -- like wakeup a task to drain buffers. Perf currently has such a mechanism, so extract that and provide it as a generic feature, independent of perf so that others may also benefit. The IRQ context callback is generated through self-IPIs where possible, or on architectures like powerpc the decrementer (the built-in timer facility) is set to generate an interrupt immediately. Architectures that don't have anything like this get to do with a callback from the timer tick. These architectures can call irq_work_run() at the tail of any IRQ handlers that might enqueue such work (like the perf IRQ handler) to avoid undue latencies in processing the work. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: Kyle McMartin <kyle@mcmartin.ca> Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com> [ various fixes ] Signed-off-by: Huang Ying <ying.huang@intel.com> LKML-Reference: <1287036094.7768.291.camel@yhuang-dev> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/irq_work.c164
-rw-r--r--kernel/perf_event.c104
-rw-r--r--kernel/timer.c7
4 files changed, 176 insertions, 101 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index d52b473c99a1..4d9bf5f8531f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -23,6 +23,7 @@ CFLAGS_REMOVE_rtmutex-debug.o = -pg
23CFLAGS_REMOVE_cgroup-debug.o = -pg 23CFLAGS_REMOVE_cgroup-debug.o = -pg
24CFLAGS_REMOVE_sched_clock.o = -pg 24CFLAGS_REMOVE_sched_clock.o = -pg
25CFLAGS_REMOVE_perf_event.o = -pg 25CFLAGS_REMOVE_perf_event.o = -pg
26CFLAGS_REMOVE_irq_work.o = -pg
26endif 27endif
27 28
28obj-$(CONFIG_FREEZER) += freezer.o 29obj-$(CONFIG_FREEZER) += freezer.o
@@ -100,6 +101,7 @@ obj-$(CONFIG_TRACING) += trace/
100obj-$(CONFIG_X86_DS) += trace/ 101obj-$(CONFIG_X86_DS) += trace/
101obj-$(CONFIG_RING_BUFFER) += trace/ 102obj-$(CONFIG_RING_BUFFER) += trace/
102obj-$(CONFIG_SMP) += sched_cpupri.o 103obj-$(CONFIG_SMP) += sched_cpupri.o
104obj-$(CONFIG_IRQ_WORK) += irq_work.o
103obj-$(CONFIG_PERF_EVENTS) += perf_event.o 105obj-$(CONFIG_PERF_EVENTS) += perf_event.o
104obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 106obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
105obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 107obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
new file mode 100644
index 000000000000..f16763ff8481
--- /dev/null
+++ b/kernel/irq_work.c
@@ -0,0 +1,164 @@
1/*
2 * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
3 *
4 * Provides a framework for enqueueing and running callbacks from hardirq
5 * context. The enqueueing is NMI-safe.
6 */
7
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/irq_work.h>
11#include <linux/hardirq.h>
12
13/*
14 * An entry can be in one of four states:
15 *
16 * free NULL, 0 -> {claimed} : free to be used
17 * claimed NULL, 3 -> {pending} : claimed to be enqueued
18 * pending next, 3 -> {busy} : queued, pending callback
19 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
20 *
21 * We use the lower two bits of the next pointer to keep PENDING and BUSY
22 * flags.
23 */
24
25#define IRQ_WORK_PENDING 1UL
26#define IRQ_WORK_BUSY 2UL
27#define IRQ_WORK_FLAGS 3UL
28
29static inline bool irq_work_is_set(struct irq_work *entry, int flags)
30{
31 return (unsigned long)entry->next & flags;
32}
33
34static inline struct irq_work *irq_work_next(struct irq_work *entry)
35{
36 unsigned long next = (unsigned long)entry->next;
37 next &= ~IRQ_WORK_FLAGS;
38 return (struct irq_work *)next;
39}
40
41static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
42{
43 unsigned long next = (unsigned long)entry;
44 next |= flags;
45 return (struct irq_work *)next;
46}
47
48static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
49
50/*
51 * Claim the entry so that no one else will poke at it.
52 */
53static bool irq_work_claim(struct irq_work *entry)
54{
55 struct irq_work *next, *nflags;
56
57 do {
58 next = entry->next;
59 if ((unsigned long)next & IRQ_WORK_PENDING)
60 return false;
61 nflags = next_flags(next, IRQ_WORK_FLAGS);
62 } while (cmpxchg(&entry->next, next, nflags) != next);
63
64 return true;
65}
66
67
68void __weak arch_irq_work_raise(void)
69{
70 /*
71 * Lame architectures will get the timer tick callback
72 */
73}
74
75/*
76 * Queue the entry and raise the IPI if needed.
77 */
78static void __irq_work_queue(struct irq_work *entry)
79{
80 struct irq_work **head, *next;
81
82 head = &get_cpu_var(irq_work_list);
83
84 do {
85 next = *head;
86 /* Can assign non-atomic because we keep the flags set. */
87 entry->next = next_flags(next, IRQ_WORK_FLAGS);
88 } while (cmpxchg(head, next, entry) != next);
89
90 /* The list was empty, raise self-interrupt to start processing. */
91 if (!irq_work_next(entry))
92 arch_irq_work_raise();
93
94 put_cpu_var(irq_work_list);
95}
96
97/*
98 * Enqueue the irq_work @entry, returns true on success, failure when the
99 * @entry was already enqueued by someone else.
100 *
101 * Can be re-enqueued while the callback is still in progress.
102 */
103bool irq_work_queue(struct irq_work *entry)
104{
105 if (!irq_work_claim(entry)) {
106 /*
107 * Already enqueued, can't do!
108 */
109 return false;
110 }
111
112 __irq_work_queue(entry);
113 return true;
114}
115EXPORT_SYMBOL_GPL(irq_work_queue);
116
117/*
118 * Run the irq_work entries on this cpu. Requires to be ran from hardirq
119 * context with local IRQs disabled.
120 */
121void irq_work_run(void)
122{
123 struct irq_work *list, **head;
124
125 head = &__get_cpu_var(irq_work_list);
126 if (*head == NULL)
127 return;
128
129 BUG_ON(!in_irq());
130 BUG_ON(!irqs_disabled());
131
132 list = xchg(head, NULL);
133 while (list != NULL) {
134 struct irq_work *entry = list;
135
136 list = irq_work_next(list);
137
138 /*
139 * Clear the PENDING bit, after this point the @entry
140 * can be re-used.
141 */
142 entry->next = next_flags(NULL, IRQ_WORK_BUSY);
143 entry->func(entry);
144 /*
145 * Clear the BUSY bit and return to the free state if
146 * no-one else claimed it meanwhile.
147 */
148 cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL);
149 }
150}
151EXPORT_SYMBOL_GPL(irq_work_run);
152
153/*
154 * Synchronize against the irq_work @entry, ensures the entry is not
155 * currently in use.
156 */
157void irq_work_sync(struct irq_work *entry)
158{
159 WARN_ON_ONCE(irqs_disabled());
160
161 while (irq_work_is_set(entry, IRQ_WORK_BUSY))
162 cpu_relax();
163}
164EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 634f86a4b2f9..99b9700e74d0 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2206,12 +2206,11 @@ static void free_event_rcu(struct rcu_head *head)
2206 kfree(event); 2206 kfree(event);
2207} 2207}
2208 2208
2209static void perf_pending_sync(struct perf_event *event);
2210static void perf_buffer_put(struct perf_buffer *buffer); 2209static void perf_buffer_put(struct perf_buffer *buffer);
2211 2210
2212static void free_event(struct perf_event *event) 2211static void free_event(struct perf_event *event)
2213{ 2212{
2214 perf_pending_sync(event); 2213 irq_work_sync(&event->pending);
2215 2214
2216 if (!event->parent) { 2215 if (!event->parent) {
2217 atomic_dec(&nr_events); 2216 atomic_dec(&nr_events);
@@ -3162,16 +3161,7 @@ void perf_event_wakeup(struct perf_event *event)
3162 } 3161 }
3163} 3162}
3164 3163
3165/* 3164static void perf_pending_event(struct irq_work *entry)
3166 * Pending wakeups
3167 *
3168 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
3169 *
3170 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
3171 * single linked list and use cmpxchg() to add entries lockless.
3172 */
3173
3174static void perf_pending_event(struct perf_pending_entry *entry)
3175{ 3165{
3176 struct perf_event *event = container_of(entry, 3166 struct perf_event *event = container_of(entry,
3177 struct perf_event, pending); 3167 struct perf_event, pending);
@@ -3187,89 +3177,6 @@ static void perf_pending_event(struct perf_pending_entry *entry)
3187 } 3177 }
3188} 3178}
3189 3179
3190#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
3191
3192static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
3193 PENDING_TAIL,
3194};
3195
3196static void perf_pending_queue(struct perf_pending_entry *entry,
3197 void (*func)(struct perf_pending_entry *))
3198{
3199 struct perf_pending_entry **head;
3200
3201 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
3202 return;
3203
3204 entry->func = func;
3205
3206 head = &get_cpu_var(perf_pending_head);
3207
3208 do {
3209 entry->next = *head;
3210 } while (cmpxchg(head, entry->next, entry) != entry->next);
3211
3212 set_perf_event_pending();
3213
3214 put_cpu_var(perf_pending_head);
3215}
3216
3217static int __perf_pending_run(void)
3218{
3219 struct perf_pending_entry *list;
3220 int nr = 0;
3221
3222 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
3223 while (list != PENDING_TAIL) {
3224 void (*func)(struct perf_pending_entry *);
3225 struct perf_pending_entry *entry = list;
3226
3227 list = list->next;
3228
3229 func = entry->func;
3230 entry->next = NULL;
3231 /*
3232 * Ensure we observe the unqueue before we issue the wakeup,
3233 * so that we won't be waiting forever.
3234 * -- see perf_not_pending().
3235 */
3236 smp_wmb();
3237
3238 func(entry);
3239 nr++;
3240 }
3241
3242 return nr;
3243}
3244
3245static inline int perf_not_pending(struct perf_event *event)
3246{
3247 /*
3248 * If we flush on whatever cpu we run, there is a chance we don't
3249 * need to wait.
3250 */
3251 get_cpu();
3252 __perf_pending_run();
3253 put_cpu();
3254
3255 /*
3256 * Ensure we see the proper queue state before going to sleep
3257 * so that we do not miss the wakeup. -- see perf_pending_handle()
3258 */
3259 smp_rmb();
3260 return event->pending.next == NULL;
3261}
3262
3263static void perf_pending_sync(struct perf_event *event)
3264{
3265 wait_event(event->waitq, perf_not_pending(event));
3266}
3267
3268void perf_event_do_pending(void)
3269{
3270 __perf_pending_run();
3271}
3272
3273/* 3180/*
3274 * We assume there is only KVM supporting the callbacks. 3181 * We assume there is only KVM supporting the callbacks.
3275 * Later on, we might change it to a list if there is 3182 * Later on, we might change it to a list if there is
@@ -3319,8 +3226,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
3319 3226
3320 if (handle->nmi) { 3227 if (handle->nmi) {
3321 handle->event->pending_wakeup = 1; 3228 handle->event->pending_wakeup = 1;
3322 perf_pending_queue(&handle->event->pending, 3229 irq_work_queue(&handle->event->pending);
3323 perf_pending_event);
3324 } else 3230 } else
3325 perf_event_wakeup(handle->event); 3231 perf_event_wakeup(handle->event);
3326} 3232}
@@ -4356,8 +4262,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
4356 event->pending_kill = POLL_HUP; 4262 event->pending_kill = POLL_HUP;
4357 if (nmi) { 4263 if (nmi) {
4358 event->pending_disable = 1; 4264 event->pending_disable = 1;
4359 perf_pending_queue(&event->pending, 4265 irq_work_queue(&event->pending);
4360 perf_pending_event);
4361 } else 4266 } else
4362 perf_event_disable(event); 4267 perf_event_disable(event);
4363 } 4268 }
@@ -5374,6 +5279,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
5374 INIT_LIST_HEAD(&event->event_entry); 5279 INIT_LIST_HEAD(&event->event_entry);
5375 INIT_LIST_HEAD(&event->sibling_list); 5280 INIT_LIST_HEAD(&event->sibling_list);
5376 init_waitqueue_head(&event->waitq); 5281 init_waitqueue_head(&event->waitq);
5282 init_irq_work(&event->pending, perf_pending_event);
5377 5283
5378 mutex_init(&event->mmap_mutex); 5284 mutex_init(&event->mmap_mutex);
5379 5285
diff --git a/kernel/timer.c b/kernel/timer.c
index 97bf05baade7..68a9ae7679b7 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,7 +37,7 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_event.h> 40#include <linux/irq_work.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/slab.h> 42#include <linux/slab.h>
43 43
@@ -1279,7 +1279,10 @@ void update_process_times(int user_tick)
1279 run_local_timers(); 1279 run_local_timers();
1280 rcu_check_callbacks(cpu, user_tick); 1280 rcu_check_callbacks(cpu, user_tick);
1281 printk_tick(); 1281 printk_tick();
1282 perf_event_do_pending(); 1282#ifdef CONFIG_IRQ_WORK
1283 if (in_irq())
1284 irq_work_run();
1285#endif
1283 scheduler_tick(); 1286 scheduler_tick();
1284 run_posix_cpu_timers(p); 1287 run_posix_cpu_timers(p);
1285} 1288}