aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Axboe <jens.axboe@oracle.com>2009-08-05 03:07:21 -0400
committerJens Axboe <jens.axboe@oracle.com>2009-09-11 08:33:31 -0400
commit5e605b64a183a6c0e84cdb99a6f8acb1f8200437 (patch)
tree1133a343bea602cb1bd8ee744c5997ce42a69b54
parentfb1e75389bd06fd5987e9cda1b4e0305c782f854 (diff)
block: add blk-iopoll, a NAPI like approach for block devices
This borrows some code from NAPI and implements a polled completion mode for block devices. The idea is the same as NAPI - instead of doing the command completion when the irq occurs, schedule a dedicated softirq in the hopes that we will complete more IO when the iopoll handler is invoked. Devices have a budget of commands assigned, and will stay in polled mode as long as they continue to consume their budget from the iopoll softirq handler. If they do not, the device is set back to interrupt completion mode. This patch holds the core bits for blk-iopoll, device driver support sold separately. Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
-rw-r--r--block/Makefile2
-rw-r--r--block/blk-iopoll.c220
-rw-r--r--include/linux/blk-iopoll.h41
-rw-r--r--include/linux/interrupt.h1
-rw-r--r--kernel/sysctl.c10
5 files changed, 272 insertions, 2 deletions
diff --git a/block/Makefile b/block/Makefile
index 6c54ed0ff755..ba74ca6bfa14 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ 5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
6 blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ 6 blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ 7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
8 ioctl.o genhd.o scsi_ioctl.o 8 blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o
9 9
10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
11obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o 11obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
new file mode 100644
index 000000000000..566db1e7c1c7
--- /dev/null
+++ b/block/blk-iopoll.c
@@ -0,0 +1,220 @@
1/*
2 * Functions related to interrupt-poll handling in the block layer. This
3 * is similar to NAPI for network devices.
4 */
5#include <linux/kernel.h>
6#include <linux/module.h>
7#include <linux/init.h>
8#include <linux/bio.h>
9#include <linux/blkdev.h>
10#include <linux/interrupt.h>
11#include <linux/cpu.h>
12#include <linux/blk-iopoll.h>
13#include <linux/delay.h>
14
15#include "blk.h"
16
17int blk_iopoll_enabled = 1;
18EXPORT_SYMBOL(blk_iopoll_enabled);
19
20static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
21
22/**
23 * blk_iopoll_sched - Schedule a run of the iopoll handler
24 * @iop: The parent iopoll structure
25 *
26 * Description:
27 * Add this blk_iopoll structure to the pending poll list and trigger the raise
28 * of the blk iopoll softirq. The driver must already have gotten a succesful
29 * return from blk_iopoll_sched_prep() before calling this.
30 **/
31void blk_iopoll_sched(struct blk_iopoll *iop)
32{
33 unsigned long flags;
34
35 local_irq_save(flags);
36 list_add_tail(&iop->list, &__get_cpu_var(blk_cpu_iopoll));
37 __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
38 local_irq_restore(flags);
39}
40EXPORT_SYMBOL(blk_iopoll_sched);
41
42/**
43 * __blk_iopoll_complete - Mark this @iop as un-polled again
44 * @iop: The parent iopoll structure
45 *
46 * Description:
47 * See blk_iopoll_complete(). This function must be called with interrupts disabled.
48 **/
49void __blk_iopoll_complete(struct blk_iopoll *iop)
50{
51 list_del(&iop->list);
52 smp_mb__before_clear_bit();
53 clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
54}
55EXPORT_SYMBOL(__blk_iopoll_complete);
56
57/**
58 * blk_iopoll_complete - Mark this @iop as un-polled again
59 * @iop: The parent iopoll structure
60 *
61 * Description:
62 * If a driver consumes less than the assigned budget in its run of the iopoll
63 * handler, it'll end the polled mode by calling this function. The iopoll handler
64 * will not be invoked again before blk_iopoll_sched_prep() is called.
65 **/
66void blk_iopoll_complete(struct blk_iopoll *iopoll)
67{
68 unsigned long flags;
69
70 local_irq_save(flags);
71 __blk_iopoll_complete(iopoll);
72 local_irq_restore(flags);
73}
74EXPORT_SYMBOL(blk_iopoll_complete);
75
76static void blk_iopoll_softirq(struct softirq_action *h)
77{
78 struct list_head *list = &__get_cpu_var(blk_cpu_iopoll);
79 unsigned long start_time = jiffies;
80 int rearm = 0, budget = 64;
81
82 local_irq_disable();
83
84 while (!list_empty(list)) {
85 struct blk_iopoll *iop;
86 int work, weight;
87
88 /*
89 * If softirq window is exhausted then punt.
90 */
91 if (budget <= 0 || time_after(jiffies, start_time)) {
92 rearm = 1;
93 break;
94 }
95
96 local_irq_enable();
97
98 /* Even though interrupts have been re-enabled, this
99 * access is safe because interrupts can only add new
100 * entries to the tail of this list, and only ->poll()
101 * calls can remove this head entry from the list.
102 */
103 iop = list_entry(list->next, struct blk_iopoll, list);
104
105 weight = iop->weight;
106 work = 0;
107 if (test_bit(IOPOLL_F_SCHED, &iop->state))
108 work = iop->poll(iop, weight);
109
110 budget -= work;
111
112 local_irq_disable();
113
114 /* Drivers must not modify the NAPI state if they
115 * consume the entire weight. In such cases this code
116 * still "owns" the NAPI instance and therefore can
117 * move the instance around on the list at-will.
118 */
119 if (work >= weight) {
120 if (blk_iopoll_disable_pending(iop))
121 __blk_iopoll_complete(iop);
122 else
123 list_move_tail(&iop->list, list);
124 }
125 }
126
127 if (rearm)
128 __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
129
130 local_irq_enable();
131}
132
133/**
134 * blk_iopoll_disable - Disable iopoll on this @iop
135 * @iop: The parent iopoll structure
136 *
137 * Description:
138 * Disable io polling and wait for any pending callbacks to have completed.
139 **/
140void blk_iopoll_disable(struct blk_iopoll *iop)
141{
142 set_bit(IOPOLL_F_DISABLE, &iop->state);
143 while (test_and_set_bit(IOPOLL_F_SCHED, &iop->state))
144 msleep(1);
145 clear_bit(IOPOLL_F_DISABLE, &iop->state);
146}
147EXPORT_SYMBOL(blk_iopoll_disable);
148
149/**
150 * blk_iopoll_enable - Enable iopoll on this @iop
151 * @iop: The parent iopoll structure
152 *
153 * Description:
154 * Enable iopoll on this @iop. Note that the handler run will not be scheduled, it
155 * will only mark it as active.
156 **/
157void blk_iopoll_enable(struct blk_iopoll *iop)
158{
159 BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state));
160 smp_mb__before_clear_bit();
161 clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
162}
163EXPORT_SYMBOL(blk_iopoll_enable);
164
165/**
166 * blk_iopoll_init - Initialize this @iop
167 * @iop: The parent iopoll structure
168 * @weight: The default weight (or command completion budget)
169 * @poll_fn: The handler to invoke
170 *
171 * Description:
172 * Initialize this blk_iopoll structure. Before being actively used, the driver
173 * must call blk_iopoll_enable().
174 **/
175void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn)
176{
177 memset(iop, 0, sizeof(*iop));
178 INIT_LIST_HEAD(&iop->list);
179 iop->weight = weight;
180 iop->poll = poll_fn;
181 set_bit(IOPOLL_F_SCHED, &iop->state);
182}
183EXPORT_SYMBOL(blk_iopoll_init);
184
185static int __cpuinit blk_iopoll_cpu_notify(struct notifier_block *self,
186 unsigned long action, void *hcpu)
187{
188 /*
189 * If a CPU goes away, splice its entries to the current CPU
190 * and trigger a run of the softirq
191 */
192 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
193 int cpu = (unsigned long) hcpu;
194
195 local_irq_disable();
196 list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
197 &__get_cpu_var(blk_cpu_iopoll));
198 raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
199 local_irq_enable();
200 }
201
202 return NOTIFY_OK;
203}
204
205static struct notifier_block __cpuinitdata blk_iopoll_cpu_notifier = {
206 .notifier_call = blk_iopoll_cpu_notify,
207};
208
209static __init int blk_iopoll_setup(void)
210{
211 int i;
212
213 for_each_possible_cpu(i)
214 INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i));
215
216 open_softirq(BLOCK_IOPOLL_SOFTIRQ, blk_iopoll_softirq);
217 register_hotcpu_notifier(&blk_iopoll_cpu_notifier);
218 return 0;
219}
220subsys_initcall(blk_iopoll_setup);
diff --git a/include/linux/blk-iopoll.h b/include/linux/blk-iopoll.h
new file mode 100644
index 000000000000..b2e1739a2e7b
--- /dev/null
+++ b/include/linux/blk-iopoll.h
@@ -0,0 +1,41 @@
1#ifndef BLK_IOPOLL_H
2#define BLK_IOPOLL_H
3
4struct blk_iopoll;
5typedef int (blk_iopoll_fn)(struct blk_iopoll *, int);
6
7struct blk_iopoll {
8 struct list_head list;
9 unsigned long state;
10 unsigned long data;
11 int weight;
12 int max;
13 blk_iopoll_fn *poll;
14};
15
16enum {
17 IOPOLL_F_SCHED = 0,
18 IOPOLL_F_DISABLE = 1,
19};
20
21static inline int blk_iopoll_sched_prep(struct blk_iopoll *iop)
22{
23 return !test_bit(IOPOLL_F_DISABLE, &iop->state) &&
24 !test_and_set_bit(IOPOLL_F_SCHED, &iop->state);
25}
26
27static inline int blk_iopoll_disable_pending(struct blk_iopoll *iop)
28{
29 return test_bit(IOPOLL_F_DISABLE, &iop->state);
30}
31
32extern void blk_iopoll_sched(struct blk_iopoll *);
33extern void blk_iopoll_init(struct blk_iopoll *, int, blk_iopoll_fn *);
34extern void blk_iopoll_complete(struct blk_iopoll *);
35extern void __blk_iopoll_complete(struct blk_iopoll *);
36extern void blk_iopoll_enable(struct blk_iopoll *);
37extern void blk_iopoll_disable(struct blk_iopoll *);
38
39extern int blk_iopoll_enabled;
40
41#endif
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 35e7df1e9f30..edd8d5c90394 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -344,6 +344,7 @@ enum
344 NET_TX_SOFTIRQ, 344 NET_TX_SOFTIRQ,
345 NET_RX_SOFTIRQ, 345 NET_RX_SOFTIRQ,
346 BLOCK_SOFTIRQ, 346 BLOCK_SOFTIRQ,
347 BLOCK_IOPOLL_SOFTIRQ,
347 TASKLET_SOFTIRQ, 348 TASKLET_SOFTIRQ,
348 SCHED_SOFTIRQ, 349 SCHED_SOFTIRQ,
349 HRTIMER_SOFTIRQ, 350 HRTIMER_SOFTIRQ,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 58be76017fd0..0ed9fa6f322e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -92,6 +92,7 @@ extern int sysctl_nr_trim_pages;
92#ifdef CONFIG_RCU_TORTURE_TEST 92#ifdef CONFIG_RCU_TORTURE_TEST
93extern int rcutorture_runnable; 93extern int rcutorture_runnable;
94#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ 94#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
95extern int blk_iopoll_enabled;
95 96
96/* Constants used for minimum and maximum */ 97/* Constants used for minimum and maximum */
97#ifdef CONFIG_DETECT_SOFTLOCKUP 98#ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -990,7 +991,14 @@ static struct ctl_table kern_table[] = {
990 .proc_handler = &proc_dointvec, 991 .proc_handler = &proc_dointvec,
991 }, 992 },
992#endif 993#endif
993 994 {
995 .ctl_name = CTL_UNNUMBERED,
996 .procname = "blk_iopoll",
997 .data = &blk_iopoll_enabled,
998 .maxlen = sizeof(int),
999 .mode = 0644,
1000 .proc_handler = &proc_dointvec,
1001 },
994/* 1002/*
995 * NOTE: do not add new entries to this table unless you have read 1003 * NOTE: do not add new entries to this table unless you have read
996 * Documentation/sysctl/ctl_unnumbered.txt 1004 * Documentation/sysctl/ctl_unnumbered.txt