diff options
author | Ed Cashin <ecashin@coraid.com> | 2012-10-04 20:16:23 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-05 14:05:25 -0400 |
commit | 69cf2d85de773d998798e47e3335b85e5645d157 (patch) | |
tree | 765eb2be45726e7e098fe73b7f368239c0461342 /drivers/block/aoe/aoedev.c | |
parent | 896831f5909e2733c13c9cb13a1a215f10c3eaa8 (diff) |
aoe: become I/O request queue handler for increased user control
To allow users to choose an elevator algorithm for their particular
workloads, change from a make_request-style driver to an
I/O-request-queue-handler-style driver.
We have to do a couple of things that might be surprising. We manipulate
the page _count directly on the assumption that we still have no guarantee
that users of the block layer are prohibited from submitting bios
containing pages with zero reference counts.[1] If such a prohibition now
exists, I can get rid of the _count manipulation.
Just as before this patch, we still keep track of the sk_buffs that the
network layer still hasn't finished yet and cap the resources we use with
a "pool" of skbs.[2]
Now that the block layer maintains the disk stats, the aoe driver's
diskstats function can go away.
1. https://lkml.org/lkml/2007/3/1/374
2. https://lkml.org/lkml/2007/7/6/241
Signed-off-by: Ed Cashin <ecashin@coraid.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'drivers/block/aoe/aoedev.c')
-rw-r--r-- | drivers/block/aoe/aoedev.c | 93 |
1 files changed, 64 insertions, 29 deletions
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index 40bae1a1ff1e..635dc986cf77 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c | |||
@@ -19,6 +19,17 @@ static void skbpoolfree(struct aoedev *d); | |||
19 | static struct aoedev *devlist; | 19 | static struct aoedev *devlist; |
20 | static DEFINE_SPINLOCK(devlist_lock); | 20 | static DEFINE_SPINLOCK(devlist_lock); |
21 | 21 | ||
22 | /* | ||
23 | * Users who grab a pointer to the device with aoedev_by_aoeaddr or | ||
24 | * aoedev_by_sysminor_m automatically get a reference count and must | ||
25 | * be responsible for performing a aoedev_put. With the addition of | ||
26 | * async kthread processing I'm no longer confident that we can | ||
27 | * guarantee consistency in the face of device flushes. | ||
28 | * | ||
29 | * For the time being, we only bother to add extra references for | ||
30 | * frames sitting on the iocq. When the kthreads finish processing | ||
31 | * these frames, they will aoedev_put the device. | ||
32 | */ | ||
22 | struct aoedev * | 33 | struct aoedev * |
23 | aoedev_by_aoeaddr(int maj, int min) | 34 | aoedev_by_aoeaddr(int maj, int min) |
24 | { | 35 | { |
@@ -28,13 +39,25 @@ aoedev_by_aoeaddr(int maj, int min) | |||
28 | spin_lock_irqsave(&devlist_lock, flags); | 39 | spin_lock_irqsave(&devlist_lock, flags); |
29 | 40 | ||
30 | for (d=devlist; d; d=d->next) | 41 | for (d=devlist; d; d=d->next) |
31 | if (d->aoemajor == maj && d->aoeminor == min) | 42 | if (d->aoemajor == maj && d->aoeminor == min) { |
43 | d->ref++; | ||
32 | break; | 44 | break; |
45 | } | ||
33 | 46 | ||
34 | spin_unlock_irqrestore(&devlist_lock, flags); | 47 | spin_unlock_irqrestore(&devlist_lock, flags); |
35 | return d; | 48 | return d; |
36 | } | 49 | } |
37 | 50 | ||
51 | void | ||
52 | aoedev_put(struct aoedev *d) | ||
53 | { | ||
54 | ulong flags; | ||
55 | |||
56 | spin_lock_irqsave(&devlist_lock, flags); | ||
57 | d->ref--; | ||
58 | spin_unlock_irqrestore(&devlist_lock, flags); | ||
59 | } | ||
60 | |||
38 | static void | 61 | static void |
39 | dummy_timer(ulong vp) | 62 | dummy_timer(ulong vp) |
40 | { | 63 | { |
@@ -47,21 +70,26 @@ dummy_timer(ulong vp) | |||
47 | add_timer(&d->timer); | 70 | add_timer(&d->timer); |
48 | } | 71 | } |
49 | 72 | ||
50 | void | 73 | static void |
51 | aoe_failbuf(struct aoedev *d, struct buf *buf) | 74 | aoe_failip(struct aoedev *d) |
52 | { | 75 | { |
76 | struct request *rq; | ||
53 | struct bio *bio; | 77 | struct bio *bio; |
78 | unsigned long n; | ||
79 | |||
80 | aoe_failbuf(d, d->ip.buf); | ||
54 | 81 | ||
55 | if (buf == NULL) | 82 | rq = d->ip.rq; |
83 | if (rq == NULL) | ||
56 | return; | 84 | return; |
57 | buf->flags |= BUFFL_FAIL; | 85 | while ((bio = d->ip.nxbio)) { |
58 | if (buf->nframesout == 0) { | 86 | clear_bit(BIO_UPTODATE, &bio->bi_flags); |
59 | if (buf == d->inprocess) /* ensure we only process this once */ | 87 | d->ip.nxbio = bio->bi_next; |
60 | d->inprocess = NULL; | 88 | n = (unsigned long) rq->special; |
61 | bio = buf->bio; | 89 | rq->special = (void *) --n; |
62 | mempool_free(buf, d->bufpool); | ||
63 | bio_endio(bio, -EIO); | ||
64 | } | 90 | } |
91 | if ((unsigned long) rq->special == 0) | ||
92 | aoe_end_request(d, rq, 0); | ||
65 | } | 93 | } |
66 | 94 | ||
67 | void | 95 | void |
@@ -70,8 +98,11 @@ aoedev_downdev(struct aoedev *d) | |||
70 | struct aoetgt *t, **tt, **te; | 98 | struct aoetgt *t, **tt, **te; |
71 | struct frame *f; | 99 | struct frame *f; |
72 | struct list_head *head, *pos, *nx; | 100 | struct list_head *head, *pos, *nx; |
101 | struct request *rq; | ||
73 | int i; | 102 | int i; |
74 | 103 | ||
104 | d->flags &= ~DEVFL_UP; | ||
105 | |||
75 | /* clean out active buffers on all targets */ | 106 | /* clean out active buffers on all targets */ |
76 | tt = d->targets; | 107 | tt = d->targets; |
77 | te = tt + NTARGETS; | 108 | te = tt + NTARGETS; |
@@ -92,22 +123,20 @@ aoedev_downdev(struct aoedev *d) | |||
92 | t->nout = 0; | 123 | t->nout = 0; |
93 | } | 124 | } |
94 | 125 | ||
95 | /* clean out the in-process buffer (if any) */ | 126 | /* clean out the in-process request (if any) */ |
96 | aoe_failbuf(d, d->inprocess); | 127 | aoe_failip(d); |
97 | d->inprocess = NULL; | ||
98 | d->htgt = NULL; | 128 | d->htgt = NULL; |
99 | 129 | ||
100 | /* clean out all pending I/O */ | 130 | /* fast fail all pending I/O */ |
101 | while (!list_empty(&d->bufq)) { | 131 | if (d->blkq) { |
102 | struct buf *buf = container_of(d->bufq.next, struct buf, bufs); | 132 | while ((rq = blk_peek_request(d->blkq))) { |
103 | list_del(d->bufq.next); | 133 | blk_start_request(rq); |
104 | aoe_failbuf(d, buf); | 134 | aoe_end_request(d, rq, 1); |
135 | } | ||
105 | } | 136 | } |
106 | 137 | ||
107 | if (d->gd) | 138 | if (d->gd) |
108 | set_capacity(d->gd, 0); | 139 | set_capacity(d->gd, 0); |
109 | |||
110 | d->flags &= ~DEVFL_UP; | ||
111 | } | 140 | } |
112 | 141 | ||
113 | static void | 142 | static void |
@@ -120,6 +149,7 @@ aoedev_freedev(struct aoedev *d) | |||
120 | aoedisk_rm_sysfs(d); | 149 | aoedisk_rm_sysfs(d); |
121 | del_gendisk(d->gd); | 150 | del_gendisk(d->gd); |
122 | put_disk(d->gd); | 151 | put_disk(d->gd); |
152 | blk_cleanup_queue(d->blkq); | ||
123 | } | 153 | } |
124 | t = d->targets; | 154 | t = d->targets; |
125 | e = t + NTARGETS; | 155 | e = t + NTARGETS; |
@@ -128,7 +158,6 @@ aoedev_freedev(struct aoedev *d) | |||
128 | if (d->bufpool) | 158 | if (d->bufpool) |
129 | mempool_destroy(d->bufpool); | 159 | mempool_destroy(d->bufpool); |
130 | skbpoolfree(d); | 160 | skbpoolfree(d); |
131 | blk_cleanup_queue(d->blkq); | ||
132 | kfree(d); | 161 | kfree(d); |
133 | } | 162 | } |
134 | 163 | ||
@@ -155,7 +184,8 @@ aoedev_flush(const char __user *str, size_t cnt) | |||
155 | spin_lock(&d->lock); | 184 | spin_lock(&d->lock); |
156 | if ((!all && (d->flags & DEVFL_UP)) | 185 | if ((!all && (d->flags & DEVFL_UP)) |
157 | || (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE)) | 186 | || (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE)) |
158 | || d->nopen) { | 187 | || d->nopen |
188 | || d->ref) { | ||
159 | spin_unlock(&d->lock); | 189 | spin_unlock(&d->lock); |
160 | dd = &d->next; | 190 | dd = &d->next; |
161 | continue; | 191 | continue; |
@@ -176,12 +206,15 @@ aoedev_flush(const char __user *str, size_t cnt) | |||
176 | return 0; | 206 | return 0; |
177 | } | 207 | } |
178 | 208 | ||
179 | /* I'm not really sure that this is a realistic problem, but if the | 209 | /* This has been confirmed to occur once with Tms=3*1000 due to the |
180 | network driver goes gonzo let's just leak memory after complaining. */ | 210 | * driver changing link and not processing its transmit ring. The |
211 | * problem is hard enough to solve by returning an error that I'm | ||
212 | * still punting on "solving" this. | ||
213 | */ | ||
181 | static void | 214 | static void |
182 | skbfree(struct sk_buff *skb) | 215 | skbfree(struct sk_buff *skb) |
183 | { | 216 | { |
184 | enum { Sms = 100, Tms = 3*1000}; | 217 | enum { Sms = 250, Tms = 30 * 1000}; |
185 | int i = Tms / Sms; | 218 | int i = Tms / Sms; |
186 | 219 | ||
187 | if (skb == NULL) | 220 | if (skb == NULL) |
@@ -222,8 +255,10 @@ aoedev_by_sysminor_m(ulong sysminor) | |||
222 | spin_lock_irqsave(&devlist_lock, flags); | 255 | spin_lock_irqsave(&devlist_lock, flags); |
223 | 256 | ||
224 | for (d=devlist; d; d=d->next) | 257 | for (d=devlist; d; d=d->next) |
225 | if (d->sysminor == sysminor) | 258 | if (d->sysminor == sysminor) { |
259 | d->ref++; | ||
226 | break; | 260 | break; |
261 | } | ||
227 | if (d) | 262 | if (d) |
228 | goto out; | 263 | goto out; |
229 | d = kcalloc(1, sizeof *d, GFP_ATOMIC); | 264 | d = kcalloc(1, sizeof *d, GFP_ATOMIC); |
@@ -231,7 +266,6 @@ aoedev_by_sysminor_m(ulong sysminor) | |||
231 | goto out; | 266 | goto out; |
232 | INIT_WORK(&d->work, aoecmd_sleepwork); | 267 | INIT_WORK(&d->work, aoecmd_sleepwork); |
233 | spin_lock_init(&d->lock); | 268 | spin_lock_init(&d->lock); |
234 | skb_queue_head_init(&d->sendq); | ||
235 | skb_queue_head_init(&d->skbpool); | 269 | skb_queue_head_init(&d->skbpool); |
236 | init_timer(&d->timer); | 270 | init_timer(&d->timer); |
237 | d->timer.data = (ulong) d; | 271 | d->timer.data = (ulong) d; |
@@ -240,7 +274,7 @@ aoedev_by_sysminor_m(ulong sysminor) | |||
240 | add_timer(&d->timer); | 274 | add_timer(&d->timer); |
241 | d->bufpool = NULL; /* defer to aoeblk_gdalloc */ | 275 | d->bufpool = NULL; /* defer to aoeblk_gdalloc */ |
242 | d->tgt = d->targets; | 276 | d->tgt = d->targets; |
243 | INIT_LIST_HEAD(&d->bufq); | 277 | d->ref = 1; |
244 | d->sysminor = sysminor; | 278 | d->sysminor = sysminor; |
245 | d->aoemajor = AOEMAJOR(sysminor); | 279 | d->aoemajor = AOEMAJOR(sysminor); |
246 | d->aoeminor = AOEMINOR(sysminor); | 280 | d->aoeminor = AOEMINOR(sysminor); |
@@ -274,6 +308,7 @@ aoedev_exit(void) | |||
274 | struct aoedev *d; | 308 | struct aoedev *d; |
275 | ulong flags; | 309 | ulong flags; |
276 | 310 | ||
311 | aoe_flush_iocq(); | ||
277 | while ((d = devlist)) { | 312 | while ((d = devlist)) { |
278 | devlist = d->next; | 313 | devlist = d->next; |
279 | 314 | ||