aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block/aoe/aoedev.c
diff options
context:
space:
mode:
authorEd Cashin <ecashin@coraid.com>2012-10-04 20:16:23 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-05 14:05:25 -0400
commit69cf2d85de773d998798e47e3335b85e5645d157 (patch)
tree765eb2be45726e7e098fe73b7f368239c0461342 /drivers/block/aoe/aoedev.c
parent896831f5909e2733c13c9cb13a1a215f10c3eaa8 (diff)
aoe: become I/O request queue handler for increased user control
To allow users to choose an elevator algorithm for their particular workloads, change from a make_request-style driver to an I/O-request-queue-handler-style driver. We have to do a couple of things that might be surprising. We manipulate the page _count directly on the assumption that we still have no guarantee that users of the block layer are prohibited from submitting bios containing pages with zero reference counts.[1] If such a prohibition now exists, I can get rid of the _count manipulation. Just as before this patch, we still keep track of the sk_buffs that the network layer still hasn't finished yet and cap the resources we use with a "pool" of skbs.[2] Now that the block layer maintains the disk stats, the aoe driver's diskstats function can go away. 1. https://lkml.org/lkml/2007/3/1/374 2. https://lkml.org/lkml/2007/7/6/241 Signed-off-by: Ed Cashin <ecashin@coraid.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'drivers/block/aoe/aoedev.c')
-rw-r--r--drivers/block/aoe/aoedev.c93
1 files changed, 64 insertions, 29 deletions
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index 40bae1a1ff1e..635dc986cf77 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -19,6 +19,17 @@ static void skbpoolfree(struct aoedev *d);
19static struct aoedev *devlist; 19static struct aoedev *devlist;
20static DEFINE_SPINLOCK(devlist_lock); 20static DEFINE_SPINLOCK(devlist_lock);
21 21
22/*
23 * Users who grab a pointer to the device with aoedev_by_aoeaddr or
24 * aoedev_by_sysminor_m automatically get a reference count and must
25 * be responsible for performing a aoedev_put. With the addition of
26 * async kthread processing I'm no longer confident that we can
27 * guarantee consistency in the face of device flushes.
28 *
29 * For the time being, we only bother to add extra references for
30 * frames sitting on the iocq. When the kthreads finish processing
31 * these frames, they will aoedev_put the device.
32 */
22struct aoedev * 33struct aoedev *
23aoedev_by_aoeaddr(int maj, int min) 34aoedev_by_aoeaddr(int maj, int min)
24{ 35{
@@ -28,13 +39,25 @@ aoedev_by_aoeaddr(int maj, int min)
28 spin_lock_irqsave(&devlist_lock, flags); 39 spin_lock_irqsave(&devlist_lock, flags);
29 40
30 for (d=devlist; d; d=d->next) 41 for (d=devlist; d; d=d->next)
31 if (d->aoemajor == maj && d->aoeminor == min) 42 if (d->aoemajor == maj && d->aoeminor == min) {
43 d->ref++;
32 break; 44 break;
45 }
33 46
34 spin_unlock_irqrestore(&devlist_lock, flags); 47 spin_unlock_irqrestore(&devlist_lock, flags);
35 return d; 48 return d;
36} 49}
37 50
51void
52aoedev_put(struct aoedev *d)
53{
54 ulong flags;
55
56 spin_lock_irqsave(&devlist_lock, flags);
57 d->ref--;
58 spin_unlock_irqrestore(&devlist_lock, flags);
59}
60
38static void 61static void
39dummy_timer(ulong vp) 62dummy_timer(ulong vp)
40{ 63{
@@ -47,21 +70,26 @@ dummy_timer(ulong vp)
47 add_timer(&d->timer); 70 add_timer(&d->timer);
48} 71}
49 72
50void 73static void
51aoe_failbuf(struct aoedev *d, struct buf *buf) 74aoe_failip(struct aoedev *d)
52{ 75{
76 struct request *rq;
53 struct bio *bio; 77 struct bio *bio;
78 unsigned long n;
79
80 aoe_failbuf(d, d->ip.buf);
54 81
55 if (buf == NULL) 82 rq = d->ip.rq;
83 if (rq == NULL)
56 return; 84 return;
57 buf->flags |= BUFFL_FAIL; 85 while ((bio = d->ip.nxbio)) {
58 if (buf->nframesout == 0) { 86 clear_bit(BIO_UPTODATE, &bio->bi_flags);
59 if (buf == d->inprocess) /* ensure we only process this once */ 87 d->ip.nxbio = bio->bi_next;
60 d->inprocess = NULL; 88 n = (unsigned long) rq->special;
61 bio = buf->bio; 89 rq->special = (void *) --n;
62 mempool_free(buf, d->bufpool);
63 bio_endio(bio, -EIO);
64 } 90 }
91 if ((unsigned long) rq->special == 0)
92 aoe_end_request(d, rq, 0);
65} 93}
66 94
67void 95void
@@ -70,8 +98,11 @@ aoedev_downdev(struct aoedev *d)
70 struct aoetgt *t, **tt, **te; 98 struct aoetgt *t, **tt, **te;
71 struct frame *f; 99 struct frame *f;
72 struct list_head *head, *pos, *nx; 100 struct list_head *head, *pos, *nx;
101 struct request *rq;
73 int i; 102 int i;
74 103
104 d->flags &= ~DEVFL_UP;
105
75 /* clean out active buffers on all targets */ 106 /* clean out active buffers on all targets */
76 tt = d->targets; 107 tt = d->targets;
77 te = tt + NTARGETS; 108 te = tt + NTARGETS;
@@ -92,22 +123,20 @@ aoedev_downdev(struct aoedev *d)
92 t->nout = 0; 123 t->nout = 0;
93 } 124 }
94 125
95 /* clean out the in-process buffer (if any) */ 126 /* clean out the in-process request (if any) */
96 aoe_failbuf(d, d->inprocess); 127 aoe_failip(d);
97 d->inprocess = NULL;
98 d->htgt = NULL; 128 d->htgt = NULL;
99 129
100 /* clean out all pending I/O */ 130 /* fast fail all pending I/O */
101 while (!list_empty(&d->bufq)) { 131 if (d->blkq) {
102 struct buf *buf = container_of(d->bufq.next, struct buf, bufs); 132 while ((rq = blk_peek_request(d->blkq))) {
103 list_del(d->bufq.next); 133 blk_start_request(rq);
104 aoe_failbuf(d, buf); 134 aoe_end_request(d, rq, 1);
135 }
105 } 136 }
106 137
107 if (d->gd) 138 if (d->gd)
108 set_capacity(d->gd, 0); 139 set_capacity(d->gd, 0);
109
110 d->flags &= ~DEVFL_UP;
111} 140}
112 141
113static void 142static void
@@ -120,6 +149,7 @@ aoedev_freedev(struct aoedev *d)
120 aoedisk_rm_sysfs(d); 149 aoedisk_rm_sysfs(d);
121 del_gendisk(d->gd); 150 del_gendisk(d->gd);
122 put_disk(d->gd); 151 put_disk(d->gd);
152 blk_cleanup_queue(d->blkq);
123 } 153 }
124 t = d->targets; 154 t = d->targets;
125 e = t + NTARGETS; 155 e = t + NTARGETS;
@@ -128,7 +158,6 @@ aoedev_freedev(struct aoedev *d)
128 if (d->bufpool) 158 if (d->bufpool)
129 mempool_destroy(d->bufpool); 159 mempool_destroy(d->bufpool);
130 skbpoolfree(d); 160 skbpoolfree(d);
131 blk_cleanup_queue(d->blkq);
132 kfree(d); 161 kfree(d);
133} 162}
134 163
@@ -155,7 +184,8 @@ aoedev_flush(const char __user *str, size_t cnt)
155 spin_lock(&d->lock); 184 spin_lock(&d->lock);
156 if ((!all && (d->flags & DEVFL_UP)) 185 if ((!all && (d->flags & DEVFL_UP))
157 || (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE)) 186 || (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE))
158 || d->nopen) { 187 || d->nopen
188 || d->ref) {
159 spin_unlock(&d->lock); 189 spin_unlock(&d->lock);
160 dd = &d->next; 190 dd = &d->next;
161 continue; 191 continue;
@@ -176,12 +206,15 @@ aoedev_flush(const char __user *str, size_t cnt)
176 return 0; 206 return 0;
177} 207}
178 208
179/* I'm not really sure that this is a realistic problem, but if the 209/* This has been confirmed to occur once with Tms=3*1000 due to the
180network driver goes gonzo let's just leak memory after complaining. */ 210 * driver changing link and not processing its transmit ring. The
211 * problem is hard enough to solve by returning an error that I'm
212 * still punting on "solving" this.
213 */
181static void 214static void
182skbfree(struct sk_buff *skb) 215skbfree(struct sk_buff *skb)
183{ 216{
184 enum { Sms = 100, Tms = 3*1000}; 217 enum { Sms = 250, Tms = 30 * 1000};
185 int i = Tms / Sms; 218 int i = Tms / Sms;
186 219
187 if (skb == NULL) 220 if (skb == NULL)
@@ -222,8 +255,10 @@ aoedev_by_sysminor_m(ulong sysminor)
222 spin_lock_irqsave(&devlist_lock, flags); 255 spin_lock_irqsave(&devlist_lock, flags);
223 256
224 for (d=devlist; d; d=d->next) 257 for (d=devlist; d; d=d->next)
225 if (d->sysminor == sysminor) 258 if (d->sysminor == sysminor) {
259 d->ref++;
226 break; 260 break;
261 }
227 if (d) 262 if (d)
228 goto out; 263 goto out;
229 d = kcalloc(1, sizeof *d, GFP_ATOMIC); 264 d = kcalloc(1, sizeof *d, GFP_ATOMIC);
@@ -231,7 +266,6 @@ aoedev_by_sysminor_m(ulong sysminor)
231 goto out; 266 goto out;
232 INIT_WORK(&d->work, aoecmd_sleepwork); 267 INIT_WORK(&d->work, aoecmd_sleepwork);
233 spin_lock_init(&d->lock); 268 spin_lock_init(&d->lock);
234 skb_queue_head_init(&d->sendq);
235 skb_queue_head_init(&d->skbpool); 269 skb_queue_head_init(&d->skbpool);
236 init_timer(&d->timer); 270 init_timer(&d->timer);
237 d->timer.data = (ulong) d; 271 d->timer.data = (ulong) d;
@@ -240,7 +274,7 @@ aoedev_by_sysminor_m(ulong sysminor)
240 add_timer(&d->timer); 274 add_timer(&d->timer);
241 d->bufpool = NULL; /* defer to aoeblk_gdalloc */ 275 d->bufpool = NULL; /* defer to aoeblk_gdalloc */
242 d->tgt = d->targets; 276 d->tgt = d->targets;
243 INIT_LIST_HEAD(&d->bufq); 277 d->ref = 1;
244 d->sysminor = sysminor; 278 d->sysminor = sysminor;
245 d->aoemajor = AOEMAJOR(sysminor); 279 d->aoemajor = AOEMAJOR(sysminor);
246 d->aoeminor = AOEMINOR(sysminor); 280 d->aoeminor = AOEMINOR(sysminor);
@@ -274,6 +308,7 @@ aoedev_exit(void)
274 struct aoedev *d; 308 struct aoedev *d;
275 ulong flags; 309 ulong flags;
276 310
311 aoe_flush_iocq();
277 while ((d = devlist)) { 312 while ((d = devlist)) {
278 devlist = d->next; 313 devlist = d->next;
279 314