aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEd Cashin <ecashin@coraid.com>2012-10-04 20:16:23 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-05 14:05:25 -0400
commit69cf2d85de773d998798e47e3335b85e5645d157 (patch)
tree765eb2be45726e7e098fe73b7f368239c0461342
parent896831f5909e2733c13c9cb13a1a215f10c3eaa8 (diff)
aoe: become I/O request queue handler for increased user control
To allow users to choose an elevator algorithm for their particular workloads, change from a make_request-style driver to an I/O-request-queue-handler-style driver. We have to do a couple of things that might be surprising. We manipulate the page _count directly on the assumption that we still have no guarantee that users of the block layer are prohibited from submitting bios containing pages with zero reference counts.[1] If such a prohibition now exists, I can get rid of the _count manipulation. Just as before this patch, we still keep track of the sk_buffs that the network layer still hasn't finished yet and cap the resources we use with a "pool" of skbs.[2] Now that the block layer maintains the disk stats, the aoe driver's diskstats function can go away. 1. https://lkml.org/lkml/2007/3/1/374 2. https://lkml.org/lkml/2007/7/6/241 Signed-off-by: Ed Cashin <ecashin@coraid.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--drivers/block/aoe/aoe.h26
-rw-r--r--drivers/block/aoe/aoeblk.c88
-rw-r--r--drivers/block/aoe/aoechr.c1
-rw-r--r--drivers/block/aoe/aoecmd.c282
-rw-r--r--drivers/block/aoe/aoedev.c93
5 files changed, 308 insertions, 182 deletions
diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index 0cd6c0f7a535..8c4f6d942e05 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -90,7 +90,7 @@ enum {
90 MIN_BUFS = 16, 90 MIN_BUFS = 16,
91 NTARGETS = 8, 91 NTARGETS = 8,
92 NAOEIFS = 8, 92 NAOEIFS = 8,
93 NSKBPOOLMAX = 128, 93 NSKBPOOLMAX = 256,
94 NFACTIVE = 17, 94 NFACTIVE = 17,
95 95
96 TIMERTICK = HZ / 10, 96 TIMERTICK = HZ / 10,
@@ -100,30 +100,26 @@ enum {
100}; 100};
101 101
102struct buf { 102struct buf {
103 struct list_head bufs;
104 ulong stime; /* for disk stats */
105 ulong flags;
106 ulong nframesout; 103 ulong nframesout;
107 ulong resid; 104 ulong resid;
108 ulong bv_resid; 105 ulong bv_resid;
109 ulong bv_off;
110 sector_t sector; 106 sector_t sector;
111 struct bio *bio; 107 struct bio *bio;
112 struct bio_vec *bv; 108 struct bio_vec *bv;
109 struct request *rq;
113}; 110};
114 111
115struct frame { 112struct frame {
116 struct list_head head; 113 struct list_head head;
117 u32 tag; 114 u32 tag;
118 ulong waited; 115 ulong waited;
119 struct buf *buf;
120 struct aoetgt *t; /* parent target I belong to */ 116 struct aoetgt *t; /* parent target I belong to */
121 char *bufaddr;
122 ulong bcnt;
123 sector_t lba; 117 sector_t lba;
124 struct sk_buff *skb; /* command skb freed on module exit */ 118 struct sk_buff *skb; /* command skb freed on module exit */
125 struct sk_buff *r_skb; /* response skb for async processing */ 119 struct sk_buff *r_skb; /* response skb for async processing */
120 struct buf *buf;
126 struct bio_vec *bv; 121 struct bio_vec *bv;
122 ulong bcnt;
127 ulong bv_off; 123 ulong bv_off;
128}; 124};
129 125
@@ -161,6 +157,7 @@ struct aoedev {
161 u16 rttavg; /* round trip average of requests/responses */ 157 u16 rttavg; /* round trip average of requests/responses */
162 u16 mintimer; 158 u16 mintimer;
163 u16 fw_ver; /* version of blade's firmware */ 159 u16 fw_ver; /* version of blade's firmware */
160 ulong ref;
164 struct work_struct work;/* disk create work struct */ 161 struct work_struct work;/* disk create work struct */
165 struct gendisk *gd; 162 struct gendisk *gd;
166 struct request_queue *blkq; 163 struct request_queue *blkq;
@@ -168,11 +165,13 @@ struct aoedev {
168 sector_t ssize; 165 sector_t ssize;
169 struct timer_list timer; 166 struct timer_list timer;
170 spinlock_t lock; 167 spinlock_t lock;
171 struct sk_buff_head sendq;
172 struct sk_buff_head skbpool; 168 struct sk_buff_head skbpool;
173 mempool_t *bufpool; /* for deadlock-free Buf allocation */ 169 mempool_t *bufpool; /* for deadlock-free Buf allocation */
174 struct list_head bufq; /* queue of bios to work on */ 170 struct { /* pointers to work in progress */
175 struct buf *inprocess; /* the one we're currently working on */ 171 struct buf *buf;
172 struct bio *nxbio;
173 struct request *rq;
174 } ip;
176 struct aoetgt *targets[NTARGETS]; 175 struct aoetgt *targets[NTARGETS];
177 struct aoetgt **tgt; /* target in use when working */ 176 struct aoetgt **tgt; /* target in use when working */
178 struct aoetgt *htgt; /* target needing rexmit assistance */ 177 struct aoetgt *htgt; /* target needing rexmit assistance */
@@ -209,6 +208,8 @@ void aoecmd_exit(void);
209int aoecmd_init(void); 208int aoecmd_init(void);
210struct sk_buff *aoecmd_ata_id(struct aoedev *); 209struct sk_buff *aoecmd_ata_id(struct aoedev *);
211void aoe_freetframe(struct frame *); 210void aoe_freetframe(struct frame *);
211void aoe_flush_iocq(void);
212void aoe_end_request(struct aoedev *, struct request *, int);
212 213
213int aoedev_init(void); 214int aoedev_init(void);
214void aoedev_exit(void); 215void aoedev_exit(void);
@@ -216,7 +217,8 @@ struct aoedev *aoedev_by_aoeaddr(int maj, int min);
216struct aoedev *aoedev_by_sysminor_m(ulong sysminor); 217struct aoedev *aoedev_by_sysminor_m(ulong sysminor);
217void aoedev_downdev(struct aoedev *d); 218void aoedev_downdev(struct aoedev *d);
218int aoedev_flush(const char __user *str, size_t size); 219int aoedev_flush(const char __user *str, size_t size);
219void aoe_failbuf(struct aoedev *d, struct buf *buf); 220void aoe_failbuf(struct aoedev *, struct buf *);
221void aoedev_put(struct aoedev *);
220 222
221int aoenet_init(void); 223int aoenet_init(void);
222void aoenet_exit(void); 224void aoenet_exit(void);
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 3a8f0933cc7d..7ec4b8fa28fd 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -161,68 +161,22 @@ aoeblk_release(struct gendisk *disk, fmode_t mode)
161} 161}
162 162
163static void 163static void
164aoeblk_make_request(struct request_queue *q, struct bio *bio) 164aoeblk_request(struct request_queue *q)
165{ 165{
166 struct sk_buff_head queue;
167 struct aoedev *d; 166 struct aoedev *d;
168 struct buf *buf; 167 struct request *rq;
169 ulong flags;
170
171 blk_queue_bounce(q, &bio);
172
173 if (bio == NULL) {
174 printk(KERN_ERR "aoe: bio is NULL\n");
175 BUG();
176 return;
177 }
178 d = bio->bi_bdev->bd_disk->private_data;
179 if (d == NULL) {
180 printk(KERN_ERR "aoe: bd_disk->private_data is NULL\n");
181 BUG();
182 bio_endio(bio, -ENXIO);
183 return;
184 } else if (bio->bi_io_vec == NULL) {
185 printk(KERN_ERR "aoe: bi_io_vec is NULL\n");
186 BUG();
187 bio_endio(bio, -ENXIO);
188 return;
189 }
190 buf = mempool_alloc(d->bufpool, GFP_NOIO);
191 if (buf == NULL) {
192 printk(KERN_INFO "aoe: buf allocation failure\n");
193 bio_endio(bio, -ENOMEM);
194 return;
195 }
196 memset(buf, 0, sizeof(*buf));
197 INIT_LIST_HEAD(&buf->bufs);
198 buf->stime = jiffies;
199 buf->bio = bio;
200 buf->resid = bio->bi_size;
201 buf->sector = bio->bi_sector;
202 buf->bv = &bio->bi_io_vec[bio->bi_idx];
203 buf->bv_resid = buf->bv->bv_len;
204 WARN_ON(buf->bv_resid == 0);
205 buf->bv_off = buf->bv->bv_offset;
206
207 spin_lock_irqsave(&d->lock, flags);
208 168
169 d = q->queuedata;
209 if ((d->flags & DEVFL_UP) == 0) { 170 if ((d->flags & DEVFL_UP) == 0) {
210 pr_info_ratelimited("aoe: device %ld.%d is not up\n", 171 pr_info_ratelimited("aoe: device %ld.%d is not up\n",
211 d->aoemajor, d->aoeminor); 172 d->aoemajor, d->aoeminor);
212 spin_unlock_irqrestore(&d->lock, flags); 173 while ((rq = blk_peek_request(q))) {
213 mempool_free(buf, d->bufpool); 174 blk_start_request(rq);
214 bio_endio(bio, -ENXIO); 175 aoe_end_request(d, rq, 1);
176 }
215 return; 177 return;
216 } 178 }
217
218 list_add_tail(&buf->bufs, &d->bufq);
219
220 aoecmd_work(d); 179 aoecmd_work(d);
221 __skb_queue_head_init(&queue);
222 skb_queue_splice_init(&d->sendq, &queue);
223
224 spin_unlock_irqrestore(&d->lock, flags);
225 aoenet_xmit(&queue);
226} 180}
227 181
228static int 182static int
@@ -254,34 +208,46 @@ aoeblk_gdalloc(void *vp)
254{ 208{
255 struct aoedev *d = vp; 209 struct aoedev *d = vp;
256 struct gendisk *gd; 210 struct gendisk *gd;
257 enum { KB = 1024, MB = KB * KB, READ_AHEAD = MB, }; 211 mempool_t *mp;
212 struct request_queue *q;
213 enum { KB = 1024, MB = KB * KB, READ_AHEAD = 2 * MB, };
258 ulong flags; 214 ulong flags;
259 215
260 gd = alloc_disk(AOE_PARTITIONS); 216 gd = alloc_disk(AOE_PARTITIONS);
261 if (gd == NULL) { 217 if (gd == NULL) {
262 printk(KERN_ERR 218 pr_err("aoe: cannot allocate disk structure for %ld.%d\n",
263 "aoe: cannot allocate disk structure for %ld.%d\n",
264 d->aoemajor, d->aoeminor); 219 d->aoemajor, d->aoeminor);
265 goto err; 220 goto err;
266 } 221 }
267 222
268 d->bufpool = mempool_create_slab_pool(MIN_BUFS, buf_pool_cache); 223 mp = mempool_create(MIN_BUFS, mempool_alloc_slab, mempool_free_slab,
269 if (d->bufpool == NULL) { 224 buf_pool_cache);
225 if (mp == NULL) {
270 printk(KERN_ERR "aoe: cannot allocate bufpool for %ld.%d\n", 226 printk(KERN_ERR "aoe: cannot allocate bufpool for %ld.%d\n",
271 d->aoemajor, d->aoeminor); 227 d->aoemajor, d->aoeminor);
272 goto err_disk; 228 goto err_disk;
273 } 229 }
230 q = blk_init_queue(aoeblk_request, &d->lock);
231 if (q == NULL) {
232 pr_err("aoe: cannot allocate block queue for %ld.%d\n",
233 d->aoemajor, d->aoeminor);
234 mempool_destroy(mp);
235 goto err_disk;
236 }
274 237
275 d->blkq = blk_alloc_queue(GFP_KERNEL); 238 d->blkq = blk_alloc_queue(GFP_KERNEL);
276 if (!d->blkq) 239 if (!d->blkq)
277 goto err_mempool; 240 goto err_mempool;
278 blk_queue_make_request(d->blkq, aoeblk_make_request);
279 d->blkq->backing_dev_info.name = "aoe"; 241 d->blkq->backing_dev_info.name = "aoe";
280 if (bdi_init(&d->blkq->backing_dev_info)) 242 if (bdi_init(&d->blkq->backing_dev_info))
281 goto err_blkq; 243 goto err_blkq;
282 spin_lock_irqsave(&d->lock, flags); 244 spin_lock_irqsave(&d->lock, flags);
283 blk_queue_max_hw_sectors(d->blkq, BLK_DEF_MAX_SECTORS); 245 blk_queue_max_hw_sectors(d->blkq, BLK_DEF_MAX_SECTORS);
284 d->blkq->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE; 246 q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE;
247 d->bufpool = mp;
248 d->blkq = gd->queue = q;
249 q->queuedata = d;
250 d->gd = gd;
285 gd->major = AOE_MAJOR; 251 gd->major = AOE_MAJOR;
286 gd->first_minor = d->sysminor * AOE_PARTITIONS; 252 gd->first_minor = d->sysminor * AOE_PARTITIONS;
287 gd->fops = &aoe_bdops; 253 gd->fops = &aoe_bdops;
@@ -290,8 +256,6 @@ aoeblk_gdalloc(void *vp)
290 snprintf(gd->disk_name, sizeof gd->disk_name, "etherd/e%ld.%d", 256 snprintf(gd->disk_name, sizeof gd->disk_name, "etherd/e%ld.%d",
291 d->aoemajor, d->aoeminor); 257 d->aoemajor, d->aoeminor);
292 258
293 gd->queue = d->blkq;
294 d->gd = gd;
295 d->flags &= ~DEVFL_GDALLOC; 259 d->flags &= ~DEVFL_GDALLOC;
296 d->flags |= DEVFL_UP; 260 d->flags |= DEVFL_UP;
297 261
diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c
index f145388cb94a..3557f0d04b46 100644
--- a/drivers/block/aoe/aoechr.c
+++ b/drivers/block/aoe/aoechr.c
@@ -106,6 +106,7 @@ loop:
106 spin_lock_irqsave(&d->lock, flags); 106 spin_lock_irqsave(&d->lock, flags);
107 goto loop; 107 goto loop;
108 } 108 }
109 aoedev_put(d);
109 if (skb) { 110 if (skb) {
110 struct sk_buff_head queue; 111 struct sk_buff_head queue;
111 __skb_queue_head_init(&queue); 112 __skb_queue_head_init(&queue);
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 59b333c902a6..5928a08c1f3f 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -23,6 +23,8 @@
23 23
24static void ktcomplete(struct frame *, struct sk_buff *); 24static void ktcomplete(struct frame *, struct sk_buff *);
25 25
26static struct buf *nextbuf(struct aoedev *);
27
26static int aoe_deadsecs = 60 * 3; 28static int aoe_deadsecs = 60 * 3;
27module_param(aoe_deadsecs, int, 0644); 29module_param(aoe_deadsecs, int, 0644);
28MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev."); 30MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev.");
@@ -283,17 +285,20 @@ aoecmd_ata_rw(struct aoedev *d)
283 struct bio_vec *bv; 285 struct bio_vec *bv;
284 struct aoetgt *t; 286 struct aoetgt *t;
285 struct sk_buff *skb; 287 struct sk_buff *skb;
288 struct sk_buff_head queue;
286 ulong bcnt, fbcnt; 289 ulong bcnt, fbcnt;
287 char writebit, extbit; 290 char writebit, extbit;
288 291
289 writebit = 0x10; 292 writebit = 0x10;
290 extbit = 0x4; 293 extbit = 0x4;
291 294
295 buf = nextbuf(d);
296 if (buf == NULL)
297 return 0;
292 f = newframe(d); 298 f = newframe(d);
293 if (f == NULL) 299 if (f == NULL)
294 return 0; 300 return 0;
295 t = *d->tgt; 301 t = *d->tgt;
296 buf = d->inprocess;
297 bv = buf->bv; 302 bv = buf->bv;
298 bcnt = t->ifp->maxbcnt; 303 bcnt = t->ifp->maxbcnt;
299 if (bcnt == 0) 304 if (bcnt == 0)
@@ -312,7 +317,7 @@ aoecmd_ata_rw(struct aoedev *d)
312 fbcnt -= buf->bv_resid; 317 fbcnt -= buf->bv_resid;
313 buf->resid -= buf->bv_resid; 318 buf->resid -= buf->bv_resid;
314 if (buf->resid == 0) { 319 if (buf->resid == 0) {
315 d->inprocess = NULL; 320 d->ip.buf = NULL;
316 break; 321 break;
317 } 322 }
318 buf->bv++; 323 buf->bv++;
@@ -364,8 +369,11 @@ aoecmd_ata_rw(struct aoedev *d)
364 369
365 skb->dev = t->ifp->nd; 370 skb->dev = t->ifp->nd;
366 skb = skb_clone(skb, GFP_ATOMIC); 371 skb = skb_clone(skb, GFP_ATOMIC);
367 if (skb) 372 if (skb) {
368 __skb_queue_tail(&d->sendq, skb); 373 __skb_queue_head_init(&queue);
374 __skb_queue_tail(&queue, skb);
375 aoenet_xmit(&queue);
376 }
369 return 1; 377 return 1;
370} 378}
371 379
@@ -415,6 +423,7 @@ static void
415resend(struct aoedev *d, struct frame *f) 423resend(struct aoedev *d, struct frame *f)
416{ 424{
417 struct sk_buff *skb; 425 struct sk_buff *skb;
426 struct sk_buff_head queue;
418 struct aoe_hdr *h; 427 struct aoe_hdr *h;
419 struct aoe_atahdr *ah; 428 struct aoe_atahdr *ah;
420 struct aoetgt *t; 429 struct aoetgt *t;
@@ -444,7 +453,9 @@ resend(struct aoedev *d, struct frame *f)
444 skb = skb_clone(skb, GFP_ATOMIC); 453 skb = skb_clone(skb, GFP_ATOMIC);
445 if (skb == NULL) 454 if (skb == NULL)
446 return; 455 return;
447 __skb_queue_tail(&d->sendq, skb); 456 __skb_queue_head_init(&queue);
457 __skb_queue_tail(&queue, skb);
458 aoenet_xmit(&queue);
448} 459}
449 460
450static int 461static int
@@ -554,7 +565,6 @@ ata_scnt(unsigned char *packet) {
554static void 565static void
555rexmit_timer(ulong vp) 566rexmit_timer(ulong vp)
556{ 567{
557 struct sk_buff_head queue;
558 struct aoedev *d; 568 struct aoedev *d;
559 struct aoetgt *t, **tt, **te; 569 struct aoetgt *t, **tt, **te;
560 struct aoeif *ifp; 570 struct aoeif *ifp;
@@ -603,6 +613,12 @@ rexmit_timer(ulong vp)
603 } 613 }
604 } 614 }
605 615
616 if (!list_empty(&flist)) { /* retransmissions necessary */
617 n = d->rttavg <<= 1;
618 if (n > MAXTIMER)
619 d->rttavg = MAXTIMER;
620 }
621
606 /* process expired frames */ 622 /* process expired frames */
607 while (!list_empty(&flist)) { 623 while (!list_empty(&flist)) {
608 pos = flist.next; 624 pos = flist.next;
@@ -641,45 +657,131 @@ rexmit_timer(ulong vp)
641 resend(d, f); 657 resend(d, f);
642 } 658 }
643 659
644 if (!skb_queue_empty(&d->sendq)) { 660 if ((d->flags & DEVFL_KICKME || d->htgt) && d->blkq) {
645 n = d->rttavg <<= 1;
646 if (n > MAXTIMER)
647 d->rttavg = MAXTIMER;
648 }
649
650 if (d->flags & DEVFL_KICKME || d->htgt) {
651 d->flags &= ~DEVFL_KICKME; 661 d->flags &= ~DEVFL_KICKME;
652 aoecmd_work(d); 662 d->blkq->request_fn(d->blkq);
653 } 663 }
654 664
655 __skb_queue_head_init(&queue);
656 skb_queue_splice_init(&d->sendq, &queue);
657
658 d->timer.expires = jiffies + TIMERTICK; 665 d->timer.expires = jiffies + TIMERTICK;
659 add_timer(&d->timer); 666 add_timer(&d->timer);
660 667
661 spin_unlock_irqrestore(&d->lock, flags); 668 spin_unlock_irqrestore(&d->lock, flags);
669}
662 670
663 aoenet_xmit(&queue); 671static unsigned long
672rqbiocnt(struct request *r)
673{
674 struct bio *bio;
675 unsigned long n = 0;
676
677 __rq_for_each_bio(bio, r)
678 n++;
679 return n;
680}
681
682/* This can be removed if we are certain that no users of the block
683 * layer will ever use zero-count pages in bios. Otherwise we have to
684 * protect against the put_page sometimes done by the network layer.
685 *
686 * See http://oss.sgi.com/archives/xfs/2007-01/msg00594.html for
687 * discussion.
688 *
689 * We cannot use get_page in the workaround, because it insists on a
690 * positive page count as a precondition. So we use _count directly.
691 */
692static void
693bio_pageinc(struct bio *bio)
694{
695 struct bio_vec *bv;
696 struct page *page;
697 int i;
698
699 bio_for_each_segment(bv, bio, i) {
700 page = bv->bv_page;
701 /* Non-zero page count for non-head members of
702 * compound pages is no longer allowed by the kernel,
703 * but this has never been seen here.
704 */
705 if (unlikely(PageCompound(page)))
706 if (compound_trans_head(page) != page) {
707 pr_crit("page tail used for block I/O\n");
708 BUG();
709 }
710 atomic_inc(&page->_count);
711 }
712}
713
714static void
715bio_pagedec(struct bio *bio)
716{
717 struct bio_vec *bv;
718 int i;
719
720 bio_for_each_segment(bv, bio, i)
721 atomic_dec(&bv->bv_page->_count);
722}
723
724static void
725bufinit(struct buf *buf, struct request *rq, struct bio *bio)
726{
727 struct bio_vec *bv;
728
729 memset(buf, 0, sizeof(*buf));
730 buf->rq = rq;
731 buf->bio = bio;
732 buf->resid = bio->bi_size;
733 buf->sector = bio->bi_sector;
734 bio_pageinc(bio);
735 buf->bv = bv = &bio->bi_io_vec[bio->bi_idx];
736 buf->bv_resid = bv->bv_len;
737 WARN_ON(buf->bv_resid == 0);
738}
739
740static struct buf *
741nextbuf(struct aoedev *d)
742{
743 struct request *rq;
744 struct request_queue *q;
745 struct buf *buf;
746 struct bio *bio;
747
748 q = d->blkq;
749 if (q == NULL)
750 return NULL; /* initializing */
751 if (d->ip.buf)
752 return d->ip.buf;
753 rq = d->ip.rq;
754 if (rq == NULL) {
755 rq = blk_peek_request(q);
756 if (rq == NULL)
757 return NULL;
758 blk_start_request(rq);
759 d->ip.rq = rq;
760 d->ip.nxbio = rq->bio;
761 rq->special = (void *) rqbiocnt(rq);
762 }
763 buf = mempool_alloc(d->bufpool, GFP_ATOMIC);
764 if (buf == NULL) {
765 pr_err("aoe: nextbuf: unable to mempool_alloc!\n");
766 return NULL;
767 }
768 bio = d->ip.nxbio;
769 bufinit(buf, rq, bio);
770 bio = bio->bi_next;
771 d->ip.nxbio = bio;
772 if (bio == NULL)
773 d->ip.rq = NULL;
774 return d->ip.buf = buf;
664} 775}
665 776
666/* enters with d->lock held */ 777/* enters with d->lock held */
667void 778void
668aoecmd_work(struct aoedev *d) 779aoecmd_work(struct aoedev *d)
669{ 780{
670 struct buf *buf;
671loop:
672 if (d->htgt && !sthtith(d)) 781 if (d->htgt && !sthtith(d))
673 return; 782 return;
674 if (d->inprocess == NULL) { 783 while (aoecmd_ata_rw(d))
675 if (list_empty(&d->bufq)) 784 ;
676 return;
677 buf = container_of(d->bufq.next, struct buf, bufs);
678 list_del(d->bufq.next);
679 d->inprocess = buf;
680 }
681 if (aoecmd_ata_rw(d))
682 goto loop;
683} 785}
684 786
685/* this function performs work that has been deferred until sleeping is OK 787/* this function performs work that has been deferred until sleeping is OK
@@ -802,25 +904,6 @@ gettgt(struct aoedev *d, char *addr)
802 return NULL; 904 return NULL;
803} 905}
804 906
805static inline void
806diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector)
807{
808 unsigned long n_sect = bio->bi_size >> 9;
809 const int rw = bio_data_dir(bio);
810 struct hd_struct *part;
811 int cpu;
812
813 cpu = part_stat_lock();
814 part = disk_map_sector_rcu(disk, sector);
815
816 part_stat_inc(cpu, part, ios[rw]);
817 part_stat_add(cpu, part, ticks[rw], duration);
818 part_stat_add(cpu, part, sectors[rw], n_sect);
819 part_stat_add(cpu, part, io_ticks, duration);
820
821 part_stat_unlock();
822}
823
824static void 907static void
825bvcpy(struct bio_vec *bv, ulong off, struct sk_buff *skb, long cnt) 908bvcpy(struct bio_vec *bv, ulong off, struct sk_buff *skb, long cnt)
826{ 909{
@@ -842,6 +925,43 @@ loop:
842 goto loop; 925 goto loop;
843} 926}
844 927
928void
929aoe_end_request(struct aoedev *d, struct request *rq, int fastfail)
930{
931 struct bio *bio;
932 int bok;
933 struct request_queue *q;
934
935 q = d->blkq;
936 if (rq == d->ip.rq)
937 d->ip.rq = NULL;
938 do {
939 bio = rq->bio;
940 bok = !fastfail && test_bit(BIO_UPTODATE, &bio->bi_flags);
941 } while (__blk_end_request(rq, bok ? 0 : -EIO, bio->bi_size));
942
943 /* cf. http://lkml.org/lkml/2006/10/31/28 */
944 if (!fastfail)
945 q->request_fn(q);
946}
947
948static void
949aoe_end_buf(struct aoedev *d, struct buf *buf)
950{
951 struct request *rq;
952 unsigned long n;
953
954 if (buf == d->ip.buf)
955 d->ip.buf = NULL;
956 rq = buf->rq;
957 bio_pagedec(buf->bio);
958 mempool_free(buf, d->bufpool);
959 n = (unsigned long) rq->special;
960 rq->special = (void *) --n;
961 if (n == 0)
962 aoe_end_request(d, rq, 0);
963}
964
845static void 965static void
846ktiocomplete(struct frame *f) 966ktiocomplete(struct frame *f)
847{ 967{
@@ -876,7 +996,7 @@ ktiocomplete(struct frame *f)
876 ahout->cmdstat, ahin->cmdstat, 996 ahout->cmdstat, ahin->cmdstat,
877 d->aoemajor, d->aoeminor); 997 d->aoemajor, d->aoeminor);
878noskb: if (buf) 998noskb: if (buf)
879 buf->flags |= BUFFL_FAIL; 999 clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
880 goto badrsp; 1000 goto badrsp;
881 } 1001 }
882 1002
@@ -887,7 +1007,7 @@ noskb: if (buf)
887 if (skb->len < n) { 1007 if (skb->len < n) {
888 pr_err("aoe: runt data size in read. skb->len=%d need=%ld\n", 1008 pr_err("aoe: runt data size in read. skb->len=%d need=%ld\n",
889 skb->len, n); 1009 skb->len, n);
890 buf->flags |= BUFFL_FAIL; 1010 clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
891 break; 1011 break;
892 } 1012 }
893 bvcpy(f->bv, f->bv_off, skb, n); 1013 bvcpy(f->bv, f->bv_off, skb, n);
@@ -927,18 +1047,13 @@ badrsp:
927 1047
928 aoe_freetframe(f); 1048 aoe_freetframe(f);
929 1049
930 if (buf && --buf->nframesout == 0 && buf->resid == 0) { 1050 if (buf && --buf->nframesout == 0 && buf->resid == 0)
931 struct bio *bio = buf->bio; 1051 aoe_end_buf(d, buf);
932 1052
933 diskstats(d->gd, bio, jiffies - buf->stime, buf->sector); 1053 aoecmd_work(d);
934 n = (buf->flags & BUFFL_FAIL) ? -EIO : 0; 1054
935 mempool_free(buf, d->bufpool); 1055 spin_unlock_irq(&d->lock);
936 spin_unlock_irq(&d->lock); 1056 aoedev_put(d);
937 if (n != -EIO)
938 bio_flush_dcache_pages(buf->bio);
939 bio_endio(bio, n);
940 } else
941 spin_unlock_irq(&d->lock);
942 dev_kfree_skb(skb); 1057 dev_kfree_skb(skb);
943} 1058}
944 1059
@@ -1061,12 +1176,14 @@ aoecmd_ata_rsp(struct sk_buff *skb)
1061 printk(KERN_INFO "aoe: can't find target e%ld.%d:%pm\n", 1176 printk(KERN_INFO "aoe: can't find target e%ld.%d:%pm\n",
1062 d->aoemajor, d->aoeminor, h->src); 1177 d->aoemajor, d->aoeminor, h->src);
1063 spin_unlock_irqrestore(&d->lock, flags); 1178 spin_unlock_irqrestore(&d->lock, flags);
1179 aoedev_put(d);
1064 return skb; 1180 return skb;
1065 } 1181 }
1066 f = getframe(t, n); 1182 f = getframe(t, n);
1067 if (f == NULL) { 1183 if (f == NULL) {
1068 calc_rttavg(d, -tsince(n)); 1184 calc_rttavg(d, -tsince(n));
1069 spin_unlock_irqrestore(&d->lock, flags); 1185 spin_unlock_irqrestore(&d->lock, flags);
1186 aoedev_put(d);
1070 snprintf(ebuf, sizeof ebuf, 1187 snprintf(ebuf, sizeof ebuf,
1071 "%15s e%d.%d tag=%08x@%08lx\n", 1188 "%15s e%d.%d tag=%08x@%08lx\n",
1072 "unexpected rsp", 1189 "unexpected rsp",
@@ -1185,8 +1302,10 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
1185 struct aoeif *ifp; 1302 struct aoeif *ifp;
1186 ulong flags, sysminor, aoemajor; 1303 ulong flags, sysminor, aoemajor;
1187 struct sk_buff *sl; 1304 struct sk_buff *sl;
1305 struct sk_buff_head queue;
1188 u16 n; 1306 u16 n;
1189 1307
1308 sl = NULL;
1190 h = (struct aoe_hdr *) skb_mac_header(skb); 1309 h = (struct aoe_hdr *) skb_mac_header(skb);
1191 ch = (struct aoe_cfghdr *) (h+1); 1310 ch = (struct aoe_cfghdr *) (h+1);
1192 1311
@@ -1223,10 +1342,8 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
1223 t = gettgt(d, h->src); 1342 t = gettgt(d, h->src);
1224 if (!t) { 1343 if (!t) {
1225 t = addtgt(d, h->src, n); 1344 t = addtgt(d, h->src, n);
1226 if (!t) { 1345 if (!t)
1227 spin_unlock_irqrestore(&d->lock, flags); 1346 goto bail;
1228 return;
1229 }
1230 } 1347 }
1231 ifp = getif(t, skb->dev); 1348 ifp = getif(t, skb->dev);
1232 if (!ifp) { 1349 if (!ifp) {
@@ -1235,8 +1352,7 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
1235 printk(KERN_INFO 1352 printk(KERN_INFO
1236 "aoe: device addif failure; " 1353 "aoe: device addif failure; "
1237 "too many interfaces?\n"); 1354 "too many interfaces?\n");
1238 spin_unlock_irqrestore(&d->lock, flags); 1355 goto bail;
1239 return;
1240 } 1356 }
1241 } 1357 }
1242 if (ifp->maxbcnt) { 1358 if (ifp->maxbcnt) {
@@ -1257,18 +1373,14 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
1257 } 1373 }
1258 1374
1259 /* don't change users' perspective */ 1375 /* don't change users' perspective */
1260 if (d->nopen) { 1376 if (d->nopen == 0) {
1261 spin_unlock_irqrestore(&d->lock, flags); 1377 d->fw_ver = be16_to_cpu(ch->fwver);
1262 return; 1378 sl = aoecmd_ata_id(d);
1263 } 1379 }
1264 d->fw_ver = be16_to_cpu(ch->fwver); 1380bail:
1265
1266 sl = aoecmd_ata_id(d);
1267
1268 spin_unlock_irqrestore(&d->lock, flags); 1381 spin_unlock_irqrestore(&d->lock, flags);
1269 1382 aoedev_put(d);
1270 if (sl) { 1383 if (sl) {
1271 struct sk_buff_head queue;
1272 __skb_queue_head_init(&queue); 1384 __skb_queue_head_init(&queue);
1273 __skb_queue_tail(&queue, sl); 1385 __skb_queue_tail(&queue, sl);
1274 aoenet_xmit(&queue); 1386 aoenet_xmit(&queue);
@@ -1297,8 +1409,19 @@ aoecmd_cleanslate(struct aoedev *d)
1297 } 1409 }
1298} 1410}
1299 1411
1300static void 1412void
1301flush_iocq(void) 1413aoe_failbuf(struct aoedev *d, struct buf *buf)
1414{
1415 if (buf == NULL)
1416 return;
1417 buf->resid = 0;
1418 clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
1419 if (buf->nframesout == 0)
1420 aoe_end_buf(d, buf);
1421}
1422
1423void
1424aoe_flush_iocq(void)
1302{ 1425{
1303 struct frame *f; 1426 struct frame *f;
1304 struct aoedev *d; 1427 struct aoedev *d;
@@ -1324,6 +1447,7 @@ flush_iocq(void)
1324 aoe_freetframe(f); 1447 aoe_freetframe(f);
1325 spin_unlock_irqrestore(&d->lock, flags); 1448 spin_unlock_irqrestore(&d->lock, flags);
1326 dev_kfree_skb(skb); 1449 dev_kfree_skb(skb);
1450 aoedev_put(d);
1327 } 1451 }
1328} 1452}
1329 1453
@@ -1344,5 +1468,5 @@ void
1344aoecmd_exit(void) 1468aoecmd_exit(void)
1345{ 1469{
1346 aoe_ktstop(&kts); 1470 aoe_ktstop(&kts);
1347 flush_iocq(); 1471 aoe_flush_iocq();
1348} 1472}
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index 40bae1a1ff1e..635dc986cf77 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -19,6 +19,17 @@ static void skbpoolfree(struct aoedev *d);
19static struct aoedev *devlist; 19static struct aoedev *devlist;
20static DEFINE_SPINLOCK(devlist_lock); 20static DEFINE_SPINLOCK(devlist_lock);
21 21
22/*
23 * Users who grab a pointer to the device with aoedev_by_aoeaddr or
24 * aoedev_by_sysminor_m automatically get a reference count and must
25 * be responsible for performing a aoedev_put. With the addition of
26 * async kthread processing I'm no longer confident that we can
27 * guarantee consistency in the face of device flushes.
28 *
29 * For the time being, we only bother to add extra references for
30 * frames sitting on the iocq. When the kthreads finish processing
31 * these frames, they will aoedev_put the device.
32 */
22struct aoedev * 33struct aoedev *
23aoedev_by_aoeaddr(int maj, int min) 34aoedev_by_aoeaddr(int maj, int min)
24{ 35{
@@ -28,13 +39,25 @@ aoedev_by_aoeaddr(int maj, int min)
28 spin_lock_irqsave(&devlist_lock, flags); 39 spin_lock_irqsave(&devlist_lock, flags);
29 40
30 for (d=devlist; d; d=d->next) 41 for (d=devlist; d; d=d->next)
31 if (d->aoemajor == maj && d->aoeminor == min) 42 if (d->aoemajor == maj && d->aoeminor == min) {
43 d->ref++;
32 break; 44 break;
45 }
33 46
34 spin_unlock_irqrestore(&devlist_lock, flags); 47 spin_unlock_irqrestore(&devlist_lock, flags);
35 return d; 48 return d;
36} 49}
37 50
51void
52aoedev_put(struct aoedev *d)
53{
54 ulong flags;
55
56 spin_lock_irqsave(&devlist_lock, flags);
57 d->ref--;
58 spin_unlock_irqrestore(&devlist_lock, flags);
59}
60
38static void 61static void
39dummy_timer(ulong vp) 62dummy_timer(ulong vp)
40{ 63{
@@ -47,21 +70,26 @@ dummy_timer(ulong vp)
47 add_timer(&d->timer); 70 add_timer(&d->timer);
48} 71}
49 72
50void 73static void
51aoe_failbuf(struct aoedev *d, struct buf *buf) 74aoe_failip(struct aoedev *d)
52{ 75{
76 struct request *rq;
53 struct bio *bio; 77 struct bio *bio;
78 unsigned long n;
79
80 aoe_failbuf(d, d->ip.buf);
54 81
55 if (buf == NULL) 82 rq = d->ip.rq;
83 if (rq == NULL)
56 return; 84 return;
57 buf->flags |= BUFFL_FAIL; 85 while ((bio = d->ip.nxbio)) {
58 if (buf->nframesout == 0) { 86 clear_bit(BIO_UPTODATE, &bio->bi_flags);
59 if (buf == d->inprocess) /* ensure we only process this once */ 87 d->ip.nxbio = bio->bi_next;
60 d->inprocess = NULL; 88 n = (unsigned long) rq->special;
61 bio = buf->bio; 89 rq->special = (void *) --n;
62 mempool_free(buf, d->bufpool);
63 bio_endio(bio, -EIO);
64 } 90 }
91 if ((unsigned long) rq->special == 0)
92 aoe_end_request(d, rq, 0);
65} 93}
66 94
67void 95void
@@ -70,8 +98,11 @@ aoedev_downdev(struct aoedev *d)
70 struct aoetgt *t, **tt, **te; 98 struct aoetgt *t, **tt, **te;
71 struct frame *f; 99 struct frame *f;
72 struct list_head *head, *pos, *nx; 100 struct list_head *head, *pos, *nx;
101 struct request *rq;
73 int i; 102 int i;
74 103
104 d->flags &= ~DEVFL_UP;
105
75 /* clean out active buffers on all targets */ 106 /* clean out active buffers on all targets */
76 tt = d->targets; 107 tt = d->targets;
77 te = tt + NTARGETS; 108 te = tt + NTARGETS;
@@ -92,22 +123,20 @@ aoedev_downdev(struct aoedev *d)
92 t->nout = 0; 123 t->nout = 0;
93 } 124 }
94 125
95 /* clean out the in-process buffer (if any) */ 126 /* clean out the in-process request (if any) */
96 aoe_failbuf(d, d->inprocess); 127 aoe_failip(d);
97 d->inprocess = NULL;
98 d->htgt = NULL; 128 d->htgt = NULL;
99 129
100 /* clean out all pending I/O */ 130 /* fast fail all pending I/O */
101 while (!list_empty(&d->bufq)) { 131 if (d->blkq) {
102 struct buf *buf = container_of(d->bufq.next, struct buf, bufs); 132 while ((rq = blk_peek_request(d->blkq))) {
103 list_del(d->bufq.next); 133 blk_start_request(rq);
104 aoe_failbuf(d, buf); 134 aoe_end_request(d, rq, 1);
135 }
105 } 136 }
106 137
107 if (d->gd) 138 if (d->gd)
108 set_capacity(d->gd, 0); 139 set_capacity(d->gd, 0);
109
110 d->flags &= ~DEVFL_UP;
111} 140}
112 141
113static void 142static void
@@ -120,6 +149,7 @@ aoedev_freedev(struct aoedev *d)
120 aoedisk_rm_sysfs(d); 149 aoedisk_rm_sysfs(d);
121 del_gendisk(d->gd); 150 del_gendisk(d->gd);
122 put_disk(d->gd); 151 put_disk(d->gd);
152 blk_cleanup_queue(d->blkq);
123 } 153 }
124 t = d->targets; 154 t = d->targets;
125 e = t + NTARGETS; 155 e = t + NTARGETS;
@@ -128,7 +158,6 @@ aoedev_freedev(struct aoedev *d)
128 if (d->bufpool) 158 if (d->bufpool)
129 mempool_destroy(d->bufpool); 159 mempool_destroy(d->bufpool);
130 skbpoolfree(d); 160 skbpoolfree(d);
131 blk_cleanup_queue(d->blkq);
132 kfree(d); 161 kfree(d);
133} 162}
134 163
@@ -155,7 +184,8 @@ aoedev_flush(const char __user *str, size_t cnt)
155 spin_lock(&d->lock); 184 spin_lock(&d->lock);
156 if ((!all && (d->flags & DEVFL_UP)) 185 if ((!all && (d->flags & DEVFL_UP))
157 || (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE)) 186 || (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE))
158 || d->nopen) { 187 || d->nopen
188 || d->ref) {
159 spin_unlock(&d->lock); 189 spin_unlock(&d->lock);
160 dd = &d->next; 190 dd = &d->next;
161 continue; 191 continue;
@@ -176,12 +206,15 @@ aoedev_flush(const char __user *str, size_t cnt)
176 return 0; 206 return 0;
177} 207}
178 208
179/* I'm not really sure that this is a realistic problem, but if the 209/* This has been confirmed to occur once with Tms=3*1000 due to the
180network driver goes gonzo let's just leak memory after complaining. */ 210 * driver changing link and not processing its transmit ring. The
211 * problem is hard enough to solve by returning an error that I'm
212 * still punting on "solving" this.
213 */
181static void 214static void
182skbfree(struct sk_buff *skb) 215skbfree(struct sk_buff *skb)
183{ 216{
184 enum { Sms = 100, Tms = 3*1000}; 217 enum { Sms = 250, Tms = 30 * 1000};
185 int i = Tms / Sms; 218 int i = Tms / Sms;
186 219
187 if (skb == NULL) 220 if (skb == NULL)
@@ -222,8 +255,10 @@ aoedev_by_sysminor_m(ulong sysminor)
222 spin_lock_irqsave(&devlist_lock, flags); 255 spin_lock_irqsave(&devlist_lock, flags);
223 256
224 for (d=devlist; d; d=d->next) 257 for (d=devlist; d; d=d->next)
225 if (d->sysminor == sysminor) 258 if (d->sysminor == sysminor) {
259 d->ref++;
226 break; 260 break;
261 }
227 if (d) 262 if (d)
228 goto out; 263 goto out;
229 d = kcalloc(1, sizeof *d, GFP_ATOMIC); 264 d = kcalloc(1, sizeof *d, GFP_ATOMIC);
@@ -231,7 +266,6 @@ aoedev_by_sysminor_m(ulong sysminor)
231 goto out; 266 goto out;
232 INIT_WORK(&d->work, aoecmd_sleepwork); 267 INIT_WORK(&d->work, aoecmd_sleepwork);
233 spin_lock_init(&d->lock); 268 spin_lock_init(&d->lock);
234 skb_queue_head_init(&d->sendq);
235 skb_queue_head_init(&d->skbpool); 269 skb_queue_head_init(&d->skbpool);
236 init_timer(&d->timer); 270 init_timer(&d->timer);
237 d->timer.data = (ulong) d; 271 d->timer.data = (ulong) d;
@@ -240,7 +274,7 @@ aoedev_by_sysminor_m(ulong sysminor)
240 add_timer(&d->timer); 274 add_timer(&d->timer);
241 d->bufpool = NULL; /* defer to aoeblk_gdalloc */ 275 d->bufpool = NULL; /* defer to aoeblk_gdalloc */
242 d->tgt = d->targets; 276 d->tgt = d->targets;
243 INIT_LIST_HEAD(&d->bufq); 277 d->ref = 1;
244 d->sysminor = sysminor; 278 d->sysminor = sysminor;
245 d->aoemajor = AOEMAJOR(sysminor); 279 d->aoemajor = AOEMAJOR(sysminor);
246 d->aoeminor = AOEMINOR(sysminor); 280 d->aoeminor = AOEMINOR(sysminor);
@@ -274,6 +308,7 @@ aoedev_exit(void)
274 struct aoedev *d; 308 struct aoedev *d;
275 ulong flags; 309 ulong flags;
276 310
311 aoe_flush_iocq();
277 while ((d = devlist)) { 312 while ((d = devlist)) {
278 devlist = d->next; 313 devlist = d->next;
279 314