aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/Kconfig12
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/aoe/aoe.h93
-rw-r--r--drivers/block/aoe/aoeblk.c91
-rw-r--r--drivers/block/aoe/aoechr.c13
-rw-r--r--drivers/block/aoe/aoecmd.c1233
-rw-r--r--drivers/block/aoe/aoedev.c265
-rw-r--r--drivers/block/aoe/aoemain.c10
-rw-r--r--drivers/block/aoe/aoenet.c61
-rw-r--r--drivers/block/cciss_scsi.c1
-rw-r--r--drivers/block/floppy.c5
-rw-r--r--drivers/block/loop.c4
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c38
-rw-r--r--drivers/block/mtip32xx/mtip32xx.h10
-rw-r--r--drivers/block/nbd.c32
-rw-r--r--drivers/block/nvme.c155
-rw-r--r--drivers/block/rbd.c1789
-rw-r--r--drivers/block/rbd_types.h27
-rw-r--r--drivers/block/ub.c2474
-rw-r--r--drivers/block/virtio_blk.c306
-rw-r--r--drivers/block/xen-blkback/blkback.c3
-rw-r--r--drivers/block/xen-blkfront.c4
22 files changed, 2802 insertions, 3825 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index a796407123c7..f529407db93f 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -353,18 +353,6 @@ config BLK_DEV_SX8
353 353
354 Use devices /dev/sx8/$N and /dev/sx8/$Np$M. 354 Use devices /dev/sx8/$N and /dev/sx8/$Np$M.
355 355
356config BLK_DEV_UB
357 tristate "Low Performance USB Block driver (deprecated)"
358 depends on USB
359 help
360 This driver supports certain USB attached storage devices
361 such as flash keys.
362
363 If you enable this driver, it is recommended to avoid conflicts
364 with usb-storage by enabling USB_LIBUSUAL.
365
366 If unsure, say N.
367
368config BLK_DEV_RAM 356config BLK_DEV_RAM
369 tristate "RAM block device support" 357 tristate "RAM block device support"
370 ---help--- 358 ---help---
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 5b795059f8fb..17e82df3df74 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -33,7 +33,6 @@ obj-$(CONFIG_VIRTIO_BLK) += virtio_blk.o
33 33
34obj-$(CONFIG_VIODASD) += viodasd.o 34obj-$(CONFIG_VIODASD) += viodasd.o
35obj-$(CONFIG_BLK_DEV_SX8) += sx8.o 35obj-$(CONFIG_BLK_DEV_SX8) += sx8.o
36obj-$(CONFIG_BLK_DEV_UB) += ub.o
37obj-$(CONFIG_BLK_DEV_HD) += hd.o 36obj-$(CONFIG_BLK_DEV_HD) += hd.o
38 37
39obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o 38obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index db195abad698..d2ed7f18d1ac 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -1,5 +1,5 @@
1/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ 1/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */
2#define VERSION "47" 2#define VERSION "50"
3#define AOE_MAJOR 152 3#define AOE_MAJOR 152
4#define DEVICE_NAME "aoe" 4#define DEVICE_NAME "aoe"
5 5
@@ -10,9 +10,6 @@
10#define AOE_PARTITIONS (16) 10#define AOE_PARTITIONS (16)
11#endif 11#endif
12 12
13#define SYSMINOR(aoemajor, aoeminor) ((aoemajor) * NPERSHELF + (aoeminor))
14#define AOEMAJOR(sysminor) ((sysminor) / NPERSHELF)
15#define AOEMINOR(sysminor) ((sysminor) % NPERSHELF)
16#define WHITESPACE " \t\v\f\n" 13#define WHITESPACE " \t\v\f\n"
17 14
18enum { 15enum {
@@ -75,72 +72,67 @@ enum {
75 DEVFL_UP = 1, /* device is installed in system and ready for AoE->ATA commands */ 72 DEVFL_UP = 1, /* device is installed in system and ready for AoE->ATA commands */
76 DEVFL_TKILL = (1<<1), /* flag for timer to know when to kill self */ 73 DEVFL_TKILL = (1<<1), /* flag for timer to know when to kill self */
77 DEVFL_EXT = (1<<2), /* device accepts lba48 commands */ 74 DEVFL_EXT = (1<<2), /* device accepts lba48 commands */
78 DEVFL_CLOSEWAIT = (1<<3), /* device is waiting for all closes to revalidate */ 75 DEVFL_GDALLOC = (1<<3), /* need to alloc gendisk */
79 DEVFL_GDALLOC = (1<<4), /* need to alloc gendisk */ 76 DEVFL_KICKME = (1<<4), /* slow polling network card catch */
80 DEVFL_KICKME = (1<<5), /* slow polling network card catch */ 77 DEVFL_NEWSIZE = (1<<5), /* need to update dev size in block layer */
81 DEVFL_NEWSIZE = (1<<6), /* need to update dev size in block layer */
82
83 BUFFL_FAIL = 1,
84}; 78};
85 79
86enum { 80enum {
87 DEFAULTBCNT = 2 * 512, /* 2 sectors */ 81 DEFAULTBCNT = 2 * 512, /* 2 sectors */
88 NPERSHELF = 16, /* number of slots per shelf address */
89 FREETAG = -1,
90 MIN_BUFS = 16, 82 MIN_BUFS = 16,
91 NTARGETS = 8, 83 NTARGETS = 8,
92 NAOEIFS = 8, 84 NAOEIFS = 8,
93 NSKBPOOLMAX = 128, 85 NSKBPOOLMAX = 256,
86 NFACTIVE = 61,
94 87
95 TIMERTICK = HZ / 10, 88 TIMERTICK = HZ / 10,
96 MINTIMER = HZ >> 2, 89 MINTIMER = HZ >> 2,
97 MAXTIMER = HZ << 1, 90 MAXTIMER = HZ << 1,
98 HELPWAIT = 20,
99}; 91};
100 92
101struct buf { 93struct buf {
102 struct list_head bufs;
103 ulong stime; /* for disk stats */
104 ulong flags;
105 ulong nframesout; 94 ulong nframesout;
106 ulong resid; 95 ulong resid;
107 ulong bv_resid; 96 ulong bv_resid;
108 ulong bv_off;
109 sector_t sector; 97 sector_t sector;
110 struct bio *bio; 98 struct bio *bio;
111 struct bio_vec *bv; 99 struct bio_vec *bv;
100 struct request *rq;
112}; 101};
113 102
114struct frame { 103struct frame {
115 int tag; 104 struct list_head head;
105 u32 tag;
116 ulong waited; 106 ulong waited;
107 struct aoetgt *t; /* parent target I belong to */
108 sector_t lba;
109 struct sk_buff *skb; /* command skb freed on module exit */
110 struct sk_buff *r_skb; /* response skb for async processing */
117 struct buf *buf; 111 struct buf *buf;
118 char *bufaddr; 112 struct bio_vec *bv;
119 ulong bcnt; 113 ulong bcnt;
120 sector_t lba; 114 ulong bv_off;
121 struct sk_buff *skb;
122}; 115};
123 116
124struct aoeif { 117struct aoeif {
125 struct net_device *nd; 118 struct net_device *nd;
126 unsigned char lost; 119 ulong lost;
127 unsigned char lostjumbo; 120 int bcnt;
128 ushort maxbcnt;
129}; 121};
130 122
131struct aoetgt { 123struct aoetgt {
132 unsigned char addr[6]; 124 unsigned char addr[6];
133 ushort nframes; 125 ushort nframes;
134 struct frame *frames; 126 struct aoedev *d; /* parent device I belong to */
127 struct list_head ffree; /* list of free frames */
135 struct aoeif ifs[NAOEIFS]; 128 struct aoeif ifs[NAOEIFS];
136 struct aoeif *ifp; /* current aoeif in use */ 129 struct aoeif *ifp; /* current aoeif in use */
137 ushort nout; 130 ushort nout;
138 ushort maxout; 131 ushort maxout;
139 u16 lasttag; /* last tag sent */ 132 ulong falloc;
140 u16 useme;
141 ulong lastwadj; /* last window adjustment */ 133 ulong lastwadj; /* last window adjustment */
134 int minbcnt;
142 int wpkts, rpkts; 135 int wpkts, rpkts;
143 int dataref;
144}; 136};
145 137
146struct aoedev { 138struct aoedev {
@@ -153,6 +145,9 @@ struct aoedev {
153 u16 rttavg; /* round trip average of requests/responses */ 145 u16 rttavg; /* round trip average of requests/responses */
154 u16 mintimer; 146 u16 mintimer;
155 u16 fw_ver; /* version of blade's firmware */ 147 u16 fw_ver; /* version of blade's firmware */
148 u16 lasttag; /* last tag sent */
149 u16 useme;
150 ulong ref;
156 struct work_struct work;/* disk create work struct */ 151 struct work_struct work;/* disk create work struct */
157 struct gendisk *gd; 152 struct gendisk *gd;
158 struct request_queue *blkq; 153 struct request_queue *blkq;
@@ -160,16 +155,31 @@ struct aoedev {
160 sector_t ssize; 155 sector_t ssize;
161 struct timer_list timer; 156 struct timer_list timer;
162 spinlock_t lock; 157 spinlock_t lock;
163 struct sk_buff_head sendq;
164 struct sk_buff_head skbpool; 158 struct sk_buff_head skbpool;
165 mempool_t *bufpool; /* for deadlock-free Buf allocation */ 159 mempool_t *bufpool; /* for deadlock-free Buf allocation */
166 struct list_head bufq; /* queue of bios to work on */ 160 struct { /* pointers to work in progress */
167 struct buf *inprocess; /* the one we're currently working on */ 161 struct buf *buf;
162 struct bio *nxbio;
163 struct request *rq;
164 } ip;
165 ulong maxbcnt;
166 struct list_head factive[NFACTIVE]; /* hash of active frames */
168 struct aoetgt *targets[NTARGETS]; 167 struct aoetgt *targets[NTARGETS];
169 struct aoetgt **tgt; /* target in use when working */ 168 struct aoetgt **tgt; /* target in use when working */
170 struct aoetgt **htgt; /* target needing rexmit assistance */ 169 struct aoetgt *htgt; /* target needing rexmit assistance */
170 ulong ntargets;
171 ulong kicked;
171}; 172};
172 173
174/* kthread tracking */
175struct ktstate {
176 struct completion rendez;
177 struct task_struct *task;
178 wait_queue_head_t *waitq;
179 int (*fn) (void);
180 char *name;
181 spinlock_t *lock;
182};
173 183
174int aoeblk_init(void); 184int aoeblk_init(void);
175void aoeblk_exit(void); 185void aoeblk_exit(void);
@@ -182,22 +192,29 @@ void aoechr_error(char *);
182 192
183void aoecmd_work(struct aoedev *d); 193void aoecmd_work(struct aoedev *d);
184void aoecmd_cfg(ushort aoemajor, unsigned char aoeminor); 194void aoecmd_cfg(ushort aoemajor, unsigned char aoeminor);
185void aoecmd_ata_rsp(struct sk_buff *); 195struct sk_buff *aoecmd_ata_rsp(struct sk_buff *);
186void aoecmd_cfg_rsp(struct sk_buff *); 196void aoecmd_cfg_rsp(struct sk_buff *);
187void aoecmd_sleepwork(struct work_struct *); 197void aoecmd_sleepwork(struct work_struct *);
188void aoecmd_cleanslate(struct aoedev *); 198void aoecmd_cleanslate(struct aoedev *);
199void aoecmd_exit(void);
200int aoecmd_init(void);
189struct sk_buff *aoecmd_ata_id(struct aoedev *); 201struct sk_buff *aoecmd_ata_id(struct aoedev *);
202void aoe_freetframe(struct frame *);
203void aoe_flush_iocq(void);
204void aoe_end_request(struct aoedev *, struct request *, int);
205int aoe_ktstart(struct ktstate *k);
206void aoe_ktstop(struct ktstate *k);
190 207
191int aoedev_init(void); 208int aoedev_init(void);
192void aoedev_exit(void); 209void aoedev_exit(void);
193struct aoedev *aoedev_by_aoeaddr(int maj, int min); 210struct aoedev *aoedev_by_aoeaddr(ulong maj, int min, int do_alloc);
194struct aoedev *aoedev_by_sysminor_m(ulong sysminor);
195void aoedev_downdev(struct aoedev *d); 211void aoedev_downdev(struct aoedev *d);
196int aoedev_flush(const char __user *str, size_t size); 212int aoedev_flush(const char __user *str, size_t size);
213void aoe_failbuf(struct aoedev *, struct buf *);
214void aoedev_put(struct aoedev *);
197 215
198int aoenet_init(void); 216int aoenet_init(void);
199void aoenet_exit(void); 217void aoenet_exit(void);
200void aoenet_xmit(struct sk_buff_head *); 218void aoenet_xmit(struct sk_buff_head *);
201int is_aoe_netif(struct net_device *ifp); 219int is_aoe_netif(struct net_device *ifp);
202int set_aoe_iflist(const char __user *str, size_t size); 220int set_aoe_iflist(const char __user *str, size_t size);
203
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 321de7b6c442..00dfc5008ad4 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -1,4 +1,4 @@
1/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ 1/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */
2/* 2/*
3 * aoeblk.c 3 * aoeblk.c
4 * block device routines 4 * block device routines
@@ -161,68 +161,22 @@ aoeblk_release(struct gendisk *disk, fmode_t mode)
161} 161}
162 162
163static void 163static void
164aoeblk_make_request(struct request_queue *q, struct bio *bio) 164aoeblk_request(struct request_queue *q)
165{ 165{
166 struct sk_buff_head queue;
167 struct aoedev *d; 166 struct aoedev *d;
168 struct buf *buf; 167 struct request *rq;
169 ulong flags;
170
171 blk_queue_bounce(q, &bio);
172
173 if (bio == NULL) {
174 printk(KERN_ERR "aoe: bio is NULL\n");
175 BUG();
176 return;
177 }
178 d = bio->bi_bdev->bd_disk->private_data;
179 if (d == NULL) {
180 printk(KERN_ERR "aoe: bd_disk->private_data is NULL\n");
181 BUG();
182 bio_endio(bio, -ENXIO);
183 return;
184 } else if (bio->bi_io_vec == NULL) {
185 printk(KERN_ERR "aoe: bi_io_vec is NULL\n");
186 BUG();
187 bio_endio(bio, -ENXIO);
188 return;
189 }
190 buf = mempool_alloc(d->bufpool, GFP_NOIO);
191 if (buf == NULL) {
192 printk(KERN_INFO "aoe: buf allocation failure\n");
193 bio_endio(bio, -ENOMEM);
194 return;
195 }
196 memset(buf, 0, sizeof(*buf));
197 INIT_LIST_HEAD(&buf->bufs);
198 buf->stime = jiffies;
199 buf->bio = bio;
200 buf->resid = bio->bi_size;
201 buf->sector = bio->bi_sector;
202 buf->bv = &bio->bi_io_vec[bio->bi_idx];
203 buf->bv_resid = buf->bv->bv_len;
204 WARN_ON(buf->bv_resid == 0);
205 buf->bv_off = buf->bv->bv_offset;
206
207 spin_lock_irqsave(&d->lock, flags);
208 168
169 d = q->queuedata;
209 if ((d->flags & DEVFL_UP) == 0) { 170 if ((d->flags & DEVFL_UP) == 0) {
210 pr_info_ratelimited("aoe: device %ld.%d is not up\n", 171 pr_info_ratelimited("aoe: device %ld.%d is not up\n",
211 d->aoemajor, d->aoeminor); 172 d->aoemajor, d->aoeminor);
212 spin_unlock_irqrestore(&d->lock, flags); 173 while ((rq = blk_peek_request(q))) {
213 mempool_free(buf, d->bufpool); 174 blk_start_request(rq);
214 bio_endio(bio, -ENXIO); 175 aoe_end_request(d, rq, 1);
176 }
215 return; 177 return;
216 } 178 }
217
218 list_add_tail(&buf->bufs, &d->bufq);
219
220 aoecmd_work(d); 179 aoecmd_work(d);
221 __skb_queue_head_init(&queue);
222 skb_queue_splice_init(&d->sendq, &queue);
223
224 spin_unlock_irqrestore(&d->lock, flags);
225 aoenet_xmit(&queue);
226} 180}
227 181
228static int 182static int
@@ -254,41 +208,54 @@ aoeblk_gdalloc(void *vp)
254{ 208{
255 struct aoedev *d = vp; 209 struct aoedev *d = vp;
256 struct gendisk *gd; 210 struct gendisk *gd;
211 mempool_t *mp;
212 struct request_queue *q;
213 enum { KB = 1024, MB = KB * KB, READ_AHEAD = 2 * MB, };
257 ulong flags; 214 ulong flags;
258 215
259 gd = alloc_disk(AOE_PARTITIONS); 216 gd = alloc_disk(AOE_PARTITIONS);
260 if (gd == NULL) { 217 if (gd == NULL) {
261 printk(KERN_ERR 218 pr_err("aoe: cannot allocate disk structure for %ld.%d\n",
262 "aoe: cannot allocate disk structure for %ld.%d\n",
263 d->aoemajor, d->aoeminor); 219 d->aoemajor, d->aoeminor);
264 goto err; 220 goto err;
265 } 221 }
266 222
267 d->bufpool = mempool_create_slab_pool(MIN_BUFS, buf_pool_cache); 223 mp = mempool_create(MIN_BUFS, mempool_alloc_slab, mempool_free_slab,
268 if (d->bufpool == NULL) { 224 buf_pool_cache);
225 if (mp == NULL) {
269 printk(KERN_ERR "aoe: cannot allocate bufpool for %ld.%d\n", 226 printk(KERN_ERR "aoe: cannot allocate bufpool for %ld.%d\n",
270 d->aoemajor, d->aoeminor); 227 d->aoemajor, d->aoeminor);
271 goto err_disk; 228 goto err_disk;
272 } 229 }
230 q = blk_init_queue(aoeblk_request, &d->lock);
231 if (q == NULL) {
232 pr_err("aoe: cannot allocate block queue for %ld.%d\n",
233 d->aoemajor, d->aoeminor);
234 mempool_destroy(mp);
235 goto err_disk;
236 }
273 237
274 d->blkq = blk_alloc_queue(GFP_KERNEL); 238 d->blkq = blk_alloc_queue(GFP_KERNEL);
275 if (!d->blkq) 239 if (!d->blkq)
276 goto err_mempool; 240 goto err_mempool;
277 blk_queue_make_request(d->blkq, aoeblk_make_request);
278 d->blkq->backing_dev_info.name = "aoe"; 241 d->blkq->backing_dev_info.name = "aoe";
279 if (bdi_init(&d->blkq->backing_dev_info)) 242 if (bdi_init(&d->blkq->backing_dev_info))
280 goto err_blkq; 243 goto err_blkq;
281 spin_lock_irqsave(&d->lock, flags); 244 spin_lock_irqsave(&d->lock, flags);
245 blk_queue_max_hw_sectors(d->blkq, BLK_DEF_MAX_SECTORS);
246 q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE;
247 d->bufpool = mp;
248 d->blkq = gd->queue = q;
249 q->queuedata = d;
250 d->gd = gd;
282 gd->major = AOE_MAJOR; 251 gd->major = AOE_MAJOR;
283 gd->first_minor = d->sysminor * AOE_PARTITIONS; 252 gd->first_minor = d->sysminor;
284 gd->fops = &aoe_bdops; 253 gd->fops = &aoe_bdops;
285 gd->private_data = d; 254 gd->private_data = d;
286 set_capacity(gd, d->ssize); 255 set_capacity(gd, d->ssize);
287 snprintf(gd->disk_name, sizeof gd->disk_name, "etherd/e%ld.%d", 256 snprintf(gd->disk_name, sizeof gd->disk_name, "etherd/e%ld.%d",
288 d->aoemajor, d->aoeminor); 257 d->aoemajor, d->aoeminor);
289 258
290 gd->queue = d->blkq;
291 d->gd = gd;
292 d->flags &= ~DEVFL_GDALLOC; 259 d->flags &= ~DEVFL_GDALLOC;
293 d->flags |= DEVFL_UP; 260 d->flags |= DEVFL_UP;
294 261
diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c
index e86d2062a164..ed57a890c643 100644
--- a/drivers/block/aoe/aoechr.c
+++ b/drivers/block/aoe/aoechr.c
@@ -1,4 +1,4 @@
1/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ 1/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */
2/* 2/*
3 * aoechr.c 3 * aoechr.c
4 * AoE character device driver 4 * AoE character device driver
@@ -86,34 +86,34 @@ revalidate(const char __user *str, size_t size)
86 if (copy_from_user(buf, str, size)) 86 if (copy_from_user(buf, str, size))
87 return -EFAULT; 87 return -EFAULT;
88 88
89 /* should be e%d.%d format */
90 n = sscanf(buf, "e%d.%d", &major, &minor); 89 n = sscanf(buf, "e%d.%d", &major, &minor);
91 if (n != 2) { 90 if (n != 2) {
92 printk(KERN_ERR "aoe: invalid device specification\n"); 91 pr_err("aoe: invalid device specification %s\n", buf);
93 return -EINVAL; 92 return -EINVAL;
94 } 93 }
95 d = aoedev_by_aoeaddr(major, minor); 94 d = aoedev_by_aoeaddr(major, minor, 0);
96 if (!d) 95 if (!d)
97 return -EINVAL; 96 return -EINVAL;
98 spin_lock_irqsave(&d->lock, flags); 97 spin_lock_irqsave(&d->lock, flags);
99 aoecmd_cleanslate(d); 98 aoecmd_cleanslate(d);
99 aoecmd_cfg(major, minor);
100loop: 100loop:
101 skb = aoecmd_ata_id(d); 101 skb = aoecmd_ata_id(d);
102 spin_unlock_irqrestore(&d->lock, flags); 102 spin_unlock_irqrestore(&d->lock, flags);
103 /* try again if we are able to sleep a bit, 103 /* try again if we are able to sleep a bit,
104 * otherwise give up this revalidation 104 * otherwise give up this revalidation
105 */ 105 */
106 if (!skb && !msleep_interruptible(200)) { 106 if (!skb && !msleep_interruptible(250)) {
107 spin_lock_irqsave(&d->lock, flags); 107 spin_lock_irqsave(&d->lock, flags);
108 goto loop; 108 goto loop;
109 } 109 }
110 aoedev_put(d);
110 if (skb) { 111 if (skb) {
111 struct sk_buff_head queue; 112 struct sk_buff_head queue;
112 __skb_queue_head_init(&queue); 113 __skb_queue_head_init(&queue);
113 __skb_queue_tail(&queue, skb); 114 __skb_queue_tail(&queue, skb);
114 aoenet_xmit(&queue); 115 aoenet_xmit(&queue);
115 } 116 }
116 aoecmd_cfg(major, minor);
117 return 0; 117 return 0;
118} 118}
119 119
@@ -174,6 +174,7 @@ aoechr_write(struct file *filp, const char __user *buf, size_t cnt, loff_t *offp
174 break; 174 break;
175 case MINOR_FLUSH: 175 case MINOR_FLUSH:
176 ret = aoedev_flush(buf, cnt); 176 ret = aoedev_flush(buf, cnt);
177 break;
177 } 178 }
178 if (ret == 0) 179 if (ret == 0)
179 ret = cnt; 180 ret = cnt;
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index de0435e63b02..3804a0af3ef1 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -1,4 +1,4 @@
1/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ 1/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */
2/* 2/*
3 * aoecmd.c 3 * aoecmd.c
4 * Filesystem request handling methods 4 * Filesystem request handling methods
@@ -12,10 +12,19 @@
12#include <linux/netdevice.h> 12#include <linux/netdevice.h>
13#include <linux/genhd.h> 13#include <linux/genhd.h>
14#include <linux/moduleparam.h> 14#include <linux/moduleparam.h>
15#include <linux/workqueue.h>
16#include <linux/kthread.h>
15#include <net/net_namespace.h> 17#include <net/net_namespace.h>
16#include <asm/unaligned.h> 18#include <asm/unaligned.h>
19#include <linux/uio.h>
17#include "aoe.h" 20#include "aoe.h"
18 21
22#define MAXIOC (8192) /* default meant to avoid most soft lockups */
23
24static void ktcomplete(struct frame *, struct sk_buff *);
25
26static struct buf *nextbuf(struct aoedev *);
27
19static int aoe_deadsecs = 60 * 3; 28static int aoe_deadsecs = 60 * 3;
20module_param(aoe_deadsecs, int, 0644); 29module_param(aoe_deadsecs, int, 0644);
21MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev."); 30MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev.");
@@ -25,6 +34,15 @@ module_param(aoe_maxout, int, 0644);
25MODULE_PARM_DESC(aoe_maxout, 34MODULE_PARM_DESC(aoe_maxout,
26 "Only aoe_maxout outstanding packets for every MAC on eX.Y."); 35 "Only aoe_maxout outstanding packets for every MAC on eX.Y.");
27 36
37static wait_queue_head_t ktiowq;
38static struct ktstate kts;
39
40/* io completion queue */
41static struct {
42 struct list_head head;
43 spinlock_t lock;
44} iocq;
45
28static struct sk_buff * 46static struct sk_buff *
29new_skb(ulong len) 47new_skb(ulong len)
30{ 48{
@@ -35,20 +53,27 @@ new_skb(ulong len)
35 skb_reset_mac_header(skb); 53 skb_reset_mac_header(skb);
36 skb_reset_network_header(skb); 54 skb_reset_network_header(skb);
37 skb->protocol = __constant_htons(ETH_P_AOE); 55 skb->protocol = __constant_htons(ETH_P_AOE);
56 skb_checksum_none_assert(skb);
38 } 57 }
39 return skb; 58 return skb;
40} 59}
41 60
42static struct frame * 61static struct frame *
43getframe(struct aoetgt *t, int tag) 62getframe(struct aoedev *d, u32 tag)
44{ 63{
45 struct frame *f, *e; 64 struct frame *f;
65 struct list_head *head, *pos, *nx;
66 u32 n;
46 67
47 f = t->frames; 68 n = tag % NFACTIVE;
48 e = f + t->nframes; 69 head = &d->factive[n];
49 for (; f<e; f++) 70 list_for_each_safe(pos, nx, head) {
50 if (f->tag == tag) 71 f = list_entry(pos, struct frame, head);
72 if (f->tag == tag) {
73 list_del(pos);
51 return f; 74 return f;
75 }
76 }
52 return NULL; 77 return NULL;
53} 78}
54 79
@@ -58,18 +83,18 @@ getframe(struct aoetgt *t, int tag)
58 * This driver reserves tag -1 to mean "unused frame." 83 * This driver reserves tag -1 to mean "unused frame."
59 */ 84 */
60static int 85static int
61newtag(struct aoetgt *t) 86newtag(struct aoedev *d)
62{ 87{
63 register ulong n; 88 register ulong n;
64 89
65 n = jiffies & 0xffff; 90 n = jiffies & 0xffff;
66 return n |= (++t->lasttag & 0x7fff) << 16; 91 return n |= (++d->lasttag & 0x7fff) << 16;
67} 92}
68 93
69static int 94static u32
70aoehdr_atainit(struct aoedev *d, struct aoetgt *t, struct aoe_hdr *h) 95aoehdr_atainit(struct aoedev *d, struct aoetgt *t, struct aoe_hdr *h)
71{ 96{
72 u32 host_tag = newtag(t); 97 u32 host_tag = newtag(d);
73 98
74 memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src); 99 memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
75 memcpy(h->dst, t->addr, sizeof h->dst); 100 memcpy(h->dst, t->addr, sizeof h->dst);
@@ -94,16 +119,18 @@ put_lba(struct aoe_atahdr *ah, sector_t lba)
94 ah->lba5 = lba >>= 8; 119 ah->lba5 = lba >>= 8;
95} 120}
96 121
97static void 122static struct aoeif *
98ifrotate(struct aoetgt *t) 123ifrotate(struct aoetgt *t)
99{ 124{
100 t->ifp++; 125 struct aoeif *ifp;
101 if (t->ifp >= &t->ifs[NAOEIFS] || t->ifp->nd == NULL) 126
102 t->ifp = t->ifs; 127 ifp = t->ifp;
103 if (t->ifp->nd == NULL) { 128 ifp++;
104 printk(KERN_INFO "aoe: no interface to rotate to\n"); 129 if (ifp >= &t->ifs[NAOEIFS] || ifp->nd == NULL)
105 BUG(); 130 ifp = t->ifs;
106 } 131 if (ifp->nd == NULL)
132 return NULL;
133 return t->ifp = ifp;
107} 134}
108 135
109static void 136static void
@@ -128,78 +155,128 @@ skb_pool_get(struct aoedev *d)
128 return NULL; 155 return NULL;
129} 156}
130 157
131/* freeframe is where we do our load balancing so it's a little hairy. */ 158void
159aoe_freetframe(struct frame *f)
160{
161 struct aoetgt *t;
162
163 t = f->t;
164 f->buf = NULL;
165 f->bv = NULL;
166 f->r_skb = NULL;
167 list_add(&f->head, &t->ffree);
168}
169
132static struct frame * 170static struct frame *
133freeframe(struct aoedev *d) 171newtframe(struct aoedev *d, struct aoetgt *t)
134{ 172{
135 struct frame *f, *e, *rf; 173 struct frame *f;
136 struct aoetgt **t;
137 struct sk_buff *skb; 174 struct sk_buff *skb;
175 struct list_head *pos;
176
177 if (list_empty(&t->ffree)) {
178 if (t->falloc >= NSKBPOOLMAX*2)
179 return NULL;
180 f = kcalloc(1, sizeof(*f), GFP_ATOMIC);
181 if (f == NULL)
182 return NULL;
183 t->falloc++;
184 f->t = t;
185 } else {
186 pos = t->ffree.next;
187 list_del(pos);
188 f = list_entry(pos, struct frame, head);
189 }
190
191 skb = f->skb;
192 if (skb == NULL) {
193 f->skb = skb = new_skb(ETH_ZLEN);
194 if (!skb) {
195bail: aoe_freetframe(f);
196 return NULL;
197 }
198 }
199
200 if (atomic_read(&skb_shinfo(skb)->dataref) != 1) {
201 skb = skb_pool_get(d);
202 if (skb == NULL)
203 goto bail;
204 skb_pool_put(d, f->skb);
205 f->skb = skb;
206 }
207
208 skb->truesize -= skb->data_len;
209 skb_shinfo(skb)->nr_frags = skb->data_len = 0;
210 skb_trim(skb, 0);
211 return f;
212}
213
214static struct frame *
215newframe(struct aoedev *d)
216{
217 struct frame *f;
218 struct aoetgt *t, **tt;
219 int totout = 0;
138 220
139 if (d->targets[0] == NULL) { /* shouldn't happen, but I'm paranoid */ 221 if (d->targets[0] == NULL) { /* shouldn't happen, but I'm paranoid */
140 printk(KERN_ERR "aoe: NULL TARGETS!\n"); 222 printk(KERN_ERR "aoe: NULL TARGETS!\n");
141 return NULL; 223 return NULL;
142 } 224 }
143 t = d->tgt; 225 tt = d->tgt; /* last used target */
144 t++;
145 if (t >= &d->targets[NTARGETS] || !*t)
146 t = d->targets;
147 for (;;) { 226 for (;;) {
148 if ((*t)->nout < (*t)->maxout 227 tt++;
228 if (tt >= &d->targets[NTARGETS] || !*tt)
229 tt = d->targets;
230 t = *tt;
231 totout += t->nout;
232 if (t->nout < t->maxout
149 && t != d->htgt 233 && t != d->htgt
150 && (*t)->ifp->nd) { 234 && t->ifp->nd) {
151 rf = NULL; 235 f = newtframe(d, t);
152 f = (*t)->frames; 236 if (f) {
153 e = f + (*t)->nframes; 237 ifrotate(t);
154 for (; f < e; f++) { 238 d->tgt = tt;
155 if (f->tag != FREETAG)
156 continue;
157 skb = f->skb;
158 if (!skb
159 && !(f->skb = skb = new_skb(ETH_ZLEN)))
160 continue;
161 if (atomic_read(&skb_shinfo(skb)->dataref)
162 != 1) {
163 if (!rf)
164 rf = f;
165 continue;
166 }
167gotone: skb_shinfo(skb)->nr_frags = skb->data_len = 0;
168 skb_trim(skb, 0);
169 d->tgt = t;
170 ifrotate(*t);
171 return f; 239 return f;
172 } 240 }
173 /* Work can be done, but the network layer is
174 holding our precious packets. Try to grab
175 one from the pool. */
176 f = rf;
177 if (f == NULL) { /* more paranoia */
178 printk(KERN_ERR
179 "aoe: freeframe: %s.\n",
180 "unexpected null rf");
181 d->flags |= DEVFL_KICKME;
182 return NULL;
183 }
184 skb = skb_pool_get(d);
185 if (skb) {
186 skb_pool_put(d, f->skb);
187 f->skb = skb;
188 goto gotone;
189 }
190 (*t)->dataref++;
191 if ((*t)->nout == 0)
192 d->flags |= DEVFL_KICKME;
193 } 241 }
194 if (t == d->tgt) /* we've looped and found nada */ 242 if (tt == d->tgt) /* we've looped and found nada */
195 break; 243 break;
196 t++; 244 }
197 if (t >= &d->targets[NTARGETS] || !*t) 245 if (totout == 0) {
198 t = d->targets; 246 d->kicked++;
247 d->flags |= DEVFL_KICKME;
199 } 248 }
200 return NULL; 249 return NULL;
201} 250}
202 251
252static void
253skb_fillup(struct sk_buff *skb, struct bio_vec *bv, ulong off, ulong cnt)
254{
255 int frag = 0;
256 ulong fcnt;
257loop:
258 fcnt = bv->bv_len - (off - bv->bv_offset);
259 if (fcnt > cnt)
260 fcnt = cnt;
261 skb_fill_page_desc(skb, frag++, bv->bv_page, off, fcnt);
262 cnt -= fcnt;
263 if (cnt <= 0)
264 return;
265 bv++;
266 off = bv->bv_offset;
267 goto loop;
268}
269
270static void
271fhash(struct frame *f)
272{
273 struct aoedev *d = f->t->d;
274 u32 n;
275
276 n = f->tag % NFACTIVE;
277 list_add_tail(&f->head, &d->factive[n]);
278}
279
203static int 280static int
204aoecmd_ata_rw(struct aoedev *d) 281aoecmd_ata_rw(struct aoedev *d)
205{ 282{
@@ -207,26 +284,47 @@ aoecmd_ata_rw(struct aoedev *d)
207 struct aoe_hdr *h; 284 struct aoe_hdr *h;
208 struct aoe_atahdr *ah; 285 struct aoe_atahdr *ah;
209 struct buf *buf; 286 struct buf *buf;
210 struct bio_vec *bv;
211 struct aoetgt *t; 287 struct aoetgt *t;
212 struct sk_buff *skb; 288 struct sk_buff *skb;
213 ulong bcnt; 289 struct sk_buff_head queue;
290 ulong bcnt, fbcnt;
214 char writebit, extbit; 291 char writebit, extbit;
215 292
216 writebit = 0x10; 293 writebit = 0x10;
217 extbit = 0x4; 294 extbit = 0x4;
218 295
219 f = freeframe(d); 296 buf = nextbuf(d);
297 if (buf == NULL)
298 return 0;
299 f = newframe(d);
220 if (f == NULL) 300 if (f == NULL)
221 return 0; 301 return 0;
222 t = *d->tgt; 302 t = *d->tgt;
223 buf = d->inprocess; 303 bcnt = d->maxbcnt;
224 bv = buf->bv;
225 bcnt = t->ifp->maxbcnt;
226 if (bcnt == 0) 304 if (bcnt == 0)
227 bcnt = DEFAULTBCNT; 305 bcnt = DEFAULTBCNT;
228 if (bcnt > buf->bv_resid) 306 if (bcnt > buf->resid)
229 bcnt = buf->bv_resid; 307 bcnt = buf->resid;
308 fbcnt = bcnt;
309 f->bv = buf->bv;
310 f->bv_off = f->bv->bv_offset + (f->bv->bv_len - buf->bv_resid);
311 do {
312 if (fbcnt < buf->bv_resid) {
313 buf->bv_resid -= fbcnt;
314 buf->resid -= fbcnt;
315 break;
316 }
317 fbcnt -= buf->bv_resid;
318 buf->resid -= buf->bv_resid;
319 if (buf->resid == 0) {
320 d->ip.buf = NULL;
321 break;
322 }
323 buf->bv++;
324 buf->bv_resid = buf->bv->bv_len;
325 WARN_ON(buf->bv_resid == 0);
326 } while (fbcnt);
327
230 /* initialize the headers & frame */ 328 /* initialize the headers & frame */
231 skb = f->skb; 329 skb = f->skb;
232 h = (struct aoe_hdr *) skb_mac_header(skb); 330 h = (struct aoe_hdr *) skb_mac_header(skb);
@@ -234,10 +332,10 @@ aoecmd_ata_rw(struct aoedev *d)
234 skb_put(skb, sizeof *h + sizeof *ah); 332 skb_put(skb, sizeof *h + sizeof *ah);
235 memset(h, 0, skb->len); 333 memset(h, 0, skb->len);
236 f->tag = aoehdr_atainit(d, t, h); 334 f->tag = aoehdr_atainit(d, t, h);
335 fhash(f);
237 t->nout++; 336 t->nout++;
238 f->waited = 0; 337 f->waited = 0;
239 f->buf = buf; 338 f->buf = buf;
240 f->bufaddr = page_address(bv->bv_page) + buf->bv_off;
241 f->bcnt = bcnt; 339 f->bcnt = bcnt;
242 f->lba = buf->sector; 340 f->lba = buf->sector;
243 341
@@ -252,10 +350,11 @@ aoecmd_ata_rw(struct aoedev *d)
252 ah->lba3 |= 0xe0; /* LBA bit + obsolete 0xa0 */ 350 ah->lba3 |= 0xe0; /* LBA bit + obsolete 0xa0 */
253 } 351 }
254 if (bio_data_dir(buf->bio) == WRITE) { 352 if (bio_data_dir(buf->bio) == WRITE) {
255 skb_fill_page_desc(skb, 0, bv->bv_page, buf->bv_off, bcnt); 353 skb_fillup(skb, f->bv, f->bv_off, bcnt);
256 ah->aflags |= AOEAFL_WRITE; 354 ah->aflags |= AOEAFL_WRITE;
257 skb->len += bcnt; 355 skb->len += bcnt;
258 skb->data_len = bcnt; 356 skb->data_len = bcnt;
357 skb->truesize += bcnt;
259 t->wpkts++; 358 t->wpkts++;
260 } else { 359 } else {
261 t->rpkts++; 360 t->rpkts++;
@@ -266,23 +365,15 @@ aoecmd_ata_rw(struct aoedev *d)
266 365
267 /* mark all tracking fields and load out */ 366 /* mark all tracking fields and load out */
268 buf->nframesout += 1; 367 buf->nframesout += 1;
269 buf->bv_off += bcnt;
270 buf->bv_resid -= bcnt;
271 buf->resid -= bcnt;
272 buf->sector += bcnt >> 9; 368 buf->sector += bcnt >> 9;
273 if (buf->resid == 0) {
274 d->inprocess = NULL;
275 } else if (buf->bv_resid == 0) {
276 buf->bv = ++bv;
277 buf->bv_resid = bv->bv_len;
278 WARN_ON(buf->bv_resid == 0);
279 buf->bv_off = bv->bv_offset;
280 }
281 369
282 skb->dev = t->ifp->nd; 370 skb->dev = t->ifp->nd;
283 skb = skb_clone(skb, GFP_ATOMIC); 371 skb = skb_clone(skb, GFP_ATOMIC);
284 if (skb) 372 if (skb) {
285 __skb_queue_tail(&d->sendq, skb); 373 __skb_queue_head_init(&queue);
374 __skb_queue_tail(&queue, skb);
375 aoenet_xmit(&queue);
376 }
286 return 1; 377 return 1;
287} 378}
288 379
@@ -329,17 +420,25 @@ cont:
329} 420}
330 421
331static void 422static void
332resend(struct aoedev *d, struct aoetgt *t, struct frame *f) 423resend(struct aoedev *d, struct frame *f)
333{ 424{
334 struct sk_buff *skb; 425 struct sk_buff *skb;
426 struct sk_buff_head queue;
335 struct aoe_hdr *h; 427 struct aoe_hdr *h;
336 struct aoe_atahdr *ah; 428 struct aoe_atahdr *ah;
429 struct aoetgt *t;
337 char buf[128]; 430 char buf[128];
338 u32 n; 431 u32 n;
339 432
340 ifrotate(t); 433 t = f->t;
341 n = newtag(t); 434 n = newtag(d);
342 skb = f->skb; 435 skb = f->skb;
436 if (ifrotate(t) == NULL) {
437 /* probably can't happen, but set it up to fail anyway */
438 pr_info("aoe: resend: no interfaces to rotate to.\n");
439 ktcomplete(f, NULL);
440 return;
441 }
343 h = (struct aoe_hdr *) skb_mac_header(skb); 442 h = (struct aoe_hdr *) skb_mac_header(skb);
344 ah = (struct aoe_atahdr *) (h+1); 443 ah = (struct aoe_atahdr *) (h+1);
345 444
@@ -350,39 +449,22 @@ resend(struct aoedev *d, struct aoetgt *t, struct frame *f)
350 aoechr_error(buf); 449 aoechr_error(buf);
351 450
352 f->tag = n; 451 f->tag = n;
452 fhash(f);
353 h->tag = cpu_to_be32(n); 453 h->tag = cpu_to_be32(n);
354 memcpy(h->dst, t->addr, sizeof h->dst); 454 memcpy(h->dst, t->addr, sizeof h->dst);
355 memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src); 455 memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
356 456
357 switch (ah->cmdstat) {
358 default:
359 break;
360 case ATA_CMD_PIO_READ:
361 case ATA_CMD_PIO_READ_EXT:
362 case ATA_CMD_PIO_WRITE:
363 case ATA_CMD_PIO_WRITE_EXT:
364 put_lba(ah, f->lba);
365
366 n = f->bcnt;
367 if (n > DEFAULTBCNT)
368 n = DEFAULTBCNT;
369 ah->scnt = n >> 9;
370 if (ah->aflags & AOEAFL_WRITE) {
371 skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr),
372 offset_in_page(f->bufaddr), n);
373 skb->len = sizeof *h + sizeof *ah + n;
374 skb->data_len = n;
375 }
376 }
377 skb->dev = t->ifp->nd; 457 skb->dev = t->ifp->nd;
378 skb = skb_clone(skb, GFP_ATOMIC); 458 skb = skb_clone(skb, GFP_ATOMIC);
379 if (skb == NULL) 459 if (skb == NULL)
380 return; 460 return;
381 __skb_queue_tail(&d->sendq, skb); 461 __skb_queue_head_init(&queue);
462 __skb_queue_tail(&queue, skb);
463 aoenet_xmit(&queue);
382} 464}
383 465
384static int 466static int
385tsince(int tag) 467tsince(u32 tag)
386{ 468{
387 int n; 469 int n;
388 470
@@ -406,58 +488,65 @@ getif(struct aoetgt *t, struct net_device *nd)
406 return NULL; 488 return NULL;
407} 489}
408 490
409static struct aoeif *
410addif(struct aoetgt *t, struct net_device *nd)
411{
412 struct aoeif *p;
413
414 p = getif(t, NULL);
415 if (!p)
416 return NULL;
417 p->nd = nd;
418 p->maxbcnt = DEFAULTBCNT;
419 p->lost = 0;
420 p->lostjumbo = 0;
421 return p;
422}
423
424static void 491static void
425ejectif(struct aoetgt *t, struct aoeif *ifp) 492ejectif(struct aoetgt *t, struct aoeif *ifp)
426{ 493{
427 struct aoeif *e; 494 struct aoeif *e;
495 struct net_device *nd;
428 ulong n; 496 ulong n;
429 497
498 nd = ifp->nd;
430 e = t->ifs + NAOEIFS - 1; 499 e = t->ifs + NAOEIFS - 1;
431 n = (e - ifp) * sizeof *ifp; 500 n = (e - ifp) * sizeof *ifp;
432 memmove(ifp, ifp+1, n); 501 memmove(ifp, ifp+1, n);
433 e->nd = NULL; 502 e->nd = NULL;
503 dev_put(nd);
434} 504}
435 505
436static int 506static int
437sthtith(struct aoedev *d) 507sthtith(struct aoedev *d)
438{ 508{
439 struct frame *f, *e, *nf; 509 struct frame *f, *nf;
510 struct list_head *nx, *pos, *head;
440 struct sk_buff *skb; 511 struct sk_buff *skb;
441 struct aoetgt *ht = *d->htgt; 512 struct aoetgt *ht = d->htgt;
442 513 int i;
443 f = ht->frames; 514
444 e = f + ht->nframes; 515 for (i = 0; i < NFACTIVE; i++) {
445 for (; f < e; f++) { 516 head = &d->factive[i];
446 if (f->tag == FREETAG) 517 list_for_each_safe(pos, nx, head) {
447 continue; 518 f = list_entry(pos, struct frame, head);
448 nf = freeframe(d); 519 if (f->t != ht)
449 if (!nf) 520 continue;
450 return 0; 521
451 skb = nf->skb; 522 nf = newframe(d);
452 *nf = *f; 523 if (!nf)
453 f->skb = skb; 524 return 0;
454 f->tag = FREETAG; 525
455 nf->waited = 0; 526 /* remove frame from active list */
456 ht->nout--; 527 list_del(pos);
457 (*d->tgt)->nout++; 528
458 resend(d, *d->tgt, nf); 529 /* reassign all pertinent bits to new outbound frame */
530 skb = nf->skb;
531 nf->skb = f->skb;
532 nf->buf = f->buf;
533 nf->bcnt = f->bcnt;
534 nf->lba = f->lba;
535 nf->bv = f->bv;
536 nf->bv_off = f->bv_off;
537 nf->waited = 0;
538 f->skb = skb;
539 aoe_freetframe(f);
540 ht->nout--;
541 nf->t->nout++;
542 resend(d, nf);
543 }
459 } 544 }
460 /* he's clean, he's useless. take away his interfaces */ 545 /* We've cleaned up the outstanding so take away his
546 * interfaces so he won't be used. We should remove him from
547 * the target array here, but cleaning up a target is
548 * involved. PUNT!
549 */
461 memset(ht->ifs, 0, sizeof ht->ifs); 550 memset(ht->ifs, 0, sizeof ht->ifs);
462 d->htgt = NULL; 551 d->htgt = NULL;
463 return 1; 552 return 1;
@@ -476,13 +565,15 @@ ata_scnt(unsigned char *packet) {
476static void 565static void
477rexmit_timer(ulong vp) 566rexmit_timer(ulong vp)
478{ 567{
479 struct sk_buff_head queue;
480 struct aoedev *d; 568 struct aoedev *d;
481 struct aoetgt *t, **tt, **te; 569 struct aoetgt *t, **tt, **te;
482 struct aoeif *ifp; 570 struct aoeif *ifp;
483 struct frame *f, *e; 571 struct frame *f;
572 struct list_head *head, *pos, *nx;
573 LIST_HEAD(flist);
484 register long timeout; 574 register long timeout;
485 ulong flags, n; 575 ulong flags, n;
576 int i;
486 577
487 d = (struct aoedev *) vp; 578 d = (struct aoedev *) vp;
488 579
@@ -496,58 +587,22 @@ rexmit_timer(ulong vp)
496 spin_unlock_irqrestore(&d->lock, flags); 587 spin_unlock_irqrestore(&d->lock, flags);
497 return; 588 return;
498 } 589 }
499 tt = d->targets;
500 te = tt + NTARGETS;
501 for (; tt < te && *tt; tt++) {
502 t = *tt;
503 f = t->frames;
504 e = f + t->nframes;
505 for (; f < e; f++) {
506 if (f->tag == FREETAG
507 || tsince(f->tag) < timeout)
508 continue;
509 n = f->waited += timeout;
510 n /= HZ;
511 if (n > aoe_deadsecs) {
512 /* waited too long. device failure. */
513 aoedev_downdev(d);
514 break;
515 }
516
517 if (n > HELPWAIT /* see if another target can help */
518 && (tt != d->targets || d->targets[1]))
519 d->htgt = tt;
520
521 if (t->nout == t->maxout) {
522 if (t->maxout > 1)
523 t->maxout--;
524 t->lastwadj = jiffies;
525 }
526
527 ifp = getif(t, f->skb->dev);
528 if (ifp && ++ifp->lost > (t->nframes << 1)
529 && (ifp != t->ifs || t->ifs[1].nd)) {
530 ejectif(t, ifp);
531 ifp = NULL;
532 }
533 590
534 if (ata_scnt(skb_mac_header(f->skb)) > DEFAULTBCNT / 512 591 /* collect all frames to rexmit into flist */
535 && ifp && ++ifp->lostjumbo > (t->nframes << 1) 592 for (i = 0; i < NFACTIVE; i++) {
536 && ifp->maxbcnt != DEFAULTBCNT) { 593 head = &d->factive[i];
537 printk(KERN_INFO 594 list_for_each_safe(pos, nx, head) {
538 "aoe: e%ld.%d: " 595 f = list_entry(pos, struct frame, head);
539 "too many lost jumbo on " 596 if (tsince(f->tag) < timeout)
540 "%s:%pm - " 597 break; /* end of expired frames */
541 "falling back to %d frames.\n", 598 /* move to flist for later processing */
542 d->aoemajor, d->aoeminor, 599 list_move_tail(pos, &flist);
543 ifp->nd->name, t->addr,
544 DEFAULTBCNT);
545 ifp->maxbcnt = 0;
546 }
547 resend(d, t, f);
548 } 600 }
549 601 }
550 /* window check */ 602 /* window check */
603 tt = d->targets;
604 te = tt + d->ntargets;
605 for (; tt < te && (t = *tt); tt++) {
551 if (t->nout == t->maxout 606 if (t->nout == t->maxout
552 && t->maxout < t->nframes 607 && t->maxout < t->nframes
553 && (jiffies - t->lastwadj)/HZ > 10) { 608 && (jiffies - t->lastwadj)/HZ > 10) {
@@ -556,45 +611,173 @@ rexmit_timer(ulong vp)
556 } 611 }
557 } 612 }
558 613
559 if (!skb_queue_empty(&d->sendq)) { 614 if (!list_empty(&flist)) { /* retransmissions necessary */
560 n = d->rttavg <<= 1; 615 n = d->rttavg <<= 1;
561 if (n > MAXTIMER) 616 if (n > MAXTIMER)
562 d->rttavg = MAXTIMER; 617 d->rttavg = MAXTIMER;
563 } 618 }
564 619
565 if (d->flags & DEVFL_KICKME || d->htgt) { 620 /* process expired frames */
566 d->flags &= ~DEVFL_KICKME; 621 while (!list_empty(&flist)) {
567 aoecmd_work(d); 622 pos = flist.next;
623 f = list_entry(pos, struct frame, head);
624 n = f->waited += timeout;
625 n /= HZ;
626 if (n > aoe_deadsecs) {
627 /* Waited too long. Device failure.
628 * Hang all frames on first hash bucket for downdev
629 * to clean up.
630 */
631 list_splice(&flist, &d->factive[0]);
632 aoedev_downdev(d);
633 break;
634 }
635 list_del(pos);
636
637 t = f->t;
638 if (n > aoe_deadsecs/2)
639 d->htgt = t; /* see if another target can help */
640
641 if (t->nout == t->maxout) {
642 if (t->maxout > 1)
643 t->maxout--;
644 t->lastwadj = jiffies;
645 }
646
647 ifp = getif(t, f->skb->dev);
648 if (ifp && ++ifp->lost > (t->nframes << 1)
649 && (ifp != t->ifs || t->ifs[1].nd)) {
650 ejectif(t, ifp);
651 ifp = NULL;
652 }
653 resend(d, f);
568 } 654 }
569 655
570 __skb_queue_head_init(&queue); 656 if ((d->flags & DEVFL_KICKME || d->htgt) && d->blkq) {
571 skb_queue_splice_init(&d->sendq, &queue); 657 d->flags &= ~DEVFL_KICKME;
658 d->blkq->request_fn(d->blkq);
659 }
572 660
573 d->timer.expires = jiffies + TIMERTICK; 661 d->timer.expires = jiffies + TIMERTICK;
574 add_timer(&d->timer); 662 add_timer(&d->timer);
575 663
576 spin_unlock_irqrestore(&d->lock, flags); 664 spin_unlock_irqrestore(&d->lock, flags);
665}
577 666
578 aoenet_xmit(&queue); 667static unsigned long
668rqbiocnt(struct request *r)
669{
670 struct bio *bio;
671 unsigned long n = 0;
672
673 __rq_for_each_bio(bio, r)
674 n++;
675 return n;
676}
677
678/* This can be removed if we are certain that no users of the block
679 * layer will ever use zero-count pages in bios. Otherwise we have to
680 * protect against the put_page sometimes done by the network layer.
681 *
682 * See http://oss.sgi.com/archives/xfs/2007-01/msg00594.html for
683 * discussion.
684 *
685 * We cannot use get_page in the workaround, because it insists on a
686 * positive page count as a precondition. So we use _count directly.
687 */
688static void
689bio_pageinc(struct bio *bio)
690{
691 struct bio_vec *bv;
692 struct page *page;
693 int i;
694
695 bio_for_each_segment(bv, bio, i) {
696 page = bv->bv_page;
697 /* Non-zero page count for non-head members of
698 * compound pages is no longer allowed by the kernel,
699 * but this has never been seen here.
700 */
701 if (unlikely(PageCompound(page)))
702 if (compound_trans_head(page) != page) {
703 pr_crit("page tail used for block I/O\n");
704 BUG();
705 }
706 atomic_inc(&page->_count);
707 }
708}
709
710static void
711bio_pagedec(struct bio *bio)
712{
713 struct bio_vec *bv;
714 int i;
715
716 bio_for_each_segment(bv, bio, i)
717 atomic_dec(&bv->bv_page->_count);
718}
719
720static void
721bufinit(struct buf *buf, struct request *rq, struct bio *bio)
722{
723 struct bio_vec *bv;
724
725 memset(buf, 0, sizeof(*buf));
726 buf->rq = rq;
727 buf->bio = bio;
728 buf->resid = bio->bi_size;
729 buf->sector = bio->bi_sector;
730 bio_pageinc(bio);
731 buf->bv = bv = &bio->bi_io_vec[bio->bi_idx];
732 buf->bv_resid = bv->bv_len;
733 WARN_ON(buf->bv_resid == 0);
734}
735
736static struct buf *
737nextbuf(struct aoedev *d)
738{
739 struct request *rq;
740 struct request_queue *q;
741 struct buf *buf;
742 struct bio *bio;
743
744 q = d->blkq;
745 if (q == NULL)
746 return NULL; /* initializing */
747 if (d->ip.buf)
748 return d->ip.buf;
749 rq = d->ip.rq;
750 if (rq == NULL) {
751 rq = blk_peek_request(q);
752 if (rq == NULL)
753 return NULL;
754 blk_start_request(rq);
755 d->ip.rq = rq;
756 d->ip.nxbio = rq->bio;
757 rq->special = (void *) rqbiocnt(rq);
758 }
759 buf = mempool_alloc(d->bufpool, GFP_ATOMIC);
760 if (buf == NULL) {
761 pr_err("aoe: nextbuf: unable to mempool_alloc!\n");
762 return NULL;
763 }
764 bio = d->ip.nxbio;
765 bufinit(buf, rq, bio);
766 bio = bio->bi_next;
767 d->ip.nxbio = bio;
768 if (bio == NULL)
769 d->ip.rq = NULL;
770 return d->ip.buf = buf;
579} 771}
580 772
581/* enters with d->lock held */ 773/* enters with d->lock held */
582void 774void
583aoecmd_work(struct aoedev *d) 775aoecmd_work(struct aoedev *d)
584{ 776{
585 struct buf *buf;
586loop:
587 if (d->htgt && !sthtith(d)) 777 if (d->htgt && !sthtith(d))
588 return; 778 return;
589 if (d->inprocess == NULL) { 779 while (aoecmd_ata_rw(d))
590 if (list_empty(&d->bufq)) 780 ;
591 return;
592 buf = container_of(d->bufq.next, struct buf, bufs);
593 list_del(d->bufq.next);
594 d->inprocess = buf;
595 }
596 if (aoecmd_ata_rw(d))
597 goto loop;
598} 781}
599 782
600/* this function performs work that has been deferred until sleeping is OK 783/* this function performs work that has been deferred until sleeping is OK
@@ -603,28 +786,25 @@ void
603aoecmd_sleepwork(struct work_struct *work) 786aoecmd_sleepwork(struct work_struct *work)
604{ 787{
605 struct aoedev *d = container_of(work, struct aoedev, work); 788 struct aoedev *d = container_of(work, struct aoedev, work);
789 struct block_device *bd;
790 u64 ssize;
606 791
607 if (d->flags & DEVFL_GDALLOC) 792 if (d->flags & DEVFL_GDALLOC)
608 aoeblk_gdalloc(d); 793 aoeblk_gdalloc(d);
609 794
610 if (d->flags & DEVFL_NEWSIZE) { 795 if (d->flags & DEVFL_NEWSIZE) {
611 struct block_device *bd;
612 unsigned long flags;
613 u64 ssize;
614
615 ssize = get_capacity(d->gd); 796 ssize = get_capacity(d->gd);
616 bd = bdget_disk(d->gd, 0); 797 bd = bdget_disk(d->gd, 0);
617
618 if (bd) { 798 if (bd) {
619 mutex_lock(&bd->bd_inode->i_mutex); 799 mutex_lock(&bd->bd_inode->i_mutex);
620 i_size_write(bd->bd_inode, (loff_t)ssize<<9); 800 i_size_write(bd->bd_inode, (loff_t)ssize<<9);
621 mutex_unlock(&bd->bd_inode->i_mutex); 801 mutex_unlock(&bd->bd_inode->i_mutex);
622 bdput(bd); 802 bdput(bd);
623 } 803 }
624 spin_lock_irqsave(&d->lock, flags); 804 spin_lock_irq(&d->lock);
625 d->flags |= DEVFL_UP; 805 d->flags |= DEVFL_UP;
626 d->flags &= ~DEVFL_NEWSIZE; 806 d->flags &= ~DEVFL_NEWSIZE;
627 spin_unlock_irqrestore(&d->lock, flags); 807 spin_unlock_irq(&d->lock);
628 } 808 }
629} 809}
630 810
@@ -717,163 +897,299 @@ gettgt(struct aoedev *d, char *addr)
717 return NULL; 897 return NULL;
718} 898}
719 899
720static inline void 900static void
721diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector) 901bvcpy(struct bio_vec *bv, ulong off, struct sk_buff *skb, long cnt)
902{
903 ulong fcnt;
904 char *p;
905 int soff = 0;
906loop:
907 fcnt = bv->bv_len - (off - bv->bv_offset);
908 if (fcnt > cnt)
909 fcnt = cnt;
910 p = page_address(bv->bv_page) + off;
911 skb_copy_bits(skb, soff, p, fcnt);
912 soff += fcnt;
913 cnt -= fcnt;
914 if (cnt <= 0)
915 return;
916 bv++;
917 off = bv->bv_offset;
918 goto loop;
919}
920
921void
922aoe_end_request(struct aoedev *d, struct request *rq, int fastfail)
923{
924 struct bio *bio;
925 int bok;
926 struct request_queue *q;
927
928 q = d->blkq;
929 if (rq == d->ip.rq)
930 d->ip.rq = NULL;
931 do {
932 bio = rq->bio;
933 bok = !fastfail && test_bit(BIO_UPTODATE, &bio->bi_flags);
934 } while (__blk_end_request(rq, bok ? 0 : -EIO, bio->bi_size));
935
936 /* cf. http://lkml.org/lkml/2006/10/31/28 */
937 if (!fastfail)
938 q->request_fn(q);
939}
940
941static void
942aoe_end_buf(struct aoedev *d, struct buf *buf)
943{
944 struct request *rq;
945 unsigned long n;
946
947 if (buf == d->ip.buf)
948 d->ip.buf = NULL;
949 rq = buf->rq;
950 bio_pagedec(buf->bio);
951 mempool_free(buf, d->bufpool);
952 n = (unsigned long) rq->special;
953 rq->special = (void *) --n;
954 if (n == 0)
955 aoe_end_request(d, rq, 0);
956}
957
958static void
959ktiocomplete(struct frame *f)
722{ 960{
723 unsigned long n_sect = bio->bi_size >> 9; 961 struct aoe_hdr *hin, *hout;
724 const int rw = bio_data_dir(bio); 962 struct aoe_atahdr *ahin, *ahout;
725 struct hd_struct *part; 963 struct buf *buf;
726 int cpu; 964 struct sk_buff *skb;
965 struct aoetgt *t;
966 struct aoeif *ifp;
967 struct aoedev *d;
968 long n;
969
970 if (f == NULL)
971 return;
972
973 t = f->t;
974 d = t->d;
975
976 hout = (struct aoe_hdr *) skb_mac_header(f->skb);
977 ahout = (struct aoe_atahdr *) (hout+1);
978 buf = f->buf;
979 skb = f->r_skb;
980 if (skb == NULL)
981 goto noskb; /* just fail the buf. */
982
983 hin = (struct aoe_hdr *) skb->data;
984 skb_pull(skb, sizeof(*hin));
985 ahin = (struct aoe_atahdr *) skb->data;
986 skb_pull(skb, sizeof(*ahin));
987 if (ahin->cmdstat & 0xa9) { /* these bits cleared on success */
988 pr_err("aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n",
989 ahout->cmdstat, ahin->cmdstat,
990 d->aoemajor, d->aoeminor);
991noskb: if (buf)
992 clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
993 goto badrsp;
994 }
727 995
728 cpu = part_stat_lock(); 996 n = ahout->scnt << 9;
729 part = disk_map_sector_rcu(disk, sector); 997 switch (ahout->cmdstat) {
998 case ATA_CMD_PIO_READ:
999 case ATA_CMD_PIO_READ_EXT:
1000 if (skb->len < n) {
1001 pr_err("aoe: runt data size in read. skb->len=%d need=%ld\n",
1002 skb->len, n);
1003 clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
1004 break;
1005 }
1006 bvcpy(f->bv, f->bv_off, skb, n);
1007 case ATA_CMD_PIO_WRITE:
1008 case ATA_CMD_PIO_WRITE_EXT:
1009 spin_lock_irq(&d->lock);
1010 ifp = getif(t, skb->dev);
1011 if (ifp)
1012 ifp->lost = 0;
1013 if (d->htgt == t) /* I'll help myself, thank you. */
1014 d->htgt = NULL;
1015 spin_unlock_irq(&d->lock);
1016 break;
1017 case ATA_CMD_ID_ATA:
1018 if (skb->len < 512) {
1019 pr_info("aoe: runt data size in ataid. skb->len=%d\n",
1020 skb->len);
1021 break;
1022 }
1023 if (skb_linearize(skb))
1024 break;
1025 spin_lock_irq(&d->lock);
1026 ataid_complete(d, t, skb->data);
1027 spin_unlock_irq(&d->lock);
1028 break;
1029 default:
1030 pr_info("aoe: unrecognized ata command %2.2Xh for %d.%d\n",
1031 ahout->cmdstat,
1032 be16_to_cpu(get_unaligned(&hin->major)),
1033 hin->minor);
1034 }
1035badrsp:
1036 spin_lock_irq(&d->lock);
1037
1038 aoe_freetframe(f);
1039
1040 if (buf && --buf->nframesout == 0 && buf->resid == 0)
1041 aoe_end_buf(d, buf);
1042
1043 aoecmd_work(d);
1044
1045 spin_unlock_irq(&d->lock);
1046 aoedev_put(d);
1047 dev_kfree_skb(skb);
1048}
1049
1050/* Enters with iocq.lock held.
1051 * Returns true iff responses needing processing remain.
1052 */
1053static int
1054ktio(void)
1055{
1056 struct frame *f;
1057 struct list_head *pos;
1058 int i;
730 1059
731 part_stat_inc(cpu, part, ios[rw]); 1060 for (i = 0; ; ++i) {
732 part_stat_add(cpu, part, ticks[rw], duration); 1061 if (i == MAXIOC)
733 part_stat_add(cpu, part, sectors[rw], n_sect); 1062 return 1;
734 part_stat_add(cpu, part, io_ticks, duration); 1063 if (list_empty(&iocq.head))
1064 return 0;
1065 pos = iocq.head.next;
1066 list_del(pos);
1067 spin_unlock_irq(&iocq.lock);
1068 f = list_entry(pos, struct frame, head);
1069 ktiocomplete(f);
1070 spin_lock_irq(&iocq.lock);
1071 }
1072}
735 1073
736 part_stat_unlock(); 1074static int
1075kthread(void *vp)
1076{
1077 struct ktstate *k;
1078 DECLARE_WAITQUEUE(wait, current);
1079 int more;
1080
1081 k = vp;
1082 current->flags |= PF_NOFREEZE;
1083 set_user_nice(current, -10);
1084 complete(&k->rendez); /* tell spawner we're running */
1085 do {
1086 spin_lock_irq(k->lock);
1087 more = k->fn();
1088 if (!more) {
1089 add_wait_queue(k->waitq, &wait);
1090 __set_current_state(TASK_INTERRUPTIBLE);
1091 }
1092 spin_unlock_irq(k->lock);
1093 if (!more) {
1094 schedule();
1095 remove_wait_queue(k->waitq, &wait);
1096 } else
1097 cond_resched();
1098 } while (!kthread_should_stop());
1099 complete(&k->rendez); /* tell spawner we're stopping */
1100 return 0;
737} 1101}
738 1102
739void 1103void
1104aoe_ktstop(struct ktstate *k)
1105{
1106 kthread_stop(k->task);
1107 wait_for_completion(&k->rendez);
1108}
1109
1110int
1111aoe_ktstart(struct ktstate *k)
1112{
1113 struct task_struct *task;
1114
1115 init_completion(&k->rendez);
1116 task = kthread_run(kthread, k, k->name);
1117 if (task == NULL || IS_ERR(task))
1118 return -ENOMEM;
1119 k->task = task;
1120 wait_for_completion(&k->rendez); /* allow kthread to start */
1121 init_completion(&k->rendez); /* for waiting for exit later */
1122 return 0;
1123}
1124
1125/* pass it off to kthreads for processing */
1126static void
1127ktcomplete(struct frame *f, struct sk_buff *skb)
1128{
1129 ulong flags;
1130
1131 f->r_skb = skb;
1132 spin_lock_irqsave(&iocq.lock, flags);
1133 list_add_tail(&f->head, &iocq.head);
1134 spin_unlock_irqrestore(&iocq.lock, flags);
1135 wake_up(&ktiowq);
1136}
1137
1138struct sk_buff *
740aoecmd_ata_rsp(struct sk_buff *skb) 1139aoecmd_ata_rsp(struct sk_buff *skb)
741{ 1140{
742 struct sk_buff_head queue;
743 struct aoedev *d; 1141 struct aoedev *d;
744 struct aoe_hdr *hin, *hout; 1142 struct aoe_hdr *h;
745 struct aoe_atahdr *ahin, *ahout;
746 struct frame *f; 1143 struct frame *f;
747 struct buf *buf;
748 struct aoetgt *t; 1144 struct aoetgt *t;
749 struct aoeif *ifp; 1145 u32 n;
750 register long n;
751 ulong flags; 1146 ulong flags;
752 char ebuf[128]; 1147 char ebuf[128];
753 u16 aoemajor; 1148 u16 aoemajor;
754 1149
755 hin = (struct aoe_hdr *) skb_mac_header(skb); 1150 h = (struct aoe_hdr *) skb->data;
756 aoemajor = get_unaligned_be16(&hin->major); 1151 aoemajor = be16_to_cpu(get_unaligned(&h->major));
757 d = aoedev_by_aoeaddr(aoemajor, hin->minor); 1152 d = aoedev_by_aoeaddr(aoemajor, h->minor, 0);
758 if (d == NULL) { 1153 if (d == NULL) {
759 snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response " 1154 snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response "
760 "for unknown device %d.%d\n", 1155 "for unknown device %d.%d\n",
761 aoemajor, hin->minor); 1156 aoemajor, h->minor);
762 aoechr_error(ebuf); 1157 aoechr_error(ebuf);
763 return; 1158 return skb;
764 } 1159 }
765 1160
766 spin_lock_irqsave(&d->lock, flags); 1161 spin_lock_irqsave(&d->lock, flags);
767 1162
768 n = get_unaligned_be32(&hin->tag); 1163 n = be32_to_cpu(get_unaligned(&h->tag));
769 t = gettgt(d, hin->src); 1164 f = getframe(d, n);
770 if (t == NULL) {
771 printk(KERN_INFO "aoe: can't find target e%ld.%d:%pm\n",
772 d->aoemajor, d->aoeminor, hin->src);
773 spin_unlock_irqrestore(&d->lock, flags);
774 return;
775 }
776 f = getframe(t, n);
777 if (f == NULL) { 1165 if (f == NULL) {
778 calc_rttavg(d, -tsince(n)); 1166 calc_rttavg(d, -tsince(n));
779 spin_unlock_irqrestore(&d->lock, flags); 1167 spin_unlock_irqrestore(&d->lock, flags);
1168 aoedev_put(d);
780 snprintf(ebuf, sizeof ebuf, 1169 snprintf(ebuf, sizeof ebuf,
781 "%15s e%d.%d tag=%08x@%08lx\n", 1170 "%15s e%d.%d tag=%08x@%08lx\n",
782 "unexpected rsp", 1171 "unexpected rsp",
783 get_unaligned_be16(&hin->major), 1172 get_unaligned_be16(&h->major),
784 hin->minor, 1173 h->minor,
785 get_unaligned_be32(&hin->tag), 1174 get_unaligned_be32(&h->tag),
786 jiffies); 1175 jiffies);
787 aoechr_error(ebuf); 1176 aoechr_error(ebuf);
788 return; 1177 return skb;
789 } 1178 }
790 1179 t = f->t;
791 calc_rttavg(d, tsince(f->tag)); 1180 calc_rttavg(d, tsince(f->tag));
792
793 ahin = (struct aoe_atahdr *) (hin+1);
794 hout = (struct aoe_hdr *) skb_mac_header(f->skb);
795 ahout = (struct aoe_atahdr *) (hout+1);
796 buf = f->buf;
797
798 if (ahin->cmdstat & 0xa9) { /* these bits cleared on success */
799 printk(KERN_ERR
800 "aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n",
801 ahout->cmdstat, ahin->cmdstat,
802 d->aoemajor, d->aoeminor);
803 if (buf)
804 buf->flags |= BUFFL_FAIL;
805 } else {
806 if (d->htgt && t == *d->htgt) /* I'll help myself, thank you. */
807 d->htgt = NULL;
808 n = ahout->scnt << 9;
809 switch (ahout->cmdstat) {
810 case ATA_CMD_PIO_READ:
811 case ATA_CMD_PIO_READ_EXT:
812 if (skb->len - sizeof *hin - sizeof *ahin < n) {
813 printk(KERN_ERR
814 "aoe: %s. skb->len=%d need=%ld\n",
815 "runt data size in read", skb->len, n);
816 /* fail frame f? just returning will rexmit. */
817 spin_unlock_irqrestore(&d->lock, flags);
818 return;
819 }
820 memcpy(f->bufaddr, ahin+1, n);
821 case ATA_CMD_PIO_WRITE:
822 case ATA_CMD_PIO_WRITE_EXT:
823 ifp = getif(t, skb->dev);
824 if (ifp) {
825 ifp->lost = 0;
826 if (n > DEFAULTBCNT)
827 ifp->lostjumbo = 0;
828 }
829 if (f->bcnt -= n) {
830 f->lba += n >> 9;
831 f->bufaddr += n;
832 resend(d, t, f);
833 goto xmit;
834 }
835 break;
836 case ATA_CMD_ID_ATA:
837 if (skb->len - sizeof *hin - sizeof *ahin < 512) {
838 printk(KERN_INFO
839 "aoe: runt data size in ataid. skb->len=%d\n",
840 skb->len);
841 spin_unlock_irqrestore(&d->lock, flags);
842 return;
843 }
844 ataid_complete(d, t, (char *) (ahin+1));
845 break;
846 default:
847 printk(KERN_INFO
848 "aoe: unrecognized ata command %2.2Xh for %d.%d\n",
849 ahout->cmdstat,
850 get_unaligned_be16(&hin->major),
851 hin->minor);
852 }
853 }
854
855 if (buf && --buf->nframesout == 0 && buf->resid == 0) {
856 diskstats(d->gd, buf->bio, jiffies - buf->stime, buf->sector);
857 if (buf->flags & BUFFL_FAIL)
858 bio_endio(buf->bio, -EIO);
859 else {
860 bio_flush_dcache_pages(buf->bio);
861 bio_endio(buf->bio, 0);
862 }
863 mempool_free(buf, d->bufpool);
864 }
865
866 f->buf = NULL;
867 f->tag = FREETAG;
868 t->nout--; 1181 t->nout--;
869
870 aoecmd_work(d); 1182 aoecmd_work(d);
871xmit:
872 __skb_queue_head_init(&queue);
873 skb_queue_splice_init(&d->sendq, &queue);
874 1183
875 spin_unlock_irqrestore(&d->lock, flags); 1184 spin_unlock_irqrestore(&d->lock, flags);
876 aoenet_xmit(&queue); 1185
1186 ktcomplete(f, skb);
1187
1188 /*
1189 * Note here that we do not perform an aoedev_put, as we are
1190 * leaving this reference for the ktio to release.
1191 */
1192 return NULL;
877} 1193}
878 1194
879void 1195void
@@ -895,7 +1211,7 @@ aoecmd_ata_id(struct aoedev *d)
895 struct sk_buff *skb; 1211 struct sk_buff *skb;
896 struct aoetgt *t; 1212 struct aoetgt *t;
897 1213
898 f = freeframe(d); 1214 f = newframe(d);
899 if (f == NULL) 1215 if (f == NULL)
900 return NULL; 1216 return NULL;
901 1217
@@ -908,6 +1224,7 @@ aoecmd_ata_id(struct aoedev *d)
908 skb_put(skb, sizeof *h + sizeof *ah); 1224 skb_put(skb, sizeof *h + sizeof *ah);
909 memset(h, 0, skb->len); 1225 memset(h, 0, skb->len);
910 f->tag = aoehdr_atainit(d, t, h); 1226 f->tag = aoehdr_atainit(d, t, h);
1227 fhash(f);
911 t->nout++; 1228 t->nout++;
912 f->waited = 0; 1229 f->waited = 0;
913 1230
@@ -928,7 +1245,6 @@ static struct aoetgt *
928addtgt(struct aoedev *d, char *addr, ulong nframes) 1245addtgt(struct aoedev *d, char *addr, ulong nframes)
929{ 1246{
930 struct aoetgt *t, **tt, **te; 1247 struct aoetgt *t, **tt, **te;
931 struct frame *f, *e;
932 1248
933 tt = d->targets; 1249 tt = d->targets;
934 te = tt + NTARGETS; 1250 te = tt + NTARGETS;
@@ -940,26 +1256,73 @@ addtgt(struct aoedev *d, char *addr, ulong nframes)
940 "aoe: device addtgt failure; too many targets\n"); 1256 "aoe: device addtgt failure; too many targets\n");
941 return NULL; 1257 return NULL;
942 } 1258 }
943 t = kcalloc(1, sizeof *t, GFP_ATOMIC); 1259 t = kzalloc(sizeof(*t), GFP_ATOMIC);
944 f = kcalloc(nframes, sizeof *f, GFP_ATOMIC); 1260 if (!t) {
945 if (!t || !f) {
946 kfree(f);
947 kfree(t);
948 printk(KERN_INFO "aoe: cannot allocate memory to add target\n"); 1261 printk(KERN_INFO "aoe: cannot allocate memory to add target\n");
949 return NULL; 1262 return NULL;
950 } 1263 }
951 1264
1265 d->ntargets++;
952 t->nframes = nframes; 1266 t->nframes = nframes;
953 t->frames = f; 1267 t->d = d;
954 e = f + nframes;
955 for (; f < e; f++)
956 f->tag = FREETAG;
957 memcpy(t->addr, addr, sizeof t->addr); 1268 memcpy(t->addr, addr, sizeof t->addr);
958 t->ifp = t->ifs; 1269 t->ifp = t->ifs;
959 t->maxout = t->nframes; 1270 t->maxout = t->nframes;
1271 INIT_LIST_HEAD(&t->ffree);
960 return *tt = t; 1272 return *tt = t;
961} 1273}
962 1274
1275static void
1276setdbcnt(struct aoedev *d)
1277{
1278 struct aoetgt **t, **e;
1279 int bcnt = 0;
1280
1281 t = d->targets;
1282 e = t + NTARGETS;
1283 for (; t < e && *t; t++)
1284 if (bcnt == 0 || bcnt > (*t)->minbcnt)
1285 bcnt = (*t)->minbcnt;
1286 if (bcnt != d->maxbcnt) {
1287 d->maxbcnt = bcnt;
1288 pr_info("aoe: e%ld.%d: setting %d byte data frames\n",
1289 d->aoemajor, d->aoeminor, bcnt);
1290 }
1291}
1292
1293static void
1294setifbcnt(struct aoetgt *t, struct net_device *nd, int bcnt)
1295{
1296 struct aoedev *d;
1297 struct aoeif *p, *e;
1298 int minbcnt;
1299
1300 d = t->d;
1301 minbcnt = bcnt;
1302 p = t->ifs;
1303 e = p + NAOEIFS;
1304 for (; p < e; p++) {
1305 if (p->nd == NULL)
1306 break; /* end of the valid interfaces */
1307 if (p->nd == nd) {
1308 p->bcnt = bcnt; /* we're updating */
1309 nd = NULL;
1310 } else if (minbcnt > p->bcnt)
1311 minbcnt = p->bcnt; /* find the min interface */
1312 }
1313 if (nd) {
1314 if (p == e) {
1315 pr_err("aoe: device setifbcnt failure; too many interfaces.\n");
1316 return;
1317 }
1318 dev_hold(nd);
1319 p->nd = nd;
1320 p->bcnt = bcnt;
1321 }
1322 t->minbcnt = minbcnt;
1323 setdbcnt(d);
1324}
1325
963void 1326void
964aoecmd_cfg_rsp(struct sk_buff *skb) 1327aoecmd_cfg_rsp(struct sk_buff *skb)
965{ 1328{
@@ -967,11 +1330,12 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
967 struct aoe_hdr *h; 1330 struct aoe_hdr *h;
968 struct aoe_cfghdr *ch; 1331 struct aoe_cfghdr *ch;
969 struct aoetgt *t; 1332 struct aoetgt *t;
970 struct aoeif *ifp; 1333 ulong flags, aoemajor;
971 ulong flags, sysminor, aoemajor;
972 struct sk_buff *sl; 1334 struct sk_buff *sl;
1335 struct sk_buff_head queue;
973 u16 n; 1336 u16 n;
974 1337
1338 sl = NULL;
975 h = (struct aoe_hdr *) skb_mac_header(skb); 1339 h = (struct aoe_hdr *) skb_mac_header(skb);
976 ch = (struct aoe_cfghdr *) (h+1); 1340 ch = (struct aoe_cfghdr *) (h+1);
977 1341
@@ -985,10 +1349,13 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
985 "Check shelf dip switches.\n"); 1349 "Check shelf dip switches.\n");
986 return; 1350 return;
987 } 1351 }
988 1352 if (aoemajor == 0xffff) {
989 sysminor = SYSMINOR(aoemajor, h->minor); 1353 pr_info("aoe: e%ld.%d: broadcast shelf number invalid\n",
990 if (sysminor * AOE_PARTITIONS + AOE_PARTITIONS > MINORMASK) { 1354 aoemajor, (int) h->minor);
991 printk(KERN_INFO "aoe: e%ld.%d: minor number too large\n", 1355 return;
1356 }
1357 if (h->minor == 0xff) {
1358 pr_info("aoe: e%ld.%d: broadcast slot number invalid\n",
992 aoemajor, (int) h->minor); 1359 aoemajor, (int) h->minor);
993 return; 1360 return;
994 } 1361 }
@@ -997,9 +1364,9 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
997 if (n > aoe_maxout) /* keep it reasonable */ 1364 if (n > aoe_maxout) /* keep it reasonable */
998 n = aoe_maxout; 1365 n = aoe_maxout;
999 1366
1000 d = aoedev_by_sysminor_m(sysminor); 1367 d = aoedev_by_aoeaddr(aoemajor, h->minor, 1);
1001 if (d == NULL) { 1368 if (d == NULL) {
1002 printk(KERN_INFO "aoe: device sysminor_m failure\n"); 1369 pr_info("aoe: device allocation failure\n");
1003 return; 1370 return;
1004 } 1371 }
1005 1372
@@ -1008,52 +1375,26 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
1008 t = gettgt(d, h->src); 1375 t = gettgt(d, h->src);
1009 if (!t) { 1376 if (!t) {
1010 t = addtgt(d, h->src, n); 1377 t = addtgt(d, h->src, n);
1011 if (!t) { 1378 if (!t)
1012 spin_unlock_irqrestore(&d->lock, flags); 1379 goto bail;
1013 return;
1014 }
1015 }
1016 ifp = getif(t, skb->dev);
1017 if (!ifp) {
1018 ifp = addif(t, skb->dev);
1019 if (!ifp) {
1020 printk(KERN_INFO
1021 "aoe: device addif failure; "
1022 "too many interfaces?\n");
1023 spin_unlock_irqrestore(&d->lock, flags);
1024 return;
1025 }
1026 }
1027 if (ifp->maxbcnt) {
1028 n = ifp->nd->mtu;
1029 n -= sizeof (struct aoe_hdr) + sizeof (struct aoe_atahdr);
1030 n /= 512;
1031 if (n > ch->scnt)
1032 n = ch->scnt;
1033 n = n ? n * 512 : DEFAULTBCNT;
1034 if (n != ifp->maxbcnt) {
1035 printk(KERN_INFO
1036 "aoe: e%ld.%d: setting %d%s%s:%pm\n",
1037 d->aoemajor, d->aoeminor, n,
1038 " byte data frames on ", ifp->nd->name,
1039 t->addr);
1040 ifp->maxbcnt = n;
1041 }
1042 } 1380 }
1381 n = skb->dev->mtu;
1382 n -= sizeof(struct aoe_hdr) + sizeof(struct aoe_atahdr);
1383 n /= 512;
1384 if (n > ch->scnt)
1385 n = ch->scnt;
1386 n = n ? n * 512 : DEFAULTBCNT;
1387 setifbcnt(t, skb->dev, n);
1043 1388
1044 /* don't change users' perspective */ 1389 /* don't change users' perspective */
1045 if (d->nopen) { 1390 if (d->nopen == 0) {
1046 spin_unlock_irqrestore(&d->lock, flags); 1391 d->fw_ver = be16_to_cpu(ch->fwver);
1047 return; 1392 sl = aoecmd_ata_id(d);
1048 } 1393 }
1049 d->fw_ver = be16_to_cpu(ch->fwver); 1394bail:
1050
1051 sl = aoecmd_ata_id(d);
1052
1053 spin_unlock_irqrestore(&d->lock, flags); 1395 spin_unlock_irqrestore(&d->lock, flags);
1054 1396 aoedev_put(d);
1055 if (sl) { 1397 if (sl) {
1056 struct sk_buff_head queue;
1057 __skb_queue_head_init(&queue); 1398 __skb_queue_head_init(&queue);
1058 __skb_queue_tail(&queue, sl); 1399 __skb_queue_tail(&queue, sl);
1059 aoenet_xmit(&queue); 1400 aoenet_xmit(&queue);
@@ -1064,20 +1405,74 @@ void
1064aoecmd_cleanslate(struct aoedev *d) 1405aoecmd_cleanslate(struct aoedev *d)
1065{ 1406{
1066 struct aoetgt **t, **te; 1407 struct aoetgt **t, **te;
1067 struct aoeif *p, *e;
1068 1408
1069 d->mintimer = MINTIMER; 1409 d->mintimer = MINTIMER;
1410 d->maxbcnt = 0;
1070 1411
1071 t = d->targets; 1412 t = d->targets;
1072 te = t + NTARGETS; 1413 te = t + NTARGETS;
1073 for (; t < te && *t; t++) { 1414 for (; t < te && *t; t++)
1074 (*t)->maxout = (*t)->nframes; 1415 (*t)->maxout = (*t)->nframes;
1075 p = (*t)->ifs; 1416}
1076 e = p + NAOEIFS; 1417
1077 for (; p < e; p++) { 1418void
1078 p->lostjumbo = 0; 1419aoe_failbuf(struct aoedev *d, struct buf *buf)
1079 p->lost = 0; 1420{
1080 p->maxbcnt = DEFAULTBCNT; 1421 if (buf == NULL)
1422 return;
1423 buf->resid = 0;
1424 clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
1425 if (buf->nframesout == 0)
1426 aoe_end_buf(d, buf);
1427}
1428
1429void
1430aoe_flush_iocq(void)
1431{
1432 struct frame *f;
1433 struct aoedev *d;
1434 LIST_HEAD(flist);
1435 struct list_head *pos;
1436 struct sk_buff *skb;
1437 ulong flags;
1438
1439 spin_lock_irqsave(&iocq.lock, flags);
1440 list_splice_init(&iocq.head, &flist);
1441 spin_unlock_irqrestore(&iocq.lock, flags);
1442 while (!list_empty(&flist)) {
1443 pos = flist.next;
1444 list_del(pos);
1445 f = list_entry(pos, struct frame, head);
1446 d = f->t->d;
1447 skb = f->r_skb;
1448 spin_lock_irqsave(&d->lock, flags);
1449 if (f->buf) {
1450 f->buf->nframesout--;
1451 aoe_failbuf(d, f->buf);
1081 } 1452 }
1453 aoe_freetframe(f);
1454 spin_unlock_irqrestore(&d->lock, flags);
1455 dev_kfree_skb(skb);
1456 aoedev_put(d);
1082 } 1457 }
1083} 1458}
1459
1460int __init
1461aoecmd_init(void)
1462{
1463 INIT_LIST_HEAD(&iocq.head);
1464 spin_lock_init(&iocq.lock);
1465 init_waitqueue_head(&ktiowq);
1466 kts.name = "aoe_ktio";
1467 kts.fn = ktio;
1468 kts.waitq = &ktiowq;
1469 kts.lock = &iocq.lock;
1470 return aoe_ktstart(&kts);
1471}
1472
1473void
1474aoecmd_exit(void)
1475{
1476 aoe_ktstop(&kts);
1477 aoe_flush_iocq();
1478}
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index 6b5110a47458..90e5b537f94b 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -1,4 +1,4 @@
1/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ 1/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */
2/* 2/*
3 * aoedev.c 3 * aoedev.c
4 * AoE device utility functions; maintains device list. 4 * AoE device utility functions; maintains device list.
@@ -9,6 +9,9 @@
9#include <linux/netdevice.h> 9#include <linux/netdevice.h>
10#include <linux/delay.h> 10#include <linux/delay.h>
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/bitmap.h>
13#include <linux/kdev_t.h>
14#include <linux/moduleparam.h>
12#include "aoe.h" 15#include "aoe.h"
13 16
14static void dummy_timer(ulong); 17static void dummy_timer(ulong);
@@ -16,23 +19,121 @@ static void aoedev_freedev(struct aoedev *);
16static void freetgt(struct aoedev *d, struct aoetgt *t); 19static void freetgt(struct aoedev *d, struct aoetgt *t);
17static void skbpoolfree(struct aoedev *d); 20static void skbpoolfree(struct aoedev *d);
18 21
22static int aoe_dyndevs = 1;
23module_param(aoe_dyndevs, int, 0644);
24MODULE_PARM_DESC(aoe_dyndevs, "Use dynamic minor numbers for devices.");
25
19static struct aoedev *devlist; 26static struct aoedev *devlist;
20static DEFINE_SPINLOCK(devlist_lock); 27static DEFINE_SPINLOCK(devlist_lock);
21 28
22struct aoedev * 29/* Because some systems will have one, many, or no
23aoedev_by_aoeaddr(int maj, int min) 30 * - partitions,
31 * - slots per shelf,
32 * - or shelves,
33 * we need some flexibility in the way the minor numbers
34 * are allocated. So they are dynamic.
35 */
36#define N_DEVS ((1U<<MINORBITS)/AOE_PARTITIONS)
37
38static DEFINE_SPINLOCK(used_minors_lock);
39static DECLARE_BITMAP(used_minors, N_DEVS);
40
41static int
42minor_get_dyn(ulong *sysminor)
24{ 43{
25 struct aoedev *d;
26 ulong flags; 44 ulong flags;
45 ulong n;
46 int error = 0;
47
48 spin_lock_irqsave(&used_minors_lock, flags);
49 n = find_first_zero_bit(used_minors, N_DEVS);
50 if (n < N_DEVS)
51 set_bit(n, used_minors);
52 else
53 error = -1;
54 spin_unlock_irqrestore(&used_minors_lock, flags);
55
56 *sysminor = n * AOE_PARTITIONS;
57 return error;
58}
27 59
28 spin_lock_irqsave(&devlist_lock, flags); 60static int
61minor_get_static(ulong *sysminor, ulong aoemaj, int aoemin)
62{
63 ulong flags;
64 ulong n;
65 int error = 0;
66 enum {
67 /* for backwards compatibility when !aoe_dyndevs,
68 * a static number of supported slots per shelf */
69 NPERSHELF = 16,
70 };
71
72 n = aoemaj * NPERSHELF + aoemin;
73 if (aoemin >= NPERSHELF || n >= N_DEVS) {
74 pr_err("aoe: %s with e%ld.%d\n",
75 "cannot use static minor device numbers",
76 aoemaj, aoemin);
77 error = -1;
78 } else {
79 spin_lock_irqsave(&used_minors_lock, flags);
80 if (test_bit(n, used_minors)) {
81 pr_err("aoe: %s %lu\n",
82 "existing device already has static minor number",
83 n);
84 error = -1;
85 } else
86 set_bit(n, used_minors);
87 spin_unlock_irqrestore(&used_minors_lock, flags);
88 }
29 89
30 for (d=devlist; d; d=d->next) 90 *sysminor = n;
31 if (d->aoemajor == maj && d->aoeminor == min) 91 return error;
32 break; 92}
93
94static int
95minor_get(ulong *sysminor, ulong aoemaj, int aoemin)
96{
97 if (aoe_dyndevs)
98 return minor_get_dyn(sysminor);
99 else
100 return minor_get_static(sysminor, aoemaj, aoemin);
101}
102
103static void
104minor_free(ulong minor)
105{
106 ulong flags;
107
108 minor /= AOE_PARTITIONS;
109 BUG_ON(minor >= N_DEVS);
110
111 spin_lock_irqsave(&used_minors_lock, flags);
112 BUG_ON(!test_bit(minor, used_minors));
113 clear_bit(minor, used_minors);
114 spin_unlock_irqrestore(&used_minors_lock, flags);
115}
116
117/*
118 * Users who grab a pointer to the device with aoedev_by_aoeaddr
119 * automatically get a reference count and must be responsible
120 * for performing a aoedev_put. With the addition of async
121 * kthread processing I'm no longer confident that we can
122 * guarantee consistency in the face of device flushes.
123 *
124 * For the time being, we only bother to add extra references for
125 * frames sitting on the iocq. When the kthreads finish processing
126 * these frames, they will aoedev_put the device.
127 */
128
129void
130aoedev_put(struct aoedev *d)
131{
132 ulong flags;
33 133
134 spin_lock_irqsave(&devlist_lock, flags);
135 d->ref--;
34 spin_unlock_irqrestore(&devlist_lock, flags); 136 spin_unlock_irqrestore(&devlist_lock, flags);
35 return d;
36} 137}
37 138
38static void 139static void
@@ -47,54 +148,74 @@ dummy_timer(ulong vp)
47 add_timer(&d->timer); 148 add_timer(&d->timer);
48} 149}
49 150
151static void
152aoe_failip(struct aoedev *d)
153{
154 struct request *rq;
155 struct bio *bio;
156 unsigned long n;
157
158 aoe_failbuf(d, d->ip.buf);
159
160 rq = d->ip.rq;
161 if (rq == NULL)
162 return;
163 while ((bio = d->ip.nxbio)) {
164 clear_bit(BIO_UPTODATE, &bio->bi_flags);
165 d->ip.nxbio = bio->bi_next;
166 n = (unsigned long) rq->special;
167 rq->special = (void *) --n;
168 }
169 if ((unsigned long) rq->special == 0)
170 aoe_end_request(d, rq, 0);
171}
172
50void 173void
51aoedev_downdev(struct aoedev *d) 174aoedev_downdev(struct aoedev *d)
52{ 175{
53 struct aoetgt **t, **te; 176 struct aoetgt *t, **tt, **te;
54 struct frame *f, *e; 177 struct frame *f;
55 struct buf *buf; 178 struct list_head *head, *pos, *nx;
56 struct bio *bio; 179 struct request *rq;
180 int i;
57 181
58 t = d->targets; 182 d->flags &= ~DEVFL_UP;
59 te = t + NTARGETS; 183
60 for (; t < te && *t; t++) { 184 /* clean out active buffers */
61 f = (*t)->frames; 185 for (i = 0; i < NFACTIVE; i++) {
62 e = f + (*t)->nframes; 186 head = &d->factive[i];
63 for (; f < e; f->tag = FREETAG, f->buf = NULL, f++) { 187 list_for_each_safe(pos, nx, head) {
64 if (f->tag == FREETAG || f->buf == NULL) 188 f = list_entry(pos, struct frame, head);
65 continue; 189 list_del(pos);
66 buf = f->buf; 190 if (f->buf) {
67 bio = buf->bio; 191 f->buf->nframesout--;
68 if (--buf->nframesout == 0 192 aoe_failbuf(d, f->buf);
69 && buf != d->inprocess) {
70 mempool_free(buf, d->bufpool);
71 bio_endio(bio, -EIO);
72 } 193 }
194 aoe_freetframe(f);
73 } 195 }
74 (*t)->maxout = (*t)->nframes;
75 (*t)->nout = 0;
76 } 196 }
77 buf = d->inprocess; 197 /* reset window dressings */
78 if (buf) { 198 tt = d->targets;
79 bio = buf->bio; 199 te = tt + NTARGETS;
80 mempool_free(buf, d->bufpool); 200 for (; tt < te && (t = *tt); tt++) {
81 bio_endio(bio, -EIO); 201 t->maxout = t->nframes;
202 t->nout = 0;
82 } 203 }
83 d->inprocess = NULL; 204
205 /* clean out the in-process request (if any) */
206 aoe_failip(d);
84 d->htgt = NULL; 207 d->htgt = NULL;
85 208
86 while (!list_empty(&d->bufq)) { 209 /* fast fail all pending I/O */
87 buf = container_of(d->bufq.next, struct buf, bufs); 210 if (d->blkq) {
88 list_del(d->bufq.next); 211 while ((rq = blk_peek_request(d->blkq))) {
89 bio = buf->bio; 212 blk_start_request(rq);
90 mempool_free(buf, d->bufpool); 213 aoe_end_request(d, rq, 1);
91 bio_endio(bio, -EIO); 214 }
92 } 215 }
93 216
94 if (d->gd) 217 if (d->gd)
95 set_capacity(d->gd, 0); 218 set_capacity(d->gd, 0);
96
97 d->flags &= ~DEVFL_UP;
98} 219}
99 220
100static void 221static void
@@ -107,6 +228,7 @@ aoedev_freedev(struct aoedev *d)
107 aoedisk_rm_sysfs(d); 228 aoedisk_rm_sysfs(d);
108 del_gendisk(d->gd); 229 del_gendisk(d->gd);
109 put_disk(d->gd); 230 put_disk(d->gd);
231 blk_cleanup_queue(d->blkq);
110 } 232 }
111 t = d->targets; 233 t = d->targets;
112 e = t + NTARGETS; 234 e = t + NTARGETS;
@@ -115,7 +237,7 @@ aoedev_freedev(struct aoedev *d)
115 if (d->bufpool) 237 if (d->bufpool)
116 mempool_destroy(d->bufpool); 238 mempool_destroy(d->bufpool);
117 skbpoolfree(d); 239 skbpoolfree(d);
118 blk_cleanup_queue(d->blkq); 240 minor_free(d->sysminor);
119 kfree(d); 241 kfree(d);
120} 242}
121 243
@@ -142,7 +264,8 @@ aoedev_flush(const char __user *str, size_t cnt)
142 spin_lock(&d->lock); 264 spin_lock(&d->lock);
143 if ((!all && (d->flags & DEVFL_UP)) 265 if ((!all && (d->flags & DEVFL_UP))
144 || (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE)) 266 || (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE))
145 || d->nopen) { 267 || d->nopen
268 || d->ref) {
146 spin_unlock(&d->lock); 269 spin_unlock(&d->lock);
147 dd = &d->next; 270 dd = &d->next;
148 continue; 271 continue;
@@ -163,12 +286,15 @@ aoedev_flush(const char __user *str, size_t cnt)
163 return 0; 286 return 0;
164} 287}
165 288
166/* I'm not really sure that this is a realistic problem, but if the 289/* This has been confirmed to occur once with Tms=3*1000 due to the
167network driver goes gonzo let's just leak memory after complaining. */ 290 * driver changing link and not processing its transmit ring. The
291 * problem is hard enough to solve by returning an error that I'm
292 * still punting on "solving" this.
293 */
168static void 294static void
169skbfree(struct sk_buff *skb) 295skbfree(struct sk_buff *skb)
170{ 296{
171 enum { Sms = 100, Tms = 3*1000}; 297 enum { Sms = 250, Tms = 30 * 1000};
172 int i = Tms / Sms; 298 int i = Tms / Sms;
173 299
174 if (skb == NULL) 300 if (skb == NULL)
@@ -182,6 +308,7 @@ skbfree(struct sk_buff *skb)
182 "cannot free skb -- memory leaked."); 308 "cannot free skb -- memory leaked.");
183 return; 309 return;
184 } 310 }
311 skb->truesize -= skb->data_len;
185 skb_shinfo(skb)->nr_frags = skb->data_len = 0; 312 skb_shinfo(skb)->nr_frags = skb->data_len = 0;
186 skb_trim(skb, 0); 313 skb_trim(skb, 0);
187 dev_kfree_skb(skb); 314 dev_kfree_skb(skb);
@@ -198,26 +325,29 @@ skbpoolfree(struct aoedev *d)
198 __skb_queue_head_init(&d->skbpool); 325 __skb_queue_head_init(&d->skbpool);
199} 326}
200 327
201/* find it or malloc it */ 328/* find it or allocate it */
202struct aoedev * 329struct aoedev *
203aoedev_by_sysminor_m(ulong sysminor) 330aoedev_by_aoeaddr(ulong maj, int min, int do_alloc)
204{ 331{
205 struct aoedev *d; 332 struct aoedev *d;
333 int i;
206 ulong flags; 334 ulong flags;
335 ulong sysminor;
207 336
208 spin_lock_irqsave(&devlist_lock, flags); 337 spin_lock_irqsave(&devlist_lock, flags);
209 338
210 for (d=devlist; d; d=d->next) 339 for (d=devlist; d; d=d->next)
211 if (d->sysminor == sysminor) 340 if (d->aoemajor == maj && d->aoeminor == min) {
341 d->ref++;
212 break; 342 break;
213 if (d) 343 }
344 if (d || !do_alloc || minor_get(&sysminor, maj, min) < 0)
214 goto out; 345 goto out;
215 d = kcalloc(1, sizeof *d, GFP_ATOMIC); 346 d = kcalloc(1, sizeof *d, GFP_ATOMIC);
216 if (!d) 347 if (!d)
217 goto out; 348 goto out;
218 INIT_WORK(&d->work, aoecmd_sleepwork); 349 INIT_WORK(&d->work, aoecmd_sleepwork);
219 spin_lock_init(&d->lock); 350 spin_lock_init(&d->lock);
220 skb_queue_head_init(&d->sendq);
221 skb_queue_head_init(&d->skbpool); 351 skb_queue_head_init(&d->skbpool);
222 init_timer(&d->timer); 352 init_timer(&d->timer);
223 d->timer.data = (ulong) d; 353 d->timer.data = (ulong) d;
@@ -226,10 +356,12 @@ aoedev_by_sysminor_m(ulong sysminor)
226 add_timer(&d->timer); 356 add_timer(&d->timer);
227 d->bufpool = NULL; /* defer to aoeblk_gdalloc */ 357 d->bufpool = NULL; /* defer to aoeblk_gdalloc */
228 d->tgt = d->targets; 358 d->tgt = d->targets;
229 INIT_LIST_HEAD(&d->bufq); 359 d->ref = 1;
360 for (i = 0; i < NFACTIVE; i++)
361 INIT_LIST_HEAD(&d->factive[i]);
230 d->sysminor = sysminor; 362 d->sysminor = sysminor;
231 d->aoemajor = AOEMAJOR(sysminor); 363 d->aoemajor = maj;
232 d->aoeminor = AOEMINOR(sysminor); 364 d->aoeminor = min;
233 d->mintimer = MINTIMER; 365 d->mintimer = MINTIMER;
234 d->next = devlist; 366 d->next = devlist;
235 devlist = d; 367 devlist = d;
@@ -241,13 +373,23 @@ aoedev_by_sysminor_m(ulong sysminor)
241static void 373static void
242freetgt(struct aoedev *d, struct aoetgt *t) 374freetgt(struct aoedev *d, struct aoetgt *t)
243{ 375{
244 struct frame *f, *e; 376 struct frame *f;
377 struct list_head *pos, *nx, *head;
378 struct aoeif *ifp;
245 379
246 f = t->frames; 380 for (ifp = t->ifs; ifp < &t->ifs[NAOEIFS]; ++ifp) {
247 e = f + t->nframes; 381 if (!ifp->nd)
248 for (; f < e; f++) 382 break;
383 dev_put(ifp->nd);
384 }
385
386 head = &t->ffree;
387 list_for_each_safe(pos, nx, head) {
388 list_del(pos);
389 f = list_entry(pos, struct frame, head);
249 skbfree(f->skb); 390 skbfree(f->skb);
250 kfree(t->frames); 391 kfree(f);
392 }
251 kfree(t); 393 kfree(t);
252} 394}
253 395
@@ -257,6 +399,7 @@ aoedev_exit(void)
257 struct aoedev *d; 399 struct aoedev *d;
258 ulong flags; 400 ulong flags;
259 401
402 aoe_flush_iocq();
260 while ((d = devlist)) { 403 while ((d = devlist)) {
261 devlist = d->next; 404 devlist = d->next;
262 405
diff --git a/drivers/block/aoe/aoemain.c b/drivers/block/aoe/aoemain.c
index 7f83ad90e76f..04793c2c701b 100644
--- a/drivers/block/aoe/aoemain.c
+++ b/drivers/block/aoe/aoemain.c
@@ -1,4 +1,4 @@
1/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ 1/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */
2/* 2/*
3 * aoemain.c 3 * aoemain.c
4 * Module initialization routines, discover timer 4 * Module initialization routines, discover timer
@@ -61,6 +61,7 @@ aoe_exit(void)
61 61
62 aoenet_exit(); 62 aoenet_exit();
63 unregister_blkdev(AOE_MAJOR, DEVICE_NAME); 63 unregister_blkdev(AOE_MAJOR, DEVICE_NAME);
64 aoecmd_exit();
64 aoechr_exit(); 65 aoechr_exit();
65 aoedev_exit(); 66 aoedev_exit();
66 aoeblk_exit(); /* free cache after de-allocating bufs */ 67 aoeblk_exit(); /* free cache after de-allocating bufs */
@@ -83,17 +84,20 @@ aoe_init(void)
83 ret = aoenet_init(); 84 ret = aoenet_init();
84 if (ret) 85 if (ret)
85 goto net_fail; 86 goto net_fail;
87 ret = aoecmd_init();
88 if (ret)
89 goto cmd_fail;
86 ret = register_blkdev(AOE_MAJOR, DEVICE_NAME); 90 ret = register_blkdev(AOE_MAJOR, DEVICE_NAME);
87 if (ret < 0) { 91 if (ret < 0) {
88 printk(KERN_ERR "aoe: can't register major\n"); 92 printk(KERN_ERR "aoe: can't register major\n");
89 goto blkreg_fail; 93 goto blkreg_fail;
90 } 94 }
91
92 printk(KERN_INFO "aoe: AoE v%s initialised.\n", VERSION); 95 printk(KERN_INFO "aoe: AoE v%s initialised.\n", VERSION);
93 discover_timer(TINIT); 96 discover_timer(TINIT);
94 return 0; 97 return 0;
95
96 blkreg_fail: 98 blkreg_fail:
99 aoecmd_exit();
100 cmd_fail:
97 aoenet_exit(); 101 aoenet_exit();
98 net_fail: 102 net_fail:
99 aoeblk_exit(); 103 aoeblk_exit();
diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c
index 4d3bc0d49df5..162c6471275c 100644
--- a/drivers/block/aoe/aoenet.c
+++ b/drivers/block/aoe/aoenet.c
@@ -1,4 +1,4 @@
1/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ 1/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */
2/* 2/*
3 * aoenet.c 3 * aoenet.c
4 * Ethernet portion of AoE driver 4 * Ethernet portion of AoE driver
@@ -33,6 +33,9 @@ static char aoe_iflist[IFLISTSZ];
33module_param_string(aoe_iflist, aoe_iflist, IFLISTSZ, 0600); 33module_param_string(aoe_iflist, aoe_iflist, IFLISTSZ, 0600);
34MODULE_PARM_DESC(aoe_iflist, "aoe_iflist=\"dev1 [dev2 ...]\""); 34MODULE_PARM_DESC(aoe_iflist, "aoe_iflist=\"dev1 [dev2 ...]\"");
35 35
36static wait_queue_head_t txwq;
37static struct ktstate kts;
38
36#ifndef MODULE 39#ifndef MODULE
37static int __init aoe_iflist_setup(char *str) 40static int __init aoe_iflist_setup(char *str)
38{ 41{
@@ -44,6 +47,23 @@ static int __init aoe_iflist_setup(char *str)
44__setup("aoe_iflist=", aoe_iflist_setup); 47__setup("aoe_iflist=", aoe_iflist_setup);
45#endif 48#endif
46 49
50static spinlock_t txlock;
51static struct sk_buff_head skbtxq;
52
53/* enters with txlock held */
54static int
55tx(void)
56{
57 struct sk_buff *skb;
58
59 while ((skb = skb_dequeue(&skbtxq))) {
60 spin_unlock_irq(&txlock);
61 dev_queue_xmit(skb);
62 spin_lock_irq(&txlock);
63 }
64 return 0;
65}
66
47int 67int
48is_aoe_netif(struct net_device *ifp) 68is_aoe_netif(struct net_device *ifp)
49{ 69{
@@ -88,10 +108,14 @@ void
88aoenet_xmit(struct sk_buff_head *queue) 108aoenet_xmit(struct sk_buff_head *queue)
89{ 109{
90 struct sk_buff *skb, *tmp; 110 struct sk_buff *skb, *tmp;
111 ulong flags;
91 112
92 skb_queue_walk_safe(queue, skb, tmp) { 113 skb_queue_walk_safe(queue, skb, tmp) {
93 __skb_unlink(skb, queue); 114 __skb_unlink(skb, queue);
94 dev_queue_xmit(skb); 115 spin_lock_irqsave(&txlock, flags);
116 skb_queue_tail(&skbtxq, skb);
117 spin_unlock_irqrestore(&txlock, flags);
118 wake_up(&txwq);
95 } 119 }
96} 120}
97 121
@@ -102,7 +126,9 @@ static int
102aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev) 126aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev)
103{ 127{
104 struct aoe_hdr *h; 128 struct aoe_hdr *h;
129 struct aoe_atahdr *ah;
105 u32 n; 130 u32 n;
131 int sn;
106 132
107 if (dev_net(ifp) != &init_net) 133 if (dev_net(ifp) != &init_net)
108 goto exit; 134 goto exit;
@@ -110,13 +136,16 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt,
110 skb = skb_share_check(skb, GFP_ATOMIC); 136 skb = skb_share_check(skb, GFP_ATOMIC);
111 if (skb == NULL) 137 if (skb == NULL)
112 return 0; 138 return 0;
113 if (skb_linearize(skb))
114 goto exit;
115 if (!is_aoe_netif(ifp)) 139 if (!is_aoe_netif(ifp))
116 goto exit; 140 goto exit;
117 skb_push(skb, ETH_HLEN); /* (1) */ 141 skb_push(skb, ETH_HLEN); /* (1) */
118 142 sn = sizeof(*h) + sizeof(*ah);
119 h = (struct aoe_hdr *) skb_mac_header(skb); 143 if (skb->len >= sn) {
144 sn -= skb_headlen(skb);
145 if (sn > 0 && !__pskb_pull_tail(skb, sn))
146 goto exit;
147 }
148 h = (struct aoe_hdr *) skb->data;
120 n = get_unaligned_be32(&h->tag); 149 n = get_unaligned_be32(&h->tag);
121 if ((h->verfl & AOEFL_RSP) == 0 || (n & 1<<31)) 150 if ((h->verfl & AOEFL_RSP) == 0 || (n & 1<<31))
122 goto exit; 151 goto exit;
@@ -137,7 +166,8 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt,
137 166
138 switch (h->cmd) { 167 switch (h->cmd) {
139 case AOECMD_ATA: 168 case AOECMD_ATA:
140 aoecmd_ata_rsp(skb); 169 /* ata_rsp may keep skb for later processing or give it back */
170 skb = aoecmd_ata_rsp(skb);
141 break; 171 break;
142 case AOECMD_CFG: 172 case AOECMD_CFG:
143 aoecmd_cfg_rsp(skb); 173 aoecmd_cfg_rsp(skb);
@@ -145,8 +175,12 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt,
145 default: 175 default:
146 if (h->cmd >= AOECMD_VEND_MIN) 176 if (h->cmd >= AOECMD_VEND_MIN)
147 break; /* don't complain about vendor commands */ 177 break; /* don't complain about vendor commands */
148 printk(KERN_INFO "aoe: unknown cmd %d\n", h->cmd); 178 pr_info("aoe: unknown AoE command type 0x%02x\n", h->cmd);
179 break;
149 } 180 }
181
182 if (!skb)
183 return 0;
150exit: 184exit:
151 dev_kfree_skb(skb); 185 dev_kfree_skb(skb);
152 return 0; 186 return 0;
@@ -160,6 +194,15 @@ static struct packet_type aoe_pt __read_mostly = {
160int __init 194int __init
161aoenet_init(void) 195aoenet_init(void)
162{ 196{
197 skb_queue_head_init(&skbtxq);
198 init_waitqueue_head(&txwq);
199 spin_lock_init(&txlock);
200 kts.lock = &txlock;
201 kts.fn = tx;
202 kts.waitq = &txwq;
203 kts.name = "aoe_tx";
204 if (aoe_ktstart(&kts))
205 return -EAGAIN;
163 dev_add_pack(&aoe_pt); 206 dev_add_pack(&aoe_pt);
164 return 0; 207 return 0;
165} 208}
@@ -167,6 +210,8 @@ aoenet_init(void)
167void 210void
168aoenet_exit(void) 211aoenet_exit(void)
169{ 212{
213 aoe_ktstop(&kts);
214 skb_queue_purge(&skbtxq);
170 dev_remove_pack(&aoe_pt); 215 dev_remove_pack(&aoe_pt);
171} 216}
172 217
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 38aa6dda6b81..da3311129a0c 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -795,6 +795,7 @@ static void complete_scsi_command(CommandList_struct *c, int timeout,
795 } 795 }
796 break; 796 break;
797 case CMD_PROTOCOL_ERR: 797 case CMD_PROTOCOL_ERR:
798 cmd->result = DID_ERROR << 16;
798 dev_warn(&h->pdev->dev, 799 dev_warn(&h->pdev->dev,
799 "%p has protocol error\n", c); 800 "%p has protocol error\n", c);
800 break; 801 break;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index a7d6347aaa79..17c675c52295 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -672,7 +672,6 @@ static void __reschedule_timeout(int drive, const char *message)
672 672
673 if (drive == current_reqD) 673 if (drive == current_reqD)
674 drive = current_drive; 674 drive = current_drive;
675 __cancel_delayed_work(&fd_timeout);
676 675
677 if (drive < 0 || drive >= N_DRIVE) { 676 if (drive < 0 || drive >= N_DRIVE) {
678 delay = 20UL * HZ; 677 delay = 20UL * HZ;
@@ -680,7 +679,7 @@ static void __reschedule_timeout(int drive, const char *message)
680 } else 679 } else
681 delay = UDP->timeout; 680 delay = UDP->timeout;
682 681
683 queue_delayed_work(floppy_wq, &fd_timeout, delay); 682 mod_delayed_work(floppy_wq, &fd_timeout, delay);
684 if (UDP->flags & FD_DEBUG) 683 if (UDP->flags & FD_DEBUG)
685 DPRINT("reschedule timeout %s\n", message); 684 DPRINT("reschedule timeout %s\n", message);
686 timeout_message = message; 685 timeout_message = message;
@@ -891,7 +890,7 @@ static void unlock_fdc(void)
891 890
892 raw_cmd = NULL; 891 raw_cmd = NULL;
893 command_status = FD_COMMAND_NONE; 892 command_status = FD_COMMAND_NONE;
894 __cancel_delayed_work(&fd_timeout); 893 cancel_delayed_work(&fd_timeout);
895 do_floppy = NULL; 894 do_floppy = NULL;
896 cont = NULL; 895 cont = NULL;
897 clear_bit(0, &fdc_busy); 896 clear_bit(0, &fdc_busy);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 3bba65510d23..e9d594fd12cb 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1038,10 +1038,10 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
1038{ 1038{
1039 int err; 1039 int err;
1040 struct loop_func_table *xfer; 1040 struct loop_func_table *xfer;
1041 uid_t uid = current_uid(); 1041 kuid_t uid = current_uid();
1042 1042
1043 if (lo->lo_encrypt_key_size && 1043 if (lo->lo_encrypt_key_size &&
1044 lo->lo_key_owner != uid && 1044 !uid_eq(lo->lo_key_owner, uid) &&
1045 !capable(CAP_SYS_ADMIN)) 1045 !capable(CAP_SYS_ADMIN))
1046 return -EPERM; 1046 return -EPERM;
1047 if (lo->lo_state != Lo_bound) 1047 if (lo->lo_state != Lo_bound)
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index a8fddeb3d638..f946d31d6917 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -1148,11 +1148,15 @@ static bool mtip_pause_ncq(struct mtip_port *port,
1148 reply = port->rxfis + RX_FIS_D2H_REG; 1148 reply = port->rxfis + RX_FIS_D2H_REG;
1149 task_file_data = readl(port->mmio+PORT_TFDATA); 1149 task_file_data = readl(port->mmio+PORT_TFDATA);
1150 1150
1151 if ((task_file_data & 1) || (fis->command == ATA_CMD_SEC_ERASE_UNIT)) 1151 if (fis->command == ATA_CMD_SEC_ERASE_UNIT)
1152 clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag);
1153
1154 if ((task_file_data & 1))
1152 return false; 1155 return false;
1153 1156
1154 if (fis->command == ATA_CMD_SEC_ERASE_PREP) { 1157 if (fis->command == ATA_CMD_SEC_ERASE_PREP) {
1155 set_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags); 1158 set_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags);
1159 set_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag);
1156 port->ic_pause_timer = jiffies; 1160 port->ic_pause_timer = jiffies;
1157 return true; 1161 return true;
1158 } else if ((fis->command == ATA_CMD_DOWNLOAD_MICRO) && 1162 } else if ((fis->command == ATA_CMD_DOWNLOAD_MICRO) &&
@@ -1900,7 +1904,7 @@ static int exec_drive_command(struct mtip_port *port, u8 *command,
1900 int rv = 0, xfer_sz = command[3]; 1904 int rv = 0, xfer_sz = command[3];
1901 1905
1902 if (xfer_sz) { 1906 if (xfer_sz) {
1903 if (user_buffer) 1907 if (!user_buffer)
1904 return -EFAULT; 1908 return -EFAULT;
1905 1909
1906 buf = dmam_alloc_coherent(&port->dd->pdev->dev, 1910 buf = dmam_alloc_coherent(&port->dd->pdev->dev,
@@ -2043,7 +2047,7 @@ static void mtip_set_timeout(struct host_to_dev_fis *fis, unsigned int *timeout)
2043 *timeout = 240000; /* 4 minutes */ 2047 *timeout = 240000; /* 4 minutes */
2044 break; 2048 break;
2045 case ATA_CMD_STANDBYNOW1: 2049 case ATA_CMD_STANDBYNOW1:
2046 *timeout = 10000; /* 10 seconds */ 2050 *timeout = 120000; /* 2 minutes */
2047 break; 2051 break;
2048 case 0xF7: 2052 case 0xF7:
2049 case 0xFA: 2053 case 0xFA:
@@ -2588,9 +2592,6 @@ static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf,
2588 if (!len || size) 2592 if (!len || size)
2589 return 0; 2593 return 0;
2590 2594
2591 if (size < 0)
2592 return -EINVAL;
2593
2594 size += sprintf(&buf[size], "H/ S ACTive : [ 0x"); 2595 size += sprintf(&buf[size], "H/ S ACTive : [ 0x");
2595 2596
2596 for (n = dd->slot_groups-1; n >= 0; n--) 2597 for (n = dd->slot_groups-1; n >= 0; n--)
@@ -2660,9 +2661,6 @@ static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf,
2660 if (!len || size) 2661 if (!len || size)
2661 return 0; 2662 return 0;
2662 2663
2663 if (size < 0)
2664 return -EINVAL;
2665
2666 size += sprintf(&buf[size], "Flag-port : [ %08lX ]\n", 2664 size += sprintf(&buf[size], "Flag-port : [ %08lX ]\n",
2667 dd->port->flags); 2665 dd->port->flags);
2668 size += sprintf(&buf[size], "Flag-dd : [ %08lX ]\n", 2666 size += sprintf(&buf[size], "Flag-dd : [ %08lX ]\n",
@@ -3214,8 +3212,8 @@ static int mtip_hw_init(struct driver_data *dd)
3214 "Unable to check write protect progress\n"); 3212 "Unable to check write protect progress\n");
3215 else 3213 else
3216 dev_info(&dd->pdev->dev, 3214 dev_info(&dd->pdev->dev,
3217 "Write protect progress: %d%% (%d blocks)\n", 3215 "Write protect progress: %u%% (%u blocks)\n",
3218 attr242.cur, attr242.data); 3216 attr242.cur, le32_to_cpu(attr242.data));
3219 return rv; 3217 return rv;
3220 3218
3221out3: 3219out3:
@@ -3619,6 +3617,10 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
3619 bio_endio(bio, -ENODATA); 3617 bio_endio(bio, -ENODATA);
3620 return; 3618 return;
3621 } 3619 }
3620 if (unlikely(test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))) {
3621 bio_endio(bio, -ENODATA);
3622 return;
3623 }
3622 } 3624 }
3623 3625
3624 if (unlikely(!bio_has_data(bio))) { 3626 if (unlikely(!bio_has_data(bio))) {
@@ -4168,7 +4170,13 @@ static void mtip_pci_shutdown(struct pci_dev *pdev)
4168 4170
4169/* Table of device ids supported by this driver. */ 4171/* Table of device ids supported by this driver. */
4170static DEFINE_PCI_DEVICE_TABLE(mtip_pci_tbl) = { 4172static DEFINE_PCI_DEVICE_TABLE(mtip_pci_tbl) = {
4171 { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320_DEVICE_ID) }, 4173 { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320H_DEVICE_ID) },
4174 { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320M_DEVICE_ID) },
4175 { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320S_DEVICE_ID) },
4176 { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P325M_DEVICE_ID) },
4177 { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P420H_DEVICE_ID) },
4178 { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P420M_DEVICE_ID) },
4179 { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P425M_DEVICE_ID) },
4172 { 0 } 4180 { 0 }
4173}; 4181};
4174 4182
@@ -4199,12 +4207,12 @@ static int __init mtip_init(void)
4199{ 4207{
4200 int error; 4208 int error;
4201 4209
4202 printk(KERN_INFO MTIP_DRV_NAME " Version " MTIP_DRV_VERSION "\n"); 4210 pr_info(MTIP_DRV_NAME " Version " MTIP_DRV_VERSION "\n");
4203 4211
4204 /* Allocate a major block device number to use with this driver. */ 4212 /* Allocate a major block device number to use with this driver. */
4205 error = register_blkdev(0, MTIP_DRV_NAME); 4213 error = register_blkdev(0, MTIP_DRV_NAME);
4206 if (error <= 0) { 4214 if (error <= 0) {
4207 printk(KERN_ERR "Unable to register block device (%d)\n", 4215 pr_err("Unable to register block device (%d)\n",
4208 error); 4216 error);
4209 return -EBUSY; 4217 return -EBUSY;
4210 } 4218 }
@@ -4213,7 +4221,7 @@ static int __init mtip_init(void)
4213 if (!dfs_parent) { 4221 if (!dfs_parent) {
4214 dfs_parent = debugfs_create_dir("rssd", NULL); 4222 dfs_parent = debugfs_create_dir("rssd", NULL);
4215 if (IS_ERR_OR_NULL(dfs_parent)) { 4223 if (IS_ERR_OR_NULL(dfs_parent)) {
4216 printk(KERN_WARNING "Error creating debugfs parent\n"); 4224 pr_warn("Error creating debugfs parent\n");
4217 dfs_parent = NULL; 4225 dfs_parent = NULL;
4218 } 4226 }
4219 } 4227 }
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index f51fc23d17bb..18627a1d04c5 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -76,7 +76,13 @@
76 76
77/* Micron Vendor ID & P320x SSD Device ID */ 77/* Micron Vendor ID & P320x SSD Device ID */
78#define PCI_VENDOR_ID_MICRON 0x1344 78#define PCI_VENDOR_ID_MICRON 0x1344
79#define P320_DEVICE_ID 0x5150 79#define P320H_DEVICE_ID 0x5150
80#define P320M_DEVICE_ID 0x5151
81#define P320S_DEVICE_ID 0x5152
82#define P325M_DEVICE_ID 0x5153
83#define P420H_DEVICE_ID 0x5160
84#define P420M_DEVICE_ID 0x5161
85#define P425M_DEVICE_ID 0x5163
80 86
81/* Driver name and version strings */ 87/* Driver name and version strings */
82#define MTIP_DRV_NAME "mtip32xx" 88#define MTIP_DRV_NAME "mtip32xx"
@@ -131,10 +137,12 @@ enum {
131 MTIP_PF_SVC_THD_STOP_BIT = 8, 137 MTIP_PF_SVC_THD_STOP_BIT = 8,
132 138
133 /* below are bit numbers in 'dd_flag' defined in driver_data */ 139 /* below are bit numbers in 'dd_flag' defined in driver_data */
140 MTIP_DDF_SEC_LOCK_BIT = 0,
134 MTIP_DDF_REMOVE_PENDING_BIT = 1, 141 MTIP_DDF_REMOVE_PENDING_BIT = 1,
135 MTIP_DDF_OVER_TEMP_BIT = 2, 142 MTIP_DDF_OVER_TEMP_BIT = 2,
136 MTIP_DDF_WRITE_PROTECT_BIT = 3, 143 MTIP_DDF_WRITE_PROTECT_BIT = 3,
137 MTIP_DDF_STOP_IO = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) | \ 144 MTIP_DDF_STOP_IO = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) | \
145 (1 << MTIP_DDF_SEC_LOCK_BIT) | \
138 (1 << MTIP_DDF_OVER_TEMP_BIT) | \ 146 (1 << MTIP_DDF_OVER_TEMP_BIT) | \
139 (1 << MTIP_DDF_WRITE_PROTECT_BIT)), 147 (1 << MTIP_DDF_WRITE_PROTECT_BIT)),
140 148
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index d07c9f7fded6..043ddcca4abf 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -78,6 +78,8 @@ static const char *ioctl_cmd_to_ascii(int cmd)
78 case NBD_SET_SOCK: return "set-sock"; 78 case NBD_SET_SOCK: return "set-sock";
79 case NBD_SET_BLKSIZE: return "set-blksize"; 79 case NBD_SET_BLKSIZE: return "set-blksize";
80 case NBD_SET_SIZE: return "set-size"; 80 case NBD_SET_SIZE: return "set-size";
81 case NBD_SET_TIMEOUT: return "set-timeout";
82 case NBD_SET_FLAGS: return "set-flags";
81 case NBD_DO_IT: return "do-it"; 83 case NBD_DO_IT: return "do-it";
82 case NBD_CLEAR_SOCK: return "clear-sock"; 84 case NBD_CLEAR_SOCK: return "clear-sock";
83 case NBD_CLEAR_QUE: return "clear-que"; 85 case NBD_CLEAR_QUE: return "clear-que";
@@ -96,6 +98,7 @@ static const char *nbdcmd_to_ascii(int cmd)
96 case NBD_CMD_READ: return "read"; 98 case NBD_CMD_READ: return "read";
97 case NBD_CMD_WRITE: return "write"; 99 case NBD_CMD_WRITE: return "write";
98 case NBD_CMD_DISC: return "disconnect"; 100 case NBD_CMD_DISC: return "disconnect";
101 case NBD_CMD_TRIM: return "trim/discard";
99 } 102 }
100 return "invalid"; 103 return "invalid";
101} 104}
@@ -449,6 +452,14 @@ static void nbd_clear_que(struct nbd_device *nbd)
449 req->errors++; 452 req->errors++;
450 nbd_end_request(req); 453 nbd_end_request(req);
451 } 454 }
455
456 while (!list_empty(&nbd->waiting_queue)) {
457 req = list_entry(nbd->waiting_queue.next, struct request,
458 queuelist);
459 list_del_init(&req->queuelist);
460 req->errors++;
461 nbd_end_request(req);
462 }
452} 463}
453 464
454 465
@@ -459,8 +470,12 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
459 470
460 nbd_cmd(req) = NBD_CMD_READ; 471 nbd_cmd(req) = NBD_CMD_READ;
461 if (rq_data_dir(req) == WRITE) { 472 if (rq_data_dir(req) == WRITE) {
462 nbd_cmd(req) = NBD_CMD_WRITE; 473 if ((req->cmd_flags & REQ_DISCARD)) {
463 if (nbd->flags & NBD_READ_ONLY) { 474 WARN_ON(!(nbd->flags & NBD_FLAG_SEND_TRIM));
475 nbd_cmd(req) = NBD_CMD_TRIM;
476 } else
477 nbd_cmd(req) = NBD_CMD_WRITE;
478 if (nbd->flags & NBD_FLAG_READ_ONLY) {
464 dev_err(disk_to_dev(nbd->disk), 479 dev_err(disk_to_dev(nbd->disk),
465 "Write on read-only\n"); 480 "Write on read-only\n");
466 goto error_out; 481 goto error_out;
@@ -598,6 +613,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
598 nbd->file = NULL; 613 nbd->file = NULL;
599 nbd_clear_que(nbd); 614 nbd_clear_que(nbd);
600 BUG_ON(!list_empty(&nbd->queue_head)); 615 BUG_ON(!list_empty(&nbd->queue_head));
616 BUG_ON(!list_empty(&nbd->waiting_queue));
601 if (file) 617 if (file)
602 fput(file); 618 fput(file);
603 return 0; 619 return 0;
@@ -642,6 +658,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
642 nbd->xmit_timeout = arg * HZ; 658 nbd->xmit_timeout = arg * HZ;
643 return 0; 659 return 0;
644 660
661 case NBD_SET_FLAGS:
662 nbd->flags = arg;
663 return 0;
664
645 case NBD_SET_SIZE_BLOCKS: 665 case NBD_SET_SIZE_BLOCKS:
646 nbd->bytesize = ((u64) arg) * nbd->blksize; 666 nbd->bytesize = ((u64) arg) * nbd->blksize;
647 bdev->bd_inode->i_size = nbd->bytesize; 667 bdev->bd_inode->i_size = nbd->bytesize;
@@ -661,6 +681,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
661 681
662 mutex_unlock(&nbd->tx_lock); 682 mutex_unlock(&nbd->tx_lock);
663 683
684 if (nbd->flags & NBD_FLAG_SEND_TRIM)
685 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
686 nbd->disk->queue);
687
664 thread = kthread_create(nbd_thread, nbd, nbd->disk->disk_name); 688 thread = kthread_create(nbd_thread, nbd, nbd->disk->disk_name);
665 if (IS_ERR(thread)) { 689 if (IS_ERR(thread)) {
666 mutex_lock(&nbd->tx_lock); 690 mutex_lock(&nbd->tx_lock);
@@ -678,6 +702,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
678 nbd->file = NULL; 702 nbd->file = NULL;
679 nbd_clear_que(nbd); 703 nbd_clear_que(nbd);
680 dev_warn(disk_to_dev(nbd->disk), "queue cleared\n"); 704 dev_warn(disk_to_dev(nbd->disk), "queue cleared\n");
705 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
681 if (file) 706 if (file)
682 fput(file); 707 fput(file);
683 nbd->bytesize = 0; 708 nbd->bytesize = 0;
@@ -796,6 +821,9 @@ static int __init nbd_init(void)
796 * Tell the block layer that we are not a rotational device 821 * Tell the block layer that we are not a rotational device
797 */ 822 */
798 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue); 823 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
824 disk->queue->limits.discard_granularity = 512;
825 disk->queue->limits.max_discard_sectors = UINT_MAX;
826 disk->queue->limits.discard_zeroes_data = 0;
799 } 827 }
800 828
801 if (register_blkdev(NBD_MAJOR, "nbd")) { 829 if (register_blkdev(NBD_MAJOR, "nbd")) {
diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 38a2d0631882..931769e133e5 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -79,6 +79,7 @@ struct nvme_dev {
79 char serial[20]; 79 char serial[20];
80 char model[40]; 80 char model[40];
81 char firmware_rev[8]; 81 char firmware_rev[8];
82 u32 max_hw_sectors;
82}; 83};
83 84
84/* 85/*
@@ -835,15 +836,15 @@ static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns,
835} 836}
836 837
837static int nvme_get_features(struct nvme_dev *dev, unsigned fid, 838static int nvme_get_features(struct nvme_dev *dev, unsigned fid,
838 unsigned dword11, dma_addr_t dma_addr) 839 unsigned nsid, dma_addr_t dma_addr)
839{ 840{
840 struct nvme_command c; 841 struct nvme_command c;
841 842
842 memset(&c, 0, sizeof(c)); 843 memset(&c, 0, sizeof(c));
843 c.features.opcode = nvme_admin_get_features; 844 c.features.opcode = nvme_admin_get_features;
845 c.features.nsid = cpu_to_le32(nsid);
844 c.features.prp1 = cpu_to_le64(dma_addr); 846 c.features.prp1 = cpu_to_le64(dma_addr);
845 c.features.fid = cpu_to_le32(fid); 847 c.features.fid = cpu_to_le32(fid);
846 c.features.dword11 = cpu_to_le32(dword11);
847 848
848 return nvme_submit_admin_cmd(dev, &c, NULL); 849 return nvme_submit_admin_cmd(dev, &c, NULL);
849} 850}
@@ -862,11 +863,51 @@ static int nvme_set_features(struct nvme_dev *dev, unsigned fid,
862 return nvme_submit_admin_cmd(dev, &c, result); 863 return nvme_submit_admin_cmd(dev, &c, result);
863} 864}
864 865
866/**
867 * nvme_cancel_ios - Cancel outstanding I/Os
868 * @queue: The queue to cancel I/Os on
869 * @timeout: True to only cancel I/Os which have timed out
870 */
871static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
872{
873 int depth = nvmeq->q_depth - 1;
874 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
875 unsigned long now = jiffies;
876 int cmdid;
877
878 for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) {
879 void *ctx;
880 nvme_completion_fn fn;
881 static struct nvme_completion cqe = {
882 .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1,
883 };
884
885 if (timeout && !time_after(now, info[cmdid].timeout))
886 continue;
887 dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d\n", cmdid);
888 ctx = cancel_cmdid(nvmeq, cmdid, &fn);
889 fn(nvmeq->dev, ctx, &cqe);
890 }
891}
892
893static void nvme_free_queue_mem(struct nvme_queue *nvmeq)
894{
895 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
896 (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
897 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
898 nvmeq->sq_cmds, nvmeq->sq_dma_addr);
899 kfree(nvmeq);
900}
901
865static void nvme_free_queue(struct nvme_dev *dev, int qid) 902static void nvme_free_queue(struct nvme_dev *dev, int qid)
866{ 903{
867 struct nvme_queue *nvmeq = dev->queues[qid]; 904 struct nvme_queue *nvmeq = dev->queues[qid];
868 int vector = dev->entry[nvmeq->cq_vector].vector; 905 int vector = dev->entry[nvmeq->cq_vector].vector;
869 906
907 spin_lock_irq(&nvmeq->q_lock);
908 nvme_cancel_ios(nvmeq, false);
909 spin_unlock_irq(&nvmeq->q_lock);
910
870 irq_set_affinity_hint(vector, NULL); 911 irq_set_affinity_hint(vector, NULL);
871 free_irq(vector, nvmeq); 912 free_irq(vector, nvmeq);
872 913
@@ -876,18 +917,15 @@ static void nvme_free_queue(struct nvme_dev *dev, int qid)
876 adapter_delete_cq(dev, qid); 917 adapter_delete_cq(dev, qid);
877 } 918 }
878 919
879 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 920 nvme_free_queue_mem(nvmeq);
880 (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
881 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
882 nvmeq->sq_cmds, nvmeq->sq_dma_addr);
883 kfree(nvmeq);
884} 921}
885 922
886static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, 923static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
887 int depth, int vector) 924 int depth, int vector)
888{ 925{
889 struct device *dmadev = &dev->pci_dev->dev; 926 struct device *dmadev = &dev->pci_dev->dev;
890 unsigned extra = (depth / 8) + (depth * sizeof(struct nvme_cmd_info)); 927 unsigned extra = DIV_ROUND_UP(depth, 8) + (depth *
928 sizeof(struct nvme_cmd_info));
891 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); 929 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
892 if (!nvmeq) 930 if (!nvmeq)
893 return NULL; 931 return NULL;
@@ -975,7 +1013,7 @@ static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
975 1013
976static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev) 1014static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
977{ 1015{
978 int result; 1016 int result = 0;
979 u32 aqa; 1017 u32 aqa;
980 u64 cap; 1018 u64 cap;
981 unsigned long timeout; 1019 unsigned long timeout;
@@ -1005,17 +1043,22 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
1005 timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; 1043 timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
1006 dev->db_stride = NVME_CAP_STRIDE(cap); 1044 dev->db_stride = NVME_CAP_STRIDE(cap);
1007 1045
1008 while (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) { 1046 while (!result && !(readl(&dev->bar->csts) & NVME_CSTS_RDY)) {
1009 msleep(100); 1047 msleep(100);
1010 if (fatal_signal_pending(current)) 1048 if (fatal_signal_pending(current))
1011 return -EINTR; 1049 result = -EINTR;
1012 if (time_after(jiffies, timeout)) { 1050 if (time_after(jiffies, timeout)) {
1013 dev_err(&dev->pci_dev->dev, 1051 dev_err(&dev->pci_dev->dev,
1014 "Device not ready; aborting initialisation\n"); 1052 "Device not ready; aborting initialisation\n");
1015 return -ENODEV; 1053 result = -ENODEV;
1016 } 1054 }
1017 } 1055 }
1018 1056
1057 if (result) {
1058 nvme_free_queue_mem(nvmeq);
1059 return result;
1060 }
1061
1019 result = queue_request_irq(dev, nvmeq, "nvme admin"); 1062 result = queue_request_irq(dev, nvmeq, "nvme admin");
1020 dev->queues[0] = nvmeq; 1063 dev->queues[0] = nvmeq;
1021 return result; 1064 return result;
@@ -1037,6 +1080,8 @@ static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
1037 offset = offset_in_page(addr); 1080 offset = offset_in_page(addr);
1038 count = DIV_ROUND_UP(offset + length, PAGE_SIZE); 1081 count = DIV_ROUND_UP(offset + length, PAGE_SIZE);
1039 pages = kcalloc(count, sizeof(*pages), GFP_KERNEL); 1082 pages = kcalloc(count, sizeof(*pages), GFP_KERNEL);
1083 if (!pages)
1084 return ERR_PTR(-ENOMEM);
1040 1085
1041 err = get_user_pages_fast(addr, count, 1, pages); 1086 err = get_user_pages_fast(addr, count, 1, pages);
1042 if (err < count) { 1087 if (err < count) {
@@ -1146,14 +1191,13 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
1146 return status; 1191 return status;
1147} 1192}
1148 1193
1149static int nvme_user_admin_cmd(struct nvme_ns *ns, 1194static int nvme_user_admin_cmd(struct nvme_dev *dev,
1150 struct nvme_admin_cmd __user *ucmd) 1195 struct nvme_admin_cmd __user *ucmd)
1151{ 1196{
1152 struct nvme_dev *dev = ns->dev;
1153 struct nvme_admin_cmd cmd; 1197 struct nvme_admin_cmd cmd;
1154 struct nvme_command c; 1198 struct nvme_command c;
1155 int status, length; 1199 int status, length;
1156 struct nvme_iod *iod; 1200 struct nvme_iod *uninitialized_var(iod);
1157 1201
1158 if (!capable(CAP_SYS_ADMIN)) 1202 if (!capable(CAP_SYS_ADMIN))
1159 return -EACCES; 1203 return -EACCES;
@@ -1204,7 +1248,7 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
1204 case NVME_IOCTL_ID: 1248 case NVME_IOCTL_ID:
1205 return ns->ns_id; 1249 return ns->ns_id;
1206 case NVME_IOCTL_ADMIN_CMD: 1250 case NVME_IOCTL_ADMIN_CMD:
1207 return nvme_user_admin_cmd(ns, (void __user *)arg); 1251 return nvme_user_admin_cmd(ns->dev, (void __user *)arg);
1208 case NVME_IOCTL_SUBMIT_IO: 1252 case NVME_IOCTL_SUBMIT_IO:
1209 return nvme_submit_io(ns, (void __user *)arg); 1253 return nvme_submit_io(ns, (void __user *)arg);
1210 default: 1254 default:
@@ -1218,26 +1262,6 @@ static const struct block_device_operations nvme_fops = {
1218 .compat_ioctl = nvme_ioctl, 1262 .compat_ioctl = nvme_ioctl,
1219}; 1263};
1220 1264
1221static void nvme_timeout_ios(struct nvme_queue *nvmeq)
1222{
1223 int depth = nvmeq->q_depth - 1;
1224 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
1225 unsigned long now = jiffies;
1226 int cmdid;
1227
1228 for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) {
1229 void *ctx;
1230 nvme_completion_fn fn;
1231 static struct nvme_completion cqe = { .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1, };
1232
1233 if (!time_after(now, info[cmdid].timeout))
1234 continue;
1235 dev_warn(nvmeq->q_dmadev, "Timing out I/O %d\n", cmdid);
1236 ctx = cancel_cmdid(nvmeq, cmdid, &fn);
1237 fn(nvmeq->dev, ctx, &cqe);
1238 }
1239}
1240
1241static void nvme_resubmit_bios(struct nvme_queue *nvmeq) 1265static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
1242{ 1266{
1243 while (bio_list_peek(&nvmeq->sq_cong)) { 1267 while (bio_list_peek(&nvmeq->sq_cong)) {
@@ -1269,7 +1293,7 @@ static int nvme_kthread(void *data)
1269 spin_lock_irq(&nvmeq->q_lock); 1293 spin_lock_irq(&nvmeq->q_lock);
1270 if (nvme_process_cq(nvmeq)) 1294 if (nvme_process_cq(nvmeq))
1271 printk("process_cq did something\n"); 1295 printk("process_cq did something\n");
1272 nvme_timeout_ios(nvmeq); 1296 nvme_cancel_ios(nvmeq, true);
1273 nvme_resubmit_bios(nvmeq); 1297 nvme_resubmit_bios(nvmeq);
1274 spin_unlock_irq(&nvmeq->q_lock); 1298 spin_unlock_irq(&nvmeq->q_lock);
1275 } 1299 }
@@ -1339,6 +1363,9 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid,
1339 ns->disk = disk; 1363 ns->disk = disk;
1340 lbaf = id->flbas & 0xf; 1364 lbaf = id->flbas & 0xf;
1341 ns->lba_shift = id->lbaf[lbaf].ds; 1365 ns->lba_shift = id->lbaf[lbaf].ds;
1366 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
1367 if (dev->max_hw_sectors)
1368 blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
1342 1369
1343 disk->major = nvme_major; 1370 disk->major = nvme_major;
1344 disk->minors = NVME_MINORS; 1371 disk->minors = NVME_MINORS;
@@ -1383,7 +1410,7 @@ static int set_queue_count(struct nvme_dev *dev, int count)
1383 1410
1384static int __devinit nvme_setup_io_queues(struct nvme_dev *dev) 1411static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
1385{ 1412{
1386 int result, cpu, i, nr_io_queues, db_bar_size; 1413 int result, cpu, i, nr_io_queues, db_bar_size, q_depth;
1387 1414
1388 nr_io_queues = num_online_cpus(); 1415 nr_io_queues = num_online_cpus();
1389 result = set_queue_count(dev, nr_io_queues); 1416 result = set_queue_count(dev, nr_io_queues);
@@ -1429,9 +1456,10 @@ static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
1429 cpu = cpumask_next(cpu, cpu_online_mask); 1456 cpu = cpumask_next(cpu, cpu_online_mask);
1430 } 1457 }
1431 1458
1459 q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1,
1460 NVME_Q_DEPTH);
1432 for (i = 0; i < nr_io_queues; i++) { 1461 for (i = 0; i < nr_io_queues; i++) {
1433 dev->queues[i + 1] = nvme_create_queue(dev, i + 1, 1462 dev->queues[i + 1] = nvme_create_queue(dev, i + 1, q_depth, i);
1434 NVME_Q_DEPTH, i);
1435 if (IS_ERR(dev->queues[i + 1])) 1463 if (IS_ERR(dev->queues[i + 1]))
1436 return PTR_ERR(dev->queues[i + 1]); 1464 return PTR_ERR(dev->queues[i + 1]);
1437 dev->queue_count++; 1465 dev->queue_count++;
@@ -1480,6 +1508,10 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev)
1480 memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); 1508 memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
1481 memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); 1509 memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
1482 memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); 1510 memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
1511 if (ctrl->mdts) {
1512 int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
1513 dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9);
1514 }
1483 1515
1484 id_ns = mem; 1516 id_ns = mem;
1485 for (i = 1; i <= nn; i++) { 1517 for (i = 1; i <= nn; i++) {
@@ -1523,8 +1555,6 @@ static int nvme_dev_remove(struct nvme_dev *dev)
1523 list_del(&dev->node); 1555 list_del(&dev->node);
1524 spin_unlock(&dev_list_lock); 1556 spin_unlock(&dev_list_lock);
1525 1557
1526 /* TODO: wait all I/O finished or cancel them */
1527
1528 list_for_each_entry_safe(ns, next, &dev->namespaces, list) { 1558 list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
1529 list_del(&ns->list); 1559 list_del(&ns->list);
1530 del_gendisk(ns->disk); 1560 del_gendisk(ns->disk);
@@ -1560,15 +1590,33 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
1560 dma_pool_destroy(dev->prp_small_pool); 1590 dma_pool_destroy(dev->prp_small_pool);
1561} 1591}
1562 1592
1563/* XXX: Use an ida or something to let remove / add work correctly */ 1593static DEFINE_IDA(nvme_instance_ida);
1564static void nvme_set_instance(struct nvme_dev *dev) 1594
1595static int nvme_set_instance(struct nvme_dev *dev)
1565{ 1596{
1566 static int instance; 1597 int instance, error;
1567 dev->instance = instance++; 1598
1599 do {
1600 if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
1601 return -ENODEV;
1602
1603 spin_lock(&dev_list_lock);
1604 error = ida_get_new(&nvme_instance_ida, &instance);
1605 spin_unlock(&dev_list_lock);
1606 } while (error == -EAGAIN);
1607
1608 if (error)
1609 return -ENODEV;
1610
1611 dev->instance = instance;
1612 return 0;
1568} 1613}
1569 1614
1570static void nvme_release_instance(struct nvme_dev *dev) 1615static void nvme_release_instance(struct nvme_dev *dev)
1571{ 1616{
1617 spin_lock(&dev_list_lock);
1618 ida_remove(&nvme_instance_ida, dev->instance);
1619 spin_unlock(&dev_list_lock);
1572} 1620}
1573 1621
1574static int __devinit nvme_probe(struct pci_dev *pdev, 1622static int __devinit nvme_probe(struct pci_dev *pdev,
@@ -1601,7 +1649,10 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
1601 pci_set_drvdata(pdev, dev); 1649 pci_set_drvdata(pdev, dev);
1602 dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)); 1650 dma_set_mask(&pdev->dev, DMA_BIT_MASK(64));
1603 dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64)); 1651 dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64));
1604 nvme_set_instance(dev); 1652 result = nvme_set_instance(dev);
1653 if (result)
1654 goto disable;
1655
1605 dev->entry[0].vector = pdev->irq; 1656 dev->entry[0].vector = pdev->irq;
1606 1657
1607 result = nvme_setup_prp_pools(dev); 1658 result = nvme_setup_prp_pools(dev);
@@ -1675,7 +1726,7 @@ static void __devexit nvme_remove(struct pci_dev *pdev)
1675#define nvme_suspend NULL 1726#define nvme_suspend NULL
1676#define nvme_resume NULL 1727#define nvme_resume NULL
1677 1728
1678static struct pci_error_handlers nvme_err_handler = { 1729static const struct pci_error_handlers nvme_err_handler = {
1679 .error_detected = nvme_error_detected, 1730 .error_detected = nvme_error_detected,
1680 .mmio_enabled = nvme_dump_registers, 1731 .mmio_enabled = nvme_dump_registers,
1681 .link_reset = nvme_link_reset, 1732 .link_reset = nvme_link_reset,
@@ -1704,15 +1755,17 @@ static struct pci_driver nvme_driver = {
1704 1755
1705static int __init nvme_init(void) 1756static int __init nvme_init(void)
1706{ 1757{
1707 int result = -EBUSY; 1758 int result;
1708 1759
1709 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); 1760 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
1710 if (IS_ERR(nvme_thread)) 1761 if (IS_ERR(nvme_thread))
1711 return PTR_ERR(nvme_thread); 1762 return PTR_ERR(nvme_thread);
1712 1763
1713 nvme_major = register_blkdev(nvme_major, "nvme"); 1764 result = register_blkdev(nvme_major, "nvme");
1714 if (nvme_major <= 0) 1765 if (result < 0)
1715 goto kill_kthread; 1766 goto kill_kthread;
1767 else if (result > 0)
1768 nvme_major = result;
1716 1769
1717 result = pci_register_driver(&nvme_driver); 1770 result = pci_register_driver(&nvme_driver);
1718 if (result) 1771 if (result)
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 9917943a3572..bb3d9be3b1b4 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -41,6 +41,8 @@
41 41
42#include "rbd_types.h" 42#include "rbd_types.h"
43 43
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
44/* 46/*
45 * The basic unit of block I/O is a sector. It is interpreted in a 47 * The basic unit of block I/O is a sector. It is interpreted in a
46 * number of contexts in Linux (blk, bio, genhd), but the default is 48 * number of contexts in Linux (blk, bio, genhd), but the default is
@@ -50,16 +52,24 @@
50#define SECTOR_SHIFT 9 52#define SECTOR_SHIFT 9
51#define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
52 54
55/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
53#define RBD_DRV_NAME "rbd" 59#define RBD_DRV_NAME "rbd"
54#define RBD_DRV_NAME_LONG "rbd (rados block device)" 60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
55 61
56#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ 62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
57 63
58#define RBD_MAX_SNAP_NAME_LEN 32 64#define RBD_MAX_SNAP_NAME_LEN 32
65#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
59#define RBD_MAX_OPT_LEN 1024 66#define RBD_MAX_OPT_LEN 1024
60 67
61#define RBD_SNAP_HEAD_NAME "-" 68#define RBD_SNAP_HEAD_NAME "-"
62 69
70#define RBD_IMAGE_ID_LEN_MAX 64
71#define RBD_OBJ_PREFIX_LEN_MAX 64
72
63/* 73/*
64 * An RBD device name will be "rbd#", where the "rbd" comes from 74 * An RBD device name will be "rbd#", where the "rbd" comes from
65 * RBD_DRV_NAME above, and # is a unique integer identifier. 75 * RBD_DRV_NAME above, and # is a unique integer identifier.
@@ -69,21 +79,22 @@
69#define DEV_NAME_LEN 32 79#define DEV_NAME_LEN 32
70#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) 80#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
71 81
72#define RBD_NOTIFY_TIMEOUT_DEFAULT 10 82#define RBD_READ_ONLY_DEFAULT false
73 83
74/* 84/*
75 * block device image metadata (in-memory version) 85 * block device image metadata (in-memory version)
76 */ 86 */
77struct rbd_image_header { 87struct rbd_image_header {
78 u64 image_size; 88 /* These four fields never change for a given rbd image */
79 char *object_prefix; 89 char *object_prefix;
90 u64 features;
80 __u8 obj_order; 91 __u8 obj_order;
81 __u8 crypt_type; 92 __u8 crypt_type;
82 __u8 comp_type; 93 __u8 comp_type;
83 struct ceph_snap_context *snapc;
84 size_t snap_names_len;
85 u32 total_snaps;
86 94
95 /* The remaining fields need to be updated occasionally */
96 u64 image_size;
97 struct ceph_snap_context *snapc;
87 char *snap_names; 98 char *snap_names;
88 u64 *snap_sizes; 99 u64 *snap_sizes;
89 100
@@ -91,7 +102,7 @@ struct rbd_image_header {
91}; 102};
92 103
93struct rbd_options { 104struct rbd_options {
94 int notify_timeout; 105 bool read_only;
95}; 106};
96 107
97/* 108/*
@@ -99,7 +110,6 @@ struct rbd_options {
99 */ 110 */
100struct rbd_client { 111struct rbd_client {
101 struct ceph_client *client; 112 struct ceph_client *client;
102 struct rbd_options *rbd_opts;
103 struct kref kref; 113 struct kref kref;
104 struct list_head node; 114 struct list_head node;
105}; 115};
@@ -141,6 +151,16 @@ struct rbd_snap {
141 u64 size; 151 u64 size;
142 struct list_head node; 152 struct list_head node;
143 u64 id; 153 u64 id;
154 u64 features;
155};
156
157struct rbd_mapping {
158 char *snap_name;
159 u64 snap_id;
160 u64 size;
161 u64 features;
162 bool snap_exists;
163 bool read_only;
144}; 164};
145 165
146/* 166/*
@@ -151,8 +171,9 @@ struct rbd_device {
151 171
152 int major; /* blkdev assigned major */ 172 int major; /* blkdev assigned major */
153 struct gendisk *disk; /* blkdev's gendisk and rq */ 173 struct gendisk *disk; /* blkdev's gendisk and rq */
154 struct request_queue *q;
155 174
175 u32 image_format; /* Either 1 or 2 */
176 struct rbd_options rbd_opts;
156 struct rbd_client *rbd_client; 177 struct rbd_client *rbd_client;
157 178
158 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 179 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
@@ -160,6 +181,8 @@ struct rbd_device {
160 spinlock_t lock; /* queue lock */ 181 spinlock_t lock; /* queue lock */
161 182
162 struct rbd_image_header header; 183 struct rbd_image_header header;
184 char *image_id;
185 size_t image_id_len;
163 char *image_name; 186 char *image_name;
164 size_t image_name_len; 187 size_t image_name_len;
165 char *header_name; 188 char *header_name;
@@ -171,13 +194,8 @@ struct rbd_device {
171 194
172 /* protects updating the header */ 195 /* protects updating the header */
173 struct rw_semaphore header_rwsem; 196 struct rw_semaphore header_rwsem;
174 /* name of the snapshot this device reads from */ 197
175 char *snap_name; 198 struct rbd_mapping mapping;
176 /* id of the snapshot this device reads from */
177 u64 snap_id; /* current snapshot id */
178 /* whether the snap_id this device reads from still exists */
179 bool snap_exists;
180 int read_only;
181 199
182 struct list_head node; 200 struct list_head node;
183 201
@@ -196,12 +214,10 @@ static DEFINE_SPINLOCK(rbd_dev_list_lock);
196static LIST_HEAD(rbd_client_list); /* clients */ 214static LIST_HEAD(rbd_client_list); /* clients */
197static DEFINE_SPINLOCK(rbd_client_list_lock); 215static DEFINE_SPINLOCK(rbd_client_list_lock);
198 216
199static int __rbd_init_snaps_header(struct rbd_device *rbd_dev); 217static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
218static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
219
200static void rbd_dev_release(struct device *dev); 220static void rbd_dev_release(struct device *dev);
201static ssize_t rbd_snap_add(struct device *dev,
202 struct device_attribute *attr,
203 const char *buf,
204 size_t count);
205static void __rbd_remove_snap_dev(struct rbd_snap *snap); 221static void __rbd_remove_snap_dev(struct rbd_snap *snap);
206 222
207static ssize_t rbd_add(struct bus_type *bus, const char *buf, 223static ssize_t rbd_add(struct bus_type *bus, const char *buf,
@@ -229,6 +245,18 @@ static struct device rbd_root_dev = {
229 .release = rbd_root_dev_release, 245 .release = rbd_root_dev_release,
230}; 246};
231 247
248#ifdef RBD_DEBUG
249#define rbd_assert(expr) \
250 if (unlikely(!(expr))) { \
251 printk(KERN_ERR "\nAssertion failure in %s() " \
252 "at line %d:\n\n" \
253 "\trbd_assert(%s);\n\n", \
254 __func__, __LINE__, #expr); \
255 BUG(); \
256 }
257#else /* !RBD_DEBUG */
258# define rbd_assert(expr) ((void) 0)
259#endif /* !RBD_DEBUG */
232 260
233static struct device *rbd_get_dev(struct rbd_device *rbd_dev) 261static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
234{ 262{
@@ -246,13 +274,12 @@ static int rbd_open(struct block_device *bdev, fmode_t mode)
246{ 274{
247 struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 275 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
248 276
249 rbd_get_dev(rbd_dev); 277 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
250
251 set_device_ro(bdev, rbd_dev->read_only);
252
253 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
254 return -EROFS; 278 return -EROFS;
255 279
280 rbd_get_dev(rbd_dev);
281 set_device_ro(bdev, rbd_dev->mapping.read_only);
282
256 return 0; 283 return 0;
257} 284}
258 285
@@ -275,8 +302,7 @@ static const struct block_device_operations rbd_bd_ops = {
275 * Initialize an rbd client instance. 302 * Initialize an rbd client instance.
276 * We own *ceph_opts. 303 * We own *ceph_opts.
277 */ 304 */
278static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts, 305static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
279 struct rbd_options *rbd_opts)
280{ 306{
281 struct rbd_client *rbdc; 307 struct rbd_client *rbdc;
282 int ret = -ENOMEM; 308 int ret = -ENOMEM;
@@ -300,8 +326,6 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts,
300 if (ret < 0) 326 if (ret < 0)
301 goto out_err; 327 goto out_err;
302 328
303 rbdc->rbd_opts = rbd_opts;
304
305 spin_lock(&rbd_client_list_lock); 329 spin_lock(&rbd_client_list_lock);
306 list_add_tail(&rbdc->node, &rbd_client_list); 330 list_add_tail(&rbdc->node, &rbd_client_list);
307 spin_unlock(&rbd_client_list_lock); 331 spin_unlock(&rbd_client_list_lock);
@@ -323,36 +347,52 @@ out_opt:
323} 347}
324 348
325/* 349/*
326 * Find a ceph client with specific addr and configuration. 350 * Find a ceph client with specific addr and configuration. If
351 * found, bump its reference count.
327 */ 352 */
328static struct rbd_client *__rbd_client_find(struct ceph_options *ceph_opts) 353static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
329{ 354{
330 struct rbd_client *client_node; 355 struct rbd_client *client_node;
356 bool found = false;
331 357
332 if (ceph_opts->flags & CEPH_OPT_NOSHARE) 358 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
333 return NULL; 359 return NULL;
334 360
335 list_for_each_entry(client_node, &rbd_client_list, node) 361 spin_lock(&rbd_client_list_lock);
336 if (!ceph_compare_options(ceph_opts, client_node->client)) 362 list_for_each_entry(client_node, &rbd_client_list, node) {
337 return client_node; 363 if (!ceph_compare_options(ceph_opts, client_node->client)) {
338 return NULL; 364 kref_get(&client_node->kref);
365 found = true;
366 break;
367 }
368 }
369 spin_unlock(&rbd_client_list_lock);
370
371 return found ? client_node : NULL;
339} 372}
340 373
341/* 374/*
342 * mount options 375 * mount options
343 */ 376 */
344enum { 377enum {
345 Opt_notify_timeout,
346 Opt_last_int, 378 Opt_last_int,
347 /* int args above */ 379 /* int args above */
348 Opt_last_string, 380 Opt_last_string,
349 /* string args above */ 381 /* string args above */
382 Opt_read_only,
383 Opt_read_write,
384 /* Boolean args above */
385 Opt_last_bool,
350}; 386};
351 387
352static match_table_t rbd_opts_tokens = { 388static match_table_t rbd_opts_tokens = {
353 {Opt_notify_timeout, "notify_timeout=%d"},
354 /* int args above */ 389 /* int args above */
355 /* string args above */ 390 /* string args above */
391 {Opt_read_only, "mapping.read_only"},
392 {Opt_read_only, "ro"}, /* Alternate spelling */
393 {Opt_read_write, "read_write"},
394 {Opt_read_write, "rw"}, /* Alternate spelling */
395 /* Boolean args above */
356 {-1, NULL} 396 {-1, NULL}
357}; 397};
358 398
@@ -377,16 +417,22 @@ static int parse_rbd_opts_token(char *c, void *private)
377 } else if (token > Opt_last_int && token < Opt_last_string) { 417 } else if (token > Opt_last_int && token < Opt_last_string) {
378 dout("got string token %d val %s\n", token, 418 dout("got string token %d val %s\n", token,
379 argstr[0].from); 419 argstr[0].from);
420 } else if (token > Opt_last_string && token < Opt_last_bool) {
421 dout("got Boolean token %d\n", token);
380 } else { 422 } else {
381 dout("got token %d\n", token); 423 dout("got token %d\n", token);
382 } 424 }
383 425
384 switch (token) { 426 switch (token) {
385 case Opt_notify_timeout: 427 case Opt_read_only:
386 rbd_opts->notify_timeout = intval; 428 rbd_opts->read_only = true;
429 break;
430 case Opt_read_write:
431 rbd_opts->read_only = false;
387 break; 432 break;
388 default: 433 default:
389 BUG_ON(token); 434 rbd_assert(false);
435 break;
390 } 436 }
391 return 0; 437 return 0;
392} 438}
@@ -395,48 +441,33 @@ static int parse_rbd_opts_token(char *c, void *private)
395 * Get a ceph client with specific addr and configuration, if one does 441 * Get a ceph client with specific addr and configuration, if one does
396 * not exist create it. 442 * not exist create it.
397 */ 443 */
398static struct rbd_client *rbd_get_client(const char *mon_addr, 444static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
399 size_t mon_addr_len, 445 size_t mon_addr_len, char *options)
400 char *options)
401{ 446{
402 struct rbd_client *rbdc; 447 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
403 struct ceph_options *ceph_opts; 448 struct ceph_options *ceph_opts;
404 struct rbd_options *rbd_opts; 449 struct rbd_client *rbdc;
405
406 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
407 if (!rbd_opts)
408 return ERR_PTR(-ENOMEM);
409 450
410 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; 451 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
411 452
412 ceph_opts = ceph_parse_options(options, mon_addr, 453 ceph_opts = ceph_parse_options(options, mon_addr,
413 mon_addr + mon_addr_len, 454 mon_addr + mon_addr_len,
414 parse_rbd_opts_token, rbd_opts); 455 parse_rbd_opts_token, rbd_opts);
415 if (IS_ERR(ceph_opts)) { 456 if (IS_ERR(ceph_opts))
416 kfree(rbd_opts); 457 return PTR_ERR(ceph_opts);
417 return ERR_CAST(ceph_opts);
418 }
419 458
420 spin_lock(&rbd_client_list_lock); 459 rbdc = rbd_client_find(ceph_opts);
421 rbdc = __rbd_client_find(ceph_opts);
422 if (rbdc) { 460 if (rbdc) {
423 /* using an existing client */ 461 /* using an existing client */
424 kref_get(&rbdc->kref);
425 spin_unlock(&rbd_client_list_lock);
426
427 ceph_destroy_options(ceph_opts); 462 ceph_destroy_options(ceph_opts);
428 kfree(rbd_opts); 463 } else {
429 464 rbdc = rbd_client_create(ceph_opts);
430 return rbdc; 465 if (IS_ERR(rbdc))
466 return PTR_ERR(rbdc);
431 } 467 }
432 spin_unlock(&rbd_client_list_lock); 468 rbd_dev->rbd_client = rbdc;
433
434 rbdc = rbd_client_create(ceph_opts, rbd_opts);
435 469
436 if (IS_ERR(rbdc)) 470 return 0;
437 kfree(rbd_opts);
438
439 return rbdc;
440} 471}
441 472
442/* 473/*
@@ -454,7 +485,6 @@ static void rbd_client_release(struct kref *kref)
454 spin_unlock(&rbd_client_list_lock); 485 spin_unlock(&rbd_client_list_lock);
455 486
456 ceph_destroy_client(rbdc->client); 487 ceph_destroy_client(rbdc->client);
457 kfree(rbdc->rbd_opts);
458 kfree(rbdc); 488 kfree(rbdc);
459} 489}
460 490
@@ -480,10 +510,38 @@ static void rbd_coll_release(struct kref *kref)
480 kfree(coll); 510 kfree(coll);
481} 511}
482 512
513static bool rbd_image_format_valid(u32 image_format)
514{
515 return image_format == 1 || image_format == 2;
516}
517
483static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 518static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
484{ 519{
485 return !memcmp(&ondisk->text, 520 size_t size;
486 RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)); 521 u32 snap_count;
522
523 /* The header has to start with the magic rbd header text */
524 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
525 return false;
526
527 /*
528 * The size of a snapshot header has to fit in a size_t, and
529 * that limits the number of snapshots.
530 */
531 snap_count = le32_to_cpu(ondisk->snap_count);
532 size = SIZE_MAX - sizeof (struct ceph_snap_context);
533 if (snap_count > size / sizeof (__le64))
534 return false;
535
536 /*
537 * Not only that, but the size of the entire the snapshot
538 * header must also be representable in a size_t.
539 */
540 size -= snap_count * sizeof (__le64);
541 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
542 return false;
543
544 return true;
487} 545}
488 546
489/* 547/*
@@ -491,179 +549,203 @@ static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
491 * header. 549 * header.
492 */ 550 */
493static int rbd_header_from_disk(struct rbd_image_header *header, 551static int rbd_header_from_disk(struct rbd_image_header *header,
494 struct rbd_image_header_ondisk *ondisk, 552 struct rbd_image_header_ondisk *ondisk)
495 u32 allocated_snaps)
496{ 553{
497 u32 snap_count; 554 u32 snap_count;
555 size_t len;
556 size_t size;
557 u32 i;
498 558
499 if (!rbd_dev_ondisk_valid(ondisk)) 559 memset(header, 0, sizeof (*header));
500 return -ENXIO;
501 560
502 snap_count = le32_to_cpu(ondisk->snap_count); 561 snap_count = le32_to_cpu(ondisk->snap_count);
503 if (snap_count > (SIZE_MAX - sizeof(struct ceph_snap_context)) 562
504 / sizeof (u64)) 563 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
505 return -EINVAL; 564 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
506 header->snapc = kmalloc(sizeof(struct ceph_snap_context) + 565 if (!header->object_prefix)
507 snap_count * sizeof(u64),
508 GFP_KERNEL);
509 if (!header->snapc)
510 return -ENOMEM; 566 return -ENOMEM;
567 memcpy(header->object_prefix, ondisk->object_prefix, len);
568 header->object_prefix[len] = '\0';
511 569
512 if (snap_count) { 570 if (snap_count) {
513 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); 571 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
514 header->snap_names = kmalloc(header->snap_names_len, 572
515 GFP_KERNEL); 573 /* Save a copy of the snapshot names */
574
575 if (snap_names_len > (u64) SIZE_MAX)
576 return -EIO;
577 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
516 if (!header->snap_names) 578 if (!header->snap_names)
517 goto err_snapc; 579 goto out_err;
518 header->snap_sizes = kmalloc(snap_count * sizeof(u64), 580 /*
519 GFP_KERNEL); 581 * Note that rbd_dev_v1_header_read() guarantees
582 * the ondisk buffer we're working with has
583 * snap_names_len bytes beyond the end of the
584 * snapshot id array, this memcpy() is safe.
585 */
586 memcpy(header->snap_names, &ondisk->snaps[snap_count],
587 snap_names_len);
588
589 /* Record each snapshot's size */
590
591 size = snap_count * sizeof (*header->snap_sizes);
592 header->snap_sizes = kmalloc(size, GFP_KERNEL);
520 if (!header->snap_sizes) 593 if (!header->snap_sizes)
521 goto err_names; 594 goto out_err;
595 for (i = 0; i < snap_count; i++)
596 header->snap_sizes[i] =
597 le64_to_cpu(ondisk->snaps[i].image_size);
522 } else { 598 } else {
523 WARN_ON(ondisk->snap_names_len); 599 WARN_ON(ondisk->snap_names_len);
524 header->snap_names_len = 0;
525 header->snap_names = NULL; 600 header->snap_names = NULL;
526 header->snap_sizes = NULL; 601 header->snap_sizes = NULL;
527 } 602 }
528 603
529 header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1, 604 header->features = 0; /* No features support in v1 images */
530 GFP_KERNEL);
531 if (!header->object_prefix)
532 goto err_sizes;
533
534 memcpy(header->object_prefix, ondisk->block_name,
535 sizeof(ondisk->block_name));
536 header->object_prefix[sizeof (ondisk->block_name)] = '\0';
537
538 header->image_size = le64_to_cpu(ondisk->image_size);
539 header->obj_order = ondisk->options.order; 605 header->obj_order = ondisk->options.order;
540 header->crypt_type = ondisk->options.crypt_type; 606 header->crypt_type = ondisk->options.crypt_type;
541 header->comp_type = ondisk->options.comp_type; 607 header->comp_type = ondisk->options.comp_type;
542 608
609 /* Allocate and fill in the snapshot context */
610
611 header->image_size = le64_to_cpu(ondisk->image_size);
612 size = sizeof (struct ceph_snap_context);
613 size += snap_count * sizeof (header->snapc->snaps[0]);
614 header->snapc = kzalloc(size, GFP_KERNEL);
615 if (!header->snapc)
616 goto out_err;
617
543 atomic_set(&header->snapc->nref, 1); 618 atomic_set(&header->snapc->nref, 1);
544 header->snapc->seq = le64_to_cpu(ondisk->snap_seq); 619 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
545 header->snapc->num_snaps = snap_count; 620 header->snapc->num_snaps = snap_count;
546 header->total_snaps = snap_count; 621 for (i = 0; i < snap_count; i++)
547 622 header->snapc->snaps[i] =
548 if (snap_count && allocated_snaps == snap_count) { 623 le64_to_cpu(ondisk->snaps[i].id);
549 int i;
550
551 for (i = 0; i < snap_count; i++) {
552 header->snapc->snaps[i] =
553 le64_to_cpu(ondisk->snaps[i].id);
554 header->snap_sizes[i] =
555 le64_to_cpu(ondisk->snaps[i].image_size);
556 }
557
558 /* copy snapshot names */
559 memcpy(header->snap_names, &ondisk->snaps[snap_count],
560 header->snap_names_len);
561 }
562 624
563 return 0; 625 return 0;
564 626
565err_sizes: 627out_err:
566 kfree(header->snap_sizes); 628 kfree(header->snap_sizes);
567 header->snap_sizes = NULL; 629 header->snap_sizes = NULL;
568err_names:
569 kfree(header->snap_names); 630 kfree(header->snap_names);
570 header->snap_names = NULL; 631 header->snap_names = NULL;
571err_snapc: 632 kfree(header->object_prefix);
572 kfree(header->snapc); 633 header->object_prefix = NULL;
573 header->snapc = NULL;
574 634
575 return -ENOMEM; 635 return -ENOMEM;
576} 636}
577 637
578static int snap_by_name(struct rbd_image_header *header, const char *snap_name, 638static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
579 u64 *seq, u64 *size)
580{ 639{
581 int i;
582 char *p = header->snap_names;
583 640
584 for (i = 0; i < header->total_snaps; i++) { 641 struct rbd_snap *snap;
585 if (!strcmp(snap_name, p)) {
586 642
587 /* Found it. Pass back its id and/or size */ 643 list_for_each_entry(snap, &rbd_dev->snaps, node) {
644 if (!strcmp(snap_name, snap->name)) {
645 rbd_dev->mapping.snap_id = snap->id;
646 rbd_dev->mapping.size = snap->size;
647 rbd_dev->mapping.features = snap->features;
588 648
589 if (seq) 649 return 0;
590 *seq = header->snapc->snaps[i];
591 if (size)
592 *size = header->snap_sizes[i];
593 return i;
594 } 650 }
595 p += strlen(p) + 1; /* Skip ahead to the next name */
596 } 651 }
652
597 return -ENOENT; 653 return -ENOENT;
598} 654}
599 655
600static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size) 656static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
601{ 657{
602 int ret; 658 int ret;
603 659
604 down_write(&rbd_dev->header_rwsem); 660 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
605
606 if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
607 sizeof (RBD_SNAP_HEAD_NAME))) { 661 sizeof (RBD_SNAP_HEAD_NAME))) {
608 rbd_dev->snap_id = CEPH_NOSNAP; 662 rbd_dev->mapping.snap_id = CEPH_NOSNAP;
609 rbd_dev->snap_exists = false; 663 rbd_dev->mapping.size = rbd_dev->header.image_size;
610 rbd_dev->read_only = 0; 664 rbd_dev->mapping.features = rbd_dev->header.features;
611 if (size) 665 rbd_dev->mapping.snap_exists = false;
612 *size = rbd_dev->header.image_size; 666 rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
667 ret = 0;
613 } else { 668 } else {
614 u64 snap_id = 0; 669 ret = snap_by_name(rbd_dev, snap_name);
615
616 ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name,
617 &snap_id, size);
618 if (ret < 0) 670 if (ret < 0)
619 goto done; 671 goto done;
620 rbd_dev->snap_id = snap_id; 672 rbd_dev->mapping.snap_exists = true;
621 rbd_dev->snap_exists = true; 673 rbd_dev->mapping.read_only = true;
622 rbd_dev->read_only = 1;
623 } 674 }
624 675 rbd_dev->mapping.snap_name = snap_name;
625 ret = 0;
626done: 676done:
627 up_write(&rbd_dev->header_rwsem);
628 return ret; 677 return ret;
629} 678}
630 679
631static void rbd_header_free(struct rbd_image_header *header) 680static void rbd_header_free(struct rbd_image_header *header)
632{ 681{
633 kfree(header->object_prefix); 682 kfree(header->object_prefix);
683 header->object_prefix = NULL;
634 kfree(header->snap_sizes); 684 kfree(header->snap_sizes);
685 header->snap_sizes = NULL;
635 kfree(header->snap_names); 686 kfree(header->snap_names);
687 header->snap_names = NULL;
636 ceph_put_snap_context(header->snapc); 688 ceph_put_snap_context(header->snapc);
689 header->snapc = NULL;
637} 690}
638 691
639/* 692static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
640 * get the actual striped segment name, offset and length
641 */
642static u64 rbd_get_segment(struct rbd_image_header *header,
643 const char *object_prefix,
644 u64 ofs, u64 len,
645 char *seg_name, u64 *segofs)
646{ 693{
647 u64 seg = ofs >> header->obj_order; 694 char *name;
695 u64 segment;
696 int ret;
648 697
649 if (seg_name) 698 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
650 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN, 699 if (!name)
651 "%s.%012llx", object_prefix, seg); 700 return NULL;
701 segment = offset >> rbd_dev->header.obj_order;
702 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
703 rbd_dev->header.object_prefix, segment);
704 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
705 pr_err("error formatting segment name for #%llu (%d)\n",
706 segment, ret);
707 kfree(name);
708 name = NULL;
709 }
652 710
653 ofs = ofs & ((1 << header->obj_order) - 1); 711 return name;
654 len = min_t(u64, len, (1 << header->obj_order) - ofs); 712}
655 713
656 if (segofs) 714static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
657 *segofs = ofs; 715{
716 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
717
718 return offset & (segment_size - 1);
719}
720
721static u64 rbd_segment_length(struct rbd_device *rbd_dev,
722 u64 offset, u64 length)
723{
724 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
658 725
659 return len; 726 offset &= segment_size - 1;
727
728 rbd_assert(length <= U64_MAX - offset);
729 if (offset + length > segment_size)
730 length = segment_size - offset;
731
732 return length;
660} 733}
661 734
662static int rbd_get_num_segments(struct rbd_image_header *header, 735static int rbd_get_num_segments(struct rbd_image_header *header,
663 u64 ofs, u64 len) 736 u64 ofs, u64 len)
664{ 737{
665 u64 start_seg = ofs >> header->obj_order; 738 u64 start_seg;
666 u64 end_seg = (ofs + len - 1) >> header->obj_order; 739 u64 end_seg;
740
741 if (!len)
742 return 0;
743 if (len - 1 > U64_MAX - ofs)
744 return -ERANGE;
745
746 start_seg = ofs >> header->obj_order;
747 end_seg = (ofs + len - 1) >> header->obj_order;
748
667 return end_seg - start_seg + 1; 749 return end_seg - start_seg + 1;
668} 750}
669 751
@@ -725,7 +807,9 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
725 struct bio_pair **bp, 807 struct bio_pair **bp,
726 int len, gfp_t gfpmask) 808 int len, gfp_t gfpmask)
727{ 809{
728 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL; 810 struct bio *old_chain = *old;
811 struct bio *new_chain = NULL;
812 struct bio *tail;
729 int total = 0; 813 int total = 0;
730 814
731 if (*bp) { 815 if (*bp) {
@@ -734,9 +818,12 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
734 } 818 }
735 819
736 while (old_chain && (total < len)) { 820 while (old_chain && (total < len)) {
821 struct bio *tmp;
822
737 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs); 823 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
738 if (!tmp) 824 if (!tmp)
739 goto err_out; 825 goto err_out;
826 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
740 827
741 if (total + old_chain->bi_size > len) { 828 if (total + old_chain->bi_size > len) {
742 struct bio_pair *bp; 829 struct bio_pair *bp;
@@ -764,24 +851,18 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
764 } 851 }
765 852
766 tmp->bi_bdev = NULL; 853 tmp->bi_bdev = NULL;
767 gfpmask &= ~__GFP_WAIT;
768 tmp->bi_next = NULL; 854 tmp->bi_next = NULL;
769 855 if (new_chain)
770 if (!new_chain) {
771 new_chain = tail = tmp;
772 } else {
773 tail->bi_next = tmp; 856 tail->bi_next = tmp;
774 tail = tmp; 857 else
775 } 858 new_chain = tmp;
859 tail = tmp;
776 old_chain = old_chain->bi_next; 860 old_chain = old_chain->bi_next;
777 861
778 total += tmp->bi_size; 862 total += tmp->bi_size;
779 } 863 }
780 864
781 BUG_ON(total < len); 865 rbd_assert(total == len);
782
783 if (tail)
784 tail->bi_next = NULL;
785 866
786 *old = old_chain; 867 *old = old_chain;
787 868
@@ -939,8 +1020,9 @@ static int rbd_do_request(struct request *rq,
939 layout->fl_stripe_count = cpu_to_le32(1); 1020 layout->fl_stripe_count = cpu_to_le32(1);
940 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 1021 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
941 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id); 1022 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
942 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, 1023 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
943 req, ops); 1024 req, ops);
1025 rbd_assert(ret == 0);
944 1026
945 ceph_osdc_build_request(req, ofs, &len, 1027 ceph_osdc_build_request(req, ofs, &len,
946 ops, 1028 ops,
@@ -1031,8 +1113,8 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
1031 int flags, 1113 int flags,
1032 struct ceph_osd_req_op *ops, 1114 struct ceph_osd_req_op *ops,
1033 const char *object_name, 1115 const char *object_name,
1034 u64 ofs, u64 len, 1116 u64 ofs, u64 inbound_size,
1035 char *buf, 1117 char *inbound,
1036 struct ceph_osd_request **linger_req, 1118 struct ceph_osd_request **linger_req,
1037 u64 *ver) 1119 u64 *ver)
1038{ 1120{
@@ -1040,15 +1122,15 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
1040 struct page **pages; 1122 struct page **pages;
1041 int num_pages; 1123 int num_pages;
1042 1124
1043 BUG_ON(ops == NULL); 1125 rbd_assert(ops != NULL);
1044 1126
1045 num_pages = calc_pages_for(ofs , len); 1127 num_pages = calc_pages_for(ofs, inbound_size);
1046 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 1128 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1047 if (IS_ERR(pages)) 1129 if (IS_ERR(pages))
1048 return PTR_ERR(pages); 1130 return PTR_ERR(pages);
1049 1131
1050 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid, 1132 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
1051 object_name, ofs, len, NULL, 1133 object_name, ofs, inbound_size, NULL,
1052 pages, num_pages, 1134 pages, num_pages,
1053 flags, 1135 flags,
1054 ops, 1136 ops,
@@ -1058,8 +1140,8 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
1058 if (ret < 0) 1140 if (ret < 0)
1059 goto done; 1141 goto done;
1060 1142
1061 if ((flags & CEPH_OSD_FLAG_READ) && buf) 1143 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1062 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret); 1144 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
1063 1145
1064done: 1146done:
1065 ceph_release_page_vector(pages, num_pages); 1147 ceph_release_page_vector(pages, num_pages);
@@ -1086,14 +1168,11 @@ static int rbd_do_op(struct request *rq,
1086 struct ceph_osd_req_op *ops; 1168 struct ceph_osd_req_op *ops;
1087 u32 payload_len; 1169 u32 payload_len;
1088 1170
1089 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); 1171 seg_name = rbd_segment_name(rbd_dev, ofs);
1090 if (!seg_name) 1172 if (!seg_name)
1091 return -ENOMEM; 1173 return -ENOMEM;
1092 1174 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1093 seg_len = rbd_get_segment(&rbd_dev->header, 1175 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
1094 rbd_dev->header.object_prefix,
1095 ofs, len,
1096 seg_name, &seg_ofs);
1097 1176
1098 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0); 1177 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1099 1178
@@ -1105,7 +1184,7 @@ static int rbd_do_op(struct request *rq,
1105 /* we've taken care of segment sizes earlier when we 1184 /* we've taken care of segment sizes earlier when we
1106 cloned the bios. We should never have a segment 1185 cloned the bios. We should never have a segment
1107 truncated at this point */ 1186 truncated at this point */
1108 BUG_ON(seg_len < len); 1187 rbd_assert(seg_len == len);
1109 1188
1110 ret = rbd_do_request(rq, rbd_dev, snapc, snapid, 1189 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1111 seg_name, seg_ofs, seg_len, 1190 seg_name, seg_ofs, seg_len,
@@ -1307,89 +1386,36 @@ static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
1307 return ret; 1386 return ret;
1308} 1387}
1309 1388
1310struct rbd_notify_info {
1311 struct rbd_device *rbd_dev;
1312};
1313
1314static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1315{
1316 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1317 if (!rbd_dev)
1318 return;
1319
1320 dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
1321 rbd_dev->header_name, (unsigned long long) notify_id,
1322 (unsigned int) opcode);
1323}
1324
1325/*
1326 * Request sync osd notify
1327 */
1328static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
1329{
1330 struct ceph_osd_req_op *ops;
1331 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1332 struct ceph_osd_event *event;
1333 struct rbd_notify_info info;
1334 int payload_len = sizeof(u32) + sizeof(u32);
1335 int ret;
1336
1337 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
1338 if (!ops)
1339 return -ENOMEM;
1340
1341 info.rbd_dev = rbd_dev;
1342
1343 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1344 (void *)&info, &event);
1345 if (ret < 0)
1346 goto fail;
1347
1348 ops[0].watch.ver = 1;
1349 ops[0].watch.flag = 1;
1350 ops[0].watch.cookie = event->cookie;
1351 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1352 ops[0].watch.timeout = 12;
1353
1354 ret = rbd_req_sync_op(rbd_dev, NULL,
1355 CEPH_NOSNAP,
1356 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1357 ops,
1358 rbd_dev->header_name,
1359 0, 0, NULL, NULL, NULL);
1360 if (ret < 0)
1361 goto fail_event;
1362
1363 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1364 dout("ceph_osdc_wait_event returned %d\n", ret);
1365 rbd_destroy_ops(ops);
1366 return 0;
1367
1368fail_event:
1369 ceph_osdc_cancel_event(event);
1370fail:
1371 rbd_destroy_ops(ops);
1372 return ret;
1373}
1374
1375/* 1389/*
1376 * Request sync osd read 1390 * Synchronous osd object method call
1377 */ 1391 */
1378static int rbd_req_sync_exec(struct rbd_device *rbd_dev, 1392static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
1379 const char *object_name, 1393 const char *object_name,
1380 const char *class_name, 1394 const char *class_name,
1381 const char *method_name, 1395 const char *method_name,
1382 const char *data, 1396 const char *outbound,
1383 int len, 1397 size_t outbound_size,
1398 char *inbound,
1399 size_t inbound_size,
1400 int flags,
1384 u64 *ver) 1401 u64 *ver)
1385{ 1402{
1386 struct ceph_osd_req_op *ops; 1403 struct ceph_osd_req_op *ops;
1387 int class_name_len = strlen(class_name); 1404 int class_name_len = strlen(class_name);
1388 int method_name_len = strlen(method_name); 1405 int method_name_len = strlen(method_name);
1406 int payload_size;
1389 int ret; 1407 int ret;
1390 1408
1391 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, 1409 /*
1392 class_name_len + method_name_len + len); 1410 * Any input parameters required by the method we're calling
1411 * will be sent along with the class and method names as
1412 * part of the message payload. That data and its size are
1413 * supplied via the indata and indata_len fields (named from
1414 * the perspective of the server side) in the OSD request
1415 * operation.
1416 */
1417 payload_size = class_name_len + method_name_len + outbound_size;
1418 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
1393 if (!ops) 1419 if (!ops)
1394 return -ENOMEM; 1420 return -ENOMEM;
1395 1421
@@ -1398,14 +1424,14 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
1398 ops[0].cls.method_name = method_name; 1424 ops[0].cls.method_name = method_name;
1399 ops[0].cls.method_len = (__u8) method_name_len; 1425 ops[0].cls.method_len = (__u8) method_name_len;
1400 ops[0].cls.argc = 0; 1426 ops[0].cls.argc = 0;
1401 ops[0].cls.indata = data; 1427 ops[0].cls.indata = outbound;
1402 ops[0].cls.indata_len = len; 1428 ops[0].cls.indata_len = outbound_size;
1403 1429
1404 ret = rbd_req_sync_op(rbd_dev, NULL, 1430 ret = rbd_req_sync_op(rbd_dev, NULL,
1405 CEPH_NOSNAP, 1431 CEPH_NOSNAP,
1406 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 1432 flags, ops,
1407 ops, 1433 object_name, 0, inbound_size, inbound,
1408 object_name, 0, 0, NULL, NULL, ver); 1434 NULL, ver);
1409 1435
1410 rbd_destroy_ops(ops); 1436 rbd_destroy_ops(ops);
1411 1437
@@ -1447,10 +1473,6 @@ static void rbd_rq_fn(struct request_queue *q)
1447 struct rbd_req_coll *coll; 1473 struct rbd_req_coll *coll;
1448 struct ceph_snap_context *snapc; 1474 struct ceph_snap_context *snapc;
1449 1475
1450 /* peek at request from block layer */
1451 if (!rq)
1452 break;
1453
1454 dout("fetched request\n"); 1476 dout("fetched request\n");
1455 1477
1456 /* filter out block requests we don't understand */ 1478 /* filter out block requests we don't understand */
@@ -1465,7 +1487,7 @@ static void rbd_rq_fn(struct request_queue *q)
1465 size = blk_rq_bytes(rq); 1487 size = blk_rq_bytes(rq);
1466 ofs = blk_rq_pos(rq) * SECTOR_SIZE; 1488 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1467 rq_bio = rq->bio; 1489 rq_bio = rq->bio;
1468 if (do_write && rbd_dev->read_only) { 1490 if (do_write && rbd_dev->mapping.read_only) {
1469 __blk_end_request_all(rq, -EROFS); 1491 __blk_end_request_all(rq, -EROFS);
1470 continue; 1492 continue;
1471 } 1493 }
@@ -1474,7 +1496,8 @@ static void rbd_rq_fn(struct request_queue *q)
1474 1496
1475 down_read(&rbd_dev->header_rwsem); 1497 down_read(&rbd_dev->header_rwsem);
1476 1498
1477 if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) { 1499 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1500 !rbd_dev->mapping.snap_exists) {
1478 up_read(&rbd_dev->header_rwsem); 1501 up_read(&rbd_dev->header_rwsem);
1479 dout("request for non-existent snapshot"); 1502 dout("request for non-existent snapshot");
1480 spin_lock_irq(q->queue_lock); 1503 spin_lock_irq(q->queue_lock);
@@ -1491,6 +1514,12 @@ static void rbd_rq_fn(struct request_queue *q)
1491 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); 1514 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
1492 1515
1493 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); 1516 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1517 if (num_segs <= 0) {
1518 spin_lock_irq(q->queue_lock);
1519 __blk_end_request_all(rq, num_segs);
1520 ceph_put_snap_context(snapc);
1521 continue;
1522 }
1494 coll = rbd_alloc_coll(num_segs); 1523 coll = rbd_alloc_coll(num_segs);
1495 if (!coll) { 1524 if (!coll) {
1496 spin_lock_irq(q->queue_lock); 1525 spin_lock_irq(q->queue_lock);
@@ -1502,10 +1531,7 @@ static void rbd_rq_fn(struct request_queue *q)
1502 do { 1531 do {
1503 /* a bio clone to be passed down to OSD req */ 1532 /* a bio clone to be passed down to OSD req */
1504 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt); 1533 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
1505 op_size = rbd_get_segment(&rbd_dev->header, 1534 op_size = rbd_segment_length(rbd_dev, ofs, size);
1506 rbd_dev->header.object_prefix,
1507 ofs, size,
1508 NULL, NULL);
1509 kref_get(&coll->kref); 1535 kref_get(&coll->kref);
1510 bio = bio_chain_clone(&rq_bio, &next_bio, &bp, 1536 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1511 op_size, GFP_ATOMIC); 1537 op_size, GFP_ATOMIC);
@@ -1525,7 +1551,7 @@ static void rbd_rq_fn(struct request_queue *q)
1525 coll, cur_seg); 1551 coll, cur_seg);
1526 else 1552 else
1527 rbd_req_read(rq, rbd_dev, 1553 rbd_req_read(rq, rbd_dev,
1528 rbd_dev->snap_id, 1554 rbd_dev->mapping.snap_id,
1529 ofs, 1555 ofs,
1530 op_size, bio, 1556 op_size, bio,
1531 coll, cur_seg); 1557 coll, cur_seg);
@@ -1581,8 +1607,6 @@ static void rbd_free_disk(struct rbd_device *rbd_dev)
1581 if (!disk) 1607 if (!disk)
1582 return; 1608 return;
1583 1609
1584 rbd_header_free(&rbd_dev->header);
1585
1586 if (disk->flags & GENHD_FL_UP) 1610 if (disk->flags & GENHD_FL_UP)
1587 del_gendisk(disk); 1611 del_gendisk(disk);
1588 if (disk->queue) 1612 if (disk->queue)
@@ -1591,105 +1615,96 @@ static void rbd_free_disk(struct rbd_device *rbd_dev)
1591} 1615}
1592 1616
1593/* 1617/*
1594 * reload the ondisk the header 1618 * Read the complete header for the given rbd device.
1619 *
1620 * Returns a pointer to a dynamically-allocated buffer containing
1621 * the complete and validated header. Caller can pass the address
1622 * of a variable that will be filled in with the version of the
1623 * header object at the time it was read.
1624 *
1625 * Returns a pointer-coded errno if a failure occurs.
1595 */ 1626 */
1596static int rbd_read_header(struct rbd_device *rbd_dev, 1627static struct rbd_image_header_ondisk *
1597 struct rbd_image_header *header) 1628rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1598{ 1629{
1599 ssize_t rc; 1630 struct rbd_image_header_ondisk *ondisk = NULL;
1600 struct rbd_image_header_ondisk *dh;
1601 u32 snap_count = 0; 1631 u32 snap_count = 0;
1602 u64 ver; 1632 u64 names_size = 0;
1603 size_t len; 1633 u32 want_count;
1634 int ret;
1604 1635
1605 /* 1636 /*
1606 * First reads the fixed-size header to determine the number 1637 * The complete header will include an array of its 64-bit
1607 * of snapshots, then re-reads it, along with all snapshot 1638 * snapshot ids, followed by the names of those snapshots as
1608 * records as well as their stored names. 1639 * a contiguous block of NUL-terminated strings. Note that
1640 * the number of snapshots could change by the time we read
1641 * it in, in which case we re-read it.
1609 */ 1642 */
1610 len = sizeof (*dh); 1643 do {
1611 while (1) { 1644 size_t size;
1612 dh = kmalloc(len, GFP_KERNEL); 1645
1613 if (!dh) 1646 kfree(ondisk);
1614 return -ENOMEM; 1647
1615 1648 size = sizeof (*ondisk);
1616 rc = rbd_req_sync_read(rbd_dev, 1649 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1617 CEPH_NOSNAP, 1650 size += names_size;
1651 ondisk = kmalloc(size, GFP_KERNEL);
1652 if (!ondisk)
1653 return ERR_PTR(-ENOMEM);
1654
1655 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1618 rbd_dev->header_name, 1656 rbd_dev->header_name,
1619 0, len, 1657 0, size,
1620 (char *)dh, &ver); 1658 (char *) ondisk, version);
1621 if (rc < 0) 1659
1622 goto out_dh; 1660 if (ret < 0)
1623 1661 goto out_err;
1624 rc = rbd_header_from_disk(header, dh, snap_count); 1662 if (WARN_ON((size_t) ret < size)) {
1625 if (rc < 0) { 1663 ret = -ENXIO;
1626 if (rc == -ENXIO) 1664 pr_warning("short header read for image %s"
1627 pr_warning("unrecognized header format" 1665 " (want %zd got %d)\n",
1628 " for image %s\n", 1666 rbd_dev->image_name, size, ret);
1629 rbd_dev->image_name); 1667 goto out_err;
1630 goto out_dh; 1668 }
1669 if (!rbd_dev_ondisk_valid(ondisk)) {
1670 ret = -ENXIO;
1671 pr_warning("invalid header for image %s\n",
1672 rbd_dev->image_name);
1673 goto out_err;
1631 } 1674 }
1632 1675
1633 if (snap_count == header->total_snaps) 1676 names_size = le64_to_cpu(ondisk->snap_names_len);
1634 break; 1677 want_count = snap_count;
1678 snap_count = le32_to_cpu(ondisk->snap_count);
1679 } while (snap_count != want_count);
1635 1680
1636 snap_count = header->total_snaps; 1681 return ondisk;
1637 len = sizeof (*dh) +
1638 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1639 header->snap_names_len;
1640 1682
1641 rbd_header_free(header); 1683out_err:
1642 kfree(dh); 1684 kfree(ondisk);
1643 }
1644 header->obj_version = ver;
1645 1685
1646out_dh: 1686 return ERR_PTR(ret);
1647 kfree(dh);
1648 return rc;
1649} 1687}
1650 1688
1651/* 1689/*
1652 * create a snapshot 1690 * reload the ondisk the header
1653 */ 1691 */
1654static int rbd_header_add_snap(struct rbd_device *rbd_dev, 1692static int rbd_read_header(struct rbd_device *rbd_dev,
1655 const char *snap_name, 1693 struct rbd_image_header *header)
1656 gfp_t gfp_flags)
1657{ 1694{
1658 int name_len = strlen(snap_name); 1695 struct rbd_image_header_ondisk *ondisk;
1659 u64 new_snapid; 1696 u64 ver = 0;
1660 int ret; 1697 int ret;
1661 void *data, *p, *e;
1662 struct ceph_mon_client *monc;
1663 1698
1664 /* we should create a snapshot only if we're pointing at the head */ 1699 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1665 if (rbd_dev->snap_id != CEPH_NOSNAP) 1700 if (IS_ERR(ondisk))
1666 return -EINVAL; 1701 return PTR_ERR(ondisk);
1702 ret = rbd_header_from_disk(header, ondisk);
1703 if (ret >= 0)
1704 header->obj_version = ver;
1705 kfree(ondisk);
1667 1706
1668 monc = &rbd_dev->rbd_client->client->monc; 1707 return ret;
1669 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
1670 dout("created snapid=%llu\n", (unsigned long long) new_snapid);
1671 if (ret < 0)
1672 return ret;
1673
1674 data = kmalloc(name_len + 16, gfp_flags);
1675 if (!data)
1676 return -ENOMEM;
1677
1678 p = data;
1679 e = data + name_len + 16;
1680
1681 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1682 ceph_encode_64_safe(&p, e, new_snapid, bad);
1683
1684 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
1685 "rbd", "snap_add",
1686 data, p - data, NULL);
1687
1688 kfree(data);
1689
1690 return ret < 0 ? ret : 0;
1691bad:
1692 return -ERANGE;
1693} 1708}
1694 1709
1695static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev) 1710static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
@@ -1716,11 +1731,15 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1716 down_write(&rbd_dev->header_rwsem); 1731 down_write(&rbd_dev->header_rwsem);
1717 1732
1718 /* resized? */ 1733 /* resized? */
1719 if (rbd_dev->snap_id == CEPH_NOSNAP) { 1734 if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) {
1720 sector_t size = (sector_t) h.image_size / SECTOR_SIZE; 1735 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1721 1736
1722 dout("setting size to %llu sectors", (unsigned long long) size); 1737 if (size != (sector_t) rbd_dev->mapping.size) {
1723 set_capacity(rbd_dev->disk, size); 1738 dout("setting size to %llu sectors",
1739 (unsigned long long) size);
1740 rbd_dev->mapping.size = (u64) size;
1741 set_capacity(rbd_dev->disk, size);
1742 }
1724 } 1743 }
1725 1744
1726 /* rbd_dev->header.object_prefix shouldn't change */ 1745 /* rbd_dev->header.object_prefix shouldn't change */
@@ -1733,16 +1752,16 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1733 *hver = h.obj_version; 1752 *hver = h.obj_version;
1734 rbd_dev->header.obj_version = h.obj_version; 1753 rbd_dev->header.obj_version = h.obj_version;
1735 rbd_dev->header.image_size = h.image_size; 1754 rbd_dev->header.image_size = h.image_size;
1736 rbd_dev->header.total_snaps = h.total_snaps;
1737 rbd_dev->header.snapc = h.snapc; 1755 rbd_dev->header.snapc = h.snapc;
1738 rbd_dev->header.snap_names = h.snap_names; 1756 rbd_dev->header.snap_names = h.snap_names;
1739 rbd_dev->header.snap_names_len = h.snap_names_len;
1740 rbd_dev->header.snap_sizes = h.snap_sizes; 1757 rbd_dev->header.snap_sizes = h.snap_sizes;
1741 /* Free the extra copy of the object prefix */ 1758 /* Free the extra copy of the object prefix */
1742 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix)); 1759 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1743 kfree(h.object_prefix); 1760 kfree(h.object_prefix);
1744 1761
1745 ret = __rbd_init_snaps_header(rbd_dev); 1762 ret = rbd_dev_snaps_update(rbd_dev);
1763 if (!ret)
1764 ret = rbd_dev_snaps_register(rbd_dev);
1746 1765
1747 up_write(&rbd_dev->header_rwsem); 1766 up_write(&rbd_dev->header_rwsem);
1748 1767
@@ -1764,29 +1783,12 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
1764{ 1783{
1765 struct gendisk *disk; 1784 struct gendisk *disk;
1766 struct request_queue *q; 1785 struct request_queue *q;
1767 int rc;
1768 u64 segment_size; 1786 u64 segment_size;
1769 u64 total_size = 0;
1770
1771 /* contact OSD, request size info about the object being mapped */
1772 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1773 if (rc)
1774 return rc;
1775
1776 /* no need to lock here, as rbd_dev is not registered yet */
1777 rc = __rbd_init_snaps_header(rbd_dev);
1778 if (rc)
1779 return rc;
1780
1781 rc = rbd_header_set_snap(rbd_dev, &total_size);
1782 if (rc)
1783 return rc;
1784 1787
1785 /* create gendisk info */ 1788 /* create gendisk info */
1786 rc = -ENOMEM;
1787 disk = alloc_disk(RBD_MINORS_PER_MAJOR); 1789 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1788 if (!disk) 1790 if (!disk)
1789 goto out; 1791 return -ENOMEM;
1790 1792
1791 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 1793 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
1792 rbd_dev->dev_id); 1794 rbd_dev->dev_id);
@@ -1796,7 +1798,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
1796 disk->private_data = rbd_dev; 1798 disk->private_data = rbd_dev;
1797 1799
1798 /* init rq */ 1800 /* init rq */
1799 rc = -ENOMEM;
1800 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock); 1801 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1801 if (!q) 1802 if (!q)
1802 goto out_disk; 1803 goto out_disk;
@@ -1817,20 +1818,14 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
1817 q->queuedata = rbd_dev; 1818 q->queuedata = rbd_dev;
1818 1819
1819 rbd_dev->disk = disk; 1820 rbd_dev->disk = disk;
1820 rbd_dev->q = q;
1821 1821
1822 /* finally, announce the disk to the world */ 1822 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1823 set_capacity(disk, total_size / SECTOR_SIZE);
1824 add_disk(disk);
1825 1823
1826 pr_info("%s: added with size 0x%llx\n",
1827 disk->disk_name, (unsigned long long)total_size);
1828 return 0; 1824 return 0;
1829
1830out_disk: 1825out_disk:
1831 put_disk(disk); 1826 put_disk(disk);
1832out: 1827
1833 return rc; 1828 return -ENOMEM;
1834} 1829}
1835 1830
1836/* 1831/*
@@ -1855,6 +1850,19 @@ static ssize_t rbd_size_show(struct device *dev,
1855 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE); 1850 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
1856} 1851}
1857 1852
1853/*
1854 * Note this shows the features for whatever's mapped, which is not
1855 * necessarily the base image.
1856 */
1857static ssize_t rbd_features_show(struct device *dev,
1858 struct device_attribute *attr, char *buf)
1859{
1860 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1861
1862 return sprintf(buf, "0x%016llx\n",
1863 (unsigned long long) rbd_dev->mapping.features);
1864}
1865
1858static ssize_t rbd_major_show(struct device *dev, 1866static ssize_t rbd_major_show(struct device *dev,
1859 struct device_attribute *attr, char *buf) 1867 struct device_attribute *attr, char *buf)
1860{ 1868{
@@ -1896,13 +1904,25 @@ static ssize_t rbd_name_show(struct device *dev,
1896 return sprintf(buf, "%s\n", rbd_dev->image_name); 1904 return sprintf(buf, "%s\n", rbd_dev->image_name);
1897} 1905}
1898 1906
1907static ssize_t rbd_image_id_show(struct device *dev,
1908 struct device_attribute *attr, char *buf)
1909{
1910 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1911
1912 return sprintf(buf, "%s\n", rbd_dev->image_id);
1913}
1914
1915/*
1916 * Shows the name of the currently-mapped snapshot (or
1917 * RBD_SNAP_HEAD_NAME for the base image).
1918 */
1899static ssize_t rbd_snap_show(struct device *dev, 1919static ssize_t rbd_snap_show(struct device *dev,
1900 struct device_attribute *attr, 1920 struct device_attribute *attr,
1901 char *buf) 1921 char *buf)
1902{ 1922{
1903 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1923 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1904 1924
1905 return sprintf(buf, "%s\n", rbd_dev->snap_name); 1925 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
1906} 1926}
1907 1927
1908static ssize_t rbd_image_refresh(struct device *dev, 1928static ssize_t rbd_image_refresh(struct device *dev,
@@ -1919,25 +1939,27 @@ static ssize_t rbd_image_refresh(struct device *dev,
1919} 1939}
1920 1940
1921static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 1941static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1942static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
1922static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 1943static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1923static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 1944static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1924static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 1945static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
1925static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 1946static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
1926static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 1947static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1948static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
1927static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 1949static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1928static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 1950static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1929static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
1930 1951
1931static struct attribute *rbd_attrs[] = { 1952static struct attribute *rbd_attrs[] = {
1932 &dev_attr_size.attr, 1953 &dev_attr_size.attr,
1954 &dev_attr_features.attr,
1933 &dev_attr_major.attr, 1955 &dev_attr_major.attr,
1934 &dev_attr_client_id.attr, 1956 &dev_attr_client_id.attr,
1935 &dev_attr_pool.attr, 1957 &dev_attr_pool.attr,
1936 &dev_attr_pool_id.attr, 1958 &dev_attr_pool_id.attr,
1937 &dev_attr_name.attr, 1959 &dev_attr_name.attr,
1960 &dev_attr_image_id.attr,
1938 &dev_attr_current_snap.attr, 1961 &dev_attr_current_snap.attr,
1939 &dev_attr_refresh.attr, 1962 &dev_attr_refresh.attr,
1940 &dev_attr_create_snap.attr,
1941 NULL 1963 NULL
1942}; 1964};
1943 1965
@@ -1983,12 +2005,24 @@ static ssize_t rbd_snap_id_show(struct device *dev,
1983 return sprintf(buf, "%llu\n", (unsigned long long)snap->id); 2005 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
1984} 2006}
1985 2007
2008static ssize_t rbd_snap_features_show(struct device *dev,
2009 struct device_attribute *attr,
2010 char *buf)
2011{
2012 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2013
2014 return sprintf(buf, "0x%016llx\n",
2015 (unsigned long long) snap->features);
2016}
2017
1986static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL); 2018static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1987static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL); 2019static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2020static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
1988 2021
1989static struct attribute *rbd_snap_attrs[] = { 2022static struct attribute *rbd_snap_attrs[] = {
1990 &dev_attr_snap_size.attr, 2023 &dev_attr_snap_size.attr,
1991 &dev_attr_snap_id.attr, 2024 &dev_attr_snap_id.attr,
2025 &dev_attr_snap_features.attr,
1992 NULL, 2026 NULL,
1993}; 2027};
1994 2028
@@ -2013,10 +2047,21 @@ static struct device_type rbd_snap_device_type = {
2013 .release = rbd_snap_dev_release, 2047 .release = rbd_snap_dev_release,
2014}; 2048};
2015 2049
2050static bool rbd_snap_registered(struct rbd_snap *snap)
2051{
2052 bool ret = snap->dev.type == &rbd_snap_device_type;
2053 bool reg = device_is_registered(&snap->dev);
2054
2055 rbd_assert(!ret ^ reg);
2056
2057 return ret;
2058}
2059
2016static void __rbd_remove_snap_dev(struct rbd_snap *snap) 2060static void __rbd_remove_snap_dev(struct rbd_snap *snap)
2017{ 2061{
2018 list_del(&snap->node); 2062 list_del(&snap->node);
2019 device_unregister(&snap->dev); 2063 if (device_is_registered(&snap->dev))
2064 device_unregister(&snap->dev);
2020} 2065}
2021 2066
2022static int rbd_register_snap_dev(struct rbd_snap *snap, 2067static int rbd_register_snap_dev(struct rbd_snap *snap,
@@ -2029,13 +2074,17 @@ static int rbd_register_snap_dev(struct rbd_snap *snap,
2029 dev->parent = parent; 2074 dev->parent = parent;
2030 dev->release = rbd_snap_dev_release; 2075 dev->release = rbd_snap_dev_release;
2031 dev_set_name(dev, "snap_%s", snap->name); 2076 dev_set_name(dev, "snap_%s", snap->name);
2077 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2078
2032 ret = device_register(dev); 2079 ret = device_register(dev);
2033 2080
2034 return ret; 2081 return ret;
2035} 2082}
2036 2083
2037static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev, 2084static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2038 int i, const char *name) 2085 const char *snap_name,
2086 u64 snap_id, u64 snap_size,
2087 u64 snap_features)
2039{ 2088{
2040 struct rbd_snap *snap; 2089 struct rbd_snap *snap;
2041 int ret; 2090 int ret;
@@ -2045,17 +2094,13 @@ static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2045 return ERR_PTR(-ENOMEM); 2094 return ERR_PTR(-ENOMEM);
2046 2095
2047 ret = -ENOMEM; 2096 ret = -ENOMEM;
2048 snap->name = kstrdup(name, GFP_KERNEL); 2097 snap->name = kstrdup(snap_name, GFP_KERNEL);
2049 if (!snap->name) 2098 if (!snap->name)
2050 goto err; 2099 goto err;
2051 2100
2052 snap->size = rbd_dev->header.snap_sizes[i]; 2101 snap->id = snap_id;
2053 snap->id = rbd_dev->header.snapc->snaps[i]; 2102 snap->size = snap_size;
2054 if (device_is_registered(&rbd_dev->dev)) { 2103 snap->features = snap_features;
2055 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2056 if (ret < 0)
2057 goto err;
2058 }
2059 2104
2060 return snap; 2105 return snap;
2061 2106
@@ -2066,128 +2111,439 @@ err:
2066 return ERR_PTR(ret); 2111 return ERR_PTR(ret);
2067} 2112}
2068 2113
2114static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2115 u64 *snap_size, u64 *snap_features)
2116{
2117 char *snap_name;
2118
2119 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2120
2121 *snap_size = rbd_dev->header.snap_sizes[which];
2122 *snap_features = 0; /* No features for v1 */
2123
2124 /* Skip over names until we find the one we are looking for */
2125
2126 snap_name = rbd_dev->header.snap_names;
2127 while (which--)
2128 snap_name += strlen(snap_name) + 1;
2129
2130 return snap_name;
2131}
2132
2069/* 2133/*
2070 * search for the previous snap in a null delimited string list 2134 * Get the size and object order for an image snapshot, or if
2135 * snap_id is CEPH_NOSNAP, gets this information for the base
2136 * image.
2071 */ 2137 */
2072const char *rbd_prev_snap_name(const char *name, const char *start) 2138static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2139 u8 *order, u64 *snap_size)
2073{ 2140{
2074 if (name < start + 2) 2141 __le64 snapid = cpu_to_le64(snap_id);
2075 return NULL; 2142 int ret;
2143 struct {
2144 u8 order;
2145 __le64 size;
2146 } __attribute__ ((packed)) size_buf = { 0 };
2147
2148 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2149 "rbd", "get_size",
2150 (char *) &snapid, sizeof (snapid),
2151 (char *) &size_buf, sizeof (size_buf),
2152 CEPH_OSD_FLAG_READ, NULL);
2153 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2154 if (ret < 0)
2155 return ret;
2156
2157 *order = size_buf.order;
2158 *snap_size = le64_to_cpu(size_buf.size);
2076 2159
2077 name -= 2; 2160 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2078 while (*name) { 2161 (unsigned long long) snap_id, (unsigned int) *order,
2079 if (name == start) 2162 (unsigned long long) *snap_size);
2080 return start; 2163
2081 name--; 2164 return 0;
2165}
2166
2167static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2168{
2169 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2170 &rbd_dev->header.obj_order,
2171 &rbd_dev->header.image_size);
2172}
2173
2174static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2175{
2176 void *reply_buf;
2177 int ret;
2178 void *p;
2179
2180 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2181 if (!reply_buf)
2182 return -ENOMEM;
2183
2184 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2185 "rbd", "get_object_prefix",
2186 NULL, 0,
2187 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2188 CEPH_OSD_FLAG_READ, NULL);
2189 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2190 if (ret < 0)
2191 goto out;
2192
2193 p = reply_buf;
2194 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2195 p + RBD_OBJ_PREFIX_LEN_MAX,
2196 NULL, GFP_NOIO);
2197
2198 if (IS_ERR(rbd_dev->header.object_prefix)) {
2199 ret = PTR_ERR(rbd_dev->header.object_prefix);
2200 rbd_dev->header.object_prefix = NULL;
2201 } else {
2202 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2082 } 2203 }
2083 return name + 1; 2204
2205out:
2206 kfree(reply_buf);
2207
2208 return ret;
2084} 2209}
2085 2210
2086/* 2211static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2087 * compare the old list of snapshots that we have to what's in the header 2212 u64 *snap_features)
2088 * and update it accordingly. Note that the header holds the snapshots
2089 * in a reverse order (from newest to oldest) and we need to go from
2090 * older to new so that we don't get a duplicate snap name when
2091 * doing the process (e.g., removed snapshot and recreated a new
2092 * one with the same name.
2093 */
2094static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2095{ 2213{
2096 const char *name, *first_name; 2214 __le64 snapid = cpu_to_le64(snap_id);
2097 int i = rbd_dev->header.total_snaps; 2215 struct {
2098 struct rbd_snap *snap, *old_snap = NULL; 2216 __le64 features;
2099 struct list_head *p, *n; 2217 __le64 incompat;
2218 } features_buf = { 0 };
2219 int ret;
2100 2220
2101 first_name = rbd_dev->header.snap_names; 2221 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2102 name = first_name + rbd_dev->header.snap_names_len; 2222 "rbd", "get_features",
2223 (char *) &snapid, sizeof (snapid),
2224 (char *) &features_buf, sizeof (features_buf),
2225 CEPH_OSD_FLAG_READ, NULL);
2226 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2227 if (ret < 0)
2228 return ret;
2229 *snap_features = le64_to_cpu(features_buf.features);
2103 2230
2104 list_for_each_prev_safe(p, n, &rbd_dev->snaps) { 2231 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2105 u64 cur_id; 2232 (unsigned long long) snap_id,
2233 (unsigned long long) *snap_features,
2234 (unsigned long long) le64_to_cpu(features_buf.incompat));
2106 2235
2107 old_snap = list_entry(p, struct rbd_snap, node); 2236 return 0;
2237}
2108 2238
2109 if (i) 2239static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2110 cur_id = rbd_dev->header.snapc->snaps[i - 1]; 2240{
2241 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2242 &rbd_dev->header.features);
2243}
2111 2244
2112 if (!i || old_snap->id < cur_id) { 2245static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
2113 /* 2246{
2114 * old_snap->id was skipped, thus was 2247 size_t size;
2115 * removed. If this rbd_dev is mapped to 2248 int ret;
2116 * the removed snapshot, record that it no 2249 void *reply_buf;
2117 * longer exists, to prevent further I/O. 2250 void *p;
2118 */ 2251 void *end;
2119 if (rbd_dev->snap_id == old_snap->id) 2252 u64 seq;
2120 rbd_dev->snap_exists = false; 2253 u32 snap_count;
2121 __rbd_remove_snap_dev(old_snap); 2254 struct ceph_snap_context *snapc;
2122 continue; 2255 u32 i;
2123 } 2256
2124 if (old_snap->id == cur_id) { 2257 /*
2125 /* we have this snapshot already */ 2258 * We'll need room for the seq value (maximum snapshot id),
2126 i--; 2259 * snapshot count, and array of that many snapshot ids.
2127 name = rbd_prev_snap_name(name, first_name); 2260 * For now we have a fixed upper limit on the number we're
2261 * prepared to receive.
2262 */
2263 size = sizeof (__le64) + sizeof (__le32) +
2264 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2265 reply_buf = kzalloc(size, GFP_KERNEL);
2266 if (!reply_buf)
2267 return -ENOMEM;
2268
2269 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2270 "rbd", "get_snapcontext",
2271 NULL, 0,
2272 reply_buf, size,
2273 CEPH_OSD_FLAG_READ, ver);
2274 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2275 if (ret < 0)
2276 goto out;
2277
2278 ret = -ERANGE;
2279 p = reply_buf;
2280 end = (char *) reply_buf + size;
2281 ceph_decode_64_safe(&p, end, seq, out);
2282 ceph_decode_32_safe(&p, end, snap_count, out);
2283
2284 /*
2285 * Make sure the reported number of snapshot ids wouldn't go
2286 * beyond the end of our buffer. But before checking that,
2287 * make sure the computed size of the snapshot context we
2288 * allocate is representable in a size_t.
2289 */
2290 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2291 / sizeof (u64)) {
2292 ret = -EINVAL;
2293 goto out;
2294 }
2295 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2296 goto out;
2297
2298 size = sizeof (struct ceph_snap_context) +
2299 snap_count * sizeof (snapc->snaps[0]);
2300 snapc = kmalloc(size, GFP_KERNEL);
2301 if (!snapc) {
2302 ret = -ENOMEM;
2303 goto out;
2304 }
2305
2306 atomic_set(&snapc->nref, 1);
2307 snapc->seq = seq;
2308 snapc->num_snaps = snap_count;
2309 for (i = 0; i < snap_count; i++)
2310 snapc->snaps[i] = ceph_decode_64(&p);
2311
2312 rbd_dev->header.snapc = snapc;
2313
2314 dout(" snap context seq = %llu, snap_count = %u\n",
2315 (unsigned long long) seq, (unsigned int) snap_count);
2316
2317out:
2318 kfree(reply_buf);
2319
2320 return 0;
2321}
2322
2323static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2324{
2325 size_t size;
2326 void *reply_buf;
2327 __le64 snap_id;
2328 int ret;
2329 void *p;
2330 void *end;
2331 size_t snap_name_len;
2332 char *snap_name;
2333
2334 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2335 reply_buf = kmalloc(size, GFP_KERNEL);
2336 if (!reply_buf)
2337 return ERR_PTR(-ENOMEM);
2338
2339 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2340 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2341 "rbd", "get_snapshot_name",
2342 (char *) &snap_id, sizeof (snap_id),
2343 reply_buf, size,
2344 CEPH_OSD_FLAG_READ, NULL);
2345 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2346 if (ret < 0)
2347 goto out;
2348
2349 p = reply_buf;
2350 end = (char *) reply_buf + size;
2351 snap_name_len = 0;
2352 snap_name = ceph_extract_encoded_string(&p, end, &snap_name_len,
2353 GFP_KERNEL);
2354 if (IS_ERR(snap_name)) {
2355 ret = PTR_ERR(snap_name);
2356 goto out;
2357 } else {
2358 dout(" snap_id 0x%016llx snap_name = %s\n",
2359 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2360 }
2361 kfree(reply_buf);
2362
2363 return snap_name;
2364out:
2365 kfree(reply_buf);
2366
2367 return ERR_PTR(ret);
2368}
2369
2370static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2371 u64 *snap_size, u64 *snap_features)
2372{
2373 __le64 snap_id;
2374 u8 order;
2375 int ret;
2376
2377 snap_id = rbd_dev->header.snapc->snaps[which];
2378 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2379 if (ret)
2380 return ERR_PTR(ret);
2381 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2382 if (ret)
2383 return ERR_PTR(ret);
2384
2385 return rbd_dev_v2_snap_name(rbd_dev, which);
2386}
2387
2388static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2389 u64 *snap_size, u64 *snap_features)
2390{
2391 if (rbd_dev->image_format == 1)
2392 return rbd_dev_v1_snap_info(rbd_dev, which,
2393 snap_size, snap_features);
2394 if (rbd_dev->image_format == 2)
2395 return rbd_dev_v2_snap_info(rbd_dev, which,
2396 snap_size, snap_features);
2397 return ERR_PTR(-EINVAL);
2398}
2399
2400/*
2401 * Scan the rbd device's current snapshot list and compare it to the
2402 * newly-received snapshot context. Remove any existing snapshots
2403 * not present in the new snapshot context. Add a new snapshot for
2404 * any snaphots in the snapshot context not in the current list.
2405 * And verify there are no changes to snapshots we already know
2406 * about.
2407 *
2408 * Assumes the snapshots in the snapshot context are sorted by
2409 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2410 * are also maintained in that order.)
2411 */
2412static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
2413{
2414 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2415 const u32 snap_count = snapc->num_snaps;
2416 struct list_head *head = &rbd_dev->snaps;
2417 struct list_head *links = head->next;
2418 u32 index = 0;
2419
2420 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
2421 while (index < snap_count || links != head) {
2422 u64 snap_id;
2423 struct rbd_snap *snap;
2424 char *snap_name;
2425 u64 snap_size = 0;
2426 u64 snap_features = 0;
2427
2428 snap_id = index < snap_count ? snapc->snaps[index]
2429 : CEPH_NOSNAP;
2430 snap = links != head ? list_entry(links, struct rbd_snap, node)
2431 : NULL;
2432 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
2433
2434 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2435 struct list_head *next = links->next;
2436
2437 /* Existing snapshot not in the new snap context */
2438
2439 if (rbd_dev->mapping.snap_id == snap->id)
2440 rbd_dev->mapping.snap_exists = false;
2441 __rbd_remove_snap_dev(snap);
2442 dout("%ssnap id %llu has been removed\n",
2443 rbd_dev->mapping.snap_id == snap->id ?
2444 "mapped " : "",
2445 (unsigned long long) snap->id);
2446
2447 /* Done with this list entry; advance */
2448
2449 links = next;
2128 continue; 2450 continue;
2129 } 2451 }
2130 for (; i > 0; 2452
2131 i--, name = rbd_prev_snap_name(name, first_name)) { 2453 snap_name = rbd_dev_snap_info(rbd_dev, index,
2132 if (!name) { 2454 &snap_size, &snap_features);
2133 WARN_ON(1); 2455 if (IS_ERR(snap_name))
2134 return -EINVAL; 2456 return PTR_ERR(snap_name);
2457
2458 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2459 (unsigned long long) snap_id);
2460 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2461 struct rbd_snap *new_snap;
2462
2463 /* We haven't seen this snapshot before */
2464
2465 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
2466 snap_id, snap_size, snap_features);
2467 if (IS_ERR(new_snap)) {
2468 int err = PTR_ERR(new_snap);
2469
2470 dout(" failed to add dev, error %d\n", err);
2471
2472 return err;
2135 } 2473 }
2136 cur_id = rbd_dev->header.snapc->snaps[i]; 2474
2137 /* snapshot removal? handle it above */ 2475 /* New goes before existing, or at end of list */
2138 if (cur_id >= old_snap->id) 2476
2139 break; 2477 dout(" added dev%s\n", snap ? "" : " at end\n");
2140 /* a new snapshot */ 2478 if (snap)
2141 snap = __rbd_add_snap_dev(rbd_dev, i - 1, name); 2479 list_add_tail(&new_snap->node, &snap->node);
2142 if (IS_ERR(snap)) 2480 else
2143 return PTR_ERR(snap); 2481 list_add_tail(&new_snap->node, head);
2144 2482 } else {
2145 /* note that we add it backward so using n and not p */ 2483 /* Already have this one */
2146 list_add(&snap->node, n); 2484
2147 p = &snap->node; 2485 dout(" already present\n");
2486
2487 rbd_assert(snap->size == snap_size);
2488 rbd_assert(!strcmp(snap->name, snap_name));
2489 rbd_assert(snap->features == snap_features);
2490
2491 /* Done with this list entry; advance */
2492
2493 links = links->next;
2148 } 2494 }
2495
2496 /* Advance to the next entry in the snapshot context */
2497
2498 index++;
2149 } 2499 }
2150 /* we're done going over the old snap list, just add what's left */ 2500 dout("%s: done\n", __func__);
2151 for (; i > 0; i--) { 2501
2152 name = rbd_prev_snap_name(name, first_name); 2502 return 0;
2153 if (!name) { 2503}
2154 WARN_ON(1); 2504
2155 return -EINVAL; 2505/*
2506 * Scan the list of snapshots and register the devices for any that
2507 * have not already been registered.
2508 */
2509static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2510{
2511 struct rbd_snap *snap;
2512 int ret = 0;
2513
2514 dout("%s called\n", __func__);
2515 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2516 return -EIO;
2517
2518 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2519 if (!rbd_snap_registered(snap)) {
2520 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2521 if (ret < 0)
2522 break;
2156 } 2523 }
2157 snap = __rbd_add_snap_dev(rbd_dev, i - 1, name);
2158 if (IS_ERR(snap))
2159 return PTR_ERR(snap);
2160 list_add(&snap->node, &rbd_dev->snaps);
2161 } 2524 }
2525 dout("%s: returning %d\n", __func__, ret);
2162 2526
2163 return 0; 2527 return ret;
2164} 2528}
2165 2529
2166static int rbd_bus_add_dev(struct rbd_device *rbd_dev) 2530static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2167{ 2531{
2168 int ret;
2169 struct device *dev; 2532 struct device *dev;
2170 struct rbd_snap *snap; 2533 int ret;
2171 2534
2172 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 2535 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2173 dev = &rbd_dev->dev;
2174 2536
2537 dev = &rbd_dev->dev;
2175 dev->bus = &rbd_bus_type; 2538 dev->bus = &rbd_bus_type;
2176 dev->type = &rbd_device_type; 2539 dev->type = &rbd_device_type;
2177 dev->parent = &rbd_root_dev; 2540 dev->parent = &rbd_root_dev;
2178 dev->release = rbd_dev_release; 2541 dev->release = rbd_dev_release;
2179 dev_set_name(dev, "%d", rbd_dev->dev_id); 2542 dev_set_name(dev, "%d", rbd_dev->dev_id);
2180 ret = device_register(dev); 2543 ret = device_register(dev);
2181 if (ret < 0)
2182 goto out;
2183 2544
2184 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2185 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2186 if (ret < 0)
2187 break;
2188 }
2189out:
2190 mutex_unlock(&ctl_mutex); 2545 mutex_unlock(&ctl_mutex);
2546
2191 return ret; 2547 return ret;
2192} 2548}
2193 2549
@@ -2212,33 +2568,37 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2212 return ret; 2568 return ret;
2213} 2569}
2214 2570
2215static atomic64_t rbd_id_max = ATOMIC64_INIT(0); 2571static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
2216 2572
2217/* 2573/*
2218 * Get a unique rbd identifier for the given new rbd_dev, and add 2574 * Get a unique rbd identifier for the given new rbd_dev, and add
2219 * the rbd_dev to the global list. The minimum rbd id is 1. 2575 * the rbd_dev to the global list. The minimum rbd id is 1.
2220 */ 2576 */
2221static void rbd_id_get(struct rbd_device *rbd_dev) 2577static void rbd_dev_id_get(struct rbd_device *rbd_dev)
2222{ 2578{
2223 rbd_dev->dev_id = atomic64_inc_return(&rbd_id_max); 2579 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
2224 2580
2225 spin_lock(&rbd_dev_list_lock); 2581 spin_lock(&rbd_dev_list_lock);
2226 list_add_tail(&rbd_dev->node, &rbd_dev_list); 2582 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2227 spin_unlock(&rbd_dev_list_lock); 2583 spin_unlock(&rbd_dev_list_lock);
2584 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2585 (unsigned long long) rbd_dev->dev_id);
2228} 2586}
2229 2587
2230/* 2588/*
2231 * Remove an rbd_dev from the global list, and record that its 2589 * Remove an rbd_dev from the global list, and record that its
2232 * identifier is no longer in use. 2590 * identifier is no longer in use.
2233 */ 2591 */
2234static void rbd_id_put(struct rbd_device *rbd_dev) 2592static void rbd_dev_id_put(struct rbd_device *rbd_dev)
2235{ 2593{
2236 struct list_head *tmp; 2594 struct list_head *tmp;
2237 int rbd_id = rbd_dev->dev_id; 2595 int rbd_id = rbd_dev->dev_id;
2238 int max_id; 2596 int max_id;
2239 2597
2240 BUG_ON(rbd_id < 1); 2598 rbd_assert(rbd_id > 0);
2241 2599
2600 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2601 (unsigned long long) rbd_dev->dev_id);
2242 spin_lock(&rbd_dev_list_lock); 2602 spin_lock(&rbd_dev_list_lock);
2243 list_del_init(&rbd_dev->node); 2603 list_del_init(&rbd_dev->node);
2244 2604
@@ -2246,7 +2606,7 @@ static void rbd_id_put(struct rbd_device *rbd_dev)
2246 * If the id being "put" is not the current maximum, there 2606 * If the id being "put" is not the current maximum, there
2247 * is nothing special we need to do. 2607 * is nothing special we need to do.
2248 */ 2608 */
2249 if (rbd_id != atomic64_read(&rbd_id_max)) { 2609 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
2250 spin_unlock(&rbd_dev_list_lock); 2610 spin_unlock(&rbd_dev_list_lock);
2251 return; 2611 return;
2252 } 2612 }
@@ -2267,12 +2627,13 @@ static void rbd_id_put(struct rbd_device *rbd_dev)
2267 spin_unlock(&rbd_dev_list_lock); 2627 spin_unlock(&rbd_dev_list_lock);
2268 2628
2269 /* 2629 /*
2270 * The max id could have been updated by rbd_id_get(), in 2630 * The max id could have been updated by rbd_dev_id_get(), in
2271 * which case it now accurately reflects the new maximum. 2631 * which case it now accurately reflects the new maximum.
2272 * Be careful not to overwrite the maximum value in that 2632 * Be careful not to overwrite the maximum value in that
2273 * case. 2633 * case.
2274 */ 2634 */
2275 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id); 2635 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2636 dout(" max dev id has been reset\n");
2276} 2637}
2277 2638
2278/* 2639/*
@@ -2361,28 +2722,31 @@ static inline char *dup_token(const char **buf, size_t *lenp)
2361} 2722}
2362 2723
2363/* 2724/*
2364 * This fills in the pool_name, image_name, image_name_len, snap_name, 2725 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2365 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based 2726 * rbd_md_name, and name fields of the given rbd_dev, based on the
2366 * on the list of monitor addresses and other options provided via 2727 * list of monitor addresses and other options provided via
2367 * /sys/bus/rbd/add. 2728 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2729 * copy of the snapshot name to map if successful, or a
2730 * pointer-coded error otherwise.
2368 * 2731 *
2369 * Note: rbd_dev is assumed to have been initially zero-filled. 2732 * Note: rbd_dev is assumed to have been initially zero-filled.
2370 */ 2733 */
2371static int rbd_add_parse_args(struct rbd_device *rbd_dev, 2734static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2372 const char *buf, 2735 const char *buf,
2373 const char **mon_addrs, 2736 const char **mon_addrs,
2374 size_t *mon_addrs_size, 2737 size_t *mon_addrs_size,
2375 char *options, 2738 char *options,
2376 size_t options_size) 2739 size_t options_size)
2377{ 2740{
2378 size_t len; 2741 size_t len;
2379 int ret; 2742 char *err_ptr = ERR_PTR(-EINVAL);
2743 char *snap_name;
2380 2744
2381 /* The first four tokens are required */ 2745 /* The first four tokens are required */
2382 2746
2383 len = next_token(&buf); 2747 len = next_token(&buf);
2384 if (!len) 2748 if (!len)
2385 return -EINVAL; 2749 return err_ptr;
2386 *mon_addrs_size = len + 1; 2750 *mon_addrs_size = len + 1;
2387 *mon_addrs = buf; 2751 *mon_addrs = buf;
2388 2752
@@ -2390,9 +2754,9 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2390 2754
2391 len = copy_token(&buf, options, options_size); 2755 len = copy_token(&buf, options, options_size);
2392 if (!len || len >= options_size) 2756 if (!len || len >= options_size)
2393 return -EINVAL; 2757 return err_ptr;
2394 2758
2395 ret = -ENOMEM; 2759 err_ptr = ERR_PTR(-ENOMEM);
2396 rbd_dev->pool_name = dup_token(&buf, NULL); 2760 rbd_dev->pool_name = dup_token(&buf, NULL);
2397 if (!rbd_dev->pool_name) 2761 if (!rbd_dev->pool_name)
2398 goto out_err; 2762 goto out_err;
@@ -2401,41 +2765,227 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2401 if (!rbd_dev->image_name) 2765 if (!rbd_dev->image_name)
2402 goto out_err; 2766 goto out_err;
2403 2767
2404 /* Create the name of the header object */ 2768 /* Snapshot name is optional */
2769 len = next_token(&buf);
2770 if (!len) {
2771 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2772 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
2773 }
2774 snap_name = kmalloc(len + 1, GFP_KERNEL);
2775 if (!snap_name)
2776 goto out_err;
2777 memcpy(snap_name, buf, len);
2778 *(snap_name + len) = '\0';
2405 2779
2406 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len 2780dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len);
2407 + sizeof (RBD_SUFFIX), 2781
2408 GFP_KERNEL); 2782 return snap_name;
2409 if (!rbd_dev->header_name) 2783
2784out_err:
2785 kfree(rbd_dev->image_name);
2786 rbd_dev->image_name = NULL;
2787 rbd_dev->image_name_len = 0;
2788 kfree(rbd_dev->pool_name);
2789 rbd_dev->pool_name = NULL;
2790
2791 return err_ptr;
2792}
2793
2794/*
2795 * An rbd format 2 image has a unique identifier, distinct from the
2796 * name given to it by the user. Internally, that identifier is
2797 * what's used to specify the names of objects related to the image.
2798 *
2799 * A special "rbd id" object is used to map an rbd image name to its
2800 * id. If that object doesn't exist, then there is no v2 rbd image
2801 * with the supplied name.
2802 *
2803 * This function will record the given rbd_dev's image_id field if
2804 * it can be determined, and in that case will return 0. If any
2805 * errors occur a negative errno will be returned and the rbd_dev's
2806 * image_id field will be unchanged (and should be NULL).
2807 */
2808static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2809{
2810 int ret;
2811 size_t size;
2812 char *object_name;
2813 void *response;
2814 void *p;
2815
2816 /*
2817 * First, see if the format 2 image id file exists, and if
2818 * so, get the image's persistent id from it.
2819 */
2820 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2821 object_name = kmalloc(size, GFP_NOIO);
2822 if (!object_name)
2823 return -ENOMEM;
2824 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2825 dout("rbd id object name is %s\n", object_name);
2826
2827 /* Response will be an encoded string, which includes a length */
2828
2829 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2830 response = kzalloc(size, GFP_NOIO);
2831 if (!response) {
2832 ret = -ENOMEM;
2833 goto out;
2834 }
2835
2836 ret = rbd_req_sync_exec(rbd_dev, object_name,
2837 "rbd", "get_id",
2838 NULL, 0,
2839 response, RBD_IMAGE_ID_LEN_MAX,
2840 CEPH_OSD_FLAG_READ, NULL);
2841 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2842 if (ret < 0)
2843 goto out;
2844
2845 p = response;
2846 rbd_dev->image_id = ceph_extract_encoded_string(&p,
2847 p + RBD_IMAGE_ID_LEN_MAX,
2848 &rbd_dev->image_id_len,
2849 GFP_NOIO);
2850 if (IS_ERR(rbd_dev->image_id)) {
2851 ret = PTR_ERR(rbd_dev->image_id);
2852 rbd_dev->image_id = NULL;
2853 } else {
2854 dout("image_id is %s\n", rbd_dev->image_id);
2855 }
2856out:
2857 kfree(response);
2858 kfree(object_name);
2859
2860 return ret;
2861}
2862
2863static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
2864{
2865 int ret;
2866 size_t size;
2867
2868 /* Version 1 images have no id; empty string is used */
2869
2870 rbd_dev->image_id = kstrdup("", GFP_KERNEL);
2871 if (!rbd_dev->image_id)
2872 return -ENOMEM;
2873 rbd_dev->image_id_len = 0;
2874
2875 /* Record the header object name for this rbd image. */
2876
2877 size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX);
2878 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2879 if (!rbd_dev->header_name) {
2880 ret = -ENOMEM;
2410 goto out_err; 2881 goto out_err;
2882 }
2411 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX); 2883 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
2412 2884
2885 /* Populate rbd image metadata */
2886
2887 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
2888 if (ret < 0)
2889 goto out_err;
2890 rbd_dev->image_format = 1;
2891
2892 dout("discovered version 1 image, header name is %s\n",
2893 rbd_dev->header_name);
2894
2895 return 0;
2896
2897out_err:
2898 kfree(rbd_dev->header_name);
2899 rbd_dev->header_name = NULL;
2900 kfree(rbd_dev->image_id);
2901 rbd_dev->image_id = NULL;
2902
2903 return ret;
2904}
2905
2906static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
2907{
2908 size_t size;
2909 int ret;
2910 u64 ver = 0;
2911
2413 /* 2912 /*
2414 * The snapshot name is optional. If none is is supplied, 2913 * Image id was filled in by the caller. Record the header
2415 * we use the default value. 2914 * object name for this rbd image.
2416 */ 2915 */
2417 rbd_dev->snap_name = dup_token(&buf, &len); 2916 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len;
2418 if (!rbd_dev->snap_name) 2917 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2918 if (!rbd_dev->header_name)
2919 return -ENOMEM;
2920 sprintf(rbd_dev->header_name, "%s%s",
2921 RBD_HEADER_PREFIX, rbd_dev->image_id);
2922
2923 /* Get the size and object order for the image */
2924
2925 ret = rbd_dev_v2_image_size(rbd_dev);
2926 if (ret < 0)
2419 goto out_err; 2927 goto out_err;
2420 if (!len) {
2421 /* Replace the empty name with the default */
2422 kfree(rbd_dev->snap_name);
2423 rbd_dev->snap_name
2424 = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2425 if (!rbd_dev->snap_name)
2426 goto out_err;
2427 2928
2428 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME, 2929 /* Get the object prefix (a.k.a. block_name) for the image */
2429 sizeof (RBD_SNAP_HEAD_NAME));
2430 }
2431 2930
2432 return 0; 2931 ret = rbd_dev_v2_object_prefix(rbd_dev);
2932 if (ret < 0)
2933 goto out_err;
2934
2935 /* Get the features for the image */
2433 2936
2937 ret = rbd_dev_v2_features(rbd_dev);
2938 if (ret < 0)
2939 goto out_err;
2940
2941 /* crypto and compression type aren't (yet) supported for v2 images */
2942
2943 rbd_dev->header.crypt_type = 0;
2944 rbd_dev->header.comp_type = 0;
2945
2946 /* Get the snapshot context, plus the header version */
2947
2948 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
2949 if (ret)
2950 goto out_err;
2951 rbd_dev->header.obj_version = ver;
2952
2953 rbd_dev->image_format = 2;
2954
2955 dout("discovered version 2 image, header name is %s\n",
2956 rbd_dev->header_name);
2957
2958 return -ENOTSUPP;
2434out_err: 2959out_err:
2435 kfree(rbd_dev->header_name); 2960 kfree(rbd_dev->header_name);
2436 kfree(rbd_dev->image_name); 2961 rbd_dev->header_name = NULL;
2437 kfree(rbd_dev->pool_name); 2962 kfree(rbd_dev->header.object_prefix);
2438 rbd_dev->pool_name = NULL; 2963 rbd_dev->header.object_prefix = NULL;
2964
2965 return ret;
2966}
2967
2968/*
2969 * Probe for the existence of the header object for the given rbd
2970 * device. For format 2 images this includes determining the image
2971 * id.
2972 */
2973static int rbd_dev_probe(struct rbd_device *rbd_dev)
2974{
2975 int ret;
2976
2977 /*
2978 * Get the id from the image id object. If it's not a
2979 * format 2 image, we'll get ENOENT back, and we'll assume
2980 * it's a format 1 image.
2981 */
2982 ret = rbd_dev_image_id(rbd_dev);
2983 if (ret)
2984 ret = rbd_dev_v1_probe(rbd_dev);
2985 else
2986 ret = rbd_dev_v2_probe(rbd_dev);
2987 if (ret)
2988 dout("probe failed, returning %d\n", ret);
2439 2989
2440 return ret; 2990 return ret;
2441} 2991}
@@ -2450,16 +3000,17 @@ static ssize_t rbd_add(struct bus_type *bus,
2450 size_t mon_addrs_size = 0; 3000 size_t mon_addrs_size = 0;
2451 struct ceph_osd_client *osdc; 3001 struct ceph_osd_client *osdc;
2452 int rc = -ENOMEM; 3002 int rc = -ENOMEM;
3003 char *snap_name;
2453 3004
2454 if (!try_module_get(THIS_MODULE)) 3005 if (!try_module_get(THIS_MODULE))
2455 return -ENODEV; 3006 return -ENODEV;
2456 3007
2457 options = kmalloc(count, GFP_KERNEL); 3008 options = kmalloc(count, GFP_KERNEL);
2458 if (!options) 3009 if (!options)
2459 goto err_nomem; 3010 goto err_out_mem;
2460 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); 3011 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2461 if (!rbd_dev) 3012 if (!rbd_dev)
2462 goto err_nomem; 3013 goto err_out_mem;
2463 3014
2464 /* static rbd_device initialization */ 3015 /* static rbd_device initialization */
2465 spin_lock_init(&rbd_dev->lock); 3016 spin_lock_init(&rbd_dev->lock);
@@ -2467,27 +3018,18 @@ static ssize_t rbd_add(struct bus_type *bus,
2467 INIT_LIST_HEAD(&rbd_dev->snaps); 3018 INIT_LIST_HEAD(&rbd_dev->snaps);
2468 init_rwsem(&rbd_dev->header_rwsem); 3019 init_rwsem(&rbd_dev->header_rwsem);
2469 3020
2470 /* generate unique id: find highest unique id, add one */
2471 rbd_id_get(rbd_dev);
2472
2473 /* Fill in the device name, now that we have its id. */
2474 BUILD_BUG_ON(DEV_NAME_LEN
2475 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2476 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
2477
2478 /* parse add command */ 3021 /* parse add command */
2479 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size, 3022 snap_name = rbd_add_parse_args(rbd_dev, buf,
2480 options, count); 3023 &mon_addrs, &mon_addrs_size, options, count);
2481 if (rc) 3024 if (IS_ERR(snap_name)) {
2482 goto err_put_id; 3025 rc = PTR_ERR(snap_name);
2483 3026 goto err_out_mem;
2484 rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
2485 options);
2486 if (IS_ERR(rbd_dev->rbd_client)) {
2487 rc = PTR_ERR(rbd_dev->rbd_client);
2488 goto err_put_id;
2489 } 3027 }
2490 3028
3029 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
3030 if (rc < 0)
3031 goto err_out_args;
3032
2491 /* pick the pool */ 3033 /* pick the pool */
2492 osdc = &rbd_dev->rbd_client->client->osdc; 3034 osdc = &rbd_dev->rbd_client->client->osdc;
2493 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name); 3035 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
@@ -2495,23 +3037,53 @@ static ssize_t rbd_add(struct bus_type *bus,
2495 goto err_out_client; 3037 goto err_out_client;
2496 rbd_dev->pool_id = rc; 3038 rbd_dev->pool_id = rc;
2497 3039
2498 /* register our block device */ 3040 rc = rbd_dev_probe(rbd_dev);
2499 rc = register_blkdev(0, rbd_dev->name);
2500 if (rc < 0) 3041 if (rc < 0)
2501 goto err_out_client; 3042 goto err_out_client;
3043 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
3044
3045 /* no need to lock here, as rbd_dev is not registered yet */
3046 rc = rbd_dev_snaps_update(rbd_dev);
3047 if (rc)
3048 goto err_out_header;
3049
3050 rc = rbd_dev_set_mapping(rbd_dev, snap_name);
3051 if (rc)
3052 goto err_out_header;
3053
3054 /* generate unique id: find highest unique id, add one */
3055 rbd_dev_id_get(rbd_dev);
3056
3057 /* Fill in the device name, now that we have its id. */
3058 BUILD_BUG_ON(DEV_NAME_LEN
3059 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3060 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3061
3062 /* Get our block major device number. */
3063
3064 rc = register_blkdev(0, rbd_dev->name);
3065 if (rc < 0)
3066 goto err_out_id;
2502 rbd_dev->major = rc; 3067 rbd_dev->major = rc;
2503 3068
2504 rc = rbd_bus_add_dev(rbd_dev); 3069 /* Set up the blkdev mapping. */
3070
3071 rc = rbd_init_disk(rbd_dev);
2505 if (rc) 3072 if (rc)
2506 goto err_out_blkdev; 3073 goto err_out_blkdev;
2507 3074
3075 rc = rbd_bus_add_dev(rbd_dev);
3076 if (rc)
3077 goto err_out_disk;
3078
2508 /* 3079 /*
2509 * At this point cleanup in the event of an error is the job 3080 * At this point cleanup in the event of an error is the job
2510 * of the sysfs code (initiated by rbd_bus_del_dev()). 3081 * of the sysfs code (initiated by rbd_bus_del_dev()).
2511 *
2512 * Set up and announce blkdev mapping.
2513 */ 3082 */
2514 rc = rbd_init_disk(rbd_dev); 3083
3084 down_write(&rbd_dev->header_rwsem);
3085 rc = rbd_dev_snaps_register(rbd_dev);
3086 up_write(&rbd_dev->header_rwsem);
2515 if (rc) 3087 if (rc)
2516 goto err_out_bus; 3088 goto err_out_bus;
2517 3089
@@ -2519,6 +3091,13 @@ static ssize_t rbd_add(struct bus_type *bus,
2519 if (rc) 3091 if (rc)
2520 goto err_out_bus; 3092 goto err_out_bus;
2521 3093
3094 /* Everything's ready. Announce the disk to the world. */
3095
3096 add_disk(rbd_dev->disk);
3097
3098 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3099 (unsigned long long) rbd_dev->mapping.size);
3100
2522 return count; 3101 return count;
2523 3102
2524err_out_bus: 3103err_out_bus:
@@ -2528,19 +3107,23 @@ err_out_bus:
2528 kfree(options); 3107 kfree(options);
2529 return rc; 3108 return rc;
2530 3109
3110err_out_disk:
3111 rbd_free_disk(rbd_dev);
2531err_out_blkdev: 3112err_out_blkdev:
2532 unregister_blkdev(rbd_dev->major, rbd_dev->name); 3113 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3114err_out_id:
3115 rbd_dev_id_put(rbd_dev);
3116err_out_header:
3117 rbd_header_free(&rbd_dev->header);
2533err_out_client: 3118err_out_client:
3119 kfree(rbd_dev->header_name);
2534 rbd_put_client(rbd_dev); 3120 rbd_put_client(rbd_dev);
2535err_put_id: 3121 kfree(rbd_dev->image_id);
2536 if (rbd_dev->pool_name) { 3122err_out_args:
2537 kfree(rbd_dev->snap_name); 3123 kfree(rbd_dev->mapping.snap_name);
2538 kfree(rbd_dev->header_name); 3124 kfree(rbd_dev->image_name);
2539 kfree(rbd_dev->image_name); 3125 kfree(rbd_dev->pool_name);
2540 kfree(rbd_dev->pool_name); 3126err_out_mem:
2541 }
2542 rbd_id_put(rbd_dev);
2543err_nomem:
2544 kfree(rbd_dev); 3127 kfree(rbd_dev);
2545 kfree(options); 3128 kfree(options);
2546 3129
@@ -2586,12 +3169,16 @@ static void rbd_dev_release(struct device *dev)
2586 rbd_free_disk(rbd_dev); 3169 rbd_free_disk(rbd_dev);
2587 unregister_blkdev(rbd_dev->major, rbd_dev->name); 3170 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2588 3171
3172 /* release allocated disk header fields */
3173 rbd_header_free(&rbd_dev->header);
3174
2589 /* done with the id, and with the rbd_dev */ 3175 /* done with the id, and with the rbd_dev */
2590 kfree(rbd_dev->snap_name); 3176 kfree(rbd_dev->mapping.snap_name);
3177 kfree(rbd_dev->image_id);
2591 kfree(rbd_dev->header_name); 3178 kfree(rbd_dev->header_name);
2592 kfree(rbd_dev->pool_name); 3179 kfree(rbd_dev->pool_name);
2593 kfree(rbd_dev->image_name); 3180 kfree(rbd_dev->image_name);
2594 rbd_id_put(rbd_dev); 3181 rbd_dev_id_put(rbd_dev);
2595 kfree(rbd_dev); 3182 kfree(rbd_dev);
2596 3183
2597 /* release module ref */ 3184 /* release module ref */
@@ -2629,47 +3216,7 @@ static ssize_t rbd_remove(struct bus_type *bus,
2629 3216
2630done: 3217done:
2631 mutex_unlock(&ctl_mutex); 3218 mutex_unlock(&ctl_mutex);
2632 return ret;
2633}
2634 3219
2635static ssize_t rbd_snap_add(struct device *dev,
2636 struct device_attribute *attr,
2637 const char *buf,
2638 size_t count)
2639{
2640 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2641 int ret;
2642 char *name = kmalloc(count + 1, GFP_KERNEL);
2643 if (!name)
2644 return -ENOMEM;
2645
2646 snprintf(name, count, "%s", buf);
2647
2648 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2649
2650 ret = rbd_header_add_snap(rbd_dev,
2651 name, GFP_KERNEL);
2652 if (ret < 0)
2653 goto err_unlock;
2654
2655 ret = __rbd_refresh_header(rbd_dev, NULL);
2656 if (ret < 0)
2657 goto err_unlock;
2658
2659 /* shouldn't hold ctl_mutex when notifying.. notify might
2660 trigger a watch callback that would need to get that mutex */
2661 mutex_unlock(&ctl_mutex);
2662
2663 /* make a best effort, don't error if failed */
2664 rbd_req_sync_notify(rbd_dev);
2665
2666 ret = count;
2667 kfree(name);
2668 return ret;
2669
2670err_unlock:
2671 mutex_unlock(&ctl_mutex);
2672 kfree(name);
2673 return ret; 3220 return ret;
2674} 3221}
2675 3222
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
index 0924e9e41a60..cbe77fa105ba 100644
--- a/drivers/block/rbd_types.h
+++ b/drivers/block/rbd_types.h
@@ -15,15 +15,30 @@
15 15
16#include <linux/types.h> 16#include <linux/types.h>
17 17
18/* For format version 2, rbd image 'foo' consists of objects
19 * rbd_id.foo - id of image
20 * rbd_header.<id> - image metadata
21 * rbd_data.<id>.0000000000000000
22 * rbd_data.<id>.0000000000000001
23 * ... - data
24 * Clients do not access header data directly in rbd format 2.
25 */
26
27#define RBD_HEADER_PREFIX "rbd_header."
28#define RBD_DATA_PREFIX "rbd_data."
29#define RBD_ID_PREFIX "rbd_id."
30
18/* 31/*
19 * rbd image 'foo' consists of objects 32 * For format version 1, rbd image 'foo' consists of objects
20 * foo.rbd - image metadata 33 * foo.rbd - image metadata
21 * foo.00000000 34 * rb.<idhi>.<idlo>.00000000
22 * foo.00000001 35 * rb.<idhi>.<idlo>.00000001
23 * ... - data 36 * ... - data
37 * There is no notion of a persistent image id in rbd format 1.
24 */ 38 */
25 39
26#define RBD_SUFFIX ".rbd" 40#define RBD_SUFFIX ".rbd"
41
27#define RBD_DIRECTORY "rbd_directory" 42#define RBD_DIRECTORY "rbd_directory"
28#define RBD_INFO "rbd_info" 43#define RBD_INFO "rbd_info"
29 44
@@ -47,7 +62,7 @@ struct rbd_image_snap_ondisk {
47 62
48struct rbd_image_header_ondisk { 63struct rbd_image_header_ondisk {
49 char text[40]; 64 char text[40];
50 char block_name[24]; 65 char object_prefix[24];
51 char signature[4]; 66 char signature[4];
52 char version[8]; 67 char version[8];
53 struct { 68 struct {
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
deleted file mode 100644
index fcec0225ac76..000000000000
--- a/drivers/block/ub.c
+++ /dev/null
@@ -1,2474 +0,0 @@
1/*
2 * The low performance USB storage driver (ub).
3 *
4 * Copyright (c) 1999, 2000 Matthew Dharm (mdharm-usb@one-eyed-alien.net)
5 * Copyright (C) 2004 Pete Zaitcev (zaitcev@yahoo.com)
6 *
7 * This work is a part of Linux kernel, is derived from it,
8 * and is not licensed separately. See file COPYING for details.
9 *
10 * TODO (sorted by decreasing priority)
11 * -- Return sense now that rq allows it (we always auto-sense anyway).
12 * -- set readonly flag for CDs, set removable flag for CF readers
13 * -- do inquiry and verify we got a disk and not a tape (for LUN mismatch)
14 * -- verify the 13 conditions and do bulk resets
15 * -- highmem
16 * -- move top_sense and work_bcs into separate allocations (if they survive)
17 * for cache purists and esoteric architectures.
18 * -- Allocate structure for LUN 0 before the first ub_sync_tur, avoid NULL. ?
19 * -- prune comments, they are too volumnous
20 * -- Resove XXX's
21 * -- CLEAR, CLR2STS, CLRRS seem to be ripe for refactoring.
22 */
23#include <linux/kernel.h>
24#include <linux/module.h>
25#include <linux/usb.h>
26#include <linux/usb_usual.h>
27#include <linux/blkdev.h>
28#include <linux/timer.h>
29#include <linux/scatterlist.h>
30#include <linux/slab.h>
31#include <linux/mutex.h>
32#include <scsi/scsi.h>
33
34#define DRV_NAME "ub"
35
36#define UB_MAJOR 180
37
38/*
39 * The command state machine is the key model for understanding of this driver.
40 *
41 * The general rule is that all transitions are done towards the bottom
42 * of the diagram, thus preventing any loops.
43 *
44 * An exception to that is how the STAT state is handled. A counter allows it
45 * to be re-entered along the path marked with [C].
46 *
47 * +--------+
48 * ! INIT !
49 * +--------+
50 * !
51 * ub_scsi_cmd_start fails ->--------------------------------------\
52 * ! !
53 * V !
54 * +--------+ !
55 * ! CMD ! !
56 * +--------+ !
57 * ! +--------+ !
58 * was -EPIPE -->-------------------------------->! CLEAR ! !
59 * ! +--------+ !
60 * ! ! !
61 * was error -->------------------------------------- ! --------->\
62 * ! ! !
63 * /--<-- cmd->dir == NONE ? ! !
64 * ! ! ! !
65 * ! V ! !
66 * ! +--------+ ! !
67 * ! ! DATA ! ! !
68 * ! +--------+ ! !
69 * ! ! +---------+ ! !
70 * ! was -EPIPE -->--------------->! CLR2STS ! ! !
71 * ! ! +---------+ ! !
72 * ! ! ! ! !
73 * ! ! was error -->---- ! --------->\
74 * ! was error -->--------------------- ! ------------- ! --------->\
75 * ! ! ! ! !
76 * ! V ! ! !
77 * \--->+--------+ ! ! !
78 * ! STAT !<--------------------------/ ! !
79 * /--->+--------+ ! !
80 * ! ! ! !
81 * [C] was -EPIPE -->-----------\ ! !
82 * ! ! ! ! !
83 * +<---- len == 0 ! ! !
84 * ! ! ! ! !
85 * ! was error -->--------------------------------------!---------->\
86 * ! ! ! ! !
87 * +<---- bad CSW ! ! !
88 * +<---- bad tag ! ! !
89 * ! ! V ! !
90 * ! ! +--------+ ! !
91 * ! ! ! CLRRS ! ! !
92 * ! ! +--------+ ! !
93 * ! ! ! ! !
94 * \------- ! --------------------[C]--------\ ! !
95 * ! ! ! !
96 * cmd->error---\ +--------+ ! !
97 * ! +--------------->! SENSE !<----------/ !
98 * STAT_FAIL----/ +--------+ !
99 * ! ! V
100 * ! V +--------+
101 * \--------------------------------\--------------------->! DONE !
102 * +--------+
103 */
104
105/*
106 * This many LUNs per USB device.
107 * Every one of them takes a host, see UB_MAX_HOSTS.
108 */
109#define UB_MAX_LUNS 9
110
111/*
112 */
113
114#define UB_PARTS_PER_LUN 8
115
116#define UB_MAX_CDB_SIZE 16 /* Corresponds to Bulk */
117
118#define UB_SENSE_SIZE 18
119
120/*
121 */
122struct ub_dev;
123
124#define UB_MAX_REQ_SG 9 /* cdrecord requires 32KB and maybe a header */
125#define UB_MAX_SECTORS 64
126
127/*
128 * A second is more than enough for a 32K transfer (UB_MAX_SECTORS)
129 * even if a webcam hogs the bus, but some devices need time to spin up.
130 */
131#define UB_URB_TIMEOUT (HZ*2)
132#define UB_DATA_TIMEOUT (HZ*5) /* ZIP does spin-ups in the data phase */
133#define UB_STAT_TIMEOUT (HZ*5) /* Same spinups and eject for a dataless cmd. */
134#define UB_CTRL_TIMEOUT (HZ/2) /* 500ms ought to be enough to clear a stall */
135
136/*
137 * An instance of a SCSI command in transit.
138 */
139#define UB_DIR_NONE 0
140#define UB_DIR_READ 1
141#define UB_DIR_ILLEGAL2 2
142#define UB_DIR_WRITE 3
143
144#define UB_DIR_CHAR(c) (((c)==UB_DIR_WRITE)? 'w': \
145 (((c)==UB_DIR_READ)? 'r': 'n'))
146
147enum ub_scsi_cmd_state {
148 UB_CMDST_INIT, /* Initial state */
149 UB_CMDST_CMD, /* Command submitted */
150 UB_CMDST_DATA, /* Data phase */
151 UB_CMDST_CLR2STS, /* Clearing before requesting status */
152 UB_CMDST_STAT, /* Status phase */
153 UB_CMDST_CLEAR, /* Clearing a stall (halt, actually) */
154 UB_CMDST_CLRRS, /* Clearing before retrying status */
155 UB_CMDST_SENSE, /* Sending Request Sense */
156 UB_CMDST_DONE /* Final state */
157};
158
159struct ub_scsi_cmd {
160 unsigned char cdb[UB_MAX_CDB_SIZE];
161 unsigned char cdb_len;
162
163 unsigned char dir; /* 0 - none, 1 - read, 3 - write. */
164 enum ub_scsi_cmd_state state;
165 unsigned int tag;
166 struct ub_scsi_cmd *next;
167
168 int error; /* Return code - valid upon done */
169 unsigned int act_len; /* Return size */
170 unsigned char key, asc, ascq; /* May be valid if error==-EIO */
171
172 int stat_count; /* Retries getting status. */
173 unsigned int timeo; /* jiffies until rq->timeout changes */
174
175 unsigned int len; /* Requested length */
176 unsigned int current_sg;
177 unsigned int nsg; /* sgv[nsg] */
178 struct scatterlist sgv[UB_MAX_REQ_SG];
179
180 struct ub_lun *lun;
181 void (*done)(struct ub_dev *, struct ub_scsi_cmd *);
182 void *back;
183};
184
185struct ub_request {
186 struct request *rq;
187 unsigned int current_try;
188 unsigned int nsg; /* sgv[nsg] */
189 struct scatterlist sgv[UB_MAX_REQ_SG];
190};
191
192/*
193 */
194struct ub_capacity {
195 unsigned long nsec; /* Linux size - 512 byte sectors */
196 unsigned int bsize; /* Linux hardsect_size */
197 unsigned int bshift; /* Shift between 512 and hard sects */
198};
199
200/*
201 * This is a direct take-off from linux/include/completion.h
202 * The difference is that I do not wait on this thing, just poll.
203 * When I want to wait (ub_probe), I just use the stock completion.
204 *
205 * Note that INIT_COMPLETION takes no lock. It is correct. But why
206 * in the bloody hell that thing takes struct instead of pointer to struct
207 * is quite beyond me. I just copied it from the stock completion.
208 */
209struct ub_completion {
210 unsigned int done;
211 spinlock_t lock;
212};
213
214static DEFINE_MUTEX(ub_mutex);
215static inline void ub_init_completion(struct ub_completion *x)
216{
217 x->done = 0;
218 spin_lock_init(&x->lock);
219}
220
221#define UB_INIT_COMPLETION(x) ((x).done = 0)
222
223static void ub_complete(struct ub_completion *x)
224{
225 unsigned long flags;
226
227 spin_lock_irqsave(&x->lock, flags);
228 x->done++;
229 spin_unlock_irqrestore(&x->lock, flags);
230}
231
232static int ub_is_completed(struct ub_completion *x)
233{
234 unsigned long flags;
235 int ret;
236
237 spin_lock_irqsave(&x->lock, flags);
238 ret = x->done;
239 spin_unlock_irqrestore(&x->lock, flags);
240 return ret;
241}
242
243/*
244 */
245struct ub_scsi_cmd_queue {
246 int qlen, qmax;
247 struct ub_scsi_cmd *head, *tail;
248};
249
250/*
251 * The block device instance (one per LUN).
252 */
253struct ub_lun {
254 struct ub_dev *udev;
255 struct list_head link;
256 struct gendisk *disk;
257 int id; /* Host index */
258 int num; /* LUN number */
259 char name[16];
260
261 int changed; /* Media was changed */
262 int removable;
263 int readonly;
264
265 struct ub_request urq;
266
267 /* Use Ingo's mempool if or when we have more than one command. */
268 /*
269 * Currently we never need more than one command for the whole device.
270 * However, giving every LUN a command is a cheap and automatic way
271 * to enforce fairness between them.
272 */
273 int cmda[1];
274 struct ub_scsi_cmd cmdv[1];
275
276 struct ub_capacity capacity;
277};
278
279/*
280 * The USB device instance.
281 */
282struct ub_dev {
283 spinlock_t *lock;
284 atomic_t poison; /* The USB device is disconnected */
285 int openc; /* protected by ub_lock! */
286 /* kref is too implicit for our taste */
287 int reset; /* Reset is running */
288 int bad_resid;
289 unsigned int tagcnt;
290 char name[12];
291 struct usb_device *dev;
292 struct usb_interface *intf;
293
294 struct list_head luns;
295
296 unsigned int send_bulk_pipe; /* cached pipe values */
297 unsigned int recv_bulk_pipe;
298 unsigned int send_ctrl_pipe;
299 unsigned int recv_ctrl_pipe;
300
301 struct tasklet_struct tasklet;
302
303 struct ub_scsi_cmd_queue cmd_queue;
304 struct ub_scsi_cmd top_rqs_cmd; /* REQUEST SENSE */
305 unsigned char top_sense[UB_SENSE_SIZE];
306
307 struct ub_completion work_done;
308 struct urb work_urb;
309 struct timer_list work_timer;
310 int last_pipe; /* What might need clearing */
311 __le32 signature; /* Learned signature */
312 struct bulk_cb_wrap work_bcb;
313 struct bulk_cs_wrap work_bcs;
314 struct usb_ctrlrequest work_cr;
315
316 struct work_struct reset_work;
317 wait_queue_head_t reset_wait;
318};
319
320/*
321 */
322static void ub_cleanup(struct ub_dev *sc);
323static int ub_request_fn_1(struct ub_lun *lun, struct request *rq);
324static void ub_cmd_build_block(struct ub_dev *sc, struct ub_lun *lun,
325 struct ub_scsi_cmd *cmd, struct ub_request *urq);
326static void ub_cmd_build_packet(struct ub_dev *sc, struct ub_lun *lun,
327 struct ub_scsi_cmd *cmd, struct ub_request *urq);
328static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd);
329static void ub_end_rq(struct request *rq, unsigned int status);
330static int ub_rw_cmd_retry(struct ub_dev *sc, struct ub_lun *lun,
331 struct ub_request *urq, struct ub_scsi_cmd *cmd);
332static int ub_submit_scsi(struct ub_dev *sc, struct ub_scsi_cmd *cmd);
333static void ub_urb_complete(struct urb *urb);
334static void ub_scsi_action(unsigned long _dev);
335static void ub_scsi_dispatch(struct ub_dev *sc);
336static void ub_scsi_urb_compl(struct ub_dev *sc, struct ub_scsi_cmd *cmd);
337static void ub_data_start(struct ub_dev *sc, struct ub_scsi_cmd *cmd);
338static void ub_state_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd, int rc);
339static int __ub_state_stat(struct ub_dev *sc, struct ub_scsi_cmd *cmd);
340static void ub_state_stat(struct ub_dev *sc, struct ub_scsi_cmd *cmd);
341static void ub_state_stat_counted(struct ub_dev *sc, struct ub_scsi_cmd *cmd);
342static void ub_state_sense(struct ub_dev *sc, struct ub_scsi_cmd *cmd);
343static int ub_submit_clear_stall(struct ub_dev *sc, struct ub_scsi_cmd *cmd,
344 int stalled_pipe);
345static void ub_top_sense_done(struct ub_dev *sc, struct ub_scsi_cmd *scmd);
346static void ub_reset_enter(struct ub_dev *sc, int try);
347static void ub_reset_task(struct work_struct *work);
348static int ub_sync_tur(struct ub_dev *sc, struct ub_lun *lun);
349static int ub_sync_read_cap(struct ub_dev *sc, struct ub_lun *lun,
350 struct ub_capacity *ret);
351static int ub_sync_reset(struct ub_dev *sc);
352static int ub_probe_clear_stall(struct ub_dev *sc, int stalled_pipe);
353static int ub_probe_lun(struct ub_dev *sc, int lnum);
354
355/*
356 */
357#ifdef CONFIG_USB_LIBUSUAL
358
359#define ub_usb_ids usb_storage_usb_ids
360#else
361
362static const struct usb_device_id ub_usb_ids[] = {
363 { USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, USB_SC_SCSI, USB_PR_BULK) },
364 { }
365};
366
367MODULE_DEVICE_TABLE(usb, ub_usb_ids);
368#endif /* CONFIG_USB_LIBUSUAL */
369
370/*
371 * Find me a way to identify "next free minor" for add_disk(),
372 * and the array disappears the next day. However, the number of
373 * hosts has something to do with the naming and /proc/partitions.
374 * This has to be thought out in detail before changing.
375 * If UB_MAX_HOST was 1000, we'd use a bitmap. Or a better data structure.
376 */
377#define UB_MAX_HOSTS 26
378static char ub_hostv[UB_MAX_HOSTS];
379
380#define UB_QLOCK_NUM 5
381static spinlock_t ub_qlockv[UB_QLOCK_NUM];
382static int ub_qlock_next = 0;
383
384static DEFINE_SPINLOCK(ub_lock); /* Locks globals and ->openc */
385
386/*
387 * The id allocator.
388 *
389 * This also stores the host for indexing by minor, which is somewhat dirty.
390 */
391static int ub_id_get(void)
392{
393 unsigned long flags;
394 int i;
395
396 spin_lock_irqsave(&ub_lock, flags);
397 for (i = 0; i < UB_MAX_HOSTS; i++) {
398 if (ub_hostv[i] == 0) {
399 ub_hostv[i] = 1;
400 spin_unlock_irqrestore(&ub_lock, flags);
401 return i;
402 }
403 }
404 spin_unlock_irqrestore(&ub_lock, flags);
405 return -1;
406}
407
408static void ub_id_put(int id)
409{
410 unsigned long flags;
411
412 if (id < 0 || id >= UB_MAX_HOSTS) {
413 printk(KERN_ERR DRV_NAME ": bad host ID %d\n", id);
414 return;
415 }
416
417 spin_lock_irqsave(&ub_lock, flags);
418 if (ub_hostv[id] == 0) {
419 spin_unlock_irqrestore(&ub_lock, flags);
420 printk(KERN_ERR DRV_NAME ": freeing free host ID %d\n", id);
421 return;
422 }
423 ub_hostv[id] = 0;
424 spin_unlock_irqrestore(&ub_lock, flags);
425}
426
427/*
428 * This is necessitated by the fact that blk_cleanup_queue does not
429 * necesserily destroy the queue. Instead, it may merely decrease q->refcnt.
430 * Since our blk_init_queue() passes a spinlock common with ub_dev,
431 * we have life time issues when ub_cleanup frees ub_dev.
432 */
433static spinlock_t *ub_next_lock(void)
434{
435 unsigned long flags;
436 spinlock_t *ret;
437
438 spin_lock_irqsave(&ub_lock, flags);
439 ret = &ub_qlockv[ub_qlock_next];
440 ub_qlock_next = (ub_qlock_next + 1) % UB_QLOCK_NUM;
441 spin_unlock_irqrestore(&ub_lock, flags);
442 return ret;
443}
444
445/*
446 * Downcount for deallocation. This rides on two assumptions:
447 * - once something is poisoned, its refcount cannot grow
448 * - opens cannot happen at this time (del_gendisk was done)
449 * If the above is true, we can drop the lock, which we need for
450 * blk_cleanup_queue(): the silly thing may attempt to sleep.
451 * [Actually, it never needs to sleep for us, but it calls might_sleep()]
452 */
453static void ub_put(struct ub_dev *sc)
454{
455 unsigned long flags;
456
457 spin_lock_irqsave(&ub_lock, flags);
458 --sc->openc;
459 if (sc->openc == 0 && atomic_read(&sc->poison)) {
460 spin_unlock_irqrestore(&ub_lock, flags);
461 ub_cleanup(sc);
462 } else {
463 spin_unlock_irqrestore(&ub_lock, flags);
464 }
465}
466
467/*
468 * Final cleanup and deallocation.
469 */
470static void ub_cleanup(struct ub_dev *sc)
471{
472 struct list_head *p;
473 struct ub_lun *lun;
474 struct request_queue *q;
475
476 while (!list_empty(&sc->luns)) {
477 p = sc->luns.next;
478 lun = list_entry(p, struct ub_lun, link);
479 list_del(p);
480
481 /* I don't think queue can be NULL. But... Stolen from sx8.c */
482 if ((q = lun->disk->queue) != NULL)
483 blk_cleanup_queue(q);
484 /*
485 * If we zero disk->private_data BEFORE put_disk, we have
486 * to check for NULL all over the place in open, release,
487 * check_media and revalidate, because the block level
488 * semaphore is well inside the put_disk.
489 * But we cannot zero after the call, because *disk is gone.
490 * The sd.c is blatantly racy in this area.
491 */
492 /* disk->private_data = NULL; */
493 put_disk(lun->disk);
494 lun->disk = NULL;
495
496 ub_id_put(lun->id);
497 kfree(lun);
498 }
499
500 usb_set_intfdata(sc->intf, NULL);
501 usb_put_intf(sc->intf);
502 usb_put_dev(sc->dev);
503 kfree(sc);
504}
505
506/*
507 * The "command allocator".
508 */
509static struct ub_scsi_cmd *ub_get_cmd(struct ub_lun *lun)
510{
511 struct ub_scsi_cmd *ret;
512
513 if (lun->cmda[0])
514 return NULL;
515 ret = &lun->cmdv[0];
516 lun->cmda[0] = 1;
517 return ret;
518}
519
520static void ub_put_cmd(struct ub_lun *lun, struct ub_scsi_cmd *cmd)
521{
522 if (cmd != &lun->cmdv[0]) {
523 printk(KERN_WARNING "%s: releasing a foreign cmd %p\n",
524 lun->name, cmd);
525 return;
526 }
527 if (!lun->cmda[0]) {
528 printk(KERN_WARNING "%s: releasing a free cmd\n", lun->name);
529 return;
530 }
531 lun->cmda[0] = 0;
532}
533
534/*
535 * The command queue.
536 */
537static void ub_cmdq_add(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
538{
539 struct ub_scsi_cmd_queue *t = &sc->cmd_queue;
540
541 if (t->qlen++ == 0) {
542 t->head = cmd;
543 t->tail = cmd;
544 } else {
545 t->tail->next = cmd;
546 t->tail = cmd;
547 }
548
549 if (t->qlen > t->qmax)
550 t->qmax = t->qlen;
551}
552
553static void ub_cmdq_insert(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
554{
555 struct ub_scsi_cmd_queue *t = &sc->cmd_queue;
556
557 if (t->qlen++ == 0) {
558 t->head = cmd;
559 t->tail = cmd;
560 } else {
561 cmd->next = t->head;
562 t->head = cmd;
563 }
564
565 if (t->qlen > t->qmax)
566 t->qmax = t->qlen;
567}
568
569static struct ub_scsi_cmd *ub_cmdq_pop(struct ub_dev *sc)
570{
571 struct ub_scsi_cmd_queue *t = &sc->cmd_queue;
572 struct ub_scsi_cmd *cmd;
573
574 if (t->qlen == 0)
575 return NULL;
576 if (--t->qlen == 0)
577 t->tail = NULL;
578 cmd = t->head;
579 t->head = cmd->next;
580 cmd->next = NULL;
581 return cmd;
582}
583
584#define ub_cmdq_peek(sc) ((sc)->cmd_queue.head)
585
586/*
587 * The request function is our main entry point
588 */
589
590static void ub_request_fn(struct request_queue *q)
591{
592 struct ub_lun *lun = q->queuedata;
593 struct request *rq;
594
595 while ((rq = blk_peek_request(q)) != NULL) {
596 if (ub_request_fn_1(lun, rq) != 0) {
597 blk_stop_queue(q);
598 break;
599 }
600 }
601}
602
603static int ub_request_fn_1(struct ub_lun *lun, struct request *rq)
604{
605 struct ub_dev *sc = lun->udev;
606 struct ub_scsi_cmd *cmd;
607 struct ub_request *urq;
608 int n_elem;
609
610 if (atomic_read(&sc->poison)) {
611 blk_start_request(rq);
612 ub_end_rq(rq, DID_NO_CONNECT << 16);
613 return 0;
614 }
615
616 if (lun->changed && rq->cmd_type != REQ_TYPE_BLOCK_PC) {
617 blk_start_request(rq);
618 ub_end_rq(rq, SAM_STAT_CHECK_CONDITION);
619 return 0;
620 }
621
622 if (lun->urq.rq != NULL)
623 return -1;
624 if ((cmd = ub_get_cmd(lun)) == NULL)
625 return -1;
626 memset(cmd, 0, sizeof(struct ub_scsi_cmd));
627
628 blk_start_request(rq);
629
630 urq = &lun->urq;
631 memset(urq, 0, sizeof(struct ub_request));
632 urq->rq = rq;
633
634 /*
635 * get scatterlist from block layer
636 */
637 sg_init_table(&urq->sgv[0], UB_MAX_REQ_SG);
638 n_elem = blk_rq_map_sg(lun->disk->queue, rq, &urq->sgv[0]);
639 if (n_elem < 0) {
640 /* Impossible, because blk_rq_map_sg should not hit ENOMEM. */
641 printk(KERN_INFO "%s: failed request map (%d)\n",
642 lun->name, n_elem);
643 goto drop;
644 }
645 if (n_elem > UB_MAX_REQ_SG) { /* Paranoia */
646 printk(KERN_WARNING "%s: request with %d segments\n",
647 lun->name, n_elem);
648 goto drop;
649 }
650 urq->nsg = n_elem;
651
652 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
653 ub_cmd_build_packet(sc, lun, cmd, urq);
654 } else {
655 ub_cmd_build_block(sc, lun, cmd, urq);
656 }
657 cmd->state = UB_CMDST_INIT;
658 cmd->lun = lun;
659 cmd->done = ub_rw_cmd_done;
660 cmd->back = urq;
661
662 cmd->tag = sc->tagcnt++;
663 if (ub_submit_scsi(sc, cmd) != 0)
664 goto drop;
665
666 return 0;
667
668drop:
669 ub_put_cmd(lun, cmd);
670 ub_end_rq(rq, DID_ERROR << 16);
671 return 0;
672}
673
674static void ub_cmd_build_block(struct ub_dev *sc, struct ub_lun *lun,
675 struct ub_scsi_cmd *cmd, struct ub_request *urq)
676{
677 struct request *rq = urq->rq;
678 unsigned int block, nblks;
679
680 if (rq_data_dir(rq) == WRITE)
681 cmd->dir = UB_DIR_WRITE;
682 else
683 cmd->dir = UB_DIR_READ;
684
685 cmd->nsg = urq->nsg;
686 memcpy(cmd->sgv, urq->sgv, sizeof(struct scatterlist) * cmd->nsg);
687
688 /*
689 * build the command
690 *
691 * The call to blk_queue_logical_block_size() guarantees that request
692 * is aligned, but it is given in terms of 512 byte units, always.
693 */
694 block = blk_rq_pos(rq) >> lun->capacity.bshift;
695 nblks = blk_rq_sectors(rq) >> lun->capacity.bshift;
696
697 cmd->cdb[0] = (cmd->dir == UB_DIR_READ)? READ_10: WRITE_10;
698 /* 10-byte uses 4 bytes of LBA: 2147483648KB, 2097152MB, 2048GB */
699 cmd->cdb[2] = block >> 24;
700 cmd->cdb[3] = block >> 16;
701 cmd->cdb[4] = block >> 8;
702 cmd->cdb[5] = block;
703 cmd->cdb[7] = nblks >> 8;
704 cmd->cdb[8] = nblks;
705 cmd->cdb_len = 10;
706
707 cmd->len = blk_rq_bytes(rq);
708}
709
710static void ub_cmd_build_packet(struct ub_dev *sc, struct ub_lun *lun,
711 struct ub_scsi_cmd *cmd, struct ub_request *urq)
712{
713 struct request *rq = urq->rq;
714
715 if (blk_rq_bytes(rq) == 0) {
716 cmd->dir = UB_DIR_NONE;
717 } else {
718 if (rq_data_dir(rq) == WRITE)
719 cmd->dir = UB_DIR_WRITE;
720 else
721 cmd->dir = UB_DIR_READ;
722 }
723
724 cmd->nsg = urq->nsg;
725 memcpy(cmd->sgv, urq->sgv, sizeof(struct scatterlist) * cmd->nsg);
726
727 memcpy(&cmd->cdb, rq->cmd, rq->cmd_len);
728 cmd->cdb_len = rq->cmd_len;
729
730 cmd->len = blk_rq_bytes(rq);
731
732 /*
733 * To reapply this to every URB is not as incorrect as it looks.
734 * In return, we avoid any complicated tracking calculations.
735 */
736 cmd->timeo = rq->timeout;
737}
738
739static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
740{
741 struct ub_lun *lun = cmd->lun;
742 struct ub_request *urq = cmd->back;
743 struct request *rq;
744 unsigned int scsi_status;
745
746 rq = urq->rq;
747
748 if (cmd->error == 0) {
749 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
750 if (cmd->act_len >= rq->resid_len)
751 rq->resid_len = 0;
752 else
753 rq->resid_len -= cmd->act_len;
754 scsi_status = 0;
755 } else {
756 if (cmd->act_len != cmd->len) {
757 scsi_status = SAM_STAT_CHECK_CONDITION;
758 } else {
759 scsi_status = 0;
760 }
761 }
762 } else {
763 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
764 /* UB_SENSE_SIZE is smaller than SCSI_SENSE_BUFFERSIZE */
765 memcpy(rq->sense, sc->top_sense, UB_SENSE_SIZE);
766 rq->sense_len = UB_SENSE_SIZE;
767 if (sc->top_sense[0] != 0)
768 scsi_status = SAM_STAT_CHECK_CONDITION;
769 else
770 scsi_status = DID_ERROR << 16;
771 } else {
772 if (cmd->error == -EIO &&
773 (cmd->key == 0 ||
774 cmd->key == MEDIUM_ERROR ||
775 cmd->key == UNIT_ATTENTION)) {
776 if (ub_rw_cmd_retry(sc, lun, urq, cmd) == 0)
777 return;
778 }
779 scsi_status = SAM_STAT_CHECK_CONDITION;
780 }
781 }
782
783 urq->rq = NULL;
784
785 ub_put_cmd(lun, cmd);
786 ub_end_rq(rq, scsi_status);
787 blk_start_queue(lun->disk->queue);
788}
789
790static void ub_end_rq(struct request *rq, unsigned int scsi_status)
791{
792 int error;
793
794 if (scsi_status == 0) {
795 error = 0;
796 } else {
797 error = -EIO;
798 rq->errors = scsi_status;
799 }
800 __blk_end_request_all(rq, error);
801}
802
803static int ub_rw_cmd_retry(struct ub_dev *sc, struct ub_lun *lun,
804 struct ub_request *urq, struct ub_scsi_cmd *cmd)
805{
806
807 if (atomic_read(&sc->poison))
808 return -ENXIO;
809
810 ub_reset_enter(sc, urq->current_try);
811
812 if (urq->current_try >= 3)
813 return -EIO;
814 urq->current_try++;
815
816 /* Remove this if anyone complains of flooding. */
817 printk(KERN_DEBUG "%s: dir %c len/act %d/%d "
818 "[sense %x %02x %02x] retry %d\n",
819 sc->name, UB_DIR_CHAR(cmd->dir), cmd->len, cmd->act_len,
820 cmd->key, cmd->asc, cmd->ascq, urq->current_try);
821
822 memset(cmd, 0, sizeof(struct ub_scsi_cmd));
823 ub_cmd_build_block(sc, lun, cmd, urq);
824
825 cmd->state = UB_CMDST_INIT;
826 cmd->lun = lun;
827 cmd->done = ub_rw_cmd_done;
828 cmd->back = urq;
829
830 cmd->tag = sc->tagcnt++;
831
832#if 0 /* Wasteful */
833 return ub_submit_scsi(sc, cmd);
834#else
835 ub_cmdq_add(sc, cmd);
836 return 0;
837#endif
838}
839
840/*
841 * Submit a regular SCSI operation (not an auto-sense).
842 *
843 * The Iron Law of Good Submit Routine is:
844 * Zero return - callback is done, Nonzero return - callback is not done.
845 * No exceptions.
846 *
847 * Host is assumed locked.
848 */
849static int ub_submit_scsi(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
850{
851
852 if (cmd->state != UB_CMDST_INIT ||
853 (cmd->dir != UB_DIR_NONE && cmd->len == 0)) {
854 return -EINVAL;
855 }
856
857 ub_cmdq_add(sc, cmd);
858 /*
859 * We can call ub_scsi_dispatch(sc) right away here, but it's a little
860 * safer to jump to a tasklet, in case upper layers do something silly.
861 */
862 tasklet_schedule(&sc->tasklet);
863 return 0;
864}
865
866/*
867 * Submit the first URB for the queued command.
868 * This function does not deal with queueing in any way.
869 */
870static int ub_scsi_cmd_start(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
871{
872 struct bulk_cb_wrap *bcb;
873 int rc;
874
875 bcb = &sc->work_bcb;
876
877 /*
878 * ``If the allocation length is eighteen or greater, and a device
879 * server returns less than eithteen bytes of data, the application
880 * client should assume that the bytes not transferred would have been
881 * zeroes had the device server returned those bytes.''
882 *
883 * We zero sense for all commands so that when a packet request
884 * fails it does not return a stale sense.
885 */
886 memset(&sc->top_sense, 0, UB_SENSE_SIZE);
887
888 /* set up the command wrapper */
889 bcb->Signature = cpu_to_le32(US_BULK_CB_SIGN);
890 bcb->Tag = cmd->tag; /* Endianness is not important */
891 bcb->DataTransferLength = cpu_to_le32(cmd->len);
892 bcb->Flags = (cmd->dir == UB_DIR_READ) ? 0x80 : 0;
893 bcb->Lun = (cmd->lun != NULL) ? cmd->lun->num : 0;
894 bcb->Length = cmd->cdb_len;
895
896 /* copy the command payload */
897 memcpy(bcb->CDB, cmd->cdb, UB_MAX_CDB_SIZE);
898
899 UB_INIT_COMPLETION(sc->work_done);
900
901 sc->last_pipe = sc->send_bulk_pipe;
902 usb_fill_bulk_urb(&sc->work_urb, sc->dev, sc->send_bulk_pipe,
903 bcb, US_BULK_CB_WRAP_LEN, ub_urb_complete, sc);
904
905 if ((rc = usb_submit_urb(&sc->work_urb, GFP_ATOMIC)) != 0) {
906 /* XXX Clear stalls */
907 ub_complete(&sc->work_done);
908 return rc;
909 }
910
911 sc->work_timer.expires = jiffies + UB_URB_TIMEOUT;
912 add_timer(&sc->work_timer);
913
914 cmd->state = UB_CMDST_CMD;
915 return 0;
916}
917
918/*
919 * Timeout handler.
920 */
921static void ub_urb_timeout(unsigned long arg)
922{
923 struct ub_dev *sc = (struct ub_dev *) arg;
924 unsigned long flags;
925
926 spin_lock_irqsave(sc->lock, flags);
927 if (!ub_is_completed(&sc->work_done))
928 usb_unlink_urb(&sc->work_urb);
929 spin_unlock_irqrestore(sc->lock, flags);
930}
931
932/*
933 * Completion routine for the work URB.
934 *
935 * This can be called directly from usb_submit_urb (while we have
936 * the sc->lock taken) and from an interrupt (while we do NOT have
937 * the sc->lock taken). Therefore, bounce this off to a tasklet.
938 */
939static void ub_urb_complete(struct urb *urb)
940{
941 struct ub_dev *sc = urb->context;
942
943 ub_complete(&sc->work_done);
944 tasklet_schedule(&sc->tasklet);
945}
946
947static void ub_scsi_action(unsigned long _dev)
948{
949 struct ub_dev *sc = (struct ub_dev *) _dev;
950 unsigned long flags;
951
952 spin_lock_irqsave(sc->lock, flags);
953 ub_scsi_dispatch(sc);
954 spin_unlock_irqrestore(sc->lock, flags);
955}
956
957static void ub_scsi_dispatch(struct ub_dev *sc)
958{
959 struct ub_scsi_cmd *cmd;
960 int rc;
961
962 while (!sc->reset && (cmd = ub_cmdq_peek(sc)) != NULL) {
963 if (cmd->state == UB_CMDST_DONE) {
964 ub_cmdq_pop(sc);
965 (*cmd->done)(sc, cmd);
966 } else if (cmd->state == UB_CMDST_INIT) {
967 if ((rc = ub_scsi_cmd_start(sc, cmd)) == 0)
968 break;
969 cmd->error = rc;
970 cmd->state = UB_CMDST_DONE;
971 } else {
972 if (!ub_is_completed(&sc->work_done))
973 break;
974 del_timer(&sc->work_timer);
975 ub_scsi_urb_compl(sc, cmd);
976 }
977 }
978}
979
980static void ub_scsi_urb_compl(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
981{
982 struct urb *urb = &sc->work_urb;
983 struct bulk_cs_wrap *bcs;
984 int endp;
985 int len;
986 int rc;
987
988 if (atomic_read(&sc->poison)) {
989 ub_state_done(sc, cmd, -ENODEV);
990 return;
991 }
992
993 endp = usb_pipeendpoint(sc->last_pipe);
994 if (usb_pipein(sc->last_pipe))
995 endp |= USB_DIR_IN;
996
997 if (cmd->state == UB_CMDST_CLEAR) {
998 if (urb->status == -EPIPE) {
999 /*
1000 * STALL while clearning STALL.
1001 * The control pipe clears itself - nothing to do.
1002 */
1003 printk(KERN_NOTICE "%s: stall on control pipe\n",
1004 sc->name);
1005 goto Bad_End;
1006 }
1007
1008 /*
1009 * We ignore the result for the halt clear.
1010 */
1011
1012 usb_reset_endpoint(sc->dev, endp);
1013
1014 ub_state_sense(sc, cmd);
1015
1016 } else if (cmd->state == UB_CMDST_CLR2STS) {
1017 if (urb->status == -EPIPE) {
1018 printk(KERN_NOTICE "%s: stall on control pipe\n",
1019 sc->name);
1020 goto Bad_End;
1021 }
1022
1023 /*
1024 * We ignore the result for the halt clear.
1025 */
1026
1027 usb_reset_endpoint(sc->dev, endp);
1028
1029 ub_state_stat(sc, cmd);
1030
1031 } else if (cmd->state == UB_CMDST_CLRRS) {
1032 if (urb->status == -EPIPE) {
1033 printk(KERN_NOTICE "%s: stall on control pipe\n",
1034 sc->name);
1035 goto Bad_End;
1036 }
1037
1038 /*
1039 * We ignore the result for the halt clear.
1040 */
1041
1042 usb_reset_endpoint(sc->dev, endp);
1043
1044 ub_state_stat_counted(sc, cmd);
1045
1046 } else if (cmd->state == UB_CMDST_CMD) {
1047 switch (urb->status) {
1048 case 0:
1049 break;
1050 case -EOVERFLOW:
1051 goto Bad_End;
1052 case -EPIPE:
1053 rc = ub_submit_clear_stall(sc, cmd, sc->last_pipe);
1054 if (rc != 0) {
1055 printk(KERN_NOTICE "%s: "
1056 "unable to submit clear (%d)\n",
1057 sc->name, rc);
1058 /*
1059 * This is typically ENOMEM or some other such shit.
1060 * Retrying is pointless. Just do Bad End on it...
1061 */
1062 ub_state_done(sc, cmd, rc);
1063 return;
1064 }
1065 cmd->state = UB_CMDST_CLEAR;
1066 return;
1067 case -ESHUTDOWN: /* unplug */
1068 case -EILSEQ: /* unplug timeout on uhci */
1069 ub_state_done(sc, cmd, -ENODEV);
1070 return;
1071 default:
1072 goto Bad_End;
1073 }
1074 if (urb->actual_length != US_BULK_CB_WRAP_LEN) {
1075 goto Bad_End;
1076 }
1077
1078 if (cmd->dir == UB_DIR_NONE || cmd->nsg < 1) {
1079 ub_state_stat(sc, cmd);
1080 return;
1081 }
1082
1083 // udelay(125); // usb-storage has this
1084 ub_data_start(sc, cmd);
1085
1086 } else if (cmd->state == UB_CMDST_DATA) {
1087 if (urb->status == -EPIPE) {
1088 rc = ub_submit_clear_stall(sc, cmd, sc->last_pipe);
1089 if (rc != 0) {
1090 printk(KERN_NOTICE "%s: "
1091 "unable to submit clear (%d)\n",
1092 sc->name, rc);
1093 ub_state_done(sc, cmd, rc);
1094 return;
1095 }
1096 cmd->state = UB_CMDST_CLR2STS;
1097 return;
1098 }
1099 if (urb->status == -EOVERFLOW) {
1100 /*
1101 * A babble? Failure, but we must transfer CSW now.
1102 */
1103 cmd->error = -EOVERFLOW; /* A cheap trick... */
1104 ub_state_stat(sc, cmd);
1105 return;
1106 }
1107
1108 if (cmd->dir == UB_DIR_WRITE) {
1109 /*
1110 * Do not continue writes in case of a failure.
1111 * Doing so would cause sectors to be mixed up,
1112 * which is worse than sectors lost.
1113 *
1114 * We must try to read the CSW, or many devices
1115 * get confused.
1116 */
1117 len = urb->actual_length;
1118 if (urb->status != 0 ||
1119 len != cmd->sgv[cmd->current_sg].length) {
1120 cmd->act_len += len;
1121
1122 cmd->error = -EIO;
1123 ub_state_stat(sc, cmd);
1124 return;
1125 }
1126
1127 } else {
1128 /*
1129 * If an error occurs on read, we record it, and
1130 * continue to fetch data in order to avoid bubble.
1131 *
1132 * As a small shortcut, we stop if we detect that
1133 * a CSW mixed into data.
1134 */
1135 if (urb->status != 0)
1136 cmd->error = -EIO;
1137
1138 len = urb->actual_length;
1139 if (urb->status != 0 ||
1140 len != cmd->sgv[cmd->current_sg].length) {
1141 if ((len & 0x1FF) == US_BULK_CS_WRAP_LEN)
1142 goto Bad_End;
1143 }
1144 }
1145
1146 cmd->act_len += urb->actual_length;
1147
1148 if (++cmd->current_sg < cmd->nsg) {
1149 ub_data_start(sc, cmd);
1150 return;
1151 }
1152 ub_state_stat(sc, cmd);
1153
1154 } else if (cmd->state == UB_CMDST_STAT) {
1155 if (urb->status == -EPIPE) {
1156 rc = ub_submit_clear_stall(sc, cmd, sc->last_pipe);
1157 if (rc != 0) {
1158 printk(KERN_NOTICE "%s: "
1159 "unable to submit clear (%d)\n",
1160 sc->name, rc);
1161 ub_state_done(sc, cmd, rc);
1162 return;
1163 }
1164
1165 /*
1166 * Having a stall when getting CSW is an error, so
1167 * make sure uppper levels are not oblivious to it.
1168 */
1169 cmd->error = -EIO; /* A cheap trick... */
1170
1171 cmd->state = UB_CMDST_CLRRS;
1172 return;
1173 }
1174
1175 /* Catch everything, including -EOVERFLOW and other nasties. */
1176 if (urb->status != 0)
1177 goto Bad_End;
1178
1179 if (urb->actual_length == 0) {
1180 ub_state_stat_counted(sc, cmd);
1181 return;
1182 }
1183
1184 /*
1185 * Check the returned Bulk protocol status.
1186 * The status block has to be validated first.
1187 */
1188
1189 bcs = &sc->work_bcs;
1190
1191 if (sc->signature == cpu_to_le32(0)) {
1192 /*
1193 * This is the first reply, so do not perform the check.
1194 * Instead, remember the signature the device uses
1195 * for future checks. But do not allow a nul.
1196 */
1197 sc->signature = bcs->Signature;
1198 if (sc->signature == cpu_to_le32(0)) {
1199 ub_state_stat_counted(sc, cmd);
1200 return;
1201 }
1202 } else {
1203 if (bcs->Signature != sc->signature) {
1204 ub_state_stat_counted(sc, cmd);
1205 return;
1206 }
1207 }
1208
1209 if (bcs->Tag != cmd->tag) {
1210 /*
1211 * This usually happens when we disagree with the
1212 * device's microcode about something. For instance,
1213 * a few of them throw this after timeouts. They buffer
1214 * commands and reply at commands we timed out before.
1215 * Without flushing these replies we loop forever.
1216 */
1217 ub_state_stat_counted(sc, cmd);
1218 return;
1219 }
1220
1221 if (!sc->bad_resid) {
1222 len = le32_to_cpu(bcs->Residue);
1223 if (len != cmd->len - cmd->act_len) {
1224 /*
1225 * Only start ignoring if this cmd ended well.
1226 */
1227 if (cmd->len == cmd->act_len) {
1228 printk(KERN_NOTICE "%s: "
1229 "bad residual %d of %d, ignoring\n",
1230 sc->name, len, cmd->len);
1231 sc->bad_resid = 1;
1232 }
1233 }
1234 }
1235
1236 switch (bcs->Status) {
1237 case US_BULK_STAT_OK:
1238 break;
1239 case US_BULK_STAT_FAIL:
1240 ub_state_sense(sc, cmd);
1241 return;
1242 case US_BULK_STAT_PHASE:
1243 goto Bad_End;
1244 default:
1245 printk(KERN_INFO "%s: unknown CSW status 0x%x\n",
1246 sc->name, bcs->Status);
1247 ub_state_done(sc, cmd, -EINVAL);
1248 return;
1249 }
1250
1251 /* Not zeroing error to preserve a babble indicator */
1252 if (cmd->error != 0) {
1253 ub_state_sense(sc, cmd);
1254 return;
1255 }
1256 cmd->state = UB_CMDST_DONE;
1257 ub_cmdq_pop(sc);
1258 (*cmd->done)(sc, cmd);
1259
1260 } else if (cmd->state == UB_CMDST_SENSE) {
1261 ub_state_done(sc, cmd, -EIO);
1262
1263 } else {
1264 printk(KERN_WARNING "%s: wrong command state %d\n",
1265 sc->name, cmd->state);
1266 ub_state_done(sc, cmd, -EINVAL);
1267 return;
1268 }
1269 return;
1270
1271Bad_End: /* Little Excel is dead */
1272 ub_state_done(sc, cmd, -EIO);
1273}
1274
1275/*
1276 * Factorization helper for the command state machine:
1277 * Initiate a data segment transfer.
1278 */
1279static void ub_data_start(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
1280{
1281 struct scatterlist *sg = &cmd->sgv[cmd->current_sg];
1282 int pipe;
1283 int rc;
1284
1285 UB_INIT_COMPLETION(sc->work_done);
1286
1287 if (cmd->dir == UB_DIR_READ)
1288 pipe = sc->recv_bulk_pipe;
1289 else
1290 pipe = sc->send_bulk_pipe;
1291 sc->last_pipe = pipe;
1292 usb_fill_bulk_urb(&sc->work_urb, sc->dev, pipe, sg_virt(sg),
1293 sg->length, ub_urb_complete, sc);
1294
1295 if ((rc = usb_submit_urb(&sc->work_urb, GFP_ATOMIC)) != 0) {
1296 /* XXX Clear stalls */
1297 ub_complete(&sc->work_done);
1298 ub_state_done(sc, cmd, rc);
1299 return;
1300 }
1301
1302 if (cmd->timeo)
1303 sc->work_timer.expires = jiffies + cmd->timeo;
1304 else
1305 sc->work_timer.expires = jiffies + UB_DATA_TIMEOUT;
1306 add_timer(&sc->work_timer);
1307
1308 cmd->state = UB_CMDST_DATA;
1309}
1310
1311/*
1312 * Factorization helper for the command state machine:
1313 * Finish the command.
1314 */
1315static void ub_state_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd, int rc)
1316{
1317
1318 cmd->error = rc;
1319 cmd->state = UB_CMDST_DONE;
1320 ub_cmdq_pop(sc);
1321 (*cmd->done)(sc, cmd);
1322}
1323
1324/*
1325 * Factorization helper for the command state machine:
1326 * Submit a CSW read.
1327 */
1328static int __ub_state_stat(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
1329{
1330 int rc;
1331
1332 UB_INIT_COMPLETION(sc->work_done);
1333
1334 sc->last_pipe = sc->recv_bulk_pipe;
1335 usb_fill_bulk_urb(&sc->work_urb, sc->dev, sc->recv_bulk_pipe,
1336 &sc->work_bcs, US_BULK_CS_WRAP_LEN, ub_urb_complete, sc);
1337
1338 if ((rc = usb_submit_urb(&sc->work_urb, GFP_ATOMIC)) != 0) {
1339 /* XXX Clear stalls */
1340 ub_complete(&sc->work_done);
1341 ub_state_done(sc, cmd, rc);
1342 return -1;
1343 }
1344
1345 if (cmd->timeo)
1346 sc->work_timer.expires = jiffies + cmd->timeo;
1347 else
1348 sc->work_timer.expires = jiffies + UB_STAT_TIMEOUT;
1349 add_timer(&sc->work_timer);
1350 return 0;
1351}
1352
1353/*
1354 * Factorization helper for the command state machine:
1355 * Submit a CSW read and go to STAT state.
1356 */
1357static void ub_state_stat(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
1358{
1359
1360 if (__ub_state_stat(sc, cmd) != 0)
1361 return;
1362
1363 cmd->stat_count = 0;
1364 cmd->state = UB_CMDST_STAT;
1365}
1366
1367/*
1368 * Factorization helper for the command state machine:
1369 * Submit a CSW read and go to STAT state with counter (along [C] path).
1370 */
1371static void ub_state_stat_counted(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
1372{
1373
1374 if (++cmd->stat_count >= 4) {
1375 ub_state_sense(sc, cmd);
1376 return;
1377 }
1378
1379 if (__ub_state_stat(sc, cmd) != 0)
1380 return;
1381
1382 cmd->state = UB_CMDST_STAT;
1383}
1384
1385/*
1386 * Factorization helper for the command state machine:
1387 * Submit a REQUEST SENSE and go to SENSE state.
1388 */
1389static void ub_state_sense(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
1390{
1391 struct ub_scsi_cmd *scmd;
1392 struct scatterlist *sg;
1393 int rc;
1394
1395 if (cmd->cdb[0] == REQUEST_SENSE) {
1396 rc = -EPIPE;
1397 goto error;
1398 }
1399
1400 scmd = &sc->top_rqs_cmd;
1401 memset(scmd, 0, sizeof(struct ub_scsi_cmd));
1402 scmd->cdb[0] = REQUEST_SENSE;
1403 scmd->cdb[4] = UB_SENSE_SIZE;
1404 scmd->cdb_len = 6;
1405 scmd->dir = UB_DIR_READ;
1406 scmd->state = UB_CMDST_INIT;
1407 scmd->nsg = 1;
1408 sg = &scmd->sgv[0];
1409 sg_init_table(sg, UB_MAX_REQ_SG);
1410 sg_set_page(sg, virt_to_page(sc->top_sense), UB_SENSE_SIZE,
1411 (unsigned long)sc->top_sense & (PAGE_SIZE-1));
1412 scmd->len = UB_SENSE_SIZE;
1413 scmd->lun = cmd->lun;
1414 scmd->done = ub_top_sense_done;
1415 scmd->back = cmd;
1416
1417 scmd->tag = sc->tagcnt++;
1418
1419 cmd->state = UB_CMDST_SENSE;
1420
1421 ub_cmdq_insert(sc, scmd);
1422 return;
1423
1424error:
1425 ub_state_done(sc, cmd, rc);
1426}
1427
1428/*
1429 * A helper for the command's state machine:
1430 * Submit a stall clear.
1431 */
1432static int ub_submit_clear_stall(struct ub_dev *sc, struct ub_scsi_cmd *cmd,
1433 int stalled_pipe)
1434{
1435 int endp;
1436 struct usb_ctrlrequest *cr;
1437 int rc;
1438
1439 endp = usb_pipeendpoint(stalled_pipe);
1440 if (usb_pipein (stalled_pipe))
1441 endp |= USB_DIR_IN;
1442
1443 cr = &sc->work_cr;
1444 cr->bRequestType = USB_RECIP_ENDPOINT;
1445 cr->bRequest = USB_REQ_CLEAR_FEATURE;
1446 cr->wValue = cpu_to_le16(USB_ENDPOINT_HALT);
1447 cr->wIndex = cpu_to_le16(endp);
1448 cr->wLength = cpu_to_le16(0);
1449
1450 UB_INIT_COMPLETION(sc->work_done);
1451
1452 usb_fill_control_urb(&sc->work_urb, sc->dev, sc->send_ctrl_pipe,
1453 (unsigned char*) cr, NULL, 0, ub_urb_complete, sc);
1454
1455 if ((rc = usb_submit_urb(&sc->work_urb, GFP_ATOMIC)) != 0) {
1456 ub_complete(&sc->work_done);
1457 return rc;
1458 }
1459
1460 sc->work_timer.expires = jiffies + UB_CTRL_TIMEOUT;
1461 add_timer(&sc->work_timer);
1462 return 0;
1463}
1464
1465/*
1466 */
1467static void ub_top_sense_done(struct ub_dev *sc, struct ub_scsi_cmd *scmd)
1468{
1469 unsigned char *sense = sc->top_sense;
1470 struct ub_scsi_cmd *cmd;
1471
1472 /*
1473 * Find the command which triggered the unit attention or a check,
1474 * save the sense into it, and advance its state machine.
1475 */
1476 if ((cmd = ub_cmdq_peek(sc)) == NULL) {
1477 printk(KERN_WARNING "%s: sense done while idle\n", sc->name);
1478 return;
1479 }
1480 if (cmd != scmd->back) {
1481 printk(KERN_WARNING "%s: "
1482 "sense done for wrong command 0x%x\n",
1483 sc->name, cmd->tag);
1484 return;
1485 }
1486 if (cmd->state != UB_CMDST_SENSE) {
1487 printk(KERN_WARNING "%s: sense done with bad cmd state %d\n",
1488 sc->name, cmd->state);
1489 return;
1490 }
1491
1492 /*
1493 * Ignoring scmd->act_len, because the buffer was pre-zeroed.
1494 */
1495 cmd->key = sense[2] & 0x0F;
1496 cmd->asc = sense[12];
1497 cmd->ascq = sense[13];
1498
1499 ub_scsi_urb_compl(sc, cmd);
1500}
1501
1502/*
1503 * Reset management
1504 */
1505
1506static void ub_reset_enter(struct ub_dev *sc, int try)
1507{
1508
1509 if (sc->reset) {
1510 /* This happens often on multi-LUN devices. */
1511 return;
1512 }
1513 sc->reset = try + 1;
1514
1515#if 0 /* Not needed because the disconnect waits for us. */
1516 unsigned long flags;
1517 spin_lock_irqsave(&ub_lock, flags);
1518 sc->openc++;
1519 spin_unlock_irqrestore(&ub_lock, flags);
1520#endif
1521
1522#if 0 /* We let them stop themselves. */
1523 struct ub_lun *lun;
1524 list_for_each_entry(lun, &sc->luns, link) {
1525 blk_stop_queue(lun->disk->queue);
1526 }
1527#endif
1528
1529 schedule_work(&sc->reset_work);
1530}
1531
1532static void ub_reset_task(struct work_struct *work)
1533{
1534 struct ub_dev *sc = container_of(work, struct ub_dev, reset_work);
1535 unsigned long flags;
1536 struct ub_lun *lun;
1537 int rc;
1538
1539 if (!sc->reset) {
1540 printk(KERN_WARNING "%s: Running reset unrequested\n",
1541 sc->name);
1542 return;
1543 }
1544
1545 if (atomic_read(&sc->poison)) {
1546 ;
1547 } else if ((sc->reset & 1) == 0) {
1548 ub_sync_reset(sc);
1549 msleep(700); /* usb-storage sleeps 6s (!) */
1550 ub_probe_clear_stall(sc, sc->recv_bulk_pipe);
1551 ub_probe_clear_stall(sc, sc->send_bulk_pipe);
1552 } else if (sc->dev->actconfig->desc.bNumInterfaces != 1) {
1553 ;
1554 } else {
1555 rc = usb_lock_device_for_reset(sc->dev, sc->intf);
1556 if (rc < 0) {
1557 printk(KERN_NOTICE
1558 "%s: usb_lock_device_for_reset failed (%d)\n",
1559 sc->name, rc);
1560 } else {
1561 rc = usb_reset_device(sc->dev);
1562 if (rc < 0) {
1563 printk(KERN_NOTICE "%s: "
1564 "usb_lock_device_for_reset failed (%d)\n",
1565 sc->name, rc);
1566 }
1567 usb_unlock_device(sc->dev);
1568 }
1569 }
1570
1571 /*
1572 * In theory, no commands can be running while reset is active,
1573 * so nobody can ask for another reset, and so we do not need any
1574 * queues of resets or anything. We do need a spinlock though,
1575 * to interact with block layer.
1576 */
1577 spin_lock_irqsave(sc->lock, flags);
1578 sc->reset = 0;
1579 tasklet_schedule(&sc->tasklet);
1580 list_for_each_entry(lun, &sc->luns, link) {
1581 blk_start_queue(lun->disk->queue);
1582 }
1583 wake_up(&sc->reset_wait);
1584 spin_unlock_irqrestore(sc->lock, flags);
1585}
1586
1587/*
1588 * XXX Reset brackets are too much hassle to implement, so just stub them
1589 * in order to prevent forced unbinding (which deadlocks solid when our
1590 * ->disconnect method waits for the reset to complete and this kills keventd).
1591 *
1592 * XXX Tell Alan to move usb_unlock_device inside of usb_reset_device,
1593 * or else the post_reset is invoked, and restats I/O on a locked device.
1594 */
1595static int ub_pre_reset(struct usb_interface *iface) {
1596 return 0;
1597}
1598
1599static int ub_post_reset(struct usb_interface *iface) {
1600 return 0;
1601}
1602
1603/*
1604 * This is called from a process context.
1605 */
1606static void ub_revalidate(struct ub_dev *sc, struct ub_lun *lun)
1607{
1608
1609 lun->readonly = 0; /* XXX Query this from the device */
1610
1611 lun->capacity.nsec = 0;
1612 lun->capacity.bsize = 512;
1613 lun->capacity.bshift = 0;
1614
1615 if (ub_sync_tur(sc, lun) != 0)
1616 return; /* Not ready */
1617 lun->changed = 0;
1618
1619 if (ub_sync_read_cap(sc, lun, &lun->capacity) != 0) {
1620 /*
1621 * The retry here means something is wrong, either with the
1622 * device, with the transport, or with our code.
1623 * We keep this because sd.c has retries for capacity.
1624 */
1625 if (ub_sync_read_cap(sc, lun, &lun->capacity) != 0) {
1626 lun->capacity.nsec = 0;
1627 lun->capacity.bsize = 512;
1628 lun->capacity.bshift = 0;
1629 }
1630 }
1631}
1632
1633/*
1634 * The open funcion.
1635 * This is mostly needed to keep refcounting, but also to support
1636 * media checks on removable media drives.
1637 */
1638static int ub_bd_open(struct block_device *bdev, fmode_t mode)
1639{
1640 struct ub_lun *lun = bdev->bd_disk->private_data;
1641 struct ub_dev *sc = lun->udev;
1642 unsigned long flags;
1643 int rc;
1644
1645 spin_lock_irqsave(&ub_lock, flags);
1646 if (atomic_read(&sc->poison)) {
1647 spin_unlock_irqrestore(&ub_lock, flags);
1648 return -ENXIO;
1649 }
1650 sc->openc++;
1651 spin_unlock_irqrestore(&ub_lock, flags);
1652
1653 if (lun->removable || lun->readonly)
1654 check_disk_change(bdev);
1655
1656 /*
1657 * The sd.c considers ->media_present and ->changed not equivalent,
1658 * under some pretty murky conditions (a failure of READ CAPACITY).
1659 * We may need it one day.
1660 */
1661 if (lun->removable && lun->changed && !(mode & FMODE_NDELAY)) {
1662 rc = -ENOMEDIUM;
1663 goto err_open;
1664 }
1665
1666 if (lun->readonly && (mode & FMODE_WRITE)) {
1667 rc = -EROFS;
1668 goto err_open;
1669 }
1670
1671 return 0;
1672
1673err_open:
1674 ub_put(sc);
1675 return rc;
1676}
1677
1678static int ub_bd_unlocked_open(struct block_device *bdev, fmode_t mode)
1679{
1680 int ret;
1681
1682 mutex_lock(&ub_mutex);
1683 ret = ub_bd_open(bdev, mode);
1684 mutex_unlock(&ub_mutex);
1685
1686 return ret;
1687}
1688
1689
1690/*
1691 */
1692static int ub_bd_release(struct gendisk *disk, fmode_t mode)
1693{
1694 struct ub_lun *lun = disk->private_data;
1695 struct ub_dev *sc = lun->udev;
1696
1697 mutex_lock(&ub_mutex);
1698 ub_put(sc);
1699 mutex_unlock(&ub_mutex);
1700
1701 return 0;
1702}
1703
1704/*
1705 * The ioctl interface.
1706 */
1707static int ub_bd_ioctl(struct block_device *bdev, fmode_t mode,
1708 unsigned int cmd, unsigned long arg)
1709{
1710 void __user *usermem = (void __user *) arg;
1711 int ret;
1712
1713 mutex_lock(&ub_mutex);
1714 ret = scsi_cmd_blk_ioctl(bdev, mode, cmd, usermem);
1715 mutex_unlock(&ub_mutex);
1716
1717 return ret;
1718}
1719
1720/*
1721 * This is called by check_disk_change if we reported a media change.
1722 * The main onjective here is to discover the features of the media such as
1723 * the capacity, read-only status, etc. USB storage generally does not
1724 * need to be spun up, but if we needed it, this would be the place.
1725 *
1726 * This call can sleep.
1727 *
1728 * The return code is not used.
1729 */
1730static int ub_bd_revalidate(struct gendisk *disk)
1731{
1732 struct ub_lun *lun = disk->private_data;
1733
1734 ub_revalidate(lun->udev, lun);
1735
1736 /* XXX Support sector size switching like in sr.c */
1737 blk_queue_logical_block_size(disk->queue, lun->capacity.bsize);
1738 set_capacity(disk, lun->capacity.nsec);
1739 // set_disk_ro(sdkp->disk, lun->readonly);
1740
1741 return 0;
1742}
1743
1744/*
1745 * The check is called by the block layer to verify if the media
1746 * is still available. It is supposed to be harmless, lightweight and
1747 * non-intrusive in case the media was not changed.
1748 *
1749 * This call can sleep.
1750 *
1751 * The return code is bool!
1752 */
1753static unsigned int ub_bd_check_events(struct gendisk *disk,
1754 unsigned int clearing)
1755{
1756 struct ub_lun *lun = disk->private_data;
1757
1758 if (!lun->removable)
1759 return 0;
1760
1761 /*
1762 * We clean checks always after every command, so this is not
1763 * as dangerous as it looks. If the TEST_UNIT_READY fails here,
1764 * the device is actually not ready with operator or software
1765 * intervention required. One dangerous item might be a drive which
1766 * spins itself down, and come the time to write dirty pages, this
1767 * will fail, then block layer discards the data. Since we never
1768 * spin drives up, such devices simply cannot be used with ub anyway.
1769 */
1770 if (ub_sync_tur(lun->udev, lun) != 0) {
1771 lun->changed = 1;
1772 return DISK_EVENT_MEDIA_CHANGE;
1773 }
1774
1775 return lun->changed ? DISK_EVENT_MEDIA_CHANGE : 0;
1776}
1777
1778static const struct block_device_operations ub_bd_fops = {
1779 .owner = THIS_MODULE,
1780 .open = ub_bd_unlocked_open,
1781 .release = ub_bd_release,
1782 .ioctl = ub_bd_ioctl,
1783 .check_events = ub_bd_check_events,
1784 .revalidate_disk = ub_bd_revalidate,
1785};
1786
1787/*
1788 * Common ->done routine for commands executed synchronously.
1789 */
1790static void ub_probe_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
1791{
1792 struct completion *cop = cmd->back;
1793 complete(cop);
1794}
1795
1796/*
1797 * Test if the device has a check condition on it, synchronously.
1798 */
1799static int ub_sync_tur(struct ub_dev *sc, struct ub_lun *lun)
1800{
1801 struct ub_scsi_cmd *cmd;
1802 enum { ALLOC_SIZE = sizeof(struct ub_scsi_cmd) };
1803 unsigned long flags;
1804 struct completion compl;
1805 int rc;
1806
1807 init_completion(&compl);
1808
1809 rc = -ENOMEM;
1810 if ((cmd = kzalloc(ALLOC_SIZE, GFP_KERNEL)) == NULL)
1811 goto err_alloc;
1812
1813 cmd->cdb[0] = TEST_UNIT_READY;
1814 cmd->cdb_len = 6;
1815 cmd->dir = UB_DIR_NONE;
1816 cmd->state = UB_CMDST_INIT;
1817 cmd->lun = lun; /* This may be NULL, but that's ok */
1818 cmd->done = ub_probe_done;
1819 cmd->back = &compl;
1820
1821 spin_lock_irqsave(sc->lock, flags);
1822 cmd->tag = sc->tagcnt++;
1823
1824 rc = ub_submit_scsi(sc, cmd);
1825 spin_unlock_irqrestore(sc->lock, flags);
1826
1827 if (rc != 0)
1828 goto err_submit;
1829
1830 wait_for_completion(&compl);
1831
1832 rc = cmd->error;
1833
1834 if (rc == -EIO && cmd->key != 0) /* Retries for benh's key */
1835 rc = cmd->key;
1836
1837err_submit:
1838 kfree(cmd);
1839err_alloc:
1840 return rc;
1841}
1842
1843/*
1844 * Read the SCSI capacity synchronously (for probing).
1845 */
1846static int ub_sync_read_cap(struct ub_dev *sc, struct ub_lun *lun,
1847 struct ub_capacity *ret)
1848{
1849 struct ub_scsi_cmd *cmd;
1850 struct scatterlist *sg;
1851 char *p;
1852 enum { ALLOC_SIZE = sizeof(struct ub_scsi_cmd) + 8 };
1853 unsigned long flags;
1854 unsigned int bsize, shift;
1855 unsigned long nsec;
1856 struct completion compl;
1857 int rc;
1858
1859 init_completion(&compl);
1860
1861 rc = -ENOMEM;
1862 if ((cmd = kzalloc(ALLOC_SIZE, GFP_KERNEL)) == NULL)
1863 goto err_alloc;
1864 p = (char *)cmd + sizeof(struct ub_scsi_cmd);
1865
1866 cmd->cdb[0] = 0x25;
1867 cmd->cdb_len = 10;
1868 cmd->dir = UB_DIR_READ;
1869 cmd->state = UB_CMDST_INIT;
1870 cmd->nsg = 1;
1871 sg = &cmd->sgv[0];
1872 sg_init_table(sg, UB_MAX_REQ_SG);
1873 sg_set_page(sg, virt_to_page(p), 8, (unsigned long)p & (PAGE_SIZE-1));
1874 cmd->len = 8;
1875 cmd->lun = lun;
1876 cmd->done = ub_probe_done;
1877 cmd->back = &compl;
1878
1879 spin_lock_irqsave(sc->lock, flags);
1880 cmd->tag = sc->tagcnt++;
1881
1882 rc = ub_submit_scsi(sc, cmd);
1883 spin_unlock_irqrestore(sc->lock, flags);
1884
1885 if (rc != 0)
1886 goto err_submit;
1887
1888 wait_for_completion(&compl);
1889
1890 if (cmd->error != 0) {
1891 rc = -EIO;
1892 goto err_read;
1893 }
1894 if (cmd->act_len != 8) {
1895 rc = -EIO;
1896 goto err_read;
1897 }
1898
1899 /* sd.c special-cases sector size of 0 to mean 512. Needed? Safe? */
1900 nsec = be32_to_cpu(*(__be32 *)p) + 1;
1901 bsize = be32_to_cpu(*(__be32 *)(p + 4));
1902 switch (bsize) {
1903 case 512: shift = 0; break;
1904 case 1024: shift = 1; break;
1905 case 2048: shift = 2; break;
1906 case 4096: shift = 3; break;
1907 default:
1908 rc = -EDOM;
1909 goto err_inv_bsize;
1910 }
1911
1912 ret->bsize = bsize;
1913 ret->bshift = shift;
1914 ret->nsec = nsec << shift;
1915 rc = 0;
1916
1917err_inv_bsize:
1918err_read:
1919err_submit:
1920 kfree(cmd);
1921err_alloc:
1922 return rc;
1923}
1924
1925/*
1926 */
1927static void ub_probe_urb_complete(struct urb *urb)
1928{
1929 struct completion *cop = urb->context;
1930 complete(cop);
1931}
1932
1933static void ub_probe_timeout(unsigned long arg)
1934{
1935 struct completion *cop = (struct completion *) arg;
1936 complete(cop);
1937}
1938
1939/*
1940 * Reset with a Bulk reset.
1941 */
1942static int ub_sync_reset(struct ub_dev *sc)
1943{
1944 int ifnum = sc->intf->cur_altsetting->desc.bInterfaceNumber;
1945 struct usb_ctrlrequest *cr;
1946 struct completion compl;
1947 struct timer_list timer;
1948 int rc;
1949
1950 init_completion(&compl);
1951
1952 cr = &sc->work_cr;
1953 cr->bRequestType = USB_TYPE_CLASS | USB_RECIP_INTERFACE;
1954 cr->bRequest = US_BULK_RESET_REQUEST;
1955 cr->wValue = cpu_to_le16(0);
1956 cr->wIndex = cpu_to_le16(ifnum);
1957 cr->wLength = cpu_to_le16(0);
1958
1959 usb_fill_control_urb(&sc->work_urb, sc->dev, sc->send_ctrl_pipe,
1960 (unsigned char*) cr, NULL, 0, ub_probe_urb_complete, &compl);
1961
1962 if ((rc = usb_submit_urb(&sc->work_urb, GFP_KERNEL)) != 0) {
1963 printk(KERN_WARNING
1964 "%s: Unable to submit a bulk reset (%d)\n", sc->name, rc);
1965 return rc;
1966 }
1967
1968 init_timer(&timer);
1969 timer.function = ub_probe_timeout;
1970 timer.data = (unsigned long) &compl;
1971 timer.expires = jiffies + UB_CTRL_TIMEOUT;
1972 add_timer(&timer);
1973
1974 wait_for_completion(&compl);
1975
1976 del_timer_sync(&timer);
1977 usb_kill_urb(&sc->work_urb);
1978
1979 return sc->work_urb.status;
1980}
1981
1982/*
1983 * Get number of LUNs by the way of Bulk GetMaxLUN command.
1984 */
1985static int ub_sync_getmaxlun(struct ub_dev *sc)
1986{
1987 int ifnum = sc->intf->cur_altsetting->desc.bInterfaceNumber;
1988 unsigned char *p;
1989 enum { ALLOC_SIZE = 1 };
1990 struct usb_ctrlrequest *cr;
1991 struct completion compl;
1992 struct timer_list timer;
1993 int nluns;
1994 int rc;
1995
1996 init_completion(&compl);
1997
1998 rc = -ENOMEM;
1999 if ((p = kmalloc(ALLOC_SIZE, GFP_KERNEL)) == NULL)
2000 goto err_alloc;
2001 *p = 55;
2002
2003 cr = &sc->work_cr;
2004 cr->bRequestType = USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE;
2005 cr->bRequest = US_BULK_GET_MAX_LUN;
2006 cr->wValue = cpu_to_le16(0);
2007 cr->wIndex = cpu_to_le16(ifnum);
2008 cr->wLength = cpu_to_le16(1);
2009
2010 usb_fill_control_urb(&sc->work_urb, sc->dev, sc->recv_ctrl_pipe,
2011 (unsigned char*) cr, p, 1, ub_probe_urb_complete, &compl);
2012
2013 if ((rc = usb_submit_urb(&sc->work_urb, GFP_KERNEL)) != 0)
2014 goto err_submit;
2015
2016 init_timer(&timer);
2017 timer.function = ub_probe_timeout;
2018 timer.data = (unsigned long) &compl;
2019 timer.expires = jiffies + UB_CTRL_TIMEOUT;
2020 add_timer(&timer);
2021
2022 wait_for_completion(&compl);
2023
2024 del_timer_sync(&timer);
2025 usb_kill_urb(&sc->work_urb);
2026
2027 if ((rc = sc->work_urb.status) < 0)
2028 goto err_io;
2029
2030 if (sc->work_urb.actual_length != 1) {
2031 nluns = 0;
2032 } else {
2033 if ((nluns = *p) == 55) {
2034 nluns = 0;
2035 } else {
2036 /* GetMaxLUN returns the maximum LUN number */
2037 nluns += 1;
2038 if (nluns > UB_MAX_LUNS)
2039 nluns = UB_MAX_LUNS;
2040 }
2041 }
2042
2043 kfree(p);
2044 return nluns;
2045
2046err_io:
2047err_submit:
2048 kfree(p);
2049err_alloc:
2050 return rc;
2051}
2052
2053/*
2054 * Clear initial stalls.
2055 */
2056static int ub_probe_clear_stall(struct ub_dev *sc, int stalled_pipe)
2057{
2058 int endp;
2059 struct usb_ctrlrequest *cr;
2060 struct completion compl;
2061 struct timer_list timer;
2062 int rc;
2063
2064 init_completion(&compl);
2065
2066 endp = usb_pipeendpoint(stalled_pipe);
2067 if (usb_pipein (stalled_pipe))
2068 endp |= USB_DIR_IN;
2069
2070 cr = &sc->work_cr;
2071 cr->bRequestType = USB_RECIP_ENDPOINT;
2072 cr->bRequest = USB_REQ_CLEAR_FEATURE;
2073 cr->wValue = cpu_to_le16(USB_ENDPOINT_HALT);
2074 cr->wIndex = cpu_to_le16(endp);
2075 cr->wLength = cpu_to_le16(0);
2076
2077 usb_fill_control_urb(&sc->work_urb, sc->dev, sc->send_ctrl_pipe,
2078 (unsigned char*) cr, NULL, 0, ub_probe_urb_complete, &compl);
2079
2080 if ((rc = usb_submit_urb(&sc->work_urb, GFP_KERNEL)) != 0) {
2081 printk(KERN_WARNING
2082 "%s: Unable to submit a probe clear (%d)\n", sc->name, rc);
2083 return rc;
2084 }
2085
2086 init_timer(&timer);
2087 timer.function = ub_probe_timeout;
2088 timer.data = (unsigned long) &compl;
2089 timer.expires = jiffies + UB_CTRL_TIMEOUT;
2090 add_timer(&timer);
2091
2092 wait_for_completion(&compl);
2093
2094 del_timer_sync(&timer);
2095 usb_kill_urb(&sc->work_urb);
2096
2097 usb_reset_endpoint(sc->dev, endp);
2098
2099 return 0;
2100}
2101
2102/*
2103 * Get the pipe settings.
2104 */
2105static int ub_get_pipes(struct ub_dev *sc, struct usb_device *dev,
2106 struct usb_interface *intf)
2107{
2108 struct usb_host_interface *altsetting = intf->cur_altsetting;
2109 struct usb_endpoint_descriptor *ep_in = NULL;
2110 struct usb_endpoint_descriptor *ep_out = NULL;
2111 struct usb_endpoint_descriptor *ep;
2112 int i;
2113
2114 /*
2115 * Find the endpoints we need.
2116 * We are expecting a minimum of 2 endpoints - in and out (bulk).
2117 * We will ignore any others.
2118 */
2119 for (i = 0; i < altsetting->desc.bNumEndpoints; i++) {
2120 ep = &altsetting->endpoint[i].desc;
2121
2122 /* Is it a BULK endpoint? */
2123 if (usb_endpoint_xfer_bulk(ep)) {
2124 /* BULK in or out? */
2125 if (usb_endpoint_dir_in(ep)) {
2126 if (ep_in == NULL)
2127 ep_in = ep;
2128 } else {
2129 if (ep_out == NULL)
2130 ep_out = ep;
2131 }
2132 }
2133 }
2134
2135 if (ep_in == NULL || ep_out == NULL) {
2136 printk(KERN_NOTICE "%s: failed endpoint check\n", sc->name);
2137 return -ENODEV;
2138 }
2139
2140 /* Calculate and store the pipe values */
2141 sc->send_ctrl_pipe = usb_sndctrlpipe(dev, 0);
2142 sc->recv_ctrl_pipe = usb_rcvctrlpipe(dev, 0);
2143 sc->send_bulk_pipe = usb_sndbulkpipe(dev,
2144 usb_endpoint_num(ep_out));
2145 sc->recv_bulk_pipe = usb_rcvbulkpipe(dev,
2146 usb_endpoint_num(ep_in));
2147
2148 return 0;
2149}
2150
2151/*
2152 * Probing is done in the process context, which allows us to cheat
2153 * and not to build a state machine for the discovery.
2154 */
2155static int ub_probe(struct usb_interface *intf,
2156 const struct usb_device_id *dev_id)
2157{
2158 struct ub_dev *sc;
2159 int nluns;
2160 int rc;
2161 int i;
2162
2163 if (usb_usual_check_type(dev_id, USB_US_TYPE_UB))
2164 return -ENXIO;
2165
2166 rc = -ENOMEM;
2167 if ((sc = kzalloc(sizeof(struct ub_dev), GFP_KERNEL)) == NULL)
2168 goto err_core;
2169 sc->lock = ub_next_lock();
2170 INIT_LIST_HEAD(&sc->luns);
2171 usb_init_urb(&sc->work_urb);
2172 tasklet_init(&sc->tasklet, ub_scsi_action, (unsigned long)sc);
2173 atomic_set(&sc->poison, 0);
2174 INIT_WORK(&sc->reset_work, ub_reset_task);
2175 init_waitqueue_head(&sc->reset_wait);
2176
2177 init_timer(&sc->work_timer);
2178 sc->work_timer.data = (unsigned long) sc;
2179 sc->work_timer.function = ub_urb_timeout;
2180
2181 ub_init_completion(&sc->work_done);
2182 sc->work_done.done = 1; /* A little yuk, but oh well... */
2183
2184 sc->dev = interface_to_usbdev(intf);
2185 sc->intf = intf;
2186 // sc->ifnum = intf->cur_altsetting->desc.bInterfaceNumber;
2187 usb_set_intfdata(intf, sc);
2188 usb_get_dev(sc->dev);
2189 /*
2190 * Since we give the interface struct to the block level through
2191 * disk->driverfs_dev, we have to pin it. Otherwise, block_uevent
2192 * oopses on close after a disconnect (kernels 2.6.16 and up).
2193 */
2194 usb_get_intf(sc->intf);
2195
2196 snprintf(sc->name, 12, DRV_NAME "(%d.%d)",
2197 sc->dev->bus->busnum, sc->dev->devnum);
2198
2199 /* XXX Verify that we can handle the device (from descriptors) */
2200
2201 if (ub_get_pipes(sc, sc->dev, intf) != 0)
2202 goto err_dev_desc;
2203
2204 /*
2205 * At this point, all USB initialization is done, do upper layer.
2206 * We really hate halfway initialized structures, so from the
2207 * invariants perspective, this ub_dev is fully constructed at
2208 * this point.
2209 */
2210
2211 /*
2212 * This is needed to clear toggles. It is a problem only if we do
2213 * `rmmod ub && modprobe ub` without disconnects, but we like that.
2214 */
2215#if 0 /* iPod Mini fails if we do this (big white iPod works) */
2216 ub_probe_clear_stall(sc, sc->recv_bulk_pipe);
2217 ub_probe_clear_stall(sc, sc->send_bulk_pipe);
2218#endif
2219
2220 /*
2221 * The way this is used by the startup code is a little specific.
2222 * A SCSI check causes a USB stall. Our common case code sees it
2223 * and clears the check, after which the device is ready for use.
2224 * But if a check was not present, any command other than
2225 * TEST_UNIT_READY ends with a lockup (including REQUEST_SENSE).
2226 *
2227 * If we neglect to clear the SCSI check, the first real command fails
2228 * (which is the capacity readout). We clear that and retry, but why
2229 * causing spurious retries for no reason.
2230 *
2231 * Revalidation may start with its own TEST_UNIT_READY, but that one
2232 * has to succeed, so we clear checks with an additional one here.
2233 * In any case it's not our business how revaliadation is implemented.
2234 */
2235 for (i = 0; i < 3; i++) { /* Retries for the schwag key from KS'04 */
2236 if ((rc = ub_sync_tur(sc, NULL)) <= 0) break;
2237 if (rc != 0x6) break;
2238 msleep(10);
2239 }
2240
2241 nluns = 1;
2242 for (i = 0; i < 3; i++) {
2243 if ((rc = ub_sync_getmaxlun(sc)) < 0)
2244 break;
2245 if (rc != 0) {
2246 nluns = rc;
2247 break;
2248 }
2249 msleep(100);
2250 }
2251
2252 for (i = 0; i < nluns; i++) {
2253 ub_probe_lun(sc, i);
2254 }
2255 return 0;
2256
2257err_dev_desc:
2258 usb_set_intfdata(intf, NULL);
2259 usb_put_intf(sc->intf);
2260 usb_put_dev(sc->dev);
2261 kfree(sc);
2262err_core:
2263 return rc;
2264}
2265
2266static int ub_probe_lun(struct ub_dev *sc, int lnum)
2267{
2268 struct ub_lun *lun;
2269 struct request_queue *q;
2270 struct gendisk *disk;
2271 int rc;
2272
2273 rc = -ENOMEM;
2274 if ((lun = kzalloc(sizeof(struct ub_lun), GFP_KERNEL)) == NULL)
2275 goto err_alloc;
2276 lun->num = lnum;
2277
2278 rc = -ENOSR;
2279 if ((lun->id = ub_id_get()) == -1)
2280 goto err_id;
2281
2282 lun->udev = sc;
2283
2284 snprintf(lun->name, 16, DRV_NAME "%c(%d.%d.%d)",
2285 lun->id + 'a', sc->dev->bus->busnum, sc->dev->devnum, lun->num);
2286
2287 lun->removable = 1; /* XXX Query this from the device */
2288 lun->changed = 1; /* ub_revalidate clears only */
2289 ub_revalidate(sc, lun);
2290
2291 rc = -ENOMEM;
2292 if ((disk = alloc_disk(UB_PARTS_PER_LUN)) == NULL)
2293 goto err_diskalloc;
2294
2295 sprintf(disk->disk_name, DRV_NAME "%c", lun->id + 'a');
2296 disk->major = UB_MAJOR;
2297 disk->first_minor = lun->id * UB_PARTS_PER_LUN;
2298 disk->fops = &ub_bd_fops;
2299 disk->private_data = lun;
2300 disk->driverfs_dev = &sc->intf->dev;
2301
2302 rc = -ENOMEM;
2303 if ((q = blk_init_queue(ub_request_fn, sc->lock)) == NULL)
2304 goto err_blkqinit;
2305
2306 disk->queue = q;
2307
2308 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
2309 blk_queue_max_segments(q, UB_MAX_REQ_SG);
2310 blk_queue_segment_boundary(q, 0xffffffff); /* Dubious. */
2311 blk_queue_max_hw_sectors(q, UB_MAX_SECTORS);
2312 blk_queue_logical_block_size(q, lun->capacity.bsize);
2313
2314 lun->disk = disk;
2315 q->queuedata = lun;
2316 list_add(&lun->link, &sc->luns);
2317
2318 set_capacity(disk, lun->capacity.nsec);
2319 if (lun->removable)
2320 disk->flags |= GENHD_FL_REMOVABLE;
2321
2322 add_disk(disk);
2323
2324 return 0;
2325
2326err_blkqinit:
2327 put_disk(disk);
2328err_diskalloc:
2329 ub_id_put(lun->id);
2330err_id:
2331 kfree(lun);
2332err_alloc:
2333 return rc;
2334}
2335
2336static void ub_disconnect(struct usb_interface *intf)
2337{
2338 struct ub_dev *sc = usb_get_intfdata(intf);
2339 struct ub_lun *lun;
2340 unsigned long flags;
2341
2342 /*
2343 * Prevent ub_bd_release from pulling the rug from under us.
2344 * XXX This is starting to look like a kref.
2345 * XXX Why not to take this ref at probe time?
2346 */
2347 spin_lock_irqsave(&ub_lock, flags);
2348 sc->openc++;
2349 spin_unlock_irqrestore(&ub_lock, flags);
2350
2351 /*
2352 * Fence stall clearings, operations triggered by unlinkings and so on.
2353 * We do not attempt to unlink any URBs, because we do not trust the
2354 * unlink paths in HC drivers. Also, we get -84 upon disconnect anyway.
2355 */
2356 atomic_set(&sc->poison, 1);
2357
2358 /*
2359 * Wait for reset to end, if any.
2360 */
2361 wait_event(sc->reset_wait, !sc->reset);
2362
2363 /*
2364 * Blow away queued commands.
2365 *
2366 * Actually, this never works, because before we get here
2367 * the HCD terminates outstanding URB(s). It causes our
2368 * SCSI command queue to advance, commands fail to submit,
2369 * and the whole queue drains. So, we just use this code to
2370 * print warnings.
2371 */
2372 spin_lock_irqsave(sc->lock, flags);
2373 {
2374 struct ub_scsi_cmd *cmd;
2375 int cnt = 0;
2376 while ((cmd = ub_cmdq_peek(sc)) != NULL) {
2377 cmd->error = -ENOTCONN;
2378 cmd->state = UB_CMDST_DONE;
2379 ub_cmdq_pop(sc);
2380 (*cmd->done)(sc, cmd);
2381 cnt++;
2382 }
2383 if (cnt != 0) {
2384 printk(KERN_WARNING "%s: "
2385 "%d was queued after shutdown\n", sc->name, cnt);
2386 }
2387 }
2388 spin_unlock_irqrestore(sc->lock, flags);
2389
2390 /*
2391 * Unregister the upper layer.
2392 */
2393 list_for_each_entry(lun, &sc->luns, link) {
2394 del_gendisk(lun->disk);
2395 /*
2396 * I wish I could do:
2397 * queue_flag_set(QUEUE_FLAG_DEAD, q);
2398 * As it is, we rely on our internal poisoning and let
2399 * the upper levels to spin furiously failing all the I/O.
2400 */
2401 }
2402
2403 /*
2404 * Testing for -EINPROGRESS is always a bug, so we are bending
2405 * the rules a little.
2406 */
2407 spin_lock_irqsave(sc->lock, flags);
2408 if (sc->work_urb.status == -EINPROGRESS) { /* janitors: ignore */
2409 printk(KERN_WARNING "%s: "
2410 "URB is active after disconnect\n", sc->name);
2411 }
2412 spin_unlock_irqrestore(sc->lock, flags);
2413
2414 /*
2415 * There is virtually no chance that other CPU runs a timeout so long
2416 * after ub_urb_complete should have called del_timer, but only if HCD
2417 * didn't forget to deliver a callback on unlink.
2418 */
2419 del_timer_sync(&sc->work_timer);
2420
2421 /*
2422 * At this point there must be no commands coming from anyone
2423 * and no URBs left in transit.
2424 */
2425
2426 ub_put(sc);
2427}
2428
2429static struct usb_driver ub_driver = {
2430 .name = "ub",
2431 .probe = ub_probe,
2432 .disconnect = ub_disconnect,
2433 .id_table = ub_usb_ids,
2434 .pre_reset = ub_pre_reset,
2435 .post_reset = ub_post_reset,
2436};
2437
2438static int __init ub_init(void)
2439{
2440 int rc;
2441 int i;
2442
2443 pr_info("'Low Performance USB Block' driver is deprecated. "
2444 "Please switch to usb-storage\n");
2445 for (i = 0; i < UB_QLOCK_NUM; i++)
2446 spin_lock_init(&ub_qlockv[i]);
2447
2448 if ((rc = register_blkdev(UB_MAJOR, DRV_NAME)) != 0)
2449 goto err_regblkdev;
2450
2451 if ((rc = usb_register(&ub_driver)) != 0)
2452 goto err_register;
2453
2454 usb_usual_set_present(USB_US_TYPE_UB);
2455 return 0;
2456
2457err_register:
2458 unregister_blkdev(UB_MAJOR, DRV_NAME);
2459err_regblkdev:
2460 return rc;
2461}
2462
2463static void __exit ub_exit(void)
2464{
2465 usb_deregister(&ub_driver);
2466
2467 unregister_blkdev(UB_MAJOR, DRV_NAME);
2468 usb_usual_clear_present(USB_US_TYPE_UB);
2469}
2470
2471module_init(ub_init);
2472module_exit(ub_exit);
2473
2474MODULE_LICENSE("GPL");
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index c0bbeb470754..0bdde8fba397 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -14,6 +14,9 @@
14 14
15#define PART_BITS 4 15#define PART_BITS 4
16 16
17static bool use_bio;
18module_param(use_bio, bool, S_IRUGO);
19
17static int major; 20static int major;
18static DEFINE_IDA(vd_index_ida); 21static DEFINE_IDA(vd_index_ida);
19 22
@@ -23,6 +26,7 @@ struct virtio_blk
23{ 26{
24 struct virtio_device *vdev; 27 struct virtio_device *vdev;
25 struct virtqueue *vq; 28 struct virtqueue *vq;
29 wait_queue_head_t queue_wait;
26 30
27 /* The disk structure for the kernel. */ 31 /* The disk structure for the kernel. */
28 struct gendisk *disk; 32 struct gendisk *disk;
@@ -51,53 +55,244 @@ struct virtio_blk
51struct virtblk_req 55struct virtblk_req
52{ 56{
53 struct request *req; 57 struct request *req;
58 struct bio *bio;
54 struct virtio_blk_outhdr out_hdr; 59 struct virtio_blk_outhdr out_hdr;
55 struct virtio_scsi_inhdr in_hdr; 60 struct virtio_scsi_inhdr in_hdr;
61 struct work_struct work;
62 struct virtio_blk *vblk;
63 int flags;
56 u8 status; 64 u8 status;
65 struct scatterlist sg[];
66};
67
68enum {
69 VBLK_IS_FLUSH = 1,
70 VBLK_REQ_FLUSH = 2,
71 VBLK_REQ_DATA = 4,
72 VBLK_REQ_FUA = 8,
57}; 73};
58 74
59static void blk_done(struct virtqueue *vq) 75static inline int virtblk_result(struct virtblk_req *vbr)
76{
77 switch (vbr->status) {
78 case VIRTIO_BLK_S_OK:
79 return 0;
80 case VIRTIO_BLK_S_UNSUPP:
81 return -ENOTTY;
82 default:
83 return -EIO;
84 }
85}
86
87static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk,
88 gfp_t gfp_mask)
60{ 89{
61 struct virtio_blk *vblk = vq->vdev->priv;
62 struct virtblk_req *vbr; 90 struct virtblk_req *vbr;
63 unsigned int len;
64 unsigned long flags;
65 91
66 spin_lock_irqsave(vblk->disk->queue->queue_lock, flags); 92 vbr = mempool_alloc(vblk->pool, gfp_mask);
67 while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) { 93 if (!vbr)
68 int error; 94 return NULL;
69 95
70 switch (vbr->status) { 96 vbr->vblk = vblk;
71 case VIRTIO_BLK_S_OK: 97 if (use_bio)
72 error = 0; 98 sg_init_table(vbr->sg, vblk->sg_elems);
73 break; 99
74 case VIRTIO_BLK_S_UNSUPP: 100 return vbr;
75 error = -ENOTTY; 101}
76 break; 102
77 default: 103static void virtblk_add_buf_wait(struct virtio_blk *vblk,
78 error = -EIO; 104 struct virtblk_req *vbr,
105 unsigned long out,
106 unsigned long in)
107{
108 DEFINE_WAIT(wait);
109
110 for (;;) {
111 prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
112 TASK_UNINTERRUPTIBLE);
113
114 spin_lock_irq(vblk->disk->queue->queue_lock);
115 if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
116 GFP_ATOMIC) < 0) {
117 spin_unlock_irq(vblk->disk->queue->queue_lock);
118 io_schedule();
119 } else {
120 virtqueue_kick(vblk->vq);
121 spin_unlock_irq(vblk->disk->queue->queue_lock);
79 break; 122 break;
80 } 123 }
81 124
82 switch (vbr->req->cmd_type) { 125 }
83 case REQ_TYPE_BLOCK_PC: 126
84 vbr->req->resid_len = vbr->in_hdr.residual; 127 finish_wait(&vblk->queue_wait, &wait);
85 vbr->req->sense_len = vbr->in_hdr.sense_len; 128}
86 vbr->req->errors = vbr->in_hdr.errors; 129
87 break; 130static inline void virtblk_add_req(struct virtblk_req *vbr,
88 case REQ_TYPE_SPECIAL: 131 unsigned int out, unsigned int in)
89 vbr->req->errors = (error != 0); 132{
90 break; 133 struct virtio_blk *vblk = vbr->vblk;
91 default: 134
92 break; 135 spin_lock_irq(vblk->disk->queue->queue_lock);
136 if (unlikely(virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
137 GFP_ATOMIC) < 0)) {
138 spin_unlock_irq(vblk->disk->queue->queue_lock);
139 virtblk_add_buf_wait(vblk, vbr, out, in);
140 return;
141 }
142 virtqueue_kick(vblk->vq);
143 spin_unlock_irq(vblk->disk->queue->queue_lock);
144}
145
146static int virtblk_bio_send_flush(struct virtblk_req *vbr)
147{
148 unsigned int out = 0, in = 0;
149
150 vbr->flags |= VBLK_IS_FLUSH;
151 vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
152 vbr->out_hdr.sector = 0;
153 vbr->out_hdr.ioprio = 0;
154 sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
155 sg_set_buf(&vbr->sg[out + in++], &vbr->status, sizeof(vbr->status));
156
157 virtblk_add_req(vbr, out, in);
158
159 return 0;
160}
161
162static int virtblk_bio_send_data(struct virtblk_req *vbr)
163{
164 struct virtio_blk *vblk = vbr->vblk;
165 unsigned int num, out = 0, in = 0;
166 struct bio *bio = vbr->bio;
167
168 vbr->flags &= ~VBLK_IS_FLUSH;
169 vbr->out_hdr.type = 0;
170 vbr->out_hdr.sector = bio->bi_sector;
171 vbr->out_hdr.ioprio = bio_prio(bio);
172
173 sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
174
175 num = blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg + out);
176
177 sg_set_buf(&vbr->sg[num + out + in++], &vbr->status,
178 sizeof(vbr->status));
179
180 if (num) {
181 if (bio->bi_rw & REQ_WRITE) {
182 vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
183 out += num;
184 } else {
185 vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
186 in += num;
93 } 187 }
188 }
189
190 virtblk_add_req(vbr, out, in);
191
192 return 0;
193}
194
195static void virtblk_bio_send_data_work(struct work_struct *work)
196{
197 struct virtblk_req *vbr;
198
199 vbr = container_of(work, struct virtblk_req, work);
200
201 virtblk_bio_send_data(vbr);
202}
203
204static void virtblk_bio_send_flush_work(struct work_struct *work)
205{
206 struct virtblk_req *vbr;
207
208 vbr = container_of(work, struct virtblk_req, work);
209
210 virtblk_bio_send_flush(vbr);
211}
212
213static inline void virtblk_request_done(struct virtblk_req *vbr)
214{
215 struct virtio_blk *vblk = vbr->vblk;
216 struct request *req = vbr->req;
217 int error = virtblk_result(vbr);
218
219 if (req->cmd_type == REQ_TYPE_BLOCK_PC) {
220 req->resid_len = vbr->in_hdr.residual;
221 req->sense_len = vbr->in_hdr.sense_len;
222 req->errors = vbr->in_hdr.errors;
223 } else if (req->cmd_type == REQ_TYPE_SPECIAL) {
224 req->errors = (error != 0);
225 }
226
227 __blk_end_request_all(req, error);
228 mempool_free(vbr, vblk->pool);
229}
230
231static inline void virtblk_bio_flush_done(struct virtblk_req *vbr)
232{
233 struct virtio_blk *vblk = vbr->vblk;
234
235 if (vbr->flags & VBLK_REQ_DATA) {
236 /* Send out the actual write data */
237 INIT_WORK(&vbr->work, virtblk_bio_send_data_work);
238 queue_work(virtblk_wq, &vbr->work);
239 } else {
240 bio_endio(vbr->bio, virtblk_result(vbr));
241 mempool_free(vbr, vblk->pool);
242 }
243}
244
245static inline void virtblk_bio_data_done(struct virtblk_req *vbr)
246{
247 struct virtio_blk *vblk = vbr->vblk;
94 248
95 __blk_end_request_all(vbr->req, error); 249 if (unlikely(vbr->flags & VBLK_REQ_FUA)) {
250 /* Send out a flush before end the bio */
251 vbr->flags &= ~VBLK_REQ_DATA;
252 INIT_WORK(&vbr->work, virtblk_bio_send_flush_work);
253 queue_work(virtblk_wq, &vbr->work);
254 } else {
255 bio_endio(vbr->bio, virtblk_result(vbr));
96 mempool_free(vbr, vblk->pool); 256 mempool_free(vbr, vblk->pool);
97 } 257 }
258}
259
260static inline void virtblk_bio_done(struct virtblk_req *vbr)
261{
262 if (unlikely(vbr->flags & VBLK_IS_FLUSH))
263 virtblk_bio_flush_done(vbr);
264 else
265 virtblk_bio_data_done(vbr);
266}
267
268static void virtblk_done(struct virtqueue *vq)
269{
270 struct virtio_blk *vblk = vq->vdev->priv;
271 bool bio_done = false, req_done = false;
272 struct virtblk_req *vbr;
273 unsigned long flags;
274 unsigned int len;
275
276 spin_lock_irqsave(vblk->disk->queue->queue_lock, flags);
277 do {
278 virtqueue_disable_cb(vq);
279 while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
280 if (vbr->bio) {
281 virtblk_bio_done(vbr);
282 bio_done = true;
283 } else {
284 virtblk_request_done(vbr);
285 req_done = true;
286 }
287 }
288 } while (!virtqueue_enable_cb(vq));
98 /* In case queue is stopped waiting for more buffers. */ 289 /* In case queue is stopped waiting for more buffers. */
99 blk_start_queue(vblk->disk->queue); 290 if (req_done)
291 blk_start_queue(vblk->disk->queue);
100 spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags); 292 spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags);
293
294 if (bio_done)
295 wake_up(&vblk->queue_wait);
101} 296}
102 297
103static bool do_req(struct request_queue *q, struct virtio_blk *vblk, 298static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
@@ -106,13 +301,13 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
106 unsigned long num, out = 0, in = 0; 301 unsigned long num, out = 0, in = 0;
107 struct virtblk_req *vbr; 302 struct virtblk_req *vbr;
108 303
109 vbr = mempool_alloc(vblk->pool, GFP_ATOMIC); 304 vbr = virtblk_alloc_req(vblk, GFP_ATOMIC);
110 if (!vbr) 305 if (!vbr)
111 /* When another request finishes we'll try again. */ 306 /* When another request finishes we'll try again. */
112 return false; 307 return false;
113 308
114 vbr->req = req; 309 vbr->req = req;
115 310 vbr->bio = NULL;
116 if (req->cmd_flags & REQ_FLUSH) { 311 if (req->cmd_flags & REQ_FLUSH) {
117 vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; 312 vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
118 vbr->out_hdr.sector = 0; 313 vbr->out_hdr.sector = 0;
@@ -172,7 +367,8 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
172 } 367 }
173 } 368 }
174 369
175 if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr, GFP_ATOMIC)<0) { 370 if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr,
371 GFP_ATOMIC) < 0) {
176 mempool_free(vbr, vblk->pool); 372 mempool_free(vbr, vblk->pool);
177 return false; 373 return false;
178 } 374 }
@@ -180,7 +376,7 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
180 return true; 376 return true;
181} 377}
182 378
183static void do_virtblk_request(struct request_queue *q) 379static void virtblk_request(struct request_queue *q)
184{ 380{
185 struct virtio_blk *vblk = q->queuedata; 381 struct virtio_blk *vblk = q->queuedata;
186 struct request *req; 382 struct request *req;
@@ -203,6 +399,34 @@ static void do_virtblk_request(struct request_queue *q)
203 virtqueue_kick(vblk->vq); 399 virtqueue_kick(vblk->vq);
204} 400}
205 401
402static void virtblk_make_request(struct request_queue *q, struct bio *bio)
403{
404 struct virtio_blk *vblk = q->queuedata;
405 struct virtblk_req *vbr;
406
407 BUG_ON(bio->bi_phys_segments + 2 > vblk->sg_elems);
408
409 vbr = virtblk_alloc_req(vblk, GFP_NOIO);
410 if (!vbr) {
411 bio_endio(bio, -ENOMEM);
412 return;
413 }
414
415 vbr->bio = bio;
416 vbr->flags = 0;
417 if (bio->bi_rw & REQ_FLUSH)
418 vbr->flags |= VBLK_REQ_FLUSH;
419 if (bio->bi_rw & REQ_FUA)
420 vbr->flags |= VBLK_REQ_FUA;
421 if (bio->bi_size)
422 vbr->flags |= VBLK_REQ_DATA;
423
424 if (unlikely(vbr->flags & VBLK_REQ_FLUSH))
425 virtblk_bio_send_flush(vbr);
426 else
427 virtblk_bio_send_data(vbr);
428}
429
206/* return id (s/n) string for *disk to *id_str 430/* return id (s/n) string for *disk to *id_str
207 */ 431 */
208static int virtblk_get_id(struct gendisk *disk, char *id_str) 432static int virtblk_get_id(struct gendisk *disk, char *id_str)
@@ -360,7 +584,7 @@ static int init_vq(struct virtio_blk *vblk)
360 int err = 0; 584 int err = 0;
361 585
362 /* We expect one virtqueue, for output. */ 586 /* We expect one virtqueue, for output. */
363 vblk->vq = virtio_find_single_vq(vblk->vdev, blk_done, "requests"); 587 vblk->vq = virtio_find_single_vq(vblk->vdev, virtblk_done, "requests");
364 if (IS_ERR(vblk->vq)) 588 if (IS_ERR(vblk->vq))
365 err = PTR_ERR(vblk->vq); 589 err = PTR_ERR(vblk->vq);
366 590
@@ -477,6 +701,8 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
477 struct virtio_blk *vblk; 701 struct virtio_blk *vblk;
478 struct request_queue *q; 702 struct request_queue *q;
479 int err, index; 703 int err, index;
704 int pool_size;
705
480 u64 cap; 706 u64 cap;
481 u32 v, blk_size, sg_elems, opt_io_size; 707 u32 v, blk_size, sg_elems, opt_io_size;
482 u16 min_io_size; 708 u16 min_io_size;
@@ -506,10 +732,12 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
506 goto out_free_index; 732 goto out_free_index;
507 } 733 }
508 734
735 init_waitqueue_head(&vblk->queue_wait);
509 vblk->vdev = vdev; 736 vblk->vdev = vdev;
510 vblk->sg_elems = sg_elems; 737 vblk->sg_elems = sg_elems;
511 sg_init_table(vblk->sg, vblk->sg_elems); 738 sg_init_table(vblk->sg, vblk->sg_elems);
512 mutex_init(&vblk->config_lock); 739 mutex_init(&vblk->config_lock);
740
513 INIT_WORK(&vblk->config_work, virtblk_config_changed_work); 741 INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
514 vblk->config_enable = true; 742 vblk->config_enable = true;
515 743
@@ -517,7 +745,10 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
517 if (err) 745 if (err)
518 goto out_free_vblk; 746 goto out_free_vblk;
519 747
520 vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req)); 748 pool_size = sizeof(struct virtblk_req);
749 if (use_bio)
750 pool_size += sizeof(struct scatterlist) * sg_elems;
751 vblk->pool = mempool_create_kmalloc_pool(1, pool_size);
521 if (!vblk->pool) { 752 if (!vblk->pool) {
522 err = -ENOMEM; 753 err = -ENOMEM;
523 goto out_free_vq; 754 goto out_free_vq;
@@ -530,12 +761,14 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
530 goto out_mempool; 761 goto out_mempool;
531 } 762 }
532 763
533 q = vblk->disk->queue = blk_init_queue(do_virtblk_request, NULL); 764 q = vblk->disk->queue = blk_init_queue(virtblk_request, NULL);
534 if (!q) { 765 if (!q) {
535 err = -ENOMEM; 766 err = -ENOMEM;
536 goto out_put_disk; 767 goto out_put_disk;
537 } 768 }
538 769
770 if (use_bio)
771 blk_queue_make_request(q, virtblk_make_request);
539 q->queuedata = vblk; 772 q->queuedata = vblk;
540 773
541 virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN); 774 virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
@@ -620,7 +853,6 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
620 if (!err && opt_io_size) 853 if (!err && opt_io_size)
621 blk_queue_io_opt(q, blk_size * opt_io_size); 854 blk_queue_io_opt(q, blk_size * opt_io_size);
622 855
623
624 add_disk(vblk->disk); 856 add_disk(vblk->disk);
625 err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial); 857 err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial);
626 if (err) 858 if (err)
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 73f196ca713f..280a13846e6c 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -42,6 +42,7 @@
42 42
43#include <xen/events.h> 43#include <xen/events.h>
44#include <xen/page.h> 44#include <xen/page.h>
45#include <xen/xen.h>
45#include <asm/xen/hypervisor.h> 46#include <asm/xen/hypervisor.h>
46#include <asm/xen/hypercall.h> 47#include <asm/xen/hypercall.h>
47#include "common.h" 48#include "common.h"
@@ -337,7 +338,7 @@ static void xen_blkbk_unmap(struct pending_req *req)
337 invcount++; 338 invcount++;
338 } 339 }
339 340
340 ret = gnttab_unmap_refs(unmap, pages, invcount, false); 341 ret = gnttab_unmap_refs(unmap, NULL, pages, invcount);
341 BUG_ON(ret); 342 BUG_ON(ret);
342} 343}
343 344
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2c2d2e5c1597..007db8986e84 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -670,7 +670,7 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
670 spin_unlock_irqrestore(&info->io_lock, flags); 670 spin_unlock_irqrestore(&info->io_lock, flags);
671 671
672 /* Flush gnttab callback work. Must be done with no locks held. */ 672 /* Flush gnttab callback work. Must be done with no locks held. */
673 flush_work_sync(&info->work); 673 flush_work(&info->work);
674 674
675 del_gendisk(info->gd); 675 del_gendisk(info->gd);
676 676
@@ -719,7 +719,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
719 spin_unlock_irq(&info->io_lock); 719 spin_unlock_irq(&info->io_lock);
720 720
721 /* Flush gnttab callback work. Must be done with no locks held. */ 721 /* Flush gnttab callback work. Must be done with no locks held. */
722 flush_work_sync(&info->work); 722 flush_work(&info->work);
723 723
724 /* Free resources associated with old device channel. */ 724 /* Free resources associated with old device channel. */
725 if (info->ring_ref != GRANT_INVALID_REF) { 725 if (info->ring_ref != GRANT_INVALID_REF) {