diff options
Diffstat (limited to 'drivers/block')
57 files changed, 20858 insertions, 1532 deletions
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index eb4fa1943944..c5f22bb0a48e 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c | |||
@@ -2533,9 +2533,8 @@ static bool DAC960_RegisterBlockDevice(DAC960_Controller_T *Controller) | |||
2533 | Controller->RequestQueue[n] = RequestQueue; | 2533 | Controller->RequestQueue[n] = RequestQueue; |
2534 | blk_queue_bounce_limit(RequestQueue, Controller->BounceBufferLimit); | 2534 | blk_queue_bounce_limit(RequestQueue, Controller->BounceBufferLimit); |
2535 | RequestQueue->queuedata = Controller; | 2535 | RequestQueue->queuedata = Controller; |
2536 | blk_queue_max_hw_segments(RequestQueue, Controller->DriverScatterGatherLimit); | 2536 | blk_queue_max_segments(RequestQueue, Controller->DriverScatterGatherLimit); |
2537 | blk_queue_max_phys_segments(RequestQueue, Controller->DriverScatterGatherLimit); | 2537 | blk_queue_max_hw_sectors(RequestQueue, Controller->MaxBlocksPerCommand); |
2538 | blk_queue_max_sectors(RequestQueue, Controller->MaxBlocksPerCommand); | ||
2539 | disk->queue = RequestQueue; | 2538 | disk->queue = RequestQueue; |
2540 | sprintf(disk->disk_name, "rd/c%dd%d", Controller->ControllerNumber, n); | 2539 | sprintf(disk->disk_name, "rd/c%dd%d", Controller->ControllerNumber, n); |
2541 | disk->major = MajorNumber; | 2540 | disk->major = MajorNumber; |
@@ -7101,7 +7100,7 @@ static struct DAC960_privdata DAC960_BA_privdata = { | |||
7101 | 7100 | ||
7102 | static struct DAC960_privdata DAC960_LP_privdata = { | 7101 | static struct DAC960_privdata DAC960_LP_privdata = { |
7103 | .HardwareType = DAC960_LP_Controller, | 7102 | .HardwareType = DAC960_LP_Controller, |
7104 | .FirmwareType = DAC960_LP_Controller, | 7103 | .FirmwareType = DAC960_V2_Controller, |
7105 | .InterruptHandler = DAC960_LP_InterruptHandler, | 7104 | .InterruptHandler = DAC960_LP_InterruptHandler, |
7106 | .MemoryWindowSize = DAC960_LP_RegisterWindowSize, | 7105 | .MemoryWindowSize = DAC960_LP_RegisterWindowSize, |
7107 | }; | 7106 | }; |
@@ -7134,7 +7133,7 @@ static struct DAC960_privdata DAC960_P_privdata = { | |||
7134 | .MemoryWindowSize = DAC960_PD_RegisterWindowSize, | 7133 | .MemoryWindowSize = DAC960_PD_RegisterWindowSize, |
7135 | }; | 7134 | }; |
7136 | 7135 | ||
7137 | static struct pci_device_id DAC960_id_table[] = { | 7136 | static const struct pci_device_id DAC960_id_table[] = { |
7138 | { | 7137 | { |
7139 | .vendor = PCI_VENDOR_ID_MYLEX, | 7138 | .vendor = PCI_VENDOR_ID_MYLEX, |
7140 | .device = PCI_DEVICE_ID_MYLEX_DAC960_GEM, | 7139 | .device = PCI_DEVICE_ID_MYLEX_DAC960_GEM, |
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 1d886e079c58..77bfce52e9ca 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig | |||
@@ -271,6 +271,8 @@ config BLK_DEV_CRYPTOLOOP | |||
271 | instead, which can be configured to be on-disk compatible with the | 271 | instead, which can be configured to be on-disk compatible with the |
272 | cryptoloop device. | 272 | cryptoloop device. |
273 | 273 | ||
274 | source "drivers/block/drbd/Kconfig" | ||
275 | |||
274 | config BLK_DEV_NBD | 276 | config BLK_DEV_NBD |
275 | tristate "Network block device support" | 277 | tristate "Network block device support" |
276 | depends on NET | 278 | depends on NET |
diff --git a/drivers/block/Makefile b/drivers/block/Makefile index cdaa3f8fddf0..aff5ac925c34 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile | |||
@@ -36,5 +36,6 @@ obj-$(CONFIG_BLK_DEV_UB) += ub.o | |||
36 | obj-$(CONFIG_BLK_DEV_HD) += hd.o | 36 | obj-$(CONFIG_BLK_DEV_HD) += hd.o |
37 | 37 | ||
38 | obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o | 38 | obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o |
39 | obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ | ||
39 | 40 | ||
40 | swim_mod-objs := swim.o swim_asm.o | 41 | swim_mod-objs := swim.o swim_asm.o |
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 055225839024..0182a22c423a 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c | |||
@@ -54,6 +54,7 @@ | |||
54 | */ | 54 | */ |
55 | 55 | ||
56 | #include <linux/module.h> | 56 | #include <linux/module.h> |
57 | #include <linux/slab.h> | ||
57 | 58 | ||
58 | #include <linux/fd.h> | 59 | #include <linux/fd.h> |
59 | #include <linux/hdreg.h> | 60 | #include <linux/hdreg.h> |
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 3af97d4da2db..035cefe4045a 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/backing-dev.h> | 9 | #include <linux/backing-dev.h> |
10 | #include <linux/fs.h> | 10 | #include <linux/fs.h> |
11 | #include <linux/ioctl.h> | 11 | #include <linux/ioctl.h> |
12 | #include <linux/slab.h> | ||
12 | #include <linux/genhd.h> | 13 | #include <linux/genhd.h> |
13 | #include <linux/netdevice.h> | 14 | #include <linux/netdevice.h> |
14 | #include "aoe.h" | 15 | #include "aoe.h" |
diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c index 62141ec09a22..4a1b9e7464aa 100644 --- a/drivers/block/aoe/aoechr.c +++ b/drivers/block/aoe/aoechr.c | |||
@@ -8,6 +8,7 @@ | |||
8 | #include <linux/blkdev.h> | 8 | #include <linux/blkdev.h> |
9 | #include <linux/completion.h> | 9 | #include <linux/completion.h> |
10 | #include <linux/delay.h> | 10 | #include <linux/delay.h> |
11 | #include <linux/slab.h> | ||
11 | #include <linux/smp_lock.h> | 12 | #include <linux/smp_lock.h> |
12 | #include <linux/skbuff.h> | 13 | #include <linux/skbuff.h> |
13 | #include "aoe.h" | 14 | #include "aoe.h" |
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 13bb69d2abb3..5674bd01d96d 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c | |||
@@ -5,6 +5,7 @@ | |||
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include <linux/ata.h> | 7 | #include <linux/ata.h> |
8 | #include <linux/slab.h> | ||
8 | #include <linux/hdreg.h> | 9 | #include <linux/hdreg.h> |
9 | #include <linux/blkdev.h> | 10 | #include <linux/blkdev.h> |
10 | #include <linux/skbuff.h> | 11 | #include <linux/skbuff.h> |
@@ -735,21 +736,6 @@ diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector | |||
735 | part_stat_unlock(); | 736 | part_stat_unlock(); |
736 | } | 737 | } |
737 | 738 | ||
738 | /* | ||
739 | * Ensure we don't create aliases in VI caches | ||
740 | */ | ||
741 | static inline void | ||
742 | killalias(struct bio *bio) | ||
743 | { | ||
744 | struct bio_vec *bv; | ||
745 | int i; | ||
746 | |||
747 | if (bio_data_dir(bio) == READ) | ||
748 | __bio_for_each_segment(bv, bio, i, 0) { | ||
749 | flush_dcache_page(bv->bv_page); | ||
750 | } | ||
751 | } | ||
752 | |||
753 | void | 739 | void |
754 | aoecmd_ata_rsp(struct sk_buff *skb) | 740 | aoecmd_ata_rsp(struct sk_buff *skb) |
755 | { | 741 | { |
@@ -871,7 +857,7 @@ aoecmd_ata_rsp(struct sk_buff *skb) | |||
871 | if (buf->flags & BUFFL_FAIL) | 857 | if (buf->flags & BUFFL_FAIL) |
872 | bio_endio(buf->bio, -EIO); | 858 | bio_endio(buf->bio, -EIO); |
873 | else { | 859 | else { |
874 | killalias(buf->bio); | 860 | bio_flush_dcache_pages(buf->bio); |
875 | bio_endio(buf->bio, 0); | 861 | bio_endio(buf->bio, 0); |
876 | } | 862 | } |
877 | mempool_free(buf, d->bufpool); | 863 | mempool_free(buf, d->bufpool); |
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index fa67027789aa..0849280bfc1c 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c | |||
@@ -8,6 +8,7 @@ | |||
8 | #include <linux/blkdev.h> | 8 | #include <linux/blkdev.h> |
9 | #include <linux/netdevice.h> | 9 | #include <linux/netdevice.h> |
10 | #include <linux/delay.h> | 10 | #include <linux/delay.h> |
11 | #include <linux/slab.h> | ||
11 | #include "aoe.h" | 12 | #include "aoe.h" |
12 | 13 | ||
13 | static void dummy_timer(ulong); | 14 | static void dummy_timer(ulong); |
diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c index ce0d62cd71b2..4d3bc0d49df5 100644 --- a/drivers/block/aoe/aoenet.c +++ b/drivers/block/aoe/aoenet.c | |||
@@ -4,6 +4,7 @@ | |||
4 | * Ethernet portion of AoE driver | 4 | * Ethernet portion of AoE driver |
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include <linux/gfp.h> | ||
7 | #include <linux/hdreg.h> | 8 | #include <linux/hdreg.h> |
8 | #include <linux/blkdev.h> | 9 | #include <linux/blkdev.h> |
9 | #include <linux/netdevice.h> | 10 | #include <linux/netdevice.h> |
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index 847a9e57570a..e35cf59cbfde 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c | |||
@@ -1470,18 +1470,13 @@ repeat: | |||
1470 | 1470 | ||
1471 | void do_fd_request(struct request_queue * q) | 1471 | void do_fd_request(struct request_queue * q) |
1472 | { | 1472 | { |
1473 | unsigned long flags; | ||
1474 | |||
1475 | DPRINT(("do_fd_request for pid %d\n",current->pid)); | 1473 | DPRINT(("do_fd_request for pid %d\n",current->pid)); |
1476 | while( fdc_busy ) sleep_on( &fdc_wait ); | 1474 | while( fdc_busy ) sleep_on( &fdc_wait ); |
1477 | fdc_busy = 1; | 1475 | fdc_busy = 1; |
1478 | stdma_lock(floppy_irq, NULL); | 1476 | stdma_lock(floppy_irq, NULL); |
1479 | 1477 | ||
1480 | atari_disable_irq( IRQ_MFP_FDC ); | 1478 | atari_disable_irq( IRQ_MFP_FDC ); |
1481 | local_save_flags(flags); /* The request function is called with ints | ||
1482 | local_irq_disable(); * disabled... so must save the IPL for later */ | ||
1483 | redo_fd_request(); | 1479 | redo_fd_request(); |
1484 | local_irq_restore(flags); | ||
1485 | atari_enable_irq( IRQ_MFP_FDC ); | 1480 | atari_enable_irq( IRQ_MFP_FDC ); |
1486 | } | 1481 | } |
1487 | 1482 | ||
diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 4f688434daf1..6081e81d5738 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c | |||
@@ -15,9 +15,9 @@ | |||
15 | #include <linux/blkdev.h> | 15 | #include <linux/blkdev.h> |
16 | #include <linux/bio.h> | 16 | #include <linux/bio.h> |
17 | #include <linux/highmem.h> | 17 | #include <linux/highmem.h> |
18 | #include <linux/gfp.h> | ||
19 | #include <linux/radix-tree.h> | 18 | #include <linux/radix-tree.h> |
20 | #include <linux/buffer_head.h> /* invalidate_bh_lrus() */ | 19 | #include <linux/buffer_head.h> /* invalidate_bh_lrus() */ |
20 | #include <linux/slab.h> | ||
21 | 21 | ||
22 | #include <asm/uaccess.h> | 22 | #include <asm/uaccess.h> |
23 | 23 | ||
@@ -434,7 +434,7 @@ static struct brd_device *brd_alloc(int i) | |||
434 | goto out_free_dev; | 434 | goto out_free_dev; |
435 | blk_queue_make_request(brd->brd_queue, brd_make_request); | 435 | blk_queue_make_request(brd->brd_queue, brd_make_request); |
436 | blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG, NULL); | 436 | blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG, NULL); |
437 | blk_queue_max_sectors(brd->brd_queue, 1024); | 437 | blk_queue_max_hw_sectors(brd->brd_queue, 1024); |
438 | blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY); | 438 | blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY); |
439 | 439 | ||
440 | disk = brd->brd_disk = alloc_disk(1 << part_shift); | 440 | disk = brd->brd_disk = alloc_disk(1 << part_shift); |
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 92b126394fa1..eb5ff0531cfb 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c | |||
@@ -179,19 +179,17 @@ static int rebuild_lun_table(ctlr_info_t *h, int first_time, int via_ioctl); | |||
179 | static int deregister_disk(ctlr_info_t *h, int drv_index, | 179 | static int deregister_disk(ctlr_info_t *h, int drv_index, |
180 | int clear_all, int via_ioctl); | 180 | int clear_all, int via_ioctl); |
181 | 181 | ||
182 | static void cciss_read_capacity(int ctlr, int logvol, int withirq, | 182 | static void cciss_read_capacity(int ctlr, int logvol, |
183 | sector_t *total_size, unsigned int *block_size); | 183 | sector_t *total_size, unsigned int *block_size); |
184 | static void cciss_read_capacity_16(int ctlr, int logvol, int withirq, | 184 | static void cciss_read_capacity_16(int ctlr, int logvol, |
185 | sector_t *total_size, unsigned int *block_size); | 185 | sector_t *total_size, unsigned int *block_size); |
186 | static void cciss_geometry_inquiry(int ctlr, int logvol, | 186 | static void cciss_geometry_inquiry(int ctlr, int logvol, |
187 | int withirq, sector_t total_size, | 187 | sector_t total_size, |
188 | unsigned int block_size, InquiryData_struct *inq_buff, | 188 | unsigned int block_size, InquiryData_struct *inq_buff, |
189 | drive_info_struct *drv); | 189 | drive_info_struct *drv); |
190 | static void __devinit cciss_interrupt_mode(ctlr_info_t *, struct pci_dev *, | 190 | static void __devinit cciss_interrupt_mode(ctlr_info_t *, struct pci_dev *, |
191 | __u32); | 191 | __u32); |
192 | static void start_io(ctlr_info_t *h); | 192 | static void start_io(ctlr_info_t *h); |
193 | static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size, | ||
194 | __u8 page_code, unsigned char *scsi3addr, int cmd_type); | ||
195 | static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size, | 193 | static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size, |
196 | __u8 page_code, unsigned char scsi3addr[], | 194 | __u8 page_code, unsigned char scsi3addr[], |
197 | int cmd_type); | 195 | int cmd_type); |
@@ -259,6 +257,79 @@ static inline void removeQ(CommandList_struct *c) | |||
259 | hlist_del_init(&c->list); | 257 | hlist_del_init(&c->list); |
260 | } | 258 | } |
261 | 259 | ||
260 | static void cciss_free_sg_chain_blocks(SGDescriptor_struct **cmd_sg_list, | ||
261 | int nr_cmds) | ||
262 | { | ||
263 | int i; | ||
264 | |||
265 | if (!cmd_sg_list) | ||
266 | return; | ||
267 | for (i = 0; i < nr_cmds; i++) { | ||
268 | kfree(cmd_sg_list[i]); | ||
269 | cmd_sg_list[i] = NULL; | ||
270 | } | ||
271 | kfree(cmd_sg_list); | ||
272 | } | ||
273 | |||
274 | static SGDescriptor_struct **cciss_allocate_sg_chain_blocks( | ||
275 | ctlr_info_t *h, int chainsize, int nr_cmds) | ||
276 | { | ||
277 | int j; | ||
278 | SGDescriptor_struct **cmd_sg_list; | ||
279 | |||
280 | if (chainsize <= 0) | ||
281 | return NULL; | ||
282 | |||
283 | cmd_sg_list = kmalloc(sizeof(*cmd_sg_list) * nr_cmds, GFP_KERNEL); | ||
284 | if (!cmd_sg_list) | ||
285 | return NULL; | ||
286 | |||
287 | /* Build up chain blocks for each command */ | ||
288 | for (j = 0; j < nr_cmds; j++) { | ||
289 | /* Need a block of chainsized s/g elements. */ | ||
290 | cmd_sg_list[j] = kmalloc((chainsize * | ||
291 | sizeof(*cmd_sg_list[j])), GFP_KERNEL); | ||
292 | if (!cmd_sg_list[j]) { | ||
293 | dev_err(&h->pdev->dev, "Cannot get memory " | ||
294 | "for s/g chains.\n"); | ||
295 | goto clean; | ||
296 | } | ||
297 | } | ||
298 | return cmd_sg_list; | ||
299 | clean: | ||
300 | cciss_free_sg_chain_blocks(cmd_sg_list, nr_cmds); | ||
301 | return NULL; | ||
302 | } | ||
303 | |||
304 | static void cciss_unmap_sg_chain_block(ctlr_info_t *h, CommandList_struct *c) | ||
305 | { | ||
306 | SGDescriptor_struct *chain_sg; | ||
307 | u64bit temp64; | ||
308 | |||
309 | if (c->Header.SGTotal <= h->max_cmd_sgentries) | ||
310 | return; | ||
311 | |||
312 | chain_sg = &c->SG[h->max_cmd_sgentries - 1]; | ||
313 | temp64.val32.lower = chain_sg->Addr.lower; | ||
314 | temp64.val32.upper = chain_sg->Addr.upper; | ||
315 | pci_unmap_single(h->pdev, temp64.val, chain_sg->Len, PCI_DMA_TODEVICE); | ||
316 | } | ||
317 | |||
318 | static void cciss_map_sg_chain_block(ctlr_info_t *h, CommandList_struct *c, | ||
319 | SGDescriptor_struct *chain_block, int len) | ||
320 | { | ||
321 | SGDescriptor_struct *chain_sg; | ||
322 | u64bit temp64; | ||
323 | |||
324 | chain_sg = &c->SG[h->max_cmd_sgentries - 1]; | ||
325 | chain_sg->Ext = CCISS_SG_CHAIN; | ||
326 | chain_sg->Len = len; | ||
327 | temp64.val = pci_map_single(h->pdev, chain_block, len, | ||
328 | PCI_DMA_TODEVICE); | ||
329 | chain_sg->Addr.lower = temp64.val32.lower; | ||
330 | chain_sg->Addr.upper = temp64.val32.upper; | ||
331 | } | ||
332 | |||
262 | #include "cciss_scsi.c" /* For SCSI tape support */ | 333 | #include "cciss_scsi.c" /* For SCSI tape support */ |
263 | 334 | ||
264 | static const char *raid_label[] = { "0", "4", "1(1+0)", "5", "5+1", "ADG", | 335 | static const char *raid_label[] = { "0", "4", "1(1+0)", "5", "5+1", "ADG", |
@@ -339,6 +410,9 @@ static int cciss_seq_show(struct seq_file *seq, void *v) | |||
339 | if (*pos > h->highest_lun) | 410 | if (*pos > h->highest_lun) |
340 | return 0; | 411 | return 0; |
341 | 412 | ||
413 | if (drv == NULL) /* it's possible for h->drv[] to have holes. */ | ||
414 | return 0; | ||
415 | |||
342 | if (drv->heads == 0) | 416 | if (drv->heads == 0) |
343 | return 0; | 417 | return 0; |
344 | 418 | ||
@@ -424,12 +498,9 @@ cciss_proc_write(struct file *file, const char __user *buf, | |||
424 | if (strncmp(ENGAGE_SCSI, buffer, sizeof ENGAGE_SCSI - 1) == 0) { | 498 | if (strncmp(ENGAGE_SCSI, buffer, sizeof ENGAGE_SCSI - 1) == 0) { |
425 | struct seq_file *seq = file->private_data; | 499 | struct seq_file *seq = file->private_data; |
426 | ctlr_info_t *h = seq->private; | 500 | ctlr_info_t *h = seq->private; |
427 | int rc; | ||
428 | 501 | ||
429 | rc = cciss_engage_scsi(h->ctlr); | 502 | err = cciss_engage_scsi(h->ctlr); |
430 | if (rc != 0) | 503 | if (err == 0) |
431 | err = -rc; | ||
432 | else | ||
433 | err = length; | 504 | err = length; |
434 | } else | 505 | } else |
435 | #endif /* CONFIG_CISS_SCSI_TAPE */ | 506 | #endif /* CONFIG_CISS_SCSI_TAPE */ |
@@ -1346,26 +1417,27 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode, | |||
1346 | kfree(buff); | 1417 | kfree(buff); |
1347 | return -ENOMEM; | 1418 | return -ENOMEM; |
1348 | } | 1419 | } |
1349 | // Fill in the command type | 1420 | /* Fill in the command type */ |
1350 | c->cmd_type = CMD_IOCTL_PEND; | 1421 | c->cmd_type = CMD_IOCTL_PEND; |
1351 | // Fill in Command Header | 1422 | /* Fill in Command Header */ |
1352 | c->Header.ReplyQueue = 0; // unused in simple mode | 1423 | c->Header.ReplyQueue = 0; /* unused in simple mode */ |
1353 | if (iocommand.buf_size > 0) // buffer to fill | 1424 | if (iocommand.buf_size > 0) /* buffer to fill */ |
1354 | { | 1425 | { |
1355 | c->Header.SGList = 1; | 1426 | c->Header.SGList = 1; |
1356 | c->Header.SGTotal = 1; | 1427 | c->Header.SGTotal = 1; |
1357 | } else // no buffers to fill | 1428 | } else /* no buffers to fill */ |
1358 | { | 1429 | { |
1359 | c->Header.SGList = 0; | 1430 | c->Header.SGList = 0; |
1360 | c->Header.SGTotal = 0; | 1431 | c->Header.SGTotal = 0; |
1361 | } | 1432 | } |
1362 | c->Header.LUN = iocommand.LUN_info; | 1433 | c->Header.LUN = iocommand.LUN_info; |
1363 | c->Header.Tag.lower = c->busaddr; // use the kernel address the cmd block for tag | 1434 | /* use the kernel address the cmd block for tag */ |
1435 | c->Header.Tag.lower = c->busaddr; | ||
1364 | 1436 | ||
1365 | // Fill in Request block | 1437 | /* Fill in Request block */ |
1366 | c->Request = iocommand.Request; | 1438 | c->Request = iocommand.Request; |
1367 | 1439 | ||
1368 | // Fill in the scatter gather information | 1440 | /* Fill in the scatter gather information */ |
1369 | if (iocommand.buf_size > 0) { | 1441 | if (iocommand.buf_size > 0) { |
1370 | temp64.val = pci_map_single(host->pdev, buff, | 1442 | temp64.val = pci_map_single(host->pdev, buff, |
1371 | iocommand.buf_size, | 1443 | iocommand.buf_size, |
@@ -1373,7 +1445,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode, | |||
1373 | c->SG[0].Addr.lower = temp64.val32.lower; | 1445 | c->SG[0].Addr.lower = temp64.val32.lower; |
1374 | c->SG[0].Addr.upper = temp64.val32.upper; | 1446 | c->SG[0].Addr.upper = temp64.val32.upper; |
1375 | c->SG[0].Len = iocommand.buf_size; | 1447 | c->SG[0].Len = iocommand.buf_size; |
1376 | c->SG[0].Ext = 0; // we are not chaining | 1448 | c->SG[0].Ext = 0; /* we are not chaining */ |
1377 | } | 1449 | } |
1378 | c->waiting = &wait; | 1450 | c->waiting = &wait; |
1379 | 1451 | ||
@@ -1657,9 +1729,11 @@ static void cciss_softirq_done(struct request *rq) | |||
1657 | { | 1729 | { |
1658 | CommandList_struct *cmd = rq->completion_data; | 1730 | CommandList_struct *cmd = rq->completion_data; |
1659 | ctlr_info_t *h = hba[cmd->ctlr]; | 1731 | ctlr_info_t *h = hba[cmd->ctlr]; |
1732 | SGDescriptor_struct *curr_sg = cmd->SG; | ||
1660 | unsigned long flags; | 1733 | unsigned long flags; |
1661 | u64bit temp64; | 1734 | u64bit temp64; |
1662 | int i, ddir; | 1735 | int i, ddir; |
1736 | int sg_index = 0; | ||
1663 | 1737 | ||
1664 | if (cmd->Request.Type.Direction == XFER_READ) | 1738 | if (cmd->Request.Type.Direction == XFER_READ) |
1665 | ddir = PCI_DMA_FROMDEVICE; | 1739 | ddir = PCI_DMA_FROMDEVICE; |
@@ -1669,9 +1743,17 @@ static void cciss_softirq_done(struct request *rq) | |||
1669 | /* command did not need to be retried */ | 1743 | /* command did not need to be retried */ |
1670 | /* unmap the DMA mapping for all the scatter gather elements */ | 1744 | /* unmap the DMA mapping for all the scatter gather elements */ |
1671 | for (i = 0; i < cmd->Header.SGList; i++) { | 1745 | for (i = 0; i < cmd->Header.SGList; i++) { |
1672 | temp64.val32.lower = cmd->SG[i].Addr.lower; | 1746 | if (curr_sg[sg_index].Ext == CCISS_SG_CHAIN) { |
1673 | temp64.val32.upper = cmd->SG[i].Addr.upper; | 1747 | cciss_unmap_sg_chain_block(h, cmd); |
1674 | pci_unmap_page(h->pdev, temp64.val, cmd->SG[i].Len, ddir); | 1748 | /* Point to the next block */ |
1749 | curr_sg = h->cmd_sg_list[cmd->cmdindex]; | ||
1750 | sg_index = 0; | ||
1751 | } | ||
1752 | temp64.val32.lower = curr_sg[sg_index].Addr.lower; | ||
1753 | temp64.val32.upper = curr_sg[sg_index].Addr.upper; | ||
1754 | pci_unmap_page(h->pdev, temp64.val, curr_sg[sg_index].Len, | ||
1755 | ddir); | ||
1756 | ++sg_index; | ||
1675 | } | 1757 | } |
1676 | 1758 | ||
1677 | #ifdef CCISS_DEBUG | 1759 | #ifdef CCISS_DEBUG |
@@ -1701,7 +1783,7 @@ static inline void log_unit_to_scsi3addr(ctlr_info_t *h, | |||
1701 | * via the inquiry page 0. Model, vendor, and rev are set to empty strings if | 1783 | * via the inquiry page 0. Model, vendor, and rev are set to empty strings if |
1702 | * they cannot be read. | 1784 | * they cannot be read. |
1703 | */ | 1785 | */ |
1704 | static void cciss_get_device_descr(int ctlr, int logvol, int withirq, | 1786 | static void cciss_get_device_descr(int ctlr, int logvol, |
1705 | char *vendor, char *model, char *rev) | 1787 | char *vendor, char *model, char *rev) |
1706 | { | 1788 | { |
1707 | int rc; | 1789 | int rc; |
@@ -1717,14 +1799,8 @@ static void cciss_get_device_descr(int ctlr, int logvol, int withirq, | |||
1717 | return; | 1799 | return; |
1718 | 1800 | ||
1719 | log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); | 1801 | log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); |
1720 | if (withirq) | 1802 | rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf, sizeof(*inq_buf), 0, |
1721 | rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf, | 1803 | scsi3addr, TYPE_CMD); |
1722 | sizeof(InquiryData_struct), 0, | ||
1723 | scsi3addr, TYPE_CMD); | ||
1724 | else | ||
1725 | rc = sendcmd(CISS_INQUIRY, ctlr, inq_buf, | ||
1726 | sizeof(InquiryData_struct), 0, | ||
1727 | scsi3addr, TYPE_CMD); | ||
1728 | if (rc == IO_OK) { | 1804 | if (rc == IO_OK) { |
1729 | memcpy(vendor, &inq_buf->data_byte[8], VENDOR_LEN); | 1805 | memcpy(vendor, &inq_buf->data_byte[8], VENDOR_LEN); |
1730 | vendor[VENDOR_LEN] = '\0'; | 1806 | vendor[VENDOR_LEN] = '\0'; |
@@ -1743,7 +1819,7 @@ static void cciss_get_device_descr(int ctlr, int logvol, int withirq, | |||
1743 | * number cannot be had, for whatever reason, 16 bytes of 0xff | 1819 | * number cannot be had, for whatever reason, 16 bytes of 0xff |
1744 | * are returned instead. | 1820 | * are returned instead. |
1745 | */ | 1821 | */ |
1746 | static void cciss_get_serial_no(int ctlr, int logvol, int withirq, | 1822 | static void cciss_get_serial_no(int ctlr, int logvol, |
1747 | unsigned char *serial_no, int buflen) | 1823 | unsigned char *serial_no, int buflen) |
1748 | { | 1824 | { |
1749 | #define PAGE_83_INQ_BYTES 64 | 1825 | #define PAGE_83_INQ_BYTES 64 |
@@ -1759,12 +1835,8 @@ static void cciss_get_serial_no(int ctlr, int logvol, int withirq, | |||
1759 | return; | 1835 | return; |
1760 | memset(serial_no, 0, buflen); | 1836 | memset(serial_no, 0, buflen); |
1761 | log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); | 1837 | log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); |
1762 | if (withirq) | 1838 | rc = sendcmd_withirq(CISS_INQUIRY, ctlr, buf, |
1763 | rc = sendcmd_withirq(CISS_INQUIRY, ctlr, buf, | 1839 | PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD); |
1764 | PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD); | ||
1765 | else | ||
1766 | rc = sendcmd(CISS_INQUIRY, ctlr, buf, | ||
1767 | PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD); | ||
1768 | if (rc == IO_OK) | 1840 | if (rc == IO_OK) |
1769 | memcpy(serial_no, &buf[8], buflen); | 1841 | memcpy(serial_no, &buf[8], buflen); |
1770 | kfree(buf); | 1842 | kfree(buf); |
@@ -1793,12 +1865,9 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk, | |||
1793 | blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask); | 1865 | blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask); |
1794 | 1866 | ||
1795 | /* This is a hardware imposed limit. */ | 1867 | /* This is a hardware imposed limit. */ |
1796 | blk_queue_max_hw_segments(disk->queue, MAXSGENTRIES); | 1868 | blk_queue_max_segments(disk->queue, h->maxsgentries); |
1797 | |||
1798 | /* This is a limit in the driver and could be eliminated. */ | ||
1799 | blk_queue_max_phys_segments(disk->queue, MAXSGENTRIES); | ||
1800 | 1869 | ||
1801 | blk_queue_max_sectors(disk->queue, h->cciss_max_sectors); | 1870 | blk_queue_max_hw_sectors(disk->queue, h->cciss_max_sectors); |
1802 | 1871 | ||
1803 | blk_queue_softirq_done(disk->queue, cciss_softirq_done); | 1872 | blk_queue_softirq_done(disk->queue, cciss_softirq_done); |
1804 | 1873 | ||
@@ -1852,18 +1921,16 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time, | |||
1852 | 1921 | ||
1853 | /* testing to see if 16-byte CDBs are already being used */ | 1922 | /* testing to see if 16-byte CDBs are already being used */ |
1854 | if (h->cciss_read == CCISS_READ_16) { | 1923 | if (h->cciss_read == CCISS_READ_16) { |
1855 | cciss_read_capacity_16(h->ctlr, drv_index, 1, | 1924 | cciss_read_capacity_16(h->ctlr, drv_index, |
1856 | &total_size, &block_size); | 1925 | &total_size, &block_size); |
1857 | 1926 | ||
1858 | } else { | 1927 | } else { |
1859 | cciss_read_capacity(ctlr, drv_index, 1, | 1928 | cciss_read_capacity(ctlr, drv_index, &total_size, &block_size); |
1860 | &total_size, &block_size); | ||
1861 | |||
1862 | /* if read_capacity returns all F's this volume is >2TB */ | 1929 | /* if read_capacity returns all F's this volume is >2TB */ |
1863 | /* in size so we switch to 16-byte CDB's for all */ | 1930 | /* in size so we switch to 16-byte CDB's for all */ |
1864 | /* read/write ops */ | 1931 | /* read/write ops */ |
1865 | if (total_size == 0xFFFFFFFFULL) { | 1932 | if (total_size == 0xFFFFFFFFULL) { |
1866 | cciss_read_capacity_16(ctlr, drv_index, 1, | 1933 | cciss_read_capacity_16(ctlr, drv_index, |
1867 | &total_size, &block_size); | 1934 | &total_size, &block_size); |
1868 | h->cciss_read = CCISS_READ_16; | 1935 | h->cciss_read = CCISS_READ_16; |
1869 | h->cciss_write = CCISS_WRITE_16; | 1936 | h->cciss_write = CCISS_WRITE_16; |
@@ -1873,14 +1940,14 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time, | |||
1873 | } | 1940 | } |
1874 | } | 1941 | } |
1875 | 1942 | ||
1876 | cciss_geometry_inquiry(ctlr, drv_index, 1, total_size, block_size, | 1943 | cciss_geometry_inquiry(ctlr, drv_index, total_size, block_size, |
1877 | inq_buff, drvinfo); | 1944 | inq_buff, drvinfo); |
1878 | drvinfo->block_size = block_size; | 1945 | drvinfo->block_size = block_size; |
1879 | drvinfo->nr_blocks = total_size + 1; | 1946 | drvinfo->nr_blocks = total_size + 1; |
1880 | 1947 | ||
1881 | cciss_get_device_descr(ctlr, drv_index, 1, drvinfo->vendor, | 1948 | cciss_get_device_descr(ctlr, drv_index, drvinfo->vendor, |
1882 | drvinfo->model, drvinfo->rev); | 1949 | drvinfo->model, drvinfo->rev); |
1883 | cciss_get_serial_no(ctlr, drv_index, 1, drvinfo->serial_no, | 1950 | cciss_get_serial_no(ctlr, drv_index, drvinfo->serial_no, |
1884 | sizeof(drvinfo->serial_no)); | 1951 | sizeof(drvinfo->serial_no)); |
1885 | /* Save the lunid in case we deregister the disk, below. */ | 1952 | /* Save the lunid in case we deregister the disk, below. */ |
1886 | memcpy(drvinfo->LunID, h->drv[drv_index]->LunID, | 1953 | memcpy(drvinfo->LunID, h->drv[drv_index]->LunID, |
@@ -2424,7 +2491,7 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff, | |||
2424 | c->Request.Type.Direction = XFER_READ; | 2491 | c->Request.Type.Direction = XFER_READ; |
2425 | c->Request.Timeout = 0; | 2492 | c->Request.Timeout = 0; |
2426 | c->Request.CDB[0] = cmd; | 2493 | c->Request.CDB[0] = cmd; |
2427 | c->Request.CDB[6] = (size >> 24) & 0xFF; //MSB | 2494 | c->Request.CDB[6] = (size >> 24) & 0xFF; /* MSB */ |
2428 | c->Request.CDB[7] = (size >> 16) & 0xFF; | 2495 | c->Request.CDB[7] = (size >> 16) & 0xFF; |
2429 | c->Request.CDB[8] = (size >> 8) & 0xFF; | 2496 | c->Request.CDB[8] = (size >> 8) & 0xFF; |
2430 | c->Request.CDB[9] = size & 0xFF; | 2497 | c->Request.CDB[9] = size & 0xFF; |
@@ -2531,6 +2598,8 @@ static int check_target_status(ctlr_info_t *h, CommandList_struct *c) | |||
2531 | case 0: return IO_OK; /* no sense */ | 2598 | case 0: return IO_OK; /* no sense */ |
2532 | case 1: return IO_OK; /* recovered error */ | 2599 | case 1: return IO_OK; /* recovered error */ |
2533 | default: | 2600 | default: |
2601 | if (check_for_unit_attention(h, c)) | ||
2602 | return IO_NEEDS_RETRY; | ||
2534 | printk(KERN_WARNING "cciss%d: cmd 0x%02x " | 2603 | printk(KERN_WARNING "cciss%d: cmd 0x%02x " |
2535 | "check condition, sense key = 0x%02x\n", | 2604 | "check condition, sense key = 0x%02x\n", |
2536 | h->ctlr, c->Request.CDB[0], | 2605 | h->ctlr, c->Request.CDB[0], |
@@ -2672,7 +2741,7 @@ static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size, | |||
2672 | } | 2741 | } |
2673 | 2742 | ||
2674 | static void cciss_geometry_inquiry(int ctlr, int logvol, | 2743 | static void cciss_geometry_inquiry(int ctlr, int logvol, |
2675 | int withirq, sector_t total_size, | 2744 | sector_t total_size, |
2676 | unsigned int block_size, | 2745 | unsigned int block_size, |
2677 | InquiryData_struct *inq_buff, | 2746 | InquiryData_struct *inq_buff, |
2678 | drive_info_struct *drv) | 2747 | drive_info_struct *drv) |
@@ -2683,21 +2752,15 @@ static void cciss_geometry_inquiry(int ctlr, int logvol, | |||
2683 | 2752 | ||
2684 | memset(inq_buff, 0, sizeof(InquiryData_struct)); | 2753 | memset(inq_buff, 0, sizeof(InquiryData_struct)); |
2685 | log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); | 2754 | log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); |
2686 | if (withirq) | 2755 | return_code = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buff, |
2687 | return_code = sendcmd_withirq(CISS_INQUIRY, ctlr, | 2756 | sizeof(*inq_buff), 0xC1, scsi3addr, TYPE_CMD); |
2688 | inq_buff, sizeof(*inq_buff), | ||
2689 | 0xC1, scsi3addr, TYPE_CMD); | ||
2690 | else | ||
2691 | return_code = sendcmd(CISS_INQUIRY, ctlr, inq_buff, | ||
2692 | sizeof(*inq_buff), 0xC1, scsi3addr, | ||
2693 | TYPE_CMD); | ||
2694 | if (return_code == IO_OK) { | 2757 | if (return_code == IO_OK) { |
2695 | if (inq_buff->data_byte[8] == 0xFF) { | 2758 | if (inq_buff->data_byte[8] == 0xFF) { |
2696 | printk(KERN_WARNING | 2759 | printk(KERN_WARNING |
2697 | "cciss: reading geometry failed, volume " | 2760 | "cciss: reading geometry failed, volume " |
2698 | "does not support reading geometry\n"); | 2761 | "does not support reading geometry\n"); |
2699 | drv->heads = 255; | 2762 | drv->heads = 255; |
2700 | drv->sectors = 32; // Sectors per track | 2763 | drv->sectors = 32; /* Sectors per track */ |
2701 | drv->cylinders = total_size + 1; | 2764 | drv->cylinders = total_size + 1; |
2702 | drv->raid_level = RAID_UNKNOWN; | 2765 | drv->raid_level = RAID_UNKNOWN; |
2703 | } else { | 2766 | } else { |
@@ -2723,7 +2786,7 @@ static void cciss_geometry_inquiry(int ctlr, int logvol, | |||
2723 | } | 2786 | } |
2724 | 2787 | ||
2725 | static void | 2788 | static void |
2726 | cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size, | 2789 | cciss_read_capacity(int ctlr, int logvol, sector_t *total_size, |
2727 | unsigned int *block_size) | 2790 | unsigned int *block_size) |
2728 | { | 2791 | { |
2729 | ReadCapdata_struct *buf; | 2792 | ReadCapdata_struct *buf; |
@@ -2737,14 +2800,8 @@ cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size, | |||
2737 | } | 2800 | } |
2738 | 2801 | ||
2739 | log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); | 2802 | log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); |
2740 | if (withirq) | 2803 | return_code = sendcmd_withirq(CCISS_READ_CAPACITY, ctlr, buf, |
2741 | return_code = sendcmd_withirq(CCISS_READ_CAPACITY, | 2804 | sizeof(ReadCapdata_struct), 0, scsi3addr, TYPE_CMD); |
2742 | ctlr, buf, sizeof(ReadCapdata_struct), | ||
2743 | 0, scsi3addr, TYPE_CMD); | ||
2744 | else | ||
2745 | return_code = sendcmd(CCISS_READ_CAPACITY, | ||
2746 | ctlr, buf, sizeof(ReadCapdata_struct), | ||
2747 | 0, scsi3addr, TYPE_CMD); | ||
2748 | if (return_code == IO_OK) { | 2805 | if (return_code == IO_OK) { |
2749 | *total_size = be32_to_cpu(*(__be32 *) buf->total_size); | 2806 | *total_size = be32_to_cpu(*(__be32 *) buf->total_size); |
2750 | *block_size = be32_to_cpu(*(__be32 *) buf->block_size); | 2807 | *block_size = be32_to_cpu(*(__be32 *) buf->block_size); |
@@ -2756,8 +2813,8 @@ cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size, | |||
2756 | kfree(buf); | 2813 | kfree(buf); |
2757 | } | 2814 | } |
2758 | 2815 | ||
2759 | static void | 2816 | static void cciss_read_capacity_16(int ctlr, int logvol, |
2760 | cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size, unsigned int *block_size) | 2817 | sector_t *total_size, unsigned int *block_size) |
2761 | { | 2818 | { |
2762 | ReadCapdata_struct_16 *buf; | 2819 | ReadCapdata_struct_16 *buf; |
2763 | int return_code; | 2820 | int return_code; |
@@ -2770,16 +2827,9 @@ cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size, | |||
2770 | } | 2827 | } |
2771 | 2828 | ||
2772 | log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); | 2829 | log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); |
2773 | if (withirq) { | 2830 | return_code = sendcmd_withirq(CCISS_READ_CAPACITY_16, |
2774 | return_code = sendcmd_withirq(CCISS_READ_CAPACITY_16, | 2831 | ctlr, buf, sizeof(ReadCapdata_struct_16), |
2775 | ctlr, buf, sizeof(ReadCapdata_struct_16), | 2832 | 0, scsi3addr, TYPE_CMD); |
2776 | 0, scsi3addr, TYPE_CMD); | ||
2777 | } | ||
2778 | else { | ||
2779 | return_code = sendcmd(CCISS_READ_CAPACITY_16, | ||
2780 | ctlr, buf, sizeof(ReadCapdata_struct_16), | ||
2781 | 0, scsi3addr, TYPE_CMD); | ||
2782 | } | ||
2783 | if (return_code == IO_OK) { | 2833 | if (return_code == IO_OK) { |
2784 | *total_size = be64_to_cpu(*(__be64 *) buf->total_size); | 2834 | *total_size = be64_to_cpu(*(__be64 *) buf->total_size); |
2785 | *block_size = be32_to_cpu(*(__be32 *) buf->block_size); | 2835 | *block_size = be32_to_cpu(*(__be32 *) buf->block_size); |
@@ -2820,13 +2870,13 @@ static int cciss_revalidate(struct gendisk *disk) | |||
2820 | return 1; | 2870 | return 1; |
2821 | } | 2871 | } |
2822 | if (h->cciss_read == CCISS_READ_10) { | 2872 | if (h->cciss_read == CCISS_READ_10) { |
2823 | cciss_read_capacity(h->ctlr, logvol, 1, | 2873 | cciss_read_capacity(h->ctlr, logvol, |
2824 | &total_size, &block_size); | 2874 | &total_size, &block_size); |
2825 | } else { | 2875 | } else { |
2826 | cciss_read_capacity_16(h->ctlr, logvol, 1, | 2876 | cciss_read_capacity_16(h->ctlr, logvol, |
2827 | &total_size, &block_size); | 2877 | &total_size, &block_size); |
2828 | } | 2878 | } |
2829 | cciss_geometry_inquiry(h->ctlr, logvol, 1, total_size, block_size, | 2879 | cciss_geometry_inquiry(h->ctlr, logvol, total_size, block_size, |
2830 | inq_buff, drv); | 2880 | inq_buff, drv); |
2831 | 2881 | ||
2832 | blk_queue_logical_block_size(drv->queue, drv->block_size); | 2882 | blk_queue_logical_block_size(drv->queue, drv->block_size); |
@@ -2837,167 +2887,6 @@ static int cciss_revalidate(struct gendisk *disk) | |||
2837 | } | 2887 | } |
2838 | 2888 | ||
2839 | /* | 2889 | /* |
2840 | * Wait polling for a command to complete. | ||
2841 | * The memory mapped FIFO is polled for the completion. | ||
2842 | * Used only at init time, interrupts from the HBA are disabled. | ||
2843 | */ | ||
2844 | static unsigned long pollcomplete(int ctlr) | ||
2845 | { | ||
2846 | unsigned long done; | ||
2847 | int i; | ||
2848 | |||
2849 | /* Wait (up to 20 seconds) for a command to complete */ | ||
2850 | |||
2851 | for (i = 20 * HZ; i > 0; i--) { | ||
2852 | done = hba[ctlr]->access.command_completed(hba[ctlr]); | ||
2853 | if (done == FIFO_EMPTY) | ||
2854 | schedule_timeout_uninterruptible(1); | ||
2855 | else | ||
2856 | return done; | ||
2857 | } | ||
2858 | /* Invalid address to tell caller we ran out of time */ | ||
2859 | return 1; | ||
2860 | } | ||
2861 | |||
2862 | /* Send command c to controller h and poll for it to complete. | ||
2863 | * Turns interrupts off on the board. Used at driver init time | ||
2864 | * and during SCSI error recovery. | ||
2865 | */ | ||
2866 | static int sendcmd_core(ctlr_info_t *h, CommandList_struct *c) | ||
2867 | { | ||
2868 | int i; | ||
2869 | unsigned long complete; | ||
2870 | int status = IO_ERROR; | ||
2871 | u64bit buff_dma_handle; | ||
2872 | |||
2873 | resend_cmd1: | ||
2874 | |||
2875 | /* Disable interrupt on the board. */ | ||
2876 | h->access.set_intr_mask(h, CCISS_INTR_OFF); | ||
2877 | |||
2878 | /* Make sure there is room in the command FIFO */ | ||
2879 | /* Actually it should be completely empty at this time */ | ||
2880 | /* unless we are in here doing error handling for the scsi */ | ||
2881 | /* tape side of the driver. */ | ||
2882 | for (i = 200000; i > 0; i--) { | ||
2883 | /* if fifo isn't full go */ | ||
2884 | if (!(h->access.fifo_full(h))) | ||
2885 | break; | ||
2886 | udelay(10); | ||
2887 | printk(KERN_WARNING "cciss cciss%d: SendCmd FIFO full," | ||
2888 | " waiting!\n", h->ctlr); | ||
2889 | } | ||
2890 | h->access.submit_command(h, c); /* Send the cmd */ | ||
2891 | do { | ||
2892 | complete = pollcomplete(h->ctlr); | ||
2893 | |||
2894 | #ifdef CCISS_DEBUG | ||
2895 | printk(KERN_DEBUG "cciss: command completed\n"); | ||
2896 | #endif /* CCISS_DEBUG */ | ||
2897 | |||
2898 | if (complete == 1) { | ||
2899 | printk(KERN_WARNING | ||
2900 | "cciss cciss%d: SendCmd Timeout out, " | ||
2901 | "No command list address returned!\n", h->ctlr); | ||
2902 | status = IO_ERROR; | ||
2903 | break; | ||
2904 | } | ||
2905 | |||
2906 | /* Make sure it's the command we're expecting. */ | ||
2907 | if ((complete & ~CISS_ERROR_BIT) != c->busaddr) { | ||
2908 | printk(KERN_WARNING "cciss%d: Unexpected command " | ||
2909 | "completion.\n", h->ctlr); | ||
2910 | continue; | ||
2911 | } | ||
2912 | |||
2913 | /* It is our command. If no error, we're done. */ | ||
2914 | if (!(complete & CISS_ERROR_BIT)) { | ||
2915 | status = IO_OK; | ||
2916 | break; | ||
2917 | } | ||
2918 | |||
2919 | /* There is an error... */ | ||
2920 | |||
2921 | /* if data overrun or underun on Report command ignore it */ | ||
2922 | if (((c->Request.CDB[0] == CISS_REPORT_LOG) || | ||
2923 | (c->Request.CDB[0] == CISS_REPORT_PHYS) || | ||
2924 | (c->Request.CDB[0] == CISS_INQUIRY)) && | ||
2925 | ((c->err_info->CommandStatus == CMD_DATA_OVERRUN) || | ||
2926 | (c->err_info->CommandStatus == CMD_DATA_UNDERRUN))) { | ||
2927 | complete = c->busaddr; | ||
2928 | status = IO_OK; | ||
2929 | break; | ||
2930 | } | ||
2931 | |||
2932 | if (c->err_info->CommandStatus == CMD_UNSOLICITED_ABORT) { | ||
2933 | printk(KERN_WARNING "cciss%d: unsolicited abort %p\n", | ||
2934 | h->ctlr, c); | ||
2935 | if (c->retry_count < MAX_CMD_RETRIES) { | ||
2936 | printk(KERN_WARNING "cciss%d: retrying %p\n", | ||
2937 | h->ctlr, c); | ||
2938 | c->retry_count++; | ||
2939 | /* erase the old error information */ | ||
2940 | memset(c->err_info, 0, sizeof(c->err_info)); | ||
2941 | goto resend_cmd1; | ||
2942 | } | ||
2943 | printk(KERN_WARNING "cciss%d: retried %p too many " | ||
2944 | "times\n", h->ctlr, c); | ||
2945 | status = IO_ERROR; | ||
2946 | break; | ||
2947 | } | ||
2948 | |||
2949 | if (c->err_info->CommandStatus == CMD_UNABORTABLE) { | ||
2950 | printk(KERN_WARNING "cciss%d: command could not be " | ||
2951 | "aborted.\n", h->ctlr); | ||
2952 | status = IO_ERROR; | ||
2953 | break; | ||
2954 | } | ||
2955 | |||
2956 | if (c->err_info->CommandStatus == CMD_TARGET_STATUS) { | ||
2957 | status = check_target_status(h, c); | ||
2958 | break; | ||
2959 | } | ||
2960 | |||
2961 | printk(KERN_WARNING "cciss%d: sendcmd error\n", h->ctlr); | ||
2962 | printk(KERN_WARNING "cmd = 0x%02x, CommandStatus = 0x%02x\n", | ||
2963 | c->Request.CDB[0], c->err_info->CommandStatus); | ||
2964 | status = IO_ERROR; | ||
2965 | break; | ||
2966 | |||
2967 | } while (1); | ||
2968 | |||
2969 | /* unlock the data buffer from DMA */ | ||
2970 | buff_dma_handle.val32.lower = c->SG[0].Addr.lower; | ||
2971 | buff_dma_handle.val32.upper = c->SG[0].Addr.upper; | ||
2972 | pci_unmap_single(h->pdev, (dma_addr_t) buff_dma_handle.val, | ||
2973 | c->SG[0].Len, PCI_DMA_BIDIRECTIONAL); | ||
2974 | return status; | ||
2975 | } | ||
2976 | |||
2977 | /* | ||
2978 | * Send a command to the controller, and wait for it to complete. | ||
2979 | * Used at init time, and during SCSI error recovery. | ||
2980 | */ | ||
2981 | static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size, | ||
2982 | __u8 page_code, unsigned char *scsi3addr, int cmd_type) | ||
2983 | { | ||
2984 | CommandList_struct *c; | ||
2985 | int status; | ||
2986 | |||
2987 | c = cmd_alloc(hba[ctlr], 1); | ||
2988 | if (!c) { | ||
2989 | printk(KERN_WARNING "cciss: unable to get memory"); | ||
2990 | return IO_ERROR; | ||
2991 | } | ||
2992 | status = fill_cmd(c, cmd, ctlr, buff, size, page_code, | ||
2993 | scsi3addr, cmd_type); | ||
2994 | if (status == IO_OK) | ||
2995 | status = sendcmd_core(hba[ctlr], c); | ||
2996 | cmd_free(hba[ctlr], c, 1); | ||
2997 | return status; | ||
2998 | } | ||
2999 | |||
3000 | /* | ||
3001 | * Map (physical) PCI mem into (virtual) kernel space | 2890 | * Map (physical) PCI mem into (virtual) kernel space |
3002 | */ | 2891 | */ |
3003 | static void __iomem *remap_pci_mem(ulong base, ulong size) | 2892 | static void __iomem *remap_pci_mem(ulong base, ulong size) |
@@ -3255,9 +3144,12 @@ static void do_cciss_request(struct request_queue *q) | |||
3255 | int seg; | 3144 | int seg; |
3256 | struct request *creq; | 3145 | struct request *creq; |
3257 | u64bit temp64; | 3146 | u64bit temp64; |
3258 | struct scatterlist tmp_sg[MAXSGENTRIES]; | 3147 | struct scatterlist *tmp_sg; |
3148 | SGDescriptor_struct *curr_sg; | ||
3259 | drive_info_struct *drv; | 3149 | drive_info_struct *drv; |
3260 | int i, dir; | 3150 | int i, dir; |
3151 | int sg_index = 0; | ||
3152 | int chained = 0; | ||
3261 | 3153 | ||
3262 | /* We call start_io here in case there is a command waiting on the | 3154 | /* We call start_io here in case there is a command waiting on the |
3263 | * queue that has not been sent. | 3155 | * queue that has not been sent. |
@@ -3270,13 +3162,14 @@ static void do_cciss_request(struct request_queue *q) | |||
3270 | if (!creq) | 3162 | if (!creq) |
3271 | goto startio; | 3163 | goto startio; |
3272 | 3164 | ||
3273 | BUG_ON(creq->nr_phys_segments > MAXSGENTRIES); | 3165 | BUG_ON(creq->nr_phys_segments > h->maxsgentries); |
3274 | 3166 | ||
3275 | if ((c = cmd_alloc(h, 1)) == NULL) | 3167 | if ((c = cmd_alloc(h, 1)) == NULL) |
3276 | goto full; | 3168 | goto full; |
3277 | 3169 | ||
3278 | blk_start_request(creq); | 3170 | blk_start_request(creq); |
3279 | 3171 | ||
3172 | tmp_sg = h->scatter_list[c->cmdindex]; | ||
3280 | spin_unlock_irq(q->queue_lock); | 3173 | spin_unlock_irq(q->queue_lock); |
3281 | 3174 | ||
3282 | c->cmd_type = CMD_RWREQ; | 3175 | c->cmd_type = CMD_RWREQ; |
@@ -3284,19 +3177,19 @@ static void do_cciss_request(struct request_queue *q) | |||
3284 | 3177 | ||
3285 | /* fill in the request */ | 3178 | /* fill in the request */ |
3286 | drv = creq->rq_disk->private_data; | 3179 | drv = creq->rq_disk->private_data; |
3287 | c->Header.ReplyQueue = 0; // unused in simple mode | 3180 | c->Header.ReplyQueue = 0; /* unused in simple mode */ |
3288 | /* got command from pool, so use the command block index instead */ | 3181 | /* got command from pool, so use the command block index instead */ |
3289 | /* for direct lookups. */ | 3182 | /* for direct lookups. */ |
3290 | /* The first 2 bits are reserved for controller error reporting. */ | 3183 | /* The first 2 bits are reserved for controller error reporting. */ |
3291 | c->Header.Tag.lower = (c->cmdindex << 3); | 3184 | c->Header.Tag.lower = (c->cmdindex << 3); |
3292 | c->Header.Tag.lower |= 0x04; /* flag for direct lookup. */ | 3185 | c->Header.Tag.lower |= 0x04; /* flag for direct lookup. */ |
3293 | memcpy(&c->Header.LUN, drv->LunID, sizeof(drv->LunID)); | 3186 | memcpy(&c->Header.LUN, drv->LunID, sizeof(drv->LunID)); |
3294 | c->Request.CDBLen = 10; // 12 byte commands not in FW yet; | 3187 | c->Request.CDBLen = 10; /* 12 byte commands not in FW yet; */ |
3295 | c->Request.Type.Type = TYPE_CMD; // It is a command. | 3188 | c->Request.Type.Type = TYPE_CMD; /* It is a command. */ |
3296 | c->Request.Type.Attribute = ATTR_SIMPLE; | 3189 | c->Request.Type.Attribute = ATTR_SIMPLE; |
3297 | c->Request.Type.Direction = | 3190 | c->Request.Type.Direction = |
3298 | (rq_data_dir(creq) == READ) ? XFER_READ : XFER_WRITE; | 3191 | (rq_data_dir(creq) == READ) ? XFER_READ : XFER_WRITE; |
3299 | c->Request.Timeout = 0; // Don't time out | 3192 | c->Request.Timeout = 0; /* Don't time out */ |
3300 | c->Request.CDB[0] = | 3193 | c->Request.CDB[0] = |
3301 | (rq_data_dir(creq) == READ) ? h->cciss_read : h->cciss_write; | 3194 | (rq_data_dir(creq) == READ) ? h->cciss_read : h->cciss_write; |
3302 | start_blk = blk_rq_pos(creq); | 3195 | start_blk = blk_rq_pos(creq); |
@@ -3305,7 +3198,7 @@ static void do_cciss_request(struct request_queue *q) | |||
3305 | (int)blk_rq_pos(creq), (int)blk_rq_sectors(creq)); | 3198 | (int)blk_rq_pos(creq), (int)blk_rq_sectors(creq)); |
3306 | #endif /* CCISS_DEBUG */ | 3199 | #endif /* CCISS_DEBUG */ |
3307 | 3200 | ||
3308 | sg_init_table(tmp_sg, MAXSGENTRIES); | 3201 | sg_init_table(tmp_sg, h->maxsgentries); |
3309 | seg = blk_rq_map_sg(q, creq, tmp_sg); | 3202 | seg = blk_rq_map_sg(q, creq, tmp_sg); |
3310 | 3203 | ||
3311 | /* get the DMA records for the setup */ | 3204 | /* get the DMA records for the setup */ |
@@ -3314,33 +3207,54 @@ static void do_cciss_request(struct request_queue *q) | |||
3314 | else | 3207 | else |
3315 | dir = PCI_DMA_TODEVICE; | 3208 | dir = PCI_DMA_TODEVICE; |
3316 | 3209 | ||
3210 | curr_sg = c->SG; | ||
3211 | sg_index = 0; | ||
3212 | chained = 0; | ||
3213 | |||
3317 | for (i = 0; i < seg; i++) { | 3214 | for (i = 0; i < seg; i++) { |
3318 | c->SG[i].Len = tmp_sg[i].length; | 3215 | if (((sg_index+1) == (h->max_cmd_sgentries)) && |
3216 | !chained && ((seg - i) > 1)) { | ||
3217 | /* Point to next chain block. */ | ||
3218 | curr_sg = h->cmd_sg_list[c->cmdindex]; | ||
3219 | sg_index = 0; | ||
3220 | chained = 1; | ||
3221 | } | ||
3222 | curr_sg[sg_index].Len = tmp_sg[i].length; | ||
3319 | temp64.val = (__u64) pci_map_page(h->pdev, sg_page(&tmp_sg[i]), | 3223 | temp64.val = (__u64) pci_map_page(h->pdev, sg_page(&tmp_sg[i]), |
3320 | tmp_sg[i].offset, | 3224 | tmp_sg[i].offset, |
3321 | tmp_sg[i].length, dir); | 3225 | tmp_sg[i].length, dir); |
3322 | c->SG[i].Addr.lower = temp64.val32.lower; | 3226 | curr_sg[sg_index].Addr.lower = temp64.val32.lower; |
3323 | c->SG[i].Addr.upper = temp64.val32.upper; | 3227 | curr_sg[sg_index].Addr.upper = temp64.val32.upper; |
3324 | c->SG[i].Ext = 0; // we are not chaining | 3228 | curr_sg[sg_index].Ext = 0; /* we are not chaining */ |
3229 | ++sg_index; | ||
3325 | } | 3230 | } |
3231 | if (chained) | ||
3232 | cciss_map_sg_chain_block(h, c, h->cmd_sg_list[c->cmdindex], | ||
3233 | (seg - (h->max_cmd_sgentries - 1)) * | ||
3234 | sizeof(SGDescriptor_struct)); | ||
3235 | |||
3326 | /* track how many SG entries we are using */ | 3236 | /* track how many SG entries we are using */ |
3327 | if (seg > h->maxSG) | 3237 | if (seg > h->maxSG) |
3328 | h->maxSG = seg; | 3238 | h->maxSG = seg; |
3329 | 3239 | ||
3330 | #ifdef CCISS_DEBUG | 3240 | #ifdef CCISS_DEBUG |
3331 | printk(KERN_DEBUG "cciss: Submitting %u sectors in %d segments\n", | 3241 | printk(KERN_DEBUG "cciss: Submitting %ld sectors in %d segments " |
3332 | blk_rq_sectors(creq), seg); | 3242 | "chained[%d]\n", |
3243 | blk_rq_sectors(creq), seg, chained); | ||
3333 | #endif /* CCISS_DEBUG */ | 3244 | #endif /* CCISS_DEBUG */ |
3334 | 3245 | ||
3335 | c->Header.SGList = c->Header.SGTotal = seg; | 3246 | c->Header.SGList = c->Header.SGTotal = seg + chained; |
3247 | if (seg > h->max_cmd_sgentries) | ||
3248 | c->Header.SGList = h->max_cmd_sgentries; | ||
3249 | |||
3336 | if (likely(blk_fs_request(creq))) { | 3250 | if (likely(blk_fs_request(creq))) { |
3337 | if(h->cciss_read == CCISS_READ_10) { | 3251 | if(h->cciss_read == CCISS_READ_10) { |
3338 | c->Request.CDB[1] = 0; | 3252 | c->Request.CDB[1] = 0; |
3339 | c->Request.CDB[2] = (start_blk >> 24) & 0xff; //MSB | 3253 | c->Request.CDB[2] = (start_blk >> 24) & 0xff; /* MSB */ |
3340 | c->Request.CDB[3] = (start_blk >> 16) & 0xff; | 3254 | c->Request.CDB[3] = (start_blk >> 16) & 0xff; |
3341 | c->Request.CDB[4] = (start_blk >> 8) & 0xff; | 3255 | c->Request.CDB[4] = (start_blk >> 8) & 0xff; |
3342 | c->Request.CDB[5] = start_blk & 0xff; | 3256 | c->Request.CDB[5] = start_blk & 0xff; |
3343 | c->Request.CDB[6] = 0; // (sect >> 24) & 0xff; MSB | 3257 | c->Request.CDB[6] = 0; /* (sect >> 24) & 0xff; MSB */ |
3344 | c->Request.CDB[7] = (blk_rq_sectors(creq) >> 8) & 0xff; | 3258 | c->Request.CDB[7] = (blk_rq_sectors(creq) >> 8) & 0xff; |
3345 | c->Request.CDB[8] = blk_rq_sectors(creq) & 0xff; | 3259 | c->Request.CDB[8] = blk_rq_sectors(creq) & 0xff; |
3346 | c->Request.CDB[9] = c->Request.CDB[11] = c->Request.CDB[12] = 0; | 3260 | c->Request.CDB[9] = c->Request.CDB[11] = c->Request.CDB[12] = 0; |
@@ -3349,7 +3263,7 @@ static void do_cciss_request(struct request_queue *q) | |||
3349 | 3263 | ||
3350 | c->Request.CDBLen = 16; | 3264 | c->Request.CDBLen = 16; |
3351 | c->Request.CDB[1]= 0; | 3265 | c->Request.CDB[1]= 0; |
3352 | c->Request.CDB[2]= (upper32 >> 24) & 0xff; //MSB | 3266 | c->Request.CDB[2]= (upper32 >> 24) & 0xff; /* MSB */ |
3353 | c->Request.CDB[3]= (upper32 >> 16) & 0xff; | 3267 | c->Request.CDB[3]= (upper32 >> 16) & 0xff; |
3354 | c->Request.CDB[4]= (upper32 >> 8) & 0xff; | 3268 | c->Request.CDB[4]= (upper32 >> 8) & 0xff; |
3355 | c->Request.CDB[5]= upper32 & 0xff; | 3269 | c->Request.CDB[5]= upper32 & 0xff; |
@@ -3427,6 +3341,7 @@ static irqreturn_t do_cciss_intr(int irq, void *dev_id) | |||
3427 | printk(KERN_WARNING | 3341 | printk(KERN_WARNING |
3428 | "cciss: controller cciss%d failed, stopping.\n", | 3342 | "cciss: controller cciss%d failed, stopping.\n", |
3429 | h->ctlr); | 3343 | h->ctlr); |
3344 | spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags); | ||
3430 | fail_all_cmds(h->ctlr); | 3345 | fail_all_cmds(h->ctlr); |
3431 | return IRQ_HANDLED; | 3346 | return IRQ_HANDLED; |
3432 | } | 3347 | } |
@@ -3513,28 +3428,33 @@ static int add_to_scan_list(struct ctlr_info *h) | |||
3513 | * @h: Pointer to the controller. | 3428 | * @h: Pointer to the controller. |
3514 | * | 3429 | * |
3515 | * Removes the controller from the rescan queue if present. Blocks if | 3430 | * Removes the controller from the rescan queue if present. Blocks if |
3516 | * the controller is currently conducting a rescan. | 3431 | * the controller is currently conducting a rescan. The controller |
3432 | * can be in one of three states: | ||
3433 | * 1. Doesn't need a scan | ||
3434 | * 2. On the scan list, but not scanning yet (we remove it) | ||
3435 | * 3. Busy scanning (and not on the list). In this case we want to wait for | ||
3436 | * the scan to complete to make sure the scanning thread for this | ||
3437 | * controller is completely idle. | ||
3517 | **/ | 3438 | **/ |
3518 | static void remove_from_scan_list(struct ctlr_info *h) | 3439 | static void remove_from_scan_list(struct ctlr_info *h) |
3519 | { | 3440 | { |
3520 | struct ctlr_info *test_h, *tmp_h; | 3441 | struct ctlr_info *test_h, *tmp_h; |
3521 | int scanning = 0; | ||
3522 | 3442 | ||
3523 | mutex_lock(&scan_mutex); | 3443 | mutex_lock(&scan_mutex); |
3524 | list_for_each_entry_safe(test_h, tmp_h, &scan_q, scan_list) { | 3444 | list_for_each_entry_safe(test_h, tmp_h, &scan_q, scan_list) { |
3525 | if (test_h == h) { | 3445 | if (test_h == h) { /* state 2. */ |
3526 | list_del(&h->scan_list); | 3446 | list_del(&h->scan_list); |
3527 | complete_all(&h->scan_wait); | 3447 | complete_all(&h->scan_wait); |
3528 | mutex_unlock(&scan_mutex); | 3448 | mutex_unlock(&scan_mutex); |
3529 | return; | 3449 | return; |
3530 | } | 3450 | } |
3531 | } | 3451 | } |
3532 | if (&h->busy_scanning) | 3452 | if (h->busy_scanning) { /* state 3. */ |
3533 | scanning = 0; | 3453 | mutex_unlock(&scan_mutex); |
3534 | mutex_unlock(&scan_mutex); | ||
3535 | |||
3536 | if (scanning) | ||
3537 | wait_for_completion(&h->scan_wait); | 3454 | wait_for_completion(&h->scan_wait); |
3455 | } else { /* state 1, nothing to do. */ | ||
3456 | mutex_unlock(&scan_mutex); | ||
3457 | } | ||
3538 | } | 3458 | } |
3539 | 3459 | ||
3540 | /** | 3460 | /** |
@@ -3573,13 +3493,11 @@ static int scan_thread(void *data) | |||
3573 | h->busy_scanning = 1; | 3493 | h->busy_scanning = 1; |
3574 | mutex_unlock(&scan_mutex); | 3494 | mutex_unlock(&scan_mutex); |
3575 | 3495 | ||
3576 | if (h) { | 3496 | rebuild_lun_table(h, 0, 0); |
3577 | rebuild_lun_table(h, 0, 0); | 3497 | complete_all(&h->scan_wait); |
3578 | complete_all(&h->scan_wait); | 3498 | mutex_lock(&scan_mutex); |
3579 | mutex_lock(&scan_mutex); | 3499 | h->busy_scanning = 0; |
3580 | h->busy_scanning = 0; | 3500 | mutex_unlock(&scan_mutex); |
3581 | mutex_unlock(&scan_mutex); | ||
3582 | } | ||
3583 | } | 3501 | } |
3584 | } | 3502 | } |
3585 | 3503 | ||
@@ -3605,8 +3523,22 @@ static int check_for_unit_attention(ctlr_info_t *h, CommandList_struct *c) | |||
3605 | case REPORT_LUNS_CHANGED: | 3523 | case REPORT_LUNS_CHANGED: |
3606 | printk(KERN_WARNING "cciss%d: report LUN data " | 3524 | printk(KERN_WARNING "cciss%d: report LUN data " |
3607 | "changed\n", h->ctlr); | 3525 | "changed\n", h->ctlr); |
3608 | add_to_scan_list(h); | 3526 | /* |
3609 | wake_up_process(cciss_scan_thread); | 3527 | * Here, we could call add_to_scan_list and wake up the scan thread, |
3528 | * except that it's quite likely that we will get more than one | ||
3529 | * REPORT_LUNS_CHANGED condition in quick succession, which means | ||
3530 | * that those which occur after the first one will likely happen | ||
3531 | * *during* the scan_thread's rescan. And the rescan code is not | ||
3532 | * robust enough to restart in the middle, undoing what it has already | ||
3533 | * done, and it's not clear that it's even possible to do this, since | ||
3534 | * part of what it does is notify the block layer, which starts | ||
3535 | * doing it's own i/o to read partition tables and so on, and the | ||
3536 | * driver doesn't have visibility to know what might need undoing. | ||
3537 | * In any event, if possible, it is horribly complicated to get right | ||
3538 | * so we just don't do it for now. | ||
3539 | * | ||
3540 | * Note: this REPORT_LUNS_CHANGED condition only occurs on the MSA2012. | ||
3541 | */ | ||
3610 | return 1; | 3542 | return 1; |
3611 | break; | 3543 | break; |
3612 | case POWER_OR_RESET: | 3544 | case POWER_OR_RESET: |
@@ -3888,6 +3820,23 @@ static int __devinit cciss_pci_init(ctlr_info_t *c, struct pci_dev *pdev) | |||
3888 | * leave a little room for ioctl calls. | 3820 | * leave a little room for ioctl calls. |
3889 | */ | 3821 | */ |
3890 | c->max_commands = readl(&(c->cfgtable->CmdsOutMax)); | 3822 | c->max_commands = readl(&(c->cfgtable->CmdsOutMax)); |
3823 | c->maxsgentries = readl(&(c->cfgtable->MaxSGElements)); | ||
3824 | |||
3825 | /* | ||
3826 | * Limit native command to 32 s/g elements to save dma'able memory. | ||
3827 | * Howvever spec says if 0, use 31 | ||
3828 | */ | ||
3829 | |||
3830 | c->max_cmd_sgentries = 31; | ||
3831 | if (c->maxsgentries > 512) { | ||
3832 | c->max_cmd_sgentries = 32; | ||
3833 | c->chainsize = c->maxsgentries - c->max_cmd_sgentries + 1; | ||
3834 | c->maxsgentries -= 1; /* account for chain pointer */ | ||
3835 | } else { | ||
3836 | c->maxsgentries = 31; /* Default to traditional value */ | ||
3837 | c->chainsize = 0; /* traditional */ | ||
3838 | } | ||
3839 | |||
3891 | c->product_name = products[prod_index].product_name; | 3840 | c->product_name = products[prod_index].product_name; |
3892 | c->access = *(products[prod_index].access); | 3841 | c->access = *(products[prod_index].access); |
3893 | c->nr_cmds = c->max_commands - 4; | 3842 | c->nr_cmds = c->max_commands - 4; |
@@ -4214,6 +4163,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, | |||
4214 | { | 4163 | { |
4215 | int i; | 4164 | int i; |
4216 | int j = 0; | 4165 | int j = 0; |
4166 | int k = 0; | ||
4217 | int rc; | 4167 | int rc; |
4218 | int dac, return_code; | 4168 | int dac, return_code; |
4219 | InquiryData_struct *inq_buff; | 4169 | InquiryData_struct *inq_buff; |
@@ -4317,6 +4267,26 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, | |||
4317 | printk(KERN_ERR "cciss: out of memory"); | 4267 | printk(KERN_ERR "cciss: out of memory"); |
4318 | goto clean4; | 4268 | goto clean4; |
4319 | } | 4269 | } |
4270 | |||
4271 | /* Need space for temp scatter list */ | ||
4272 | hba[i]->scatter_list = kmalloc(hba[i]->max_commands * | ||
4273 | sizeof(struct scatterlist *), | ||
4274 | GFP_KERNEL); | ||
4275 | for (k = 0; k < hba[i]->nr_cmds; k++) { | ||
4276 | hba[i]->scatter_list[k] = kmalloc(sizeof(struct scatterlist) * | ||
4277 | hba[i]->maxsgentries, | ||
4278 | GFP_KERNEL); | ||
4279 | if (hba[i]->scatter_list[k] == NULL) { | ||
4280 | printk(KERN_ERR "cciss%d: could not allocate " | ||
4281 | "s/g lists\n", i); | ||
4282 | goto clean4; | ||
4283 | } | ||
4284 | } | ||
4285 | hba[i]->cmd_sg_list = cciss_allocate_sg_chain_blocks(hba[i], | ||
4286 | hba[i]->chainsize, hba[i]->nr_cmds); | ||
4287 | if (!hba[i]->cmd_sg_list && hba[i]->chainsize > 0) | ||
4288 | goto clean4; | ||
4289 | |||
4320 | spin_lock_init(&hba[i]->lock); | 4290 | spin_lock_init(&hba[i]->lock); |
4321 | 4291 | ||
4322 | /* Initialize the pdev driver private data. | 4292 | /* Initialize the pdev driver private data. |
@@ -4362,7 +4332,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, | |||
4362 | 4332 | ||
4363 | cciss_procinit(i); | 4333 | cciss_procinit(i); |
4364 | 4334 | ||
4365 | hba[i]->cciss_max_sectors = 2048; | 4335 | hba[i]->cciss_max_sectors = 8192; |
4366 | 4336 | ||
4367 | rebuild_lun_table(hba[i], 1, 0); | 4337 | rebuild_lun_table(hba[i], 1, 0); |
4368 | hba[i]->busy_initializing = 0; | 4338 | hba[i]->busy_initializing = 0; |
@@ -4370,6 +4340,11 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, | |||
4370 | 4340 | ||
4371 | clean4: | 4341 | clean4: |
4372 | kfree(hba[i]->cmd_pool_bits); | 4342 | kfree(hba[i]->cmd_pool_bits); |
4343 | /* Free up sg elements */ | ||
4344 | for (k = 0; k < hba[i]->nr_cmds; k++) | ||
4345 | kfree(hba[i]->scatter_list[k]); | ||
4346 | kfree(hba[i]->scatter_list); | ||
4347 | cciss_free_sg_chain_blocks(hba[i]->cmd_sg_list, hba[i]->nr_cmds); | ||
4373 | if (hba[i]->cmd_pool) | 4348 | if (hba[i]->cmd_pool) |
4374 | pci_free_consistent(hba[i]->pdev, | 4349 | pci_free_consistent(hba[i]->pdev, |
4375 | hba[i]->nr_cmds * sizeof(CommandList_struct), | 4350 | hba[i]->nr_cmds * sizeof(CommandList_struct), |
@@ -4400,30 +4375,28 @@ clean_no_release_regions: | |||
4400 | 4375 | ||
4401 | static void cciss_shutdown(struct pci_dev *pdev) | 4376 | static void cciss_shutdown(struct pci_dev *pdev) |
4402 | { | 4377 | { |
4403 | ctlr_info_t *tmp_ptr; | 4378 | ctlr_info_t *h; |
4404 | int i; | 4379 | char *flush_buf; |
4405 | char flush_buf[4]; | ||
4406 | int return_code; | 4380 | int return_code; |
4407 | 4381 | ||
4408 | tmp_ptr = pci_get_drvdata(pdev); | 4382 | h = pci_get_drvdata(pdev); |
4409 | if (tmp_ptr == NULL) | 4383 | flush_buf = kzalloc(4, GFP_KERNEL); |
4410 | return; | 4384 | if (!flush_buf) { |
4411 | i = tmp_ptr->ctlr; | 4385 | printk(KERN_WARNING |
4412 | if (hba[i] == NULL) | 4386 | "cciss:%d cache not flushed, out of memory.\n", |
4387 | h->ctlr); | ||
4413 | return; | 4388 | return; |
4414 | |||
4415 | /* Turn board interrupts off and send the flush cache command */ | ||
4416 | /* sendcmd will turn off interrupt, and send the flush... | ||
4417 | * To write all data in the battery backed cache to disks */ | ||
4418 | memset(flush_buf, 0, 4); | ||
4419 | return_code = sendcmd(CCISS_CACHE_FLUSH, i, flush_buf, 4, 0, | ||
4420 | CTLR_LUNID, TYPE_CMD); | ||
4421 | if (return_code == IO_OK) { | ||
4422 | printk(KERN_INFO "Completed flushing cache on controller %d\n", i); | ||
4423 | } else { | ||
4424 | printk(KERN_WARNING "Error flushing cache on controller %d\n", i); | ||
4425 | } | 4389 | } |
4426 | free_irq(hba[i]->intr[2], hba[i]); | 4390 | /* write all data in the battery backed cache to disk */ |
4391 | memset(flush_buf, 0, 4); | ||
4392 | return_code = sendcmd_withirq(CCISS_CACHE_FLUSH, h->ctlr, flush_buf, | ||
4393 | 4, 0, CTLR_LUNID, TYPE_CMD); | ||
4394 | kfree(flush_buf); | ||
4395 | if (return_code != IO_OK) | ||
4396 | printk(KERN_WARNING "cciss%d: Error flushing cache\n", | ||
4397 | h->ctlr); | ||
4398 | h->access.set_intr_mask(h, CCISS_INTR_OFF); | ||
4399 | free_irq(h->intr[2], h); | ||
4427 | } | 4400 | } |
4428 | 4401 | ||
4429 | static void __devexit cciss_remove_one(struct pci_dev *pdev) | 4402 | static void __devexit cciss_remove_one(struct pci_dev *pdev) |
@@ -4485,6 +4458,11 @@ static void __devexit cciss_remove_one(struct pci_dev *pdev) | |||
4485 | pci_free_consistent(hba[i]->pdev, hba[i]->nr_cmds * sizeof(ErrorInfo_struct), | 4458 | pci_free_consistent(hba[i]->pdev, hba[i]->nr_cmds * sizeof(ErrorInfo_struct), |
4486 | hba[i]->errinfo_pool, hba[i]->errinfo_pool_dhandle); | 4459 | hba[i]->errinfo_pool, hba[i]->errinfo_pool_dhandle); |
4487 | kfree(hba[i]->cmd_pool_bits); | 4460 | kfree(hba[i]->cmd_pool_bits); |
4461 | /* Free up sg elements */ | ||
4462 | for (j = 0; j < hba[i]->nr_cmds; j++) | ||
4463 | kfree(hba[i]->scatter_list[j]); | ||
4464 | kfree(hba[i]->scatter_list); | ||
4465 | cciss_free_sg_chain_blocks(hba[i]->cmd_sg_list, hba[i]->nr_cmds); | ||
4488 | /* | 4466 | /* |
4489 | * Deliberately omit pci_disable_device(): it does something nasty to | 4467 | * Deliberately omit pci_disable_device(): it does something nasty to |
4490 | * Smart Array controllers that pci_enable_device does not undo | 4468 | * Smart Array controllers that pci_enable_device does not undo |
@@ -4517,7 +4495,7 @@ static int __init cciss_init(void) | |||
4517 | * boundary. Given that we use pci_alloc_consistent() to allocate an | 4495 | * boundary. Given that we use pci_alloc_consistent() to allocate an |
4518 | * array of them, the size must be a multiple of 8 bytes. | 4496 | * array of them, the size must be a multiple of 8 bytes. |
4519 | */ | 4497 | */ |
4520 | BUILD_BUG_ON(sizeof(CommandList_struct) % 8); | 4498 | BUILD_BUG_ON(sizeof(CommandList_struct) % COMMANDLIST_ALIGNMENT); |
4521 | 4499 | ||
4522 | printk(KERN_INFO DRIVER_NAME "\n"); | 4500 | printk(KERN_INFO DRIVER_NAME "\n"); |
4523 | 4501 | ||
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h index 31524cf42c77..c5d411174db0 100644 --- a/drivers/block/cciss.h +++ b/drivers/block/cciss.h | |||
@@ -55,12 +55,12 @@ typedef struct _drive_info_struct | |||
55 | char device_initialized; /* indicates whether dev is initialized */ | 55 | char device_initialized; /* indicates whether dev is initialized */ |
56 | } drive_info_struct; | 56 | } drive_info_struct; |
57 | 57 | ||
58 | struct ctlr_info | 58 | struct ctlr_info |
59 | { | 59 | { |
60 | int ctlr; | 60 | int ctlr; |
61 | char devname[8]; | 61 | char devname[8]; |
62 | char *product_name; | 62 | char *product_name; |
63 | char firm_ver[4]; // Firmware version | 63 | char firm_ver[4]; /* Firmware version */ |
64 | struct pci_dev *pdev; | 64 | struct pci_dev *pdev; |
65 | __u32 board_id; | 65 | __u32 board_id; |
66 | void __iomem *vaddr; | 66 | void __iomem *vaddr; |
@@ -75,6 +75,16 @@ struct ctlr_info | |||
75 | int num_luns; | 75 | int num_luns; |
76 | int highest_lun; | 76 | int highest_lun; |
77 | int usage_count; /* number of opens all all minor devices */ | 77 | int usage_count; /* number of opens all all minor devices */ |
78 | /* Need space for temp sg list | ||
79 | * number of scatter/gathers supported | ||
80 | * number of scatter/gathers in chained block | ||
81 | */ | ||
82 | struct scatterlist **scatter_list; | ||
83 | int maxsgentries; | ||
84 | int chainsize; | ||
85 | int max_cmd_sgentries; | ||
86 | SGDescriptor_struct **cmd_sg_list; | ||
87 | |||
78 | # define DOORBELL_INT 0 | 88 | # define DOORBELL_INT 0 |
79 | # define PERF_MODE_INT 1 | 89 | # define PERF_MODE_INT 1 |
80 | # define SIMPLE_MODE_INT 2 | 90 | # define SIMPLE_MODE_INT 2 |
@@ -87,7 +97,7 @@ struct ctlr_info | |||
87 | BYTE cciss_write; | 97 | BYTE cciss_write; |
88 | BYTE cciss_read_capacity; | 98 | BYTE cciss_read_capacity; |
89 | 99 | ||
90 | // information about each logical volume | 100 | /* information about each logical volume */ |
91 | drive_info_struct *drv[CISS_MAX_LUN]; | 101 | drive_info_struct *drv[CISS_MAX_LUN]; |
92 | 102 | ||
93 | struct access_method access; | 103 | struct access_method access; |
@@ -100,7 +110,7 @@ struct ctlr_info | |||
100 | unsigned int maxSG; | 110 | unsigned int maxSG; |
101 | spinlock_t lock; | 111 | spinlock_t lock; |
102 | 112 | ||
103 | //* pointers to command and error info pool */ | 113 | /* pointers to command and error info pool */ |
104 | CommandList_struct *cmd_pool; | 114 | CommandList_struct *cmd_pool; |
105 | dma_addr_t cmd_pool_dhandle; | 115 | dma_addr_t cmd_pool_dhandle; |
106 | ErrorInfo_struct *errinfo_pool; | 116 | ErrorInfo_struct *errinfo_pool; |
@@ -118,12 +128,10 @@ struct ctlr_info | |||
118 | */ | 128 | */ |
119 | int next_to_run; | 129 | int next_to_run; |
120 | 130 | ||
121 | // Disk structures we need to pass back | 131 | /* Disk structures we need to pass back */ |
122 | struct gendisk *gendisk[CISS_MAX_LUN]; | 132 | struct gendisk *gendisk[CISS_MAX_LUN]; |
123 | #ifdef CONFIG_CISS_SCSI_TAPE | 133 | #ifdef CONFIG_CISS_SCSI_TAPE |
124 | void *scsi_ctlr; /* ptr to structure containing scsi related stuff */ | 134 | struct cciss_scsi_adapter_data_t *scsi_ctlr; |
125 | /* list of block side commands the scsi error handling sucked up */ | ||
126 | /* and saved for later processing */ | ||
127 | #endif | 135 | #endif |
128 | unsigned char alive; | 136 | unsigned char alive; |
129 | struct list_head scan_list; | 137 | struct list_head scan_list; |
@@ -299,4 +307,3 @@ struct board_type { | |||
299 | #define CCISS_LOCK(i) (&hba[i]->lock) | 307 | #define CCISS_LOCK(i) (&hba[i]->lock) |
300 | 308 | ||
301 | #endif /* CCISS_H */ | 309 | #endif /* CCISS_H */ |
302 | |||
diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h index dbaed1ea0da3..e624ff959cb6 100644 --- a/drivers/block/cciss_cmd.h +++ b/drivers/block/cciss_cmd.h | |||
@@ -1,30 +1,16 @@ | |||
1 | #ifndef CCISS_CMD_H | 1 | #ifndef CCISS_CMD_H |
2 | #define CCISS_CMD_H | 2 | #define CCISS_CMD_H |
3 | //########################################################################### | 3 | |
4 | //DEFINES | 4 | #include <linux/cciss_defs.h> |
5 | //########################################################################### | 5 | |
6 | /* DEFINES */ | ||
6 | #define CISS_VERSION "1.00" | 7 | #define CISS_VERSION "1.00" |
7 | 8 | ||
8 | //general boundary defintions | 9 | /* general boundary definitions */ |
9 | #define SENSEINFOBYTES 32//note that this value may vary between host implementations | 10 | #define MAXSGENTRIES 32 |
10 | #define MAXSGENTRIES 31 | 11 | #define CCISS_SG_CHAIN 0x80000000 |
11 | #define MAXREPLYQS 256 | 12 | #define MAXREPLYQS 256 |
12 | 13 | ||
13 | //Command Status value | ||
14 | #define CMD_SUCCESS 0x0000 | ||
15 | #define CMD_TARGET_STATUS 0x0001 | ||
16 | #define CMD_DATA_UNDERRUN 0x0002 | ||
17 | #define CMD_DATA_OVERRUN 0x0003 | ||
18 | #define CMD_INVALID 0x0004 | ||
19 | #define CMD_PROTOCOL_ERR 0x0005 | ||
20 | #define CMD_HARDWARE_ERR 0x0006 | ||
21 | #define CMD_CONNECTION_LOST 0x0007 | ||
22 | #define CMD_ABORTED 0x0008 | ||
23 | #define CMD_ABORT_FAILED 0x0009 | ||
24 | #define CMD_UNSOLICITED_ABORT 0x000A | ||
25 | #define CMD_TIMEOUT 0x000B | ||
26 | #define CMD_UNABORTABLE 0x000C | ||
27 | |||
28 | /* Unit Attentions ASC's as defined for the MSA2012sa */ | 14 | /* Unit Attentions ASC's as defined for the MSA2012sa */ |
29 | #define POWER_OR_RESET 0x29 | 15 | #define POWER_OR_RESET 0x29 |
30 | #define STATE_CHANGED 0x2a | 16 | #define STATE_CHANGED 0x2a |
@@ -48,30 +34,13 @@ | |||
48 | #define ASYM_ACCESS_CHANGED 0x06 | 34 | #define ASYM_ACCESS_CHANGED 0x06 |
49 | #define LUN_CAPACITY_CHANGED 0x09 | 35 | #define LUN_CAPACITY_CHANGED 0x09 |
50 | 36 | ||
51 | //transfer direction | 37 | /* config space register offsets */ |
52 | #define XFER_NONE 0x00 | ||
53 | #define XFER_WRITE 0x01 | ||
54 | #define XFER_READ 0x02 | ||
55 | #define XFER_RSVD 0x03 | ||
56 | |||
57 | //task attribute | ||
58 | #define ATTR_UNTAGGED 0x00 | ||
59 | #define ATTR_SIMPLE 0x04 | ||
60 | #define ATTR_HEADOFQUEUE 0x05 | ||
61 | #define ATTR_ORDERED 0x06 | ||
62 | #define ATTR_ACA 0x07 | ||
63 | |||
64 | //cdb type | ||
65 | #define TYPE_CMD 0x00 | ||
66 | #define TYPE_MSG 0x01 | ||
67 | |||
68 | //config space register offsets | ||
69 | #define CFG_VENDORID 0x00 | 38 | #define CFG_VENDORID 0x00 |
70 | #define CFG_DEVICEID 0x02 | 39 | #define CFG_DEVICEID 0x02 |
71 | #define CFG_I2OBAR 0x10 | 40 | #define CFG_I2OBAR 0x10 |
72 | #define CFG_MEM1BAR 0x14 | 41 | #define CFG_MEM1BAR 0x14 |
73 | 42 | ||
74 | //i2o space register offsets | 43 | /* i2o space register offsets */ |
75 | #define I2O_IBDB_SET 0x20 | 44 | #define I2O_IBDB_SET 0x20 |
76 | #define I2O_IBDB_CLEAR 0x70 | 45 | #define I2O_IBDB_CLEAR 0x70 |
77 | #define I2O_INT_STATUS 0x30 | 46 | #define I2O_INT_STATUS 0x30 |
@@ -80,7 +49,7 @@ | |||
80 | #define I2O_OBPOST_Q 0x44 | 49 | #define I2O_OBPOST_Q 0x44 |
81 | #define I2O_DMA1_CFG 0x214 | 50 | #define I2O_DMA1_CFG 0x214 |
82 | 51 | ||
83 | //Configuration Table | 52 | /* Configuration Table */ |
84 | #define CFGTBL_ChangeReq 0x00000001l | 53 | #define CFGTBL_ChangeReq 0x00000001l |
85 | #define CFGTBL_AccCmds 0x00000001l | 54 | #define CFGTBL_AccCmds 0x00000001l |
86 | 55 | ||
@@ -102,24 +71,17 @@ typedef union _u64bit | |||
102 | __u64 val; | 71 | __u64 val; |
103 | } u64bit; | 72 | } u64bit; |
104 | 73 | ||
105 | // Type defs used in the following structs | 74 | /* Type defs used in the following structs */ |
106 | #define BYTE __u8 | ||
107 | #define WORD __u16 | ||
108 | #define HWORD __u16 | ||
109 | #define DWORD __u32 | ||
110 | #define QWORD vals32 | 75 | #define QWORD vals32 |
111 | 76 | ||
112 | //########################################################################### | 77 | /* STRUCTURES */ |
113 | //STRUCTURES | ||
114 | //########################################################################### | ||
115 | #define CISS_MAX_LUN 1024 | ||
116 | #define CISS_MAX_PHYS_LUN 1024 | 78 | #define CISS_MAX_PHYS_LUN 1024 |
117 | // SCSI-3 Cmmands | 79 | /* SCSI-3 Cmmands */ |
118 | 80 | ||
119 | #pragma pack(1) | 81 | #pragma pack(1) |
120 | 82 | ||
121 | #define CISS_INQUIRY 0x12 | 83 | #define CISS_INQUIRY 0x12 |
122 | //Date returned | 84 | /* Date returned */ |
123 | typedef struct _InquiryData_struct | 85 | typedef struct _InquiryData_struct |
124 | { | 86 | { |
125 | BYTE data_byte[36]; | 87 | BYTE data_byte[36]; |
@@ -127,7 +89,7 @@ typedef struct _InquiryData_struct | |||
127 | 89 | ||
128 | #define CISS_REPORT_LOG 0xc2 /* Report Logical LUNs */ | 90 | #define CISS_REPORT_LOG 0xc2 /* Report Logical LUNs */ |
129 | #define CISS_REPORT_PHYS 0xc3 /* Report Physical LUNs */ | 91 | #define CISS_REPORT_PHYS 0xc3 /* Report Physical LUNs */ |
130 | // Data returned | 92 | /* Data returned */ |
131 | typedef struct _ReportLUNdata_struct | 93 | typedef struct _ReportLUNdata_struct |
132 | { | 94 | { |
133 | BYTE LUNListLength[4]; | 95 | BYTE LUNListLength[4]; |
@@ -138,8 +100,8 @@ typedef struct _ReportLUNdata_struct | |||
138 | #define CCISS_READ_CAPACITY 0x25 /* Read Capacity */ | 100 | #define CCISS_READ_CAPACITY 0x25 /* Read Capacity */ |
139 | typedef struct _ReadCapdata_struct | 101 | typedef struct _ReadCapdata_struct |
140 | { | 102 | { |
141 | BYTE total_size[4]; // Total size in blocks | 103 | BYTE total_size[4]; /* Total size in blocks */ |
142 | BYTE block_size[4]; // Size of blocks in bytes | 104 | BYTE block_size[4]; /* Size of blocks in bytes */ |
143 | } ReadCapdata_struct; | 105 | } ReadCapdata_struct; |
144 | 106 | ||
145 | #define CCISS_READ_CAPACITY_16 0x9e /* Read Capacity 16 */ | 107 | #define CCISS_READ_CAPACITY_16 0x9e /* Read Capacity 16 */ |
@@ -171,52 +133,13 @@ typedef struct _ReadCapdata_struct_16 | |||
171 | #define CDB_LEN10 10 | 133 | #define CDB_LEN10 10 |
172 | #define CDB_LEN16 16 | 134 | #define CDB_LEN16 16 |
173 | 135 | ||
174 | // BMIC commands | 136 | /* BMIC commands */ |
175 | #define BMIC_READ 0x26 | 137 | #define BMIC_READ 0x26 |
176 | #define BMIC_WRITE 0x27 | 138 | #define BMIC_WRITE 0x27 |
177 | #define BMIC_CACHE_FLUSH 0xc2 | 139 | #define BMIC_CACHE_FLUSH 0xc2 |
178 | #define CCISS_CACHE_FLUSH 0x01 //C2 was already being used by CCISS | 140 | #define CCISS_CACHE_FLUSH 0x01 /* C2 was already being used by CCISS */ |
179 | |||
180 | //Command List Structure | ||
181 | typedef union _SCSI3Addr_struct { | ||
182 | struct { | ||
183 | BYTE Dev; | ||
184 | BYTE Bus:6; | ||
185 | BYTE Mode:2; // b00 | ||
186 | } PeripDev; | ||
187 | struct { | ||
188 | BYTE DevLSB; | ||
189 | BYTE DevMSB:6; | ||
190 | BYTE Mode:2; // b01 | ||
191 | } LogDev; | ||
192 | struct { | ||
193 | BYTE Dev:5; | ||
194 | BYTE Bus:3; | ||
195 | BYTE Targ:6; | ||
196 | BYTE Mode:2; // b10 | ||
197 | } LogUnit; | ||
198 | } SCSI3Addr_struct; | ||
199 | |||
200 | typedef struct _PhysDevAddr_struct { | ||
201 | DWORD TargetId:24; | ||
202 | DWORD Bus:6; | ||
203 | DWORD Mode:2; | ||
204 | SCSI3Addr_struct Target[2]; //2 level target device addr | ||
205 | } PhysDevAddr_struct; | ||
206 | |||
207 | typedef struct _LogDevAddr_struct { | ||
208 | DWORD VolId:30; | ||
209 | DWORD Mode:2; | ||
210 | BYTE reserved[4]; | ||
211 | } LogDevAddr_struct; | ||
212 | |||
213 | typedef union _LUNAddr_struct { | ||
214 | BYTE LunAddrBytes[8]; | ||
215 | SCSI3Addr_struct SCSI3Lun[4]; | ||
216 | PhysDevAddr_struct PhysDev; | ||
217 | LogDevAddr_struct LogDev; | ||
218 | } LUNAddr_struct; | ||
219 | 141 | ||
142 | /* Command List Structure */ | ||
220 | #define CTLR_LUNID "\0\0\0\0\0\0\0\0" | 143 | #define CTLR_LUNID "\0\0\0\0\0\0\0\0" |
221 | 144 | ||
222 | typedef struct _CommandListHeader_struct { | 145 | typedef struct _CommandListHeader_struct { |
@@ -226,16 +149,6 @@ typedef struct _CommandListHeader_struct { | |||
226 | QWORD Tag; | 149 | QWORD Tag; |
227 | LUNAddr_struct LUN; | 150 | LUNAddr_struct LUN; |
228 | } CommandListHeader_struct; | 151 | } CommandListHeader_struct; |
229 | typedef struct _RequestBlock_struct { | ||
230 | BYTE CDBLen; | ||
231 | struct { | ||
232 | BYTE Type:3; | ||
233 | BYTE Attribute:3; | ||
234 | BYTE Direction:2; | ||
235 | } Type; | ||
236 | HWORD Timeout; | ||
237 | BYTE CDB[16]; | ||
238 | } RequestBlock_struct; | ||
239 | typedef struct _ErrDescriptor_struct { | 152 | typedef struct _ErrDescriptor_struct { |
240 | QWORD Addr; | 153 | QWORD Addr; |
241 | DWORD Len; | 154 | DWORD Len; |
@@ -246,28 +159,6 @@ typedef struct _SGDescriptor_struct { | |||
246 | DWORD Ext; | 159 | DWORD Ext; |
247 | } SGDescriptor_struct; | 160 | } SGDescriptor_struct; |
248 | 161 | ||
249 | typedef union _MoreErrInfo_struct{ | ||
250 | struct { | ||
251 | BYTE Reserved[3]; | ||
252 | BYTE Type; | ||
253 | DWORD ErrorInfo; | ||
254 | }Common_Info; | ||
255 | struct{ | ||
256 | BYTE Reserved[2]; | ||
257 | BYTE offense_size;//size of offending entry | ||
258 | BYTE offense_num; //byte # of offense 0-base | ||
259 | DWORD offense_value; | ||
260 | }Invalid_Cmd; | ||
261 | }MoreErrInfo_struct; | ||
262 | typedef struct _ErrorInfo_struct { | ||
263 | BYTE ScsiStatus; | ||
264 | BYTE SenseLen; | ||
265 | HWORD CommandStatus; | ||
266 | DWORD ResidualCnt; | ||
267 | MoreErrInfo_struct MoreErrInfo; | ||
268 | BYTE SenseInfo[SENSEINFOBYTES]; | ||
269 | } ErrorInfo_struct; | ||
270 | |||
271 | /* Command types */ | 162 | /* Command types */ |
272 | #define CMD_RWREQ 0x00 | 163 | #define CMD_RWREQ 0x00 |
273 | #define CMD_IOCTL_PEND 0x01 | 164 | #define CMD_IOCTL_PEND 0x01 |
@@ -276,10 +167,18 @@ typedef struct _ErrorInfo_struct { | |||
276 | #define CMD_MSG_TIMEOUT 0x05 | 167 | #define CMD_MSG_TIMEOUT 0x05 |
277 | #define CMD_MSG_STALE 0xff | 168 | #define CMD_MSG_STALE 0xff |
278 | 169 | ||
279 | /* This structure needs to be divisible by 8 for new | 170 | /* This structure needs to be divisible by COMMANDLIST_ALIGNMENT |
280 | * indexing method. | 171 | * because low bits of the address are used to to indicate that |
172 | * whether the tag contains an index or an address. PAD_32 and | ||
173 | * PAD_64 can be adjusted independently as needed for 32-bit | ||
174 | * and 64-bits systems. | ||
281 | */ | 175 | */ |
282 | #define PADSIZE (sizeof(long) - 4) | 176 | #define COMMANDLIST_ALIGNMENT (8) |
177 | #define IS_64_BIT ((sizeof(long) - 4)/4) | ||
178 | #define IS_32_BIT (!IS_64_BIT) | ||
179 | #define PAD_32 (0) | ||
180 | #define PAD_64 (4) | ||
181 | #define PADSIZE (IS_32_BIT * PAD_32 + IS_64_BIT * PAD_64) | ||
283 | typedef struct _CommandList_struct { | 182 | typedef struct _CommandList_struct { |
284 | CommandListHeader_struct Header; | 183 | CommandListHeader_struct Header; |
285 | RequestBlock_struct Request; | 184 | RequestBlock_struct Request; |
@@ -299,7 +198,7 @@ typedef struct _CommandList_struct { | |||
299 | char pad[PADSIZE]; | 198 | char pad[PADSIZE]; |
300 | } CommandList_struct; | 199 | } CommandList_struct; |
301 | 200 | ||
302 | //Configuration Table Structure | 201 | /* Configuration Table Structure */ |
303 | typedef struct _HostWrite_struct { | 202 | typedef struct _HostWrite_struct { |
304 | DWORD TransportRequest; | 203 | DWORD TransportRequest; |
305 | DWORD Reserved; | 204 | DWORD Reserved; |
@@ -319,6 +218,10 @@ typedef struct _CfgTable_struct { | |||
319 | BYTE ServerName[16]; | 218 | BYTE ServerName[16]; |
320 | DWORD HeartBeat; | 219 | DWORD HeartBeat; |
321 | DWORD SCSI_Prefetch; | 220 | DWORD SCSI_Prefetch; |
221 | DWORD MaxSGElements; | ||
222 | DWORD MaxLogicalUnits; | ||
223 | DWORD MaxPhysicalDrives; | ||
224 | DWORD MaxPhysicalDrivesPerLogicalUnit; | ||
322 | } CfgTable_struct; | 225 | } CfgTable_struct; |
323 | #pragma pack() | 226 | #pragma pack() |
324 | #endif // CCISS_CMD_H | 227 | #endif /* CCISS_CMD_H */ |
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c index 3315268b4ec7..e1d0e2cfec72 100644 --- a/drivers/block/cciss_scsi.c +++ b/drivers/block/cciss_scsi.c | |||
@@ -84,7 +84,6 @@ static struct scsi_host_template cciss_driver_template = { | |||
84 | .queuecommand = cciss_scsi_queue_command, | 84 | .queuecommand = cciss_scsi_queue_command, |
85 | .can_queue = SCSI_CCISS_CAN_QUEUE, | 85 | .can_queue = SCSI_CCISS_CAN_QUEUE, |
86 | .this_id = 7, | 86 | .this_id = 7, |
87 | .sg_tablesize = MAXSGENTRIES, | ||
88 | .cmd_per_lun = 1, | 87 | .cmd_per_lun = 1, |
89 | .use_clustering = DISABLE_CLUSTERING, | 88 | .use_clustering = DISABLE_CLUSTERING, |
90 | /* Can't have eh_bus_reset_handler or eh_host_reset_handler for cciss */ | 89 | /* Can't have eh_bus_reset_handler or eh_host_reset_handler for cciss */ |
@@ -93,11 +92,16 @@ static struct scsi_host_template cciss_driver_template = { | |||
93 | }; | 92 | }; |
94 | 93 | ||
95 | #pragma pack(1) | 94 | #pragma pack(1) |
95 | |||
96 | #define SCSI_PAD_32 0 | ||
97 | #define SCSI_PAD_64 0 | ||
98 | |||
96 | struct cciss_scsi_cmd_stack_elem_t { | 99 | struct cciss_scsi_cmd_stack_elem_t { |
97 | CommandList_struct cmd; | 100 | CommandList_struct cmd; |
98 | ErrorInfo_struct Err; | 101 | ErrorInfo_struct Err; |
99 | __u32 busaddr; | 102 | __u32 busaddr; |
100 | __u32 pad; | 103 | int cmdindex; |
104 | u8 pad[IS_32_BIT * SCSI_PAD_32 + IS_64_BIT * SCSI_PAD_64]; | ||
101 | }; | 105 | }; |
102 | 106 | ||
103 | #pragma pack() | 107 | #pragma pack() |
@@ -118,16 +122,15 @@ struct cciss_scsi_cmd_stack_t { | |||
118 | struct cciss_scsi_adapter_data_t { | 122 | struct cciss_scsi_adapter_data_t { |
119 | struct Scsi_Host *scsi_host; | 123 | struct Scsi_Host *scsi_host; |
120 | struct cciss_scsi_cmd_stack_t cmd_stack; | 124 | struct cciss_scsi_cmd_stack_t cmd_stack; |
125 | SGDescriptor_struct **cmd_sg_list; | ||
121 | int registered; | 126 | int registered; |
122 | spinlock_t lock; // to protect ccissscsi[ctlr]; | 127 | spinlock_t lock; // to protect ccissscsi[ctlr]; |
123 | }; | 128 | }; |
124 | 129 | ||
125 | #define CPQ_TAPE_LOCK(ctlr, flags) spin_lock_irqsave( \ | 130 | #define CPQ_TAPE_LOCK(ctlr, flags) spin_lock_irqsave( \ |
126 | &(((struct cciss_scsi_adapter_data_t *) \ | 131 | &hba[ctlr]->scsi_ctlr->lock, flags); |
127 | hba[ctlr]->scsi_ctlr)->lock), flags); | ||
128 | #define CPQ_TAPE_UNLOCK(ctlr, flags) spin_unlock_irqrestore( \ | 132 | #define CPQ_TAPE_UNLOCK(ctlr, flags) spin_unlock_irqrestore( \ |
129 | &(((struct cciss_scsi_adapter_data_t *) \ | 133 | &hba[ctlr]->scsi_ctlr->lock, flags); |
130 | hba[ctlr]->scsi_ctlr)->lock), flags); | ||
131 | 134 | ||
132 | static CommandList_struct * | 135 | static CommandList_struct * |
133 | scsi_cmd_alloc(ctlr_info_t *h) | 136 | scsi_cmd_alloc(ctlr_info_t *h) |
@@ -143,7 +146,7 @@ scsi_cmd_alloc(ctlr_info_t *h) | |||
143 | struct cciss_scsi_cmd_stack_t *stk; | 146 | struct cciss_scsi_cmd_stack_t *stk; |
144 | u64bit temp64; | 147 | u64bit temp64; |
145 | 148 | ||
146 | sa = (struct cciss_scsi_adapter_data_t *) h->scsi_ctlr; | 149 | sa = h->scsi_ctlr; |
147 | stk = &sa->cmd_stack; | 150 | stk = &sa->cmd_stack; |
148 | 151 | ||
149 | if (stk->top < 0) | 152 | if (stk->top < 0) |
@@ -154,6 +157,7 @@ scsi_cmd_alloc(ctlr_info_t *h) | |||
154 | memset(&c->Err, 0, sizeof(c->Err)); | 157 | memset(&c->Err, 0, sizeof(c->Err)); |
155 | /* set physical addr of cmd and addr of scsi parameters */ | 158 | /* set physical addr of cmd and addr of scsi parameters */ |
156 | c->cmd.busaddr = c->busaddr; | 159 | c->cmd.busaddr = c->busaddr; |
160 | c->cmd.cmdindex = c->cmdindex; | ||
157 | /* (__u32) (stk->cmd_pool_handle + | 161 | /* (__u32) (stk->cmd_pool_handle + |
158 | (sizeof(struct cciss_scsi_cmd_stack_elem_t)*stk->top)); */ | 162 | (sizeof(struct cciss_scsi_cmd_stack_elem_t)*stk->top)); */ |
159 | 163 | ||
@@ -182,7 +186,7 @@ scsi_cmd_free(ctlr_info_t *h, CommandList_struct *cmd) | |||
182 | struct cciss_scsi_adapter_data_t *sa; | 186 | struct cciss_scsi_adapter_data_t *sa; |
183 | struct cciss_scsi_cmd_stack_t *stk; | 187 | struct cciss_scsi_cmd_stack_t *stk; |
184 | 188 | ||
185 | sa = (struct cciss_scsi_adapter_data_t *) h->scsi_ctlr; | 189 | sa = h->scsi_ctlr; |
186 | stk = &sa->cmd_stack; | 190 | stk = &sa->cmd_stack; |
187 | if (stk->top >= CMD_STACK_SIZE) { | 191 | if (stk->top >= CMD_STACK_SIZE) { |
188 | printk("cciss: scsi_cmd_free called too many times.\n"); | 192 | printk("cciss: scsi_cmd_free called too many times.\n"); |
@@ -199,24 +203,31 @@ scsi_cmd_stack_setup(int ctlr, struct cciss_scsi_adapter_data_t *sa) | |||
199 | struct cciss_scsi_cmd_stack_t *stk; | 203 | struct cciss_scsi_cmd_stack_t *stk; |
200 | size_t size; | 204 | size_t size; |
201 | 205 | ||
206 | sa->cmd_sg_list = cciss_allocate_sg_chain_blocks(hba[ctlr], | ||
207 | hba[ctlr]->chainsize, CMD_STACK_SIZE); | ||
208 | if (!sa->cmd_sg_list && hba[ctlr]->chainsize > 0) | ||
209 | return -ENOMEM; | ||
210 | |||
202 | stk = &sa->cmd_stack; | 211 | stk = &sa->cmd_stack; |
203 | size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * CMD_STACK_SIZE; | 212 | size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * CMD_STACK_SIZE; |
204 | 213 | ||
205 | // pci_alloc_consistent guarantees 32-bit DMA address will | 214 | /* Check alignment, see cciss_cmd.h near CommandList_struct def. */ |
206 | // be used | 215 | BUILD_BUG_ON((sizeof(*stk->pool) % COMMANDLIST_ALIGNMENT) != 0); |
207 | 216 | /* pci_alloc_consistent guarantees 32-bit DMA address will be used */ | |
208 | stk->pool = (struct cciss_scsi_cmd_stack_elem_t *) | 217 | stk->pool = (struct cciss_scsi_cmd_stack_elem_t *) |
209 | pci_alloc_consistent(hba[ctlr]->pdev, size, &stk->cmd_pool_handle); | 218 | pci_alloc_consistent(hba[ctlr]->pdev, size, &stk->cmd_pool_handle); |
210 | 219 | ||
211 | if (stk->pool == NULL) { | 220 | if (stk->pool == NULL) { |
212 | printk("stk->pool is null\n"); | 221 | cciss_free_sg_chain_blocks(sa->cmd_sg_list, CMD_STACK_SIZE); |
213 | return -1; | 222 | sa->cmd_sg_list = NULL; |
223 | return -ENOMEM; | ||
214 | } | 224 | } |
215 | 225 | ||
216 | for (i=0; i<CMD_STACK_SIZE; i++) { | 226 | for (i=0; i<CMD_STACK_SIZE; i++) { |
217 | stk->elem[i] = &stk->pool[i]; | 227 | stk->elem[i] = &stk->pool[i]; |
218 | stk->elem[i]->busaddr = (__u32) (stk->cmd_pool_handle + | 228 | stk->elem[i]->busaddr = (__u32) (stk->cmd_pool_handle + |
219 | (sizeof(struct cciss_scsi_cmd_stack_elem_t) * i)); | 229 | (sizeof(struct cciss_scsi_cmd_stack_elem_t) * i)); |
230 | stk->elem[i]->cmdindex = i; | ||
220 | } | 231 | } |
221 | stk->top = CMD_STACK_SIZE-1; | 232 | stk->top = CMD_STACK_SIZE-1; |
222 | return 0; | 233 | return 0; |
@@ -229,7 +240,7 @@ scsi_cmd_stack_free(int ctlr) | |||
229 | struct cciss_scsi_cmd_stack_t *stk; | 240 | struct cciss_scsi_cmd_stack_t *stk; |
230 | size_t size; | 241 | size_t size; |
231 | 242 | ||
232 | sa = (struct cciss_scsi_adapter_data_t *) hba[ctlr]->scsi_ctlr; | 243 | sa = hba[ctlr]->scsi_ctlr; |
233 | stk = &sa->cmd_stack; | 244 | stk = &sa->cmd_stack; |
234 | if (stk->top != CMD_STACK_SIZE-1) { | 245 | if (stk->top != CMD_STACK_SIZE-1) { |
235 | printk( "cciss: %d scsi commands are still outstanding.\n", | 246 | printk( "cciss: %d scsi commands are still outstanding.\n", |
@@ -241,6 +252,7 @@ scsi_cmd_stack_free(int ctlr) | |||
241 | 252 | ||
242 | pci_free_consistent(hba[ctlr]->pdev, size, stk->pool, stk->cmd_pool_handle); | 253 | pci_free_consistent(hba[ctlr]->pdev, size, stk->pool, stk->cmd_pool_handle); |
243 | stk->pool = NULL; | 254 | stk->pool = NULL; |
255 | cciss_free_sg_chain_blocks(sa->cmd_sg_list, CMD_STACK_SIZE); | ||
244 | } | 256 | } |
245 | 257 | ||
246 | #if 0 | 258 | #if 0 |
@@ -530,8 +542,7 @@ adjust_cciss_scsi_table(int ctlr, int hostno, | |||
530 | CPQ_TAPE_LOCK(ctlr, flags); | 542 | CPQ_TAPE_LOCK(ctlr, flags); |
531 | 543 | ||
532 | if (hostno != -1) /* if it's not the first time... */ | 544 | if (hostno != -1) /* if it's not the first time... */ |
533 | sh = ((struct cciss_scsi_adapter_data_t *) | 545 | sh = hba[ctlr]->scsi_ctlr->scsi_host; |
534 | hba[ctlr]->scsi_ctlr)->scsi_host; | ||
535 | 546 | ||
536 | /* find any devices in ccissscsi[] that are not in | 547 | /* find any devices in ccissscsi[] that are not in |
537 | sd[] and remove them from ccissscsi[] */ | 548 | sd[] and remove them from ccissscsi[] */ |
@@ -702,7 +713,7 @@ cciss_scsi_setup(int cntl_num) | |||
702 | kfree(shba); | 713 | kfree(shba); |
703 | shba = NULL; | 714 | shba = NULL; |
704 | } | 715 | } |
705 | hba[cntl_num]->scsi_ctlr = (void *) shba; | 716 | hba[cntl_num]->scsi_ctlr = shba; |
706 | return; | 717 | return; |
707 | } | 718 | } |
708 | 719 | ||
@@ -725,6 +736,8 @@ complete_scsi_command( CommandList_struct *cp, int timeout, __u32 tag) | |||
725 | ctlr = hba[cp->ctlr]; | 736 | ctlr = hba[cp->ctlr]; |
726 | 737 | ||
727 | scsi_dma_unmap(cmd); | 738 | scsi_dma_unmap(cmd); |
739 | if (cp->Header.SGTotal > ctlr->max_cmd_sgentries) | ||
740 | cciss_unmap_sg_chain_block(ctlr, cp); | ||
728 | 741 | ||
729 | cmd->result = (DID_OK << 16); /* host byte */ | 742 | cmd->result = (DID_OK << 16); /* host byte */ |
730 | cmd->result |= (COMMAND_COMPLETE << 8); /* msg byte */ | 743 | cmd->result |= (COMMAND_COMPLETE << 8); /* msg byte */ |
@@ -755,7 +768,7 @@ complete_scsi_command( CommandList_struct *cp, int timeout, __u32 tag) | |||
755 | cp, | 768 | cp, |
756 | ei->ScsiStatus); | 769 | ei->ScsiStatus); |
757 | #endif | 770 | #endif |
758 | cmd->result |= (ei->ScsiStatus < 1); | 771 | cmd->result |= (ei->ScsiStatus << 1); |
759 | } | 772 | } |
760 | else { /* scsi status is zero??? How??? */ | 773 | else { /* scsi status is zero??? How??? */ |
761 | 774 | ||
@@ -847,9 +860,10 @@ cciss_scsi_detect(int ctlr) | |||
847 | sh->io_port = 0; // good enough? FIXME, | 860 | sh->io_port = 0; // good enough? FIXME, |
848 | sh->n_io_port = 0; // I don't think we use these two... | 861 | sh->n_io_port = 0; // I don't think we use these two... |
849 | sh->this_id = SELF_SCSI_ID; | 862 | sh->this_id = SELF_SCSI_ID; |
863 | sh->sg_tablesize = hba[ctlr]->maxsgentries; | ||
850 | 864 | ||
851 | ((struct cciss_scsi_adapter_data_t *) | 865 | ((struct cciss_scsi_adapter_data_t *) |
852 | hba[ctlr]->scsi_ctlr)->scsi_host = (void *) sh; | 866 | hba[ctlr]->scsi_ctlr)->scsi_host = sh; |
853 | sh->hostdata[0] = (unsigned long) hba[ctlr]; | 867 | sh->hostdata[0] = (unsigned long) hba[ctlr]; |
854 | sh->irq = hba[ctlr]->intr[SIMPLE_MODE_INT]; | 868 | sh->irq = hba[ctlr]->intr[SIMPLE_MODE_INT]; |
855 | sh->unique_id = sh->irq; | 869 | sh->unique_id = sh->irq; |
@@ -1364,34 +1378,54 @@ cciss_scsi_proc_info(struct Scsi_Host *sh, | |||
1364 | dma mapping and fills in the scatter gather entries of the | 1378 | dma mapping and fills in the scatter gather entries of the |
1365 | cciss command, cp. */ | 1379 | cciss command, cp. */ |
1366 | 1380 | ||
1367 | static void | 1381 | static void cciss_scatter_gather(ctlr_info_t *h, CommandList_struct *cp, |
1368 | cciss_scatter_gather(struct pci_dev *pdev, | 1382 | struct scsi_cmnd *cmd) |
1369 | CommandList_struct *cp, | ||
1370 | struct scsi_cmnd *cmd) | ||
1371 | { | 1383 | { |
1372 | unsigned int len; | 1384 | unsigned int len; |
1373 | struct scatterlist *sg; | 1385 | struct scatterlist *sg; |
1374 | __u64 addr64; | 1386 | __u64 addr64; |
1375 | int use_sg, i; | 1387 | int request_nsgs, i, chained, sg_index; |
1376 | 1388 | struct cciss_scsi_adapter_data_t *sa = h->scsi_ctlr; | |
1377 | BUG_ON(scsi_sg_count(cmd) > MAXSGENTRIES); | 1389 | SGDescriptor_struct *curr_sg; |
1378 | 1390 | ||
1379 | use_sg = scsi_dma_map(cmd); | 1391 | BUG_ON(scsi_sg_count(cmd) > h->maxsgentries); |
1380 | if (use_sg) { /* not too many addrs? */ | 1392 | |
1381 | scsi_for_each_sg(cmd, sg, use_sg, i) { | 1393 | chained = 0; |
1394 | sg_index = 0; | ||
1395 | curr_sg = cp->SG; | ||
1396 | request_nsgs = scsi_dma_map(cmd); | ||
1397 | if (request_nsgs) { | ||
1398 | scsi_for_each_sg(cmd, sg, request_nsgs, i) { | ||
1399 | if (sg_index + 1 == h->max_cmd_sgentries && | ||
1400 | !chained && request_nsgs - i > 1) { | ||
1401 | chained = 1; | ||
1402 | sg_index = 0; | ||
1403 | curr_sg = sa->cmd_sg_list[cp->cmdindex]; | ||
1404 | } | ||
1382 | addr64 = (__u64) sg_dma_address(sg); | 1405 | addr64 = (__u64) sg_dma_address(sg); |
1383 | len = sg_dma_len(sg); | 1406 | len = sg_dma_len(sg); |
1384 | cp->SG[i].Addr.lower = | 1407 | curr_sg[sg_index].Addr.lower = |
1385 | (__u32) (addr64 & (__u64) 0x00000000FFFFFFFF); | 1408 | (__u32) (addr64 & 0x0FFFFFFFFULL); |
1386 | cp->SG[i].Addr.upper = | 1409 | curr_sg[sg_index].Addr.upper = |
1387 | (__u32) ((addr64 >> 32) & (__u64) 0x00000000FFFFFFFF); | 1410 | (__u32) ((addr64 >> 32) & 0x0FFFFFFFFULL); |
1388 | cp->SG[i].Len = len; | 1411 | curr_sg[sg_index].Len = len; |
1389 | cp->SG[i].Ext = 0; // we are not chaining | 1412 | curr_sg[sg_index].Ext = 0; |
1413 | ++sg_index; | ||
1390 | } | 1414 | } |
1415 | if (chained) | ||
1416 | cciss_map_sg_chain_block(h, cp, | ||
1417 | sa->cmd_sg_list[cp->cmdindex], | ||
1418 | (request_nsgs - (h->max_cmd_sgentries - 1)) * | ||
1419 | sizeof(SGDescriptor_struct)); | ||
1391 | } | 1420 | } |
1392 | 1421 | /* track how many SG entries we are using */ | |
1393 | cp->Header.SGList = (__u8) use_sg; /* no. SGs contig in this cmd */ | 1422 | if (request_nsgs > h->maxSG) |
1394 | cp->Header.SGTotal = (__u16) use_sg; /* total sgs in this cmd list */ | 1423 | h->maxSG = request_nsgs; |
1424 | cp->Header.SGTotal = (__u8) request_nsgs + chained; | ||
1425 | if (request_nsgs > h->max_cmd_sgentries) | ||
1426 | cp->Header.SGList = h->max_cmd_sgentries; | ||
1427 | else | ||
1428 | cp->Header.SGList = cp->Header.SGTotal; | ||
1395 | return; | 1429 | return; |
1396 | } | 1430 | } |
1397 | 1431 | ||
@@ -1399,7 +1433,7 @@ cciss_scatter_gather(struct pci_dev *pdev, | |||
1399 | static int | 1433 | static int |
1400 | cciss_scsi_queue_command (struct scsi_cmnd *cmd, void (* done)(struct scsi_cmnd *)) | 1434 | cciss_scsi_queue_command (struct scsi_cmnd *cmd, void (* done)(struct scsi_cmnd *)) |
1401 | { | 1435 | { |
1402 | ctlr_info_t **c; | 1436 | ctlr_info_t *c; |
1403 | int ctlr, rc; | 1437 | int ctlr, rc; |
1404 | unsigned char scsi3addr[8]; | 1438 | unsigned char scsi3addr[8]; |
1405 | CommandList_struct *cp; | 1439 | CommandList_struct *cp; |
@@ -1407,8 +1441,8 @@ cciss_scsi_queue_command (struct scsi_cmnd *cmd, void (* done)(struct scsi_cmnd | |||
1407 | 1441 | ||
1408 | // Get the ptr to our adapter structure (hba[i]) out of cmd->host. | 1442 | // Get the ptr to our adapter structure (hba[i]) out of cmd->host. |
1409 | // We violate cmd->host privacy here. (Is there another way?) | 1443 | // We violate cmd->host privacy here. (Is there another way?) |
1410 | c = (ctlr_info_t **) &cmd->device->host->hostdata[0]; | 1444 | c = (ctlr_info_t *) cmd->device->host->hostdata[0]; |
1411 | ctlr = (*c)->ctlr; | 1445 | ctlr = c->ctlr; |
1412 | 1446 | ||
1413 | rc = lookup_scsi3addr(ctlr, cmd->device->channel, cmd->device->id, | 1447 | rc = lookup_scsi3addr(ctlr, cmd->device->channel, cmd->device->id, |
1414 | cmd->device->lun, scsi3addr); | 1448 | cmd->device->lun, scsi3addr); |
@@ -1431,7 +1465,7 @@ cciss_scsi_queue_command (struct scsi_cmnd *cmd, void (* done)(struct scsi_cmnd | |||
1431 | see what the device thinks of it. */ | 1465 | see what the device thinks of it. */ |
1432 | 1466 | ||
1433 | spin_lock_irqsave(CCISS_LOCK(ctlr), flags); | 1467 | spin_lock_irqsave(CCISS_LOCK(ctlr), flags); |
1434 | cp = scsi_cmd_alloc(*c); | 1468 | cp = scsi_cmd_alloc(c); |
1435 | spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags); | 1469 | spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags); |
1436 | if (cp == NULL) { /* trouble... */ | 1470 | if (cp == NULL) { /* trouble... */ |
1437 | printk("scsi_cmd_alloc returned NULL!\n"); | 1471 | printk("scsi_cmd_alloc returned NULL!\n"); |
@@ -1489,15 +1523,14 @@ cciss_scsi_queue_command (struct scsi_cmnd *cmd, void (* done)(struct scsi_cmnd | |||
1489 | BUG(); | 1523 | BUG(); |
1490 | break; | 1524 | break; |
1491 | } | 1525 | } |
1492 | 1526 | cciss_scatter_gather(c, cp, cmd); | |
1493 | cciss_scatter_gather((*c)->pdev, cp, cmd); // Fill the SG list | ||
1494 | 1527 | ||
1495 | /* Put the request on the tail of the request queue */ | 1528 | /* Put the request on the tail of the request queue */ |
1496 | 1529 | ||
1497 | spin_lock_irqsave(CCISS_LOCK(ctlr), flags); | 1530 | spin_lock_irqsave(CCISS_LOCK(ctlr), flags); |
1498 | addQ(&(*c)->reqQ, cp); | 1531 | addQ(&c->reqQ, cp); |
1499 | (*c)->Qdepth++; | 1532 | c->Qdepth++; |
1500 | start_io(*c); | 1533 | start_io(c); |
1501 | spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags); | 1534 | spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags); |
1502 | 1535 | ||
1503 | /* the cmd'll come back via intr handler in complete_scsi_command() */ | 1536 | /* the cmd'll come back via intr handler in complete_scsi_command() */ |
@@ -1514,7 +1547,7 @@ cciss_unregister_scsi(int ctlr) | |||
1514 | /* we are being forcibly unloaded, and may not refuse. */ | 1547 | /* we are being forcibly unloaded, and may not refuse. */ |
1515 | 1548 | ||
1516 | spin_lock_irqsave(CCISS_LOCK(ctlr), flags); | 1549 | spin_lock_irqsave(CCISS_LOCK(ctlr), flags); |
1517 | sa = (struct cciss_scsi_adapter_data_t *) hba[ctlr]->scsi_ctlr; | 1550 | sa = hba[ctlr]->scsi_ctlr; |
1518 | stk = &sa->cmd_stack; | 1551 | stk = &sa->cmd_stack; |
1519 | 1552 | ||
1520 | /* if we weren't ever actually registered, don't unregister */ | 1553 | /* if we weren't ever actually registered, don't unregister */ |
@@ -1541,13 +1574,13 @@ cciss_engage_scsi(int ctlr) | |||
1541 | unsigned long flags; | 1574 | unsigned long flags; |
1542 | 1575 | ||
1543 | spin_lock_irqsave(CCISS_LOCK(ctlr), flags); | 1576 | spin_lock_irqsave(CCISS_LOCK(ctlr), flags); |
1544 | sa = (struct cciss_scsi_adapter_data_t *) hba[ctlr]->scsi_ctlr; | 1577 | sa = hba[ctlr]->scsi_ctlr; |
1545 | stk = &sa->cmd_stack; | 1578 | stk = &sa->cmd_stack; |
1546 | 1579 | ||
1547 | if (sa->registered) { | 1580 | if (sa->registered) { |
1548 | printk("cciss%d: SCSI subsystem already engaged.\n", ctlr); | 1581 | printk("cciss%d: SCSI subsystem already engaged.\n", ctlr); |
1549 | spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags); | 1582 | spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags); |
1550 | return ENXIO; | 1583 | return -ENXIO; |
1551 | } | 1584 | } |
1552 | sa->registered = 1; | 1585 | sa->registered = 1; |
1553 | spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags); | 1586 | spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags); |
@@ -1654,14 +1687,14 @@ static int cciss_eh_device_reset_handler(struct scsi_cmnd *scsicmd) | |||
1654 | int rc; | 1687 | int rc; |
1655 | CommandList_struct *cmd_in_trouble; | 1688 | CommandList_struct *cmd_in_trouble; |
1656 | unsigned char lunaddr[8]; | 1689 | unsigned char lunaddr[8]; |
1657 | ctlr_info_t **c; | 1690 | ctlr_info_t *c; |
1658 | int ctlr; | 1691 | int ctlr; |
1659 | 1692 | ||
1660 | /* find the controller to which the command to be aborted was sent */ | 1693 | /* find the controller to which the command to be aborted was sent */ |
1661 | c = (ctlr_info_t **) &scsicmd->device->host->hostdata[0]; | 1694 | c = (ctlr_info_t *) scsicmd->device->host->hostdata[0]; |
1662 | if (c == NULL) /* paranoia */ | 1695 | if (c == NULL) /* paranoia */ |
1663 | return FAILED; | 1696 | return FAILED; |
1664 | ctlr = (*c)->ctlr; | 1697 | ctlr = c->ctlr; |
1665 | printk(KERN_WARNING "cciss%d: resetting tape drive or medium changer.\n", ctlr); | 1698 | printk(KERN_WARNING "cciss%d: resetting tape drive or medium changer.\n", ctlr); |
1666 | /* find the command that's giving us trouble */ | 1699 | /* find the command that's giving us trouble */ |
1667 | cmd_in_trouble = (CommandList_struct *) scsicmd->host_scribble; | 1700 | cmd_in_trouble = (CommandList_struct *) scsicmd->host_scribble; |
@@ -1671,7 +1704,7 @@ static int cciss_eh_device_reset_handler(struct scsi_cmnd *scsicmd) | |||
1671 | /* send a reset to the SCSI LUN which the command was sent to */ | 1704 | /* send a reset to the SCSI LUN which the command was sent to */ |
1672 | rc = sendcmd_withirq(CCISS_RESET_MSG, ctlr, NULL, 0, 0, lunaddr, | 1705 | rc = sendcmd_withirq(CCISS_RESET_MSG, ctlr, NULL, 0, 0, lunaddr, |
1673 | TYPE_MSG); | 1706 | TYPE_MSG); |
1674 | if (rc == 0 && wait_for_device_to_become_ready(*c, lunaddr) == 0) | 1707 | if (rc == 0 && wait_for_device_to_become_ready(c, lunaddr) == 0) |
1675 | return SUCCESS; | 1708 | return SUCCESS; |
1676 | printk(KERN_WARNING "cciss%d: resetting device failed.\n", ctlr); | 1709 | printk(KERN_WARNING "cciss%d: resetting device failed.\n", ctlr); |
1677 | return FAILED; | 1710 | return FAILED; |
@@ -1682,14 +1715,14 @@ static int cciss_eh_abort_handler(struct scsi_cmnd *scsicmd) | |||
1682 | int rc; | 1715 | int rc; |
1683 | CommandList_struct *cmd_to_abort; | 1716 | CommandList_struct *cmd_to_abort; |
1684 | unsigned char lunaddr[8]; | 1717 | unsigned char lunaddr[8]; |
1685 | ctlr_info_t **c; | 1718 | ctlr_info_t *c; |
1686 | int ctlr; | 1719 | int ctlr; |
1687 | 1720 | ||
1688 | /* find the controller to which the command to be aborted was sent */ | 1721 | /* find the controller to which the command to be aborted was sent */ |
1689 | c = (ctlr_info_t **) &scsicmd->device->host->hostdata[0]; | 1722 | c = (ctlr_info_t *) scsicmd->device->host->hostdata[0]; |
1690 | if (c == NULL) /* paranoia */ | 1723 | if (c == NULL) /* paranoia */ |
1691 | return FAILED; | 1724 | return FAILED; |
1692 | ctlr = (*c)->ctlr; | 1725 | ctlr = c->ctlr; |
1693 | printk(KERN_WARNING "cciss%d: aborting tardy SCSI cmd\n", ctlr); | 1726 | printk(KERN_WARNING "cciss%d: aborting tardy SCSI cmd\n", ctlr); |
1694 | 1727 | ||
1695 | /* find the command to be aborted */ | 1728 | /* find the command to be aborted */ |
diff --git a/drivers/block/cciss_scsi.h b/drivers/block/cciss_scsi.h index 7b750245ae76..6d5822fe851a 100644 --- a/drivers/block/cciss_scsi.h +++ b/drivers/block/cciss_scsi.h | |||
@@ -25,16 +25,16 @@ | |||
25 | 25 | ||
26 | #include <scsi/scsicam.h> /* possibly irrelevant, since we don't show disks */ | 26 | #include <scsi/scsicam.h> /* possibly irrelevant, since we don't show disks */ |
27 | 27 | ||
28 | // the scsi id of the adapter... | 28 | /* the scsi id of the adapter... */ |
29 | #define SELF_SCSI_ID 15 | 29 | #define SELF_SCSI_ID 15 |
30 | // 15 is somewhat arbitrary, since the scsi-2 bus | 30 | /* 15 is somewhat arbitrary, since the scsi-2 bus |
31 | // that's presented by the driver to the OS is | 31 | that's presented by the driver to the OS is |
32 | // fabricated. The "real" scsi-3 bus the | 32 | fabricated. The "real" scsi-3 bus the |
33 | // hardware presents is fabricated too. | 33 | hardware presents is fabricated too. |
34 | // The actual, honest-to-goodness physical | 34 | The actual, honest-to-goodness physical |
35 | // bus that the devices are attached to is not | 35 | bus that the devices are attached to is not |
36 | // addressible natively, and may in fact turn | 36 | addressible natively, and may in fact turn |
37 | // out to be not scsi at all. | 37 | out to be not scsi at all. */ |
38 | 38 | ||
39 | #define SCSI_CCISS_CAN_QUEUE 2 | 39 | #define SCSI_CCISS_CAN_QUEUE 2 |
40 | 40 | ||
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c index 6422651ec364..91d11631cec9 100644 --- a/drivers/block/cpqarray.c +++ b/drivers/block/cpqarray.c | |||
@@ -448,11 +448,8 @@ static int __init cpqarray_register_ctlr( int i, struct pci_dev *pdev) | |||
448 | blk_queue_bounce_limit(q, hba[i]->pci_dev->dma_mask); | 448 | blk_queue_bounce_limit(q, hba[i]->pci_dev->dma_mask); |
449 | 449 | ||
450 | /* This is a hardware imposed limit. */ | 450 | /* This is a hardware imposed limit. */ |
451 | blk_queue_max_hw_segments(q, SG_MAX); | 451 | blk_queue_max_segments(q, SG_MAX); |
452 | 452 | ||
453 | /* This is a driver limit and could be eliminated. */ | ||
454 | blk_queue_max_phys_segments(q, SG_MAX); | ||
455 | |||
456 | init_timer(&hba[i]->timer); | 453 | init_timer(&hba[i]->timer); |
457 | hba[i]->timer.expires = jiffies + IDA_TIMER; | 454 | hba[i]->timer.expires = jiffies + IDA_TIMER; |
458 | hba[i]->timer.data = (unsigned long)hba[i]; | 455 | hba[i]->timer.data = (unsigned long)hba[i]; |
diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig new file mode 100644 index 000000000000..df0983787390 --- /dev/null +++ b/drivers/block/drbd/Kconfig | |||
@@ -0,0 +1,71 @@ | |||
1 | # | ||
2 | # DRBD device driver configuration | ||
3 | # | ||
4 | |||
5 | comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected" | ||
6 | depends on PROC_FS='n' || INET='n' || CONNECTOR='n' | ||
7 | |||
8 | config BLK_DEV_DRBD | ||
9 | tristate "DRBD Distributed Replicated Block Device support" | ||
10 | depends on PROC_FS && INET && CONNECTOR | ||
11 | select LRU_CACHE | ||
12 | default n | ||
13 | help | ||
14 | |||
15 | NOTE: In order to authenticate connections you have to select | ||
16 | CRYPTO_HMAC and a hash function as well. | ||
17 | |||
18 | DRBD is a shared-nothing, synchronously replicated block device. It | ||
19 | is designed to serve as a building block for high availability | ||
20 | clusters and in this context, is a "drop-in" replacement for shared | ||
21 | storage. Simplistically, you could see it as a network RAID 1. | ||
22 | |||
23 | Each minor device has a role, which can be 'primary' or 'secondary'. | ||
24 | On the node with the primary device the application is supposed to | ||
25 | run and to access the device (/dev/drbdX). Every write is sent to | ||
26 | the local 'lower level block device' and, across the network, to the | ||
27 | node with the device in 'secondary' state. The secondary device | ||
28 | simply writes the data to its lower level block device. | ||
29 | |||
30 | DRBD can also be used in dual-Primary mode (device writable on both | ||
31 | nodes), which means it can exhibit shared disk semantics in a | ||
32 | shared-nothing cluster. Needless to say, on top of dual-Primary | ||
33 | DRBD utilizing a cluster file system is necessary to maintain for | ||
34 | cache coherency. | ||
35 | |||
36 | For automatic failover you need a cluster manager (e.g. heartbeat). | ||
37 | See also: http://www.drbd.org/, http://www.linux-ha.org | ||
38 | |||
39 | If unsure, say N. | ||
40 | |||
41 | config DRBD_FAULT_INJECTION | ||
42 | bool "DRBD fault injection" | ||
43 | depends on BLK_DEV_DRBD | ||
44 | help | ||
45 | |||
46 | Say Y here if you want to simulate IO errors, in order to test DRBD's | ||
47 | behavior. | ||
48 | |||
49 | The actual simulation of IO errors is done by writing 3 values to | ||
50 | /sys/module/drbd/parameters/ | ||
51 | |||
52 | enable_faults: bitmask of... | ||
53 | 1 meta data write | ||
54 | 2 read | ||
55 | 4 resync data write | ||
56 | 8 read | ||
57 | 16 data write | ||
58 | 32 data read | ||
59 | 64 read ahead | ||
60 | 128 kmalloc of bitmap | ||
61 | 256 allocation of EE (epoch_entries) | ||
62 | |||
63 | fault_devs: bitmask of minor numbers | ||
64 | fault_rate: frequency in percent | ||
65 | |||
66 | Example: Simulate data write errors on /dev/drbd0 with a probability of 5%. | ||
67 | echo 16 > /sys/module/drbd/parameters/enable_faults | ||
68 | echo 1 > /sys/module/drbd/parameters/fault_devs | ||
69 | echo 5 > /sys/module/drbd/parameters/fault_rate | ||
70 | |||
71 | If unsure, say N. | ||
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile new file mode 100644 index 000000000000..0d3f337ff5ff --- /dev/null +++ b/drivers/block/drbd/Makefile | |||
@@ -0,0 +1,5 @@ | |||
1 | drbd-y := drbd_bitmap.o drbd_proc.o | ||
2 | drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o | ||
3 | drbd-y += drbd_main.o drbd_strings.o drbd_nl.o | ||
4 | |||
5 | obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o | ||
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c new file mode 100644 index 000000000000..df018990c422 --- /dev/null +++ b/drivers/block/drbd/drbd_actlog.c | |||
@@ -0,0 +1,1433 @@ | |||
1 | /* | ||
2 | drbd_actlog.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #include <linux/slab.h> | ||
27 | #include <linux/drbd.h> | ||
28 | #include "drbd_int.h" | ||
29 | #include "drbd_wrappers.h" | ||
30 | |||
31 | /* We maintain a trivial check sum in our on disk activity log. | ||
32 | * With that we can ensure correct operation even when the storage | ||
33 | * device might do a partial (last) sector write while loosing power. | ||
34 | */ | ||
35 | struct __packed al_transaction { | ||
36 | u32 magic; | ||
37 | u32 tr_number; | ||
38 | struct __packed { | ||
39 | u32 pos; | ||
40 | u32 extent; } updates[1 + AL_EXTENTS_PT]; | ||
41 | u32 xor_sum; | ||
42 | }; | ||
43 | |||
44 | struct update_odbm_work { | ||
45 | struct drbd_work w; | ||
46 | unsigned int enr; | ||
47 | }; | ||
48 | |||
49 | struct update_al_work { | ||
50 | struct drbd_work w; | ||
51 | struct lc_element *al_ext; | ||
52 | struct completion event; | ||
53 | unsigned int enr; | ||
54 | /* if old_enr != LC_FREE, write corresponding bitmap sector, too */ | ||
55 | unsigned int old_enr; | ||
56 | }; | ||
57 | |||
58 | struct drbd_atodb_wait { | ||
59 | atomic_t count; | ||
60 | struct completion io_done; | ||
61 | struct drbd_conf *mdev; | ||
62 | int error; | ||
63 | }; | ||
64 | |||
65 | |||
66 | int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int); | ||
67 | |||
68 | static int _drbd_md_sync_page_io(struct drbd_conf *mdev, | ||
69 | struct drbd_backing_dev *bdev, | ||
70 | struct page *page, sector_t sector, | ||
71 | int rw, int size) | ||
72 | { | ||
73 | struct bio *bio; | ||
74 | struct drbd_md_io md_io; | ||
75 | int ok; | ||
76 | |||
77 | md_io.mdev = mdev; | ||
78 | init_completion(&md_io.event); | ||
79 | md_io.error = 0; | ||
80 | |||
81 | if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags)) | ||
82 | rw |= (1 << BIO_RW_BARRIER); | ||
83 | rw |= ((1<<BIO_RW_UNPLUG) | (1<<BIO_RW_SYNCIO)); | ||
84 | |||
85 | retry: | ||
86 | bio = bio_alloc(GFP_NOIO, 1); | ||
87 | bio->bi_bdev = bdev->md_bdev; | ||
88 | bio->bi_sector = sector; | ||
89 | ok = (bio_add_page(bio, page, size, 0) == size); | ||
90 | if (!ok) | ||
91 | goto out; | ||
92 | bio->bi_private = &md_io; | ||
93 | bio->bi_end_io = drbd_md_io_complete; | ||
94 | bio->bi_rw = rw; | ||
95 | |||
96 | if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) | ||
97 | bio_endio(bio, -EIO); | ||
98 | else | ||
99 | submit_bio(rw, bio); | ||
100 | wait_for_completion(&md_io.event); | ||
101 | ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0; | ||
102 | |||
103 | /* check for unsupported barrier op. | ||
104 | * would rather check on EOPNOTSUPP, but that is not reliable. | ||
105 | * don't try again for ANY return value != 0 */ | ||
106 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && !ok)) { | ||
107 | /* Try again with no barrier */ | ||
108 | dev_warn(DEV, "Barriers not supported on meta data device - disabling\n"); | ||
109 | set_bit(MD_NO_BARRIER, &mdev->flags); | ||
110 | rw &= ~(1 << BIO_RW_BARRIER); | ||
111 | bio_put(bio); | ||
112 | goto retry; | ||
113 | } | ||
114 | out: | ||
115 | bio_put(bio); | ||
116 | return ok; | ||
117 | } | ||
118 | |||
119 | int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, | ||
120 | sector_t sector, int rw) | ||
121 | { | ||
122 | int logical_block_size, mask, ok; | ||
123 | int offset = 0; | ||
124 | struct page *iop = mdev->md_io_page; | ||
125 | |||
126 | D_ASSERT(mutex_is_locked(&mdev->md_io_mutex)); | ||
127 | |||
128 | BUG_ON(!bdev->md_bdev); | ||
129 | |||
130 | logical_block_size = bdev_logical_block_size(bdev->md_bdev); | ||
131 | if (logical_block_size == 0) | ||
132 | logical_block_size = MD_SECTOR_SIZE; | ||
133 | |||
134 | /* in case logical_block_size != 512 [ s390 only? ] */ | ||
135 | if (logical_block_size != MD_SECTOR_SIZE) { | ||
136 | mask = (logical_block_size / MD_SECTOR_SIZE) - 1; | ||
137 | D_ASSERT(mask == 1 || mask == 3 || mask == 7); | ||
138 | D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE); | ||
139 | offset = sector & mask; | ||
140 | sector = sector & ~mask; | ||
141 | iop = mdev->md_io_tmpp; | ||
142 | |||
143 | if (rw & WRITE) { | ||
144 | /* these are GFP_KERNEL pages, pre-allocated | ||
145 | * on device initialization */ | ||
146 | void *p = page_address(mdev->md_io_page); | ||
147 | void *hp = page_address(mdev->md_io_tmpp); | ||
148 | |||
149 | ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, | ||
150 | READ, logical_block_size); | ||
151 | |||
152 | if (unlikely(!ok)) { | ||
153 | dev_err(DEV, "drbd_md_sync_page_io(,%llus," | ||
154 | "READ [logical_block_size!=512]) failed!\n", | ||
155 | (unsigned long long)sector); | ||
156 | return 0; | ||
157 | } | ||
158 | |||
159 | memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE); | ||
160 | } | ||
161 | } | ||
162 | |||
163 | if (sector < drbd_md_first_sector(bdev) || | ||
164 | sector > drbd_md_last_sector(bdev)) | ||
165 | dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n", | ||
166 | current->comm, current->pid, __func__, | ||
167 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); | ||
168 | |||
169 | ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size); | ||
170 | if (unlikely(!ok)) { | ||
171 | dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n", | ||
172 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); | ||
173 | return 0; | ||
174 | } | ||
175 | |||
176 | if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) { | ||
177 | void *p = page_address(mdev->md_io_page); | ||
178 | void *hp = page_address(mdev->md_io_tmpp); | ||
179 | |||
180 | memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE); | ||
181 | } | ||
182 | |||
183 | return ok; | ||
184 | } | ||
185 | |||
186 | static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) | ||
187 | { | ||
188 | struct lc_element *al_ext; | ||
189 | struct lc_element *tmp; | ||
190 | unsigned long al_flags = 0; | ||
191 | |||
192 | spin_lock_irq(&mdev->al_lock); | ||
193 | tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); | ||
194 | if (unlikely(tmp != NULL)) { | ||
195 | struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); | ||
196 | if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { | ||
197 | spin_unlock_irq(&mdev->al_lock); | ||
198 | return NULL; | ||
199 | } | ||
200 | } | ||
201 | al_ext = lc_get(mdev->act_log, enr); | ||
202 | al_flags = mdev->act_log->flags; | ||
203 | spin_unlock_irq(&mdev->al_lock); | ||
204 | |||
205 | /* | ||
206 | if (!al_ext) { | ||
207 | if (al_flags & LC_STARVING) | ||
208 | dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n"); | ||
209 | if (al_flags & LC_DIRTY) | ||
210 | dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n"); | ||
211 | } | ||
212 | */ | ||
213 | |||
214 | return al_ext; | ||
215 | } | ||
216 | |||
217 | void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector) | ||
218 | { | ||
219 | unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); | ||
220 | struct lc_element *al_ext; | ||
221 | struct update_al_work al_work; | ||
222 | |||
223 | D_ASSERT(atomic_read(&mdev->local_cnt) > 0); | ||
224 | |||
225 | wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr))); | ||
226 | |||
227 | if (al_ext->lc_number != enr) { | ||
228 | /* drbd_al_write_transaction(mdev,al_ext,enr); | ||
229 | * recurses into generic_make_request(), which | ||
230 | * disallows recursion, bios being serialized on the | ||
231 | * current->bio_tail list now. | ||
232 | * we have to delegate updates to the activity log | ||
233 | * to the worker thread. */ | ||
234 | init_completion(&al_work.event); | ||
235 | al_work.al_ext = al_ext; | ||
236 | al_work.enr = enr; | ||
237 | al_work.old_enr = al_ext->lc_number; | ||
238 | al_work.w.cb = w_al_write_transaction; | ||
239 | drbd_queue_work_front(&mdev->data.work, &al_work.w); | ||
240 | wait_for_completion(&al_work.event); | ||
241 | |||
242 | mdev->al_writ_cnt++; | ||
243 | |||
244 | spin_lock_irq(&mdev->al_lock); | ||
245 | lc_changed(mdev->act_log, al_ext); | ||
246 | spin_unlock_irq(&mdev->al_lock); | ||
247 | wake_up(&mdev->al_wait); | ||
248 | } | ||
249 | } | ||
250 | |||
251 | void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector) | ||
252 | { | ||
253 | unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); | ||
254 | struct lc_element *extent; | ||
255 | unsigned long flags; | ||
256 | |||
257 | spin_lock_irqsave(&mdev->al_lock, flags); | ||
258 | |||
259 | extent = lc_find(mdev->act_log, enr); | ||
260 | |||
261 | if (!extent) { | ||
262 | spin_unlock_irqrestore(&mdev->al_lock, flags); | ||
263 | dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr); | ||
264 | return; | ||
265 | } | ||
266 | |||
267 | if (lc_put(mdev->act_log, extent) == 0) | ||
268 | wake_up(&mdev->al_wait); | ||
269 | |||
270 | spin_unlock_irqrestore(&mdev->al_lock, flags); | ||
271 | } | ||
272 | |||
273 | int | ||
274 | w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) | ||
275 | { | ||
276 | struct update_al_work *aw = container_of(w, struct update_al_work, w); | ||
277 | struct lc_element *updated = aw->al_ext; | ||
278 | const unsigned int new_enr = aw->enr; | ||
279 | const unsigned int evicted = aw->old_enr; | ||
280 | struct al_transaction *buffer; | ||
281 | sector_t sector; | ||
282 | int i, n, mx; | ||
283 | unsigned int extent_nr; | ||
284 | u32 xor_sum = 0; | ||
285 | |||
286 | if (!get_ldev(mdev)) { | ||
287 | dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n"); | ||
288 | complete(&((struct update_al_work *)w)->event); | ||
289 | return 1; | ||
290 | } | ||
291 | /* do we have to do a bitmap write, first? | ||
292 | * TODO reduce maximum latency: | ||
293 | * submit both bios, then wait for both, | ||
294 | * instead of doing two synchronous sector writes. */ | ||
295 | if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE) | ||
296 | drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT); | ||
297 | |||
298 | mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */ | ||
299 | buffer = (struct al_transaction *)page_address(mdev->md_io_page); | ||
300 | |||
301 | buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); | ||
302 | buffer->tr_number = cpu_to_be32(mdev->al_tr_number); | ||
303 | |||
304 | n = lc_index_of(mdev->act_log, updated); | ||
305 | |||
306 | buffer->updates[0].pos = cpu_to_be32(n); | ||
307 | buffer->updates[0].extent = cpu_to_be32(new_enr); | ||
308 | |||
309 | xor_sum ^= new_enr; | ||
310 | |||
311 | mx = min_t(int, AL_EXTENTS_PT, | ||
312 | mdev->act_log->nr_elements - mdev->al_tr_cycle); | ||
313 | for (i = 0; i < mx; i++) { | ||
314 | unsigned idx = mdev->al_tr_cycle + i; | ||
315 | extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number; | ||
316 | buffer->updates[i+1].pos = cpu_to_be32(idx); | ||
317 | buffer->updates[i+1].extent = cpu_to_be32(extent_nr); | ||
318 | xor_sum ^= extent_nr; | ||
319 | } | ||
320 | for (; i < AL_EXTENTS_PT; i++) { | ||
321 | buffer->updates[i+1].pos = __constant_cpu_to_be32(-1); | ||
322 | buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE); | ||
323 | xor_sum ^= LC_FREE; | ||
324 | } | ||
325 | mdev->al_tr_cycle += AL_EXTENTS_PT; | ||
326 | if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) | ||
327 | mdev->al_tr_cycle = 0; | ||
328 | |||
329 | buffer->xor_sum = cpu_to_be32(xor_sum); | ||
330 | |||
331 | sector = mdev->ldev->md.md_offset | ||
332 | + mdev->ldev->md.al_offset + mdev->al_tr_pos; | ||
333 | |||
334 | if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) | ||
335 | drbd_chk_io_error(mdev, 1, TRUE); | ||
336 | |||
337 | if (++mdev->al_tr_pos > | ||
338 | div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) | ||
339 | mdev->al_tr_pos = 0; | ||
340 | |||
341 | D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE); | ||
342 | mdev->al_tr_number++; | ||
343 | |||
344 | mutex_unlock(&mdev->md_io_mutex); | ||
345 | |||
346 | complete(&((struct update_al_work *)w)->event); | ||
347 | put_ldev(mdev); | ||
348 | |||
349 | return 1; | ||
350 | } | ||
351 | |||
352 | /** | ||
353 | * drbd_al_read_tr() - Read a single transaction from the on disk activity log | ||
354 | * @mdev: DRBD device. | ||
355 | * @bdev: Block device to read form. | ||
356 | * @b: pointer to an al_transaction. | ||
357 | * @index: On disk slot of the transaction to read. | ||
358 | * | ||
359 | * Returns -1 on IO error, 0 on checksum error and 1 upon success. | ||
360 | */ | ||
361 | static int drbd_al_read_tr(struct drbd_conf *mdev, | ||
362 | struct drbd_backing_dev *bdev, | ||
363 | struct al_transaction *b, | ||
364 | int index) | ||
365 | { | ||
366 | sector_t sector; | ||
367 | int rv, i; | ||
368 | u32 xor_sum = 0; | ||
369 | |||
370 | sector = bdev->md.md_offset + bdev->md.al_offset + index; | ||
371 | |||
372 | /* Dont process error normally, | ||
373 | * as this is done before disk is attached! */ | ||
374 | if (!drbd_md_sync_page_io(mdev, bdev, sector, READ)) | ||
375 | return -1; | ||
376 | |||
377 | rv = (be32_to_cpu(b->magic) == DRBD_MAGIC); | ||
378 | |||
379 | for (i = 0; i < AL_EXTENTS_PT + 1; i++) | ||
380 | xor_sum ^= be32_to_cpu(b->updates[i].extent); | ||
381 | rv &= (xor_sum == be32_to_cpu(b->xor_sum)); | ||
382 | |||
383 | return rv; | ||
384 | } | ||
385 | |||
386 | /** | ||
387 | * drbd_al_read_log() - Restores the activity log from its on disk representation. | ||
388 | * @mdev: DRBD device. | ||
389 | * @bdev: Block device to read form. | ||
390 | * | ||
391 | * Returns 1 on success, returns 0 when reading the log failed due to IO errors. | ||
392 | */ | ||
393 | int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | ||
394 | { | ||
395 | struct al_transaction *buffer; | ||
396 | int i; | ||
397 | int rv; | ||
398 | int mx; | ||
399 | int active_extents = 0; | ||
400 | int transactions = 0; | ||
401 | int found_valid = 0; | ||
402 | int from = 0; | ||
403 | int to = 0; | ||
404 | u32 from_tnr = 0; | ||
405 | u32 to_tnr = 0; | ||
406 | u32 cnr; | ||
407 | |||
408 | mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT); | ||
409 | |||
410 | /* lock out all other meta data io for now, | ||
411 | * and make sure the page is mapped. | ||
412 | */ | ||
413 | mutex_lock(&mdev->md_io_mutex); | ||
414 | buffer = page_address(mdev->md_io_page); | ||
415 | |||
416 | /* Find the valid transaction in the log */ | ||
417 | for (i = 0; i <= mx; i++) { | ||
418 | rv = drbd_al_read_tr(mdev, bdev, buffer, i); | ||
419 | if (rv == 0) | ||
420 | continue; | ||
421 | if (rv == -1) { | ||
422 | mutex_unlock(&mdev->md_io_mutex); | ||
423 | return 0; | ||
424 | } | ||
425 | cnr = be32_to_cpu(buffer->tr_number); | ||
426 | |||
427 | if (++found_valid == 1) { | ||
428 | from = i; | ||
429 | to = i; | ||
430 | from_tnr = cnr; | ||
431 | to_tnr = cnr; | ||
432 | continue; | ||
433 | } | ||
434 | if ((int)cnr - (int)from_tnr < 0) { | ||
435 | D_ASSERT(from_tnr - cnr + i - from == mx+1); | ||
436 | from = i; | ||
437 | from_tnr = cnr; | ||
438 | } | ||
439 | if ((int)cnr - (int)to_tnr > 0) { | ||
440 | D_ASSERT(cnr - to_tnr == i - to); | ||
441 | to = i; | ||
442 | to_tnr = cnr; | ||
443 | } | ||
444 | } | ||
445 | |||
446 | if (!found_valid) { | ||
447 | dev_warn(DEV, "No usable activity log found.\n"); | ||
448 | mutex_unlock(&mdev->md_io_mutex); | ||
449 | return 1; | ||
450 | } | ||
451 | |||
452 | /* Read the valid transactions. | ||
453 | * dev_info(DEV, "Reading from %d to %d.\n",from,to); */ | ||
454 | i = from; | ||
455 | while (1) { | ||
456 | int j, pos; | ||
457 | unsigned int extent_nr; | ||
458 | unsigned int trn; | ||
459 | |||
460 | rv = drbd_al_read_tr(mdev, bdev, buffer, i); | ||
461 | ERR_IF(rv == 0) goto cancel; | ||
462 | if (rv == -1) { | ||
463 | mutex_unlock(&mdev->md_io_mutex); | ||
464 | return 0; | ||
465 | } | ||
466 | |||
467 | trn = be32_to_cpu(buffer->tr_number); | ||
468 | |||
469 | spin_lock_irq(&mdev->al_lock); | ||
470 | |||
471 | /* This loop runs backwards because in the cyclic | ||
472 | elements there might be an old version of the | ||
473 | updated element (in slot 0). So the element in slot 0 | ||
474 | can overwrite old versions. */ | ||
475 | for (j = AL_EXTENTS_PT; j >= 0; j--) { | ||
476 | pos = be32_to_cpu(buffer->updates[j].pos); | ||
477 | extent_nr = be32_to_cpu(buffer->updates[j].extent); | ||
478 | |||
479 | if (extent_nr == LC_FREE) | ||
480 | continue; | ||
481 | |||
482 | lc_set(mdev->act_log, extent_nr, pos); | ||
483 | active_extents++; | ||
484 | } | ||
485 | spin_unlock_irq(&mdev->al_lock); | ||
486 | |||
487 | transactions++; | ||
488 | |||
489 | cancel: | ||
490 | if (i == to) | ||
491 | break; | ||
492 | i++; | ||
493 | if (i > mx) | ||
494 | i = 0; | ||
495 | } | ||
496 | |||
497 | mdev->al_tr_number = to_tnr+1; | ||
498 | mdev->al_tr_pos = to; | ||
499 | if (++mdev->al_tr_pos > | ||
500 | div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) | ||
501 | mdev->al_tr_pos = 0; | ||
502 | |||
503 | /* ok, we are done with it */ | ||
504 | mutex_unlock(&mdev->md_io_mutex); | ||
505 | |||
506 | dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n", | ||
507 | transactions, active_extents); | ||
508 | |||
509 | return 1; | ||
510 | } | ||
511 | |||
512 | static void atodb_endio(struct bio *bio, int error) | ||
513 | { | ||
514 | struct drbd_atodb_wait *wc = bio->bi_private; | ||
515 | struct drbd_conf *mdev = wc->mdev; | ||
516 | struct page *page; | ||
517 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | ||
518 | |||
519 | /* strange behavior of some lower level drivers... | ||
520 | * fail the request by clearing the uptodate flag, | ||
521 | * but do not return any error?! */ | ||
522 | if (!error && !uptodate) | ||
523 | error = -EIO; | ||
524 | |||
525 | drbd_chk_io_error(mdev, error, TRUE); | ||
526 | if (error && wc->error == 0) | ||
527 | wc->error = error; | ||
528 | |||
529 | if (atomic_dec_and_test(&wc->count)) | ||
530 | complete(&wc->io_done); | ||
531 | |||
532 | page = bio->bi_io_vec[0].bv_page; | ||
533 | put_page(page); | ||
534 | bio_put(bio); | ||
535 | mdev->bm_writ_cnt++; | ||
536 | put_ldev(mdev); | ||
537 | } | ||
538 | |||
539 | /* sector to word */ | ||
540 | #define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) | ||
541 | |||
542 | /* activity log to on disk bitmap -- prepare bio unless that sector | ||
543 | * is already covered by previously prepared bios */ | ||
544 | static int atodb_prepare_unless_covered(struct drbd_conf *mdev, | ||
545 | struct bio **bios, | ||
546 | unsigned int enr, | ||
547 | struct drbd_atodb_wait *wc) __must_hold(local) | ||
548 | { | ||
549 | struct bio *bio; | ||
550 | struct page *page; | ||
551 | sector_t on_disk_sector; | ||
552 | unsigned int page_offset = PAGE_SIZE; | ||
553 | int offset; | ||
554 | int i = 0; | ||
555 | int err = -ENOMEM; | ||
556 | |||
557 | /* We always write aligned, full 4k blocks, | ||
558 | * so we can ignore the logical_block_size (for now) */ | ||
559 | enr &= ~7U; | ||
560 | on_disk_sector = enr + mdev->ldev->md.md_offset | ||
561 | + mdev->ldev->md.bm_offset; | ||
562 | |||
563 | D_ASSERT(!(on_disk_sector & 7U)); | ||
564 | |||
565 | /* Check if that enr is already covered by an already created bio. | ||
566 | * Caution, bios[] is not NULL terminated, | ||
567 | * but only initialized to all NULL. | ||
568 | * For completely scattered activity log, | ||
569 | * the last invocation iterates over all bios, | ||
570 | * and finds the last NULL entry. | ||
571 | */ | ||
572 | while ((bio = bios[i])) { | ||
573 | if (bio->bi_sector == on_disk_sector) | ||
574 | return 0; | ||
575 | i++; | ||
576 | } | ||
577 | /* bios[i] == NULL, the next not yet used slot */ | ||
578 | |||
579 | /* GFP_KERNEL, we are not in the write-out path */ | ||
580 | bio = bio_alloc(GFP_KERNEL, 1); | ||
581 | if (bio == NULL) | ||
582 | return -ENOMEM; | ||
583 | |||
584 | if (i > 0) { | ||
585 | const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec; | ||
586 | page_offset = prev_bv->bv_offset + prev_bv->bv_len; | ||
587 | page = prev_bv->bv_page; | ||
588 | } | ||
589 | if (page_offset == PAGE_SIZE) { | ||
590 | page = alloc_page(__GFP_HIGHMEM); | ||
591 | if (page == NULL) | ||
592 | goto out_bio_put; | ||
593 | page_offset = 0; | ||
594 | } else { | ||
595 | get_page(page); | ||
596 | } | ||
597 | |||
598 | offset = S2W(enr); | ||
599 | drbd_bm_get_lel(mdev, offset, | ||
600 | min_t(size_t, S2W(8), drbd_bm_words(mdev) - offset), | ||
601 | kmap(page) + page_offset); | ||
602 | kunmap(page); | ||
603 | |||
604 | bio->bi_private = wc; | ||
605 | bio->bi_end_io = atodb_endio; | ||
606 | bio->bi_bdev = mdev->ldev->md_bdev; | ||
607 | bio->bi_sector = on_disk_sector; | ||
608 | |||
609 | if (bio_add_page(bio, page, 4096, page_offset) != 4096) | ||
610 | goto out_put_page; | ||
611 | |||
612 | atomic_inc(&wc->count); | ||
613 | /* we already know that we may do this... | ||
614 | * get_ldev_if_state(mdev,D_ATTACHING); | ||
615 | * just get the extra reference, so that the local_cnt reflects | ||
616 | * the number of pending IO requests DRBD at its backing device. | ||
617 | */ | ||
618 | atomic_inc(&mdev->local_cnt); | ||
619 | |||
620 | bios[i] = bio; | ||
621 | |||
622 | return 0; | ||
623 | |||
624 | out_put_page: | ||
625 | err = -EINVAL; | ||
626 | put_page(page); | ||
627 | out_bio_put: | ||
628 | bio_put(bio); | ||
629 | return err; | ||
630 | } | ||
631 | |||
632 | /** | ||
633 | * drbd_al_to_on_disk_bm() - * Writes bitmap parts covered by active AL extents | ||
634 | * @mdev: DRBD device. | ||
635 | * | ||
636 | * Called when we detach (unconfigure) local storage, | ||
637 | * or when we go from R_PRIMARY to R_SECONDARY role. | ||
638 | */ | ||
639 | void drbd_al_to_on_disk_bm(struct drbd_conf *mdev) | ||
640 | { | ||
641 | int i, nr_elements; | ||
642 | unsigned int enr; | ||
643 | struct bio **bios; | ||
644 | struct drbd_atodb_wait wc; | ||
645 | |||
646 | ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING)) | ||
647 | return; /* sorry, I don't have any act_log etc... */ | ||
648 | |||
649 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); | ||
650 | |||
651 | nr_elements = mdev->act_log->nr_elements; | ||
652 | |||
653 | /* GFP_KERNEL, we are not in anyone's write-out path */ | ||
654 | bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL); | ||
655 | if (!bios) | ||
656 | goto submit_one_by_one; | ||
657 | |||
658 | atomic_set(&wc.count, 0); | ||
659 | init_completion(&wc.io_done); | ||
660 | wc.mdev = mdev; | ||
661 | wc.error = 0; | ||
662 | |||
663 | for (i = 0; i < nr_elements; i++) { | ||
664 | enr = lc_element_by_index(mdev->act_log, i)->lc_number; | ||
665 | if (enr == LC_FREE) | ||
666 | continue; | ||
667 | /* next statement also does atomic_inc wc.count and local_cnt */ | ||
668 | if (atodb_prepare_unless_covered(mdev, bios, | ||
669 | enr/AL_EXT_PER_BM_SECT, | ||
670 | &wc)) | ||
671 | goto free_bios_submit_one_by_one; | ||
672 | } | ||
673 | |||
674 | /* unnecessary optimization? */ | ||
675 | lc_unlock(mdev->act_log); | ||
676 | wake_up(&mdev->al_wait); | ||
677 | |||
678 | /* all prepared, submit them */ | ||
679 | for (i = 0; i < nr_elements; i++) { | ||
680 | if (bios[i] == NULL) | ||
681 | break; | ||
682 | if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) { | ||
683 | bios[i]->bi_rw = WRITE; | ||
684 | bio_endio(bios[i], -EIO); | ||
685 | } else { | ||
686 | submit_bio(WRITE, bios[i]); | ||
687 | } | ||
688 | } | ||
689 | |||
690 | drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev)); | ||
691 | |||
692 | /* always (try to) flush bitmap to stable storage */ | ||
693 | drbd_md_flush(mdev); | ||
694 | |||
695 | /* In case we did not submit a single IO do not wait for | ||
696 | * them to complete. ( Because we would wait forever here. ) | ||
697 | * | ||
698 | * In case we had IOs and they are already complete, there | ||
699 | * is not point in waiting anyways. | ||
700 | * Therefore this if () ... */ | ||
701 | if (atomic_read(&wc.count)) | ||
702 | wait_for_completion(&wc.io_done); | ||
703 | |||
704 | put_ldev(mdev); | ||
705 | |||
706 | kfree(bios); | ||
707 | return; | ||
708 | |||
709 | free_bios_submit_one_by_one: | ||
710 | /* free everything by calling the endio callback directly. */ | ||
711 | for (i = 0; i < nr_elements && bios[i]; i++) | ||
712 | bio_endio(bios[i], 0); | ||
713 | |||
714 | kfree(bios); | ||
715 | |||
716 | submit_one_by_one: | ||
717 | dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n"); | ||
718 | |||
719 | for (i = 0; i < mdev->act_log->nr_elements; i++) { | ||
720 | enr = lc_element_by_index(mdev->act_log, i)->lc_number; | ||
721 | if (enr == LC_FREE) | ||
722 | continue; | ||
723 | /* Really slow: if we have al-extents 16..19 active, | ||
724 | * sector 4 will be written four times! Synchronous! */ | ||
725 | drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT); | ||
726 | } | ||
727 | |||
728 | lc_unlock(mdev->act_log); | ||
729 | wake_up(&mdev->al_wait); | ||
730 | put_ldev(mdev); | ||
731 | } | ||
732 | |||
733 | /** | ||
734 | * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents | ||
735 | * @mdev: DRBD device. | ||
736 | */ | ||
737 | void drbd_al_apply_to_bm(struct drbd_conf *mdev) | ||
738 | { | ||
739 | unsigned int enr; | ||
740 | unsigned long add = 0; | ||
741 | char ppb[10]; | ||
742 | int i; | ||
743 | |||
744 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); | ||
745 | |||
746 | for (i = 0; i < mdev->act_log->nr_elements; i++) { | ||
747 | enr = lc_element_by_index(mdev->act_log, i)->lc_number; | ||
748 | if (enr == LC_FREE) | ||
749 | continue; | ||
750 | add += drbd_bm_ALe_set_all(mdev, enr); | ||
751 | } | ||
752 | |||
753 | lc_unlock(mdev->act_log); | ||
754 | wake_up(&mdev->al_wait); | ||
755 | |||
756 | dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n", | ||
757 | ppsize(ppb, Bit2KB(add))); | ||
758 | } | ||
759 | |||
760 | static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) | ||
761 | { | ||
762 | int rv; | ||
763 | |||
764 | spin_lock_irq(&mdev->al_lock); | ||
765 | rv = (al_ext->refcnt == 0); | ||
766 | if (likely(rv)) | ||
767 | lc_del(mdev->act_log, al_ext); | ||
768 | spin_unlock_irq(&mdev->al_lock); | ||
769 | |||
770 | return rv; | ||
771 | } | ||
772 | |||
773 | /** | ||
774 | * drbd_al_shrink() - Removes all active extents form the activity log | ||
775 | * @mdev: DRBD device. | ||
776 | * | ||
777 | * Removes all active extents form the activity log, waiting until | ||
778 | * the reference count of each entry dropped to 0 first, of course. | ||
779 | * | ||
780 | * You need to lock mdev->act_log with lc_try_lock() / lc_unlock() | ||
781 | */ | ||
782 | void drbd_al_shrink(struct drbd_conf *mdev) | ||
783 | { | ||
784 | struct lc_element *al_ext; | ||
785 | int i; | ||
786 | |||
787 | D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags)); | ||
788 | |||
789 | for (i = 0; i < mdev->act_log->nr_elements; i++) { | ||
790 | al_ext = lc_element_by_index(mdev->act_log, i); | ||
791 | if (al_ext->lc_number == LC_FREE) | ||
792 | continue; | ||
793 | wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext)); | ||
794 | } | ||
795 | |||
796 | wake_up(&mdev->al_wait); | ||
797 | } | ||
798 | |||
799 | static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused) | ||
800 | { | ||
801 | struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); | ||
802 | |||
803 | if (!get_ldev(mdev)) { | ||
804 | if (__ratelimit(&drbd_ratelimit_state)) | ||
805 | dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n"); | ||
806 | kfree(udw); | ||
807 | return 1; | ||
808 | } | ||
809 | |||
810 | drbd_bm_write_sect(mdev, udw->enr); | ||
811 | put_ldev(mdev); | ||
812 | |||
813 | kfree(udw); | ||
814 | |||
815 | if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) { | ||
816 | switch (mdev->state.conn) { | ||
817 | case C_SYNC_SOURCE: case C_SYNC_TARGET: | ||
818 | case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T: | ||
819 | drbd_resync_finished(mdev); | ||
820 | default: | ||
821 | /* nothing to do */ | ||
822 | break; | ||
823 | } | ||
824 | } | ||
825 | drbd_bcast_sync_progress(mdev); | ||
826 | |||
827 | return 1; | ||
828 | } | ||
829 | |||
830 | |||
831 | /* ATTENTION. The AL's extents are 4MB each, while the extents in the | ||
832 | * resync LRU-cache are 16MB each. | ||
833 | * The caller of this function has to hold an get_ldev() reference. | ||
834 | * | ||
835 | * TODO will be obsoleted once we have a caching lru of the on disk bitmap | ||
836 | */ | ||
837 | static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, | ||
838 | int count, int success) | ||
839 | { | ||
840 | struct lc_element *e; | ||
841 | struct update_odbm_work *udw; | ||
842 | |||
843 | unsigned int enr; | ||
844 | |||
845 | D_ASSERT(atomic_read(&mdev->local_cnt)); | ||
846 | |||
847 | /* I simply assume that a sector/size pair never crosses | ||
848 | * a 16 MB extent border. (Currently this is true...) */ | ||
849 | enr = BM_SECT_TO_EXT(sector); | ||
850 | |||
851 | e = lc_get(mdev->resync, enr); | ||
852 | if (e) { | ||
853 | struct bm_extent *ext = lc_entry(e, struct bm_extent, lce); | ||
854 | if (ext->lce.lc_number == enr) { | ||
855 | if (success) | ||
856 | ext->rs_left -= count; | ||
857 | else | ||
858 | ext->rs_failed += count; | ||
859 | if (ext->rs_left < ext->rs_failed) { | ||
860 | dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d " | ||
861 | "rs_failed=%d count=%d\n", | ||
862 | (unsigned long long)sector, | ||
863 | ext->lce.lc_number, ext->rs_left, | ||
864 | ext->rs_failed, count); | ||
865 | dump_stack(); | ||
866 | |||
867 | lc_put(mdev->resync, &ext->lce); | ||
868 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
869 | return; | ||
870 | } | ||
871 | } else { | ||
872 | /* Normally this element should be in the cache, | ||
873 | * since drbd_rs_begin_io() pulled it already in. | ||
874 | * | ||
875 | * But maybe an application write finished, and we set | ||
876 | * something outside the resync lru_cache in sync. | ||
877 | */ | ||
878 | int rs_left = drbd_bm_e_weight(mdev, enr); | ||
879 | if (ext->flags != 0) { | ||
880 | dev_warn(DEV, "changing resync lce: %d[%u;%02lx]" | ||
881 | " -> %d[%u;00]\n", | ||
882 | ext->lce.lc_number, ext->rs_left, | ||
883 | ext->flags, enr, rs_left); | ||
884 | ext->flags = 0; | ||
885 | } | ||
886 | if (ext->rs_failed) { | ||
887 | dev_warn(DEV, "Kicking resync_lru element enr=%u " | ||
888 | "out with rs_failed=%d\n", | ||
889 | ext->lce.lc_number, ext->rs_failed); | ||
890 | set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); | ||
891 | } | ||
892 | ext->rs_left = rs_left; | ||
893 | ext->rs_failed = success ? 0 : count; | ||
894 | lc_changed(mdev->resync, &ext->lce); | ||
895 | } | ||
896 | lc_put(mdev->resync, &ext->lce); | ||
897 | /* no race, we are within the al_lock! */ | ||
898 | |||
899 | if (ext->rs_left == ext->rs_failed) { | ||
900 | ext->rs_failed = 0; | ||
901 | |||
902 | udw = kmalloc(sizeof(*udw), GFP_ATOMIC); | ||
903 | if (udw) { | ||
904 | udw->enr = ext->lce.lc_number; | ||
905 | udw->w.cb = w_update_odbm; | ||
906 | drbd_queue_work_front(&mdev->data.work, &udw->w); | ||
907 | } else { | ||
908 | dev_warn(DEV, "Could not kmalloc an udw\n"); | ||
909 | set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); | ||
910 | } | ||
911 | } | ||
912 | } else { | ||
913 | dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n", | ||
914 | mdev->resync_locked, | ||
915 | mdev->resync->nr_elements, | ||
916 | mdev->resync->flags); | ||
917 | } | ||
918 | } | ||
919 | |||
920 | /* clear the bit corresponding to the piece of storage in question: | ||
921 | * size byte of data starting from sector. Only clear a bits of the affected | ||
922 | * one ore more _aligned_ BM_BLOCK_SIZE blocks. | ||
923 | * | ||
924 | * called by worker on C_SYNC_TARGET and receiver on SyncSource. | ||
925 | * | ||
926 | */ | ||
927 | void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, | ||
928 | const char *file, const unsigned int line) | ||
929 | { | ||
930 | /* Is called from worker and receiver context _only_ */ | ||
931 | unsigned long sbnr, ebnr, lbnr; | ||
932 | unsigned long count = 0; | ||
933 | sector_t esector, nr_sectors; | ||
934 | int wake_up = 0; | ||
935 | unsigned long flags; | ||
936 | |||
937 | if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { | ||
938 | dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", | ||
939 | (unsigned long long)sector, size); | ||
940 | return; | ||
941 | } | ||
942 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | ||
943 | esector = sector + (size >> 9) - 1; | ||
944 | |||
945 | ERR_IF(sector >= nr_sectors) return; | ||
946 | ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); | ||
947 | |||
948 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); | ||
949 | |||
950 | /* we clear it (in sync). | ||
951 | * round up start sector, round down end sector. we make sure we only | ||
952 | * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */ | ||
953 | if (unlikely(esector < BM_SECT_PER_BIT-1)) | ||
954 | return; | ||
955 | if (unlikely(esector == (nr_sectors-1))) | ||
956 | ebnr = lbnr; | ||
957 | else | ||
958 | ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); | ||
959 | sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); | ||
960 | |||
961 | if (sbnr > ebnr) | ||
962 | return; | ||
963 | |||
964 | /* | ||
965 | * ok, (capacity & 7) != 0 sometimes, but who cares... | ||
966 | * we count rs_{total,left} in bits, not sectors. | ||
967 | */ | ||
968 | spin_lock_irqsave(&mdev->al_lock, flags); | ||
969 | count = drbd_bm_clear_bits(mdev, sbnr, ebnr); | ||
970 | if (count) { | ||
971 | /* we need the lock for drbd_try_clear_on_disk_bm */ | ||
972 | if (jiffies - mdev->rs_mark_time > HZ*10) { | ||
973 | /* should be rolling marks, | ||
974 | * but we estimate only anyways. */ | ||
975 | if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) && | ||
976 | mdev->state.conn != C_PAUSED_SYNC_T && | ||
977 | mdev->state.conn != C_PAUSED_SYNC_S) { | ||
978 | mdev->rs_mark_time = jiffies; | ||
979 | mdev->rs_mark_left = drbd_bm_total_weight(mdev); | ||
980 | } | ||
981 | } | ||
982 | if (get_ldev(mdev)) { | ||
983 | drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE); | ||
984 | put_ldev(mdev); | ||
985 | } | ||
986 | /* just wake_up unconditional now, various lc_chaged(), | ||
987 | * lc_put() in drbd_try_clear_on_disk_bm(). */ | ||
988 | wake_up = 1; | ||
989 | } | ||
990 | spin_unlock_irqrestore(&mdev->al_lock, flags); | ||
991 | if (wake_up) | ||
992 | wake_up(&mdev->al_wait); | ||
993 | } | ||
994 | |||
995 | /* | ||
996 | * this is intended to set one request worth of data out of sync. | ||
997 | * affects at least 1 bit, | ||
998 | * and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits. | ||
999 | * | ||
1000 | * called by tl_clear and drbd_send_dblock (==drbd_make_request). | ||
1001 | * so this can be _any_ process. | ||
1002 | */ | ||
1003 | void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, | ||
1004 | const char *file, const unsigned int line) | ||
1005 | { | ||
1006 | unsigned long sbnr, ebnr, lbnr, flags; | ||
1007 | sector_t esector, nr_sectors; | ||
1008 | unsigned int enr, count; | ||
1009 | struct lc_element *e; | ||
1010 | |||
1011 | if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { | ||
1012 | dev_err(DEV, "sector: %llus, size: %d\n", | ||
1013 | (unsigned long long)sector, size); | ||
1014 | return; | ||
1015 | } | ||
1016 | |||
1017 | if (!get_ldev(mdev)) | ||
1018 | return; /* no disk, no metadata, no bitmap to set bits in */ | ||
1019 | |||
1020 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | ||
1021 | esector = sector + (size >> 9) - 1; | ||
1022 | |||
1023 | ERR_IF(sector >= nr_sectors) | ||
1024 | goto out; | ||
1025 | ERR_IF(esector >= nr_sectors) | ||
1026 | esector = (nr_sectors-1); | ||
1027 | |||
1028 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); | ||
1029 | |||
1030 | /* we set it out of sync, | ||
1031 | * we do not need to round anything here */ | ||
1032 | sbnr = BM_SECT_TO_BIT(sector); | ||
1033 | ebnr = BM_SECT_TO_BIT(esector); | ||
1034 | |||
1035 | /* ok, (capacity & 7) != 0 sometimes, but who cares... | ||
1036 | * we count rs_{total,left} in bits, not sectors. */ | ||
1037 | spin_lock_irqsave(&mdev->al_lock, flags); | ||
1038 | count = drbd_bm_set_bits(mdev, sbnr, ebnr); | ||
1039 | |||
1040 | enr = BM_SECT_TO_EXT(sector); | ||
1041 | e = lc_find(mdev->resync, enr); | ||
1042 | if (e) | ||
1043 | lc_entry(e, struct bm_extent, lce)->rs_left += count; | ||
1044 | spin_unlock_irqrestore(&mdev->al_lock, flags); | ||
1045 | |||
1046 | out: | ||
1047 | put_ldev(mdev); | ||
1048 | } | ||
1049 | |||
1050 | static | ||
1051 | struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr) | ||
1052 | { | ||
1053 | struct lc_element *e; | ||
1054 | struct bm_extent *bm_ext; | ||
1055 | int wakeup = 0; | ||
1056 | unsigned long rs_flags; | ||
1057 | |||
1058 | spin_lock_irq(&mdev->al_lock); | ||
1059 | if (mdev->resync_locked > mdev->resync->nr_elements/2) { | ||
1060 | spin_unlock_irq(&mdev->al_lock); | ||
1061 | return NULL; | ||
1062 | } | ||
1063 | e = lc_get(mdev->resync, enr); | ||
1064 | bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; | ||
1065 | if (bm_ext) { | ||
1066 | if (bm_ext->lce.lc_number != enr) { | ||
1067 | bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); | ||
1068 | bm_ext->rs_failed = 0; | ||
1069 | lc_changed(mdev->resync, &bm_ext->lce); | ||
1070 | wakeup = 1; | ||
1071 | } | ||
1072 | if (bm_ext->lce.refcnt == 1) | ||
1073 | mdev->resync_locked++; | ||
1074 | set_bit(BME_NO_WRITES, &bm_ext->flags); | ||
1075 | } | ||
1076 | rs_flags = mdev->resync->flags; | ||
1077 | spin_unlock_irq(&mdev->al_lock); | ||
1078 | if (wakeup) | ||
1079 | wake_up(&mdev->al_wait); | ||
1080 | |||
1081 | if (!bm_ext) { | ||
1082 | if (rs_flags & LC_STARVING) | ||
1083 | dev_warn(DEV, "Have to wait for element" | ||
1084 | " (resync LRU too small?)\n"); | ||
1085 | BUG_ON(rs_flags & LC_DIRTY); | ||
1086 | } | ||
1087 | |||
1088 | return bm_ext; | ||
1089 | } | ||
1090 | |||
1091 | static int _is_in_al(struct drbd_conf *mdev, unsigned int enr) | ||
1092 | { | ||
1093 | struct lc_element *al_ext; | ||
1094 | int rv = 0; | ||
1095 | |||
1096 | spin_lock_irq(&mdev->al_lock); | ||
1097 | if (unlikely(enr == mdev->act_log->new_number)) | ||
1098 | rv = 1; | ||
1099 | else { | ||
1100 | al_ext = lc_find(mdev->act_log, enr); | ||
1101 | if (al_ext) { | ||
1102 | if (al_ext->refcnt) | ||
1103 | rv = 1; | ||
1104 | } | ||
1105 | } | ||
1106 | spin_unlock_irq(&mdev->al_lock); | ||
1107 | |||
1108 | /* | ||
1109 | if (unlikely(rv)) { | ||
1110 | dev_info(DEV, "Delaying sync read until app's write is done\n"); | ||
1111 | } | ||
1112 | */ | ||
1113 | return rv; | ||
1114 | } | ||
1115 | |||
1116 | /** | ||
1117 | * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED | ||
1118 | * @mdev: DRBD device. | ||
1119 | * @sector: The sector number. | ||
1120 | * | ||
1121 | * This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted. | ||
1122 | */ | ||
1123 | int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector) | ||
1124 | { | ||
1125 | unsigned int enr = BM_SECT_TO_EXT(sector); | ||
1126 | struct bm_extent *bm_ext; | ||
1127 | int i, sig; | ||
1128 | |||
1129 | sig = wait_event_interruptible(mdev->al_wait, | ||
1130 | (bm_ext = _bme_get(mdev, enr))); | ||
1131 | if (sig) | ||
1132 | return 0; | ||
1133 | |||
1134 | if (test_bit(BME_LOCKED, &bm_ext->flags)) | ||
1135 | return 1; | ||
1136 | |||
1137 | for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { | ||
1138 | sig = wait_event_interruptible(mdev->al_wait, | ||
1139 | !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i)); | ||
1140 | if (sig) { | ||
1141 | spin_lock_irq(&mdev->al_lock); | ||
1142 | if (lc_put(mdev->resync, &bm_ext->lce) == 0) { | ||
1143 | clear_bit(BME_NO_WRITES, &bm_ext->flags); | ||
1144 | mdev->resync_locked--; | ||
1145 | wake_up(&mdev->al_wait); | ||
1146 | } | ||
1147 | spin_unlock_irq(&mdev->al_lock); | ||
1148 | return 0; | ||
1149 | } | ||
1150 | } | ||
1151 | |||
1152 | set_bit(BME_LOCKED, &bm_ext->flags); | ||
1153 | |||
1154 | return 1; | ||
1155 | } | ||
1156 | |||
1157 | /** | ||
1158 | * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep | ||
1159 | * @mdev: DRBD device. | ||
1160 | * @sector: The sector number. | ||
1161 | * | ||
1162 | * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then | ||
1163 | * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN | ||
1164 | * if there is still application IO going on in this area. | ||
1165 | */ | ||
1166 | int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector) | ||
1167 | { | ||
1168 | unsigned int enr = BM_SECT_TO_EXT(sector); | ||
1169 | const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT; | ||
1170 | struct lc_element *e; | ||
1171 | struct bm_extent *bm_ext; | ||
1172 | int i; | ||
1173 | |||
1174 | spin_lock_irq(&mdev->al_lock); | ||
1175 | if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) { | ||
1176 | /* in case you have very heavy scattered io, it may | ||
1177 | * stall the syncer undefined if we give up the ref count | ||
1178 | * when we try again and requeue. | ||
1179 | * | ||
1180 | * if we don't give up the refcount, but the next time | ||
1181 | * we are scheduled this extent has been "synced" by new | ||
1182 | * application writes, we'd miss the lc_put on the | ||
1183 | * extent we keep the refcount on. | ||
1184 | * so we remembered which extent we had to try again, and | ||
1185 | * if the next requested one is something else, we do | ||
1186 | * the lc_put here... | ||
1187 | * we also have to wake_up | ||
1188 | */ | ||
1189 | e = lc_find(mdev->resync, mdev->resync_wenr); | ||
1190 | bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; | ||
1191 | if (bm_ext) { | ||
1192 | D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags)); | ||
1193 | D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags)); | ||
1194 | clear_bit(BME_NO_WRITES, &bm_ext->flags); | ||
1195 | mdev->resync_wenr = LC_FREE; | ||
1196 | if (lc_put(mdev->resync, &bm_ext->lce) == 0) | ||
1197 | mdev->resync_locked--; | ||
1198 | wake_up(&mdev->al_wait); | ||
1199 | } else { | ||
1200 | dev_alert(DEV, "LOGIC BUG\n"); | ||
1201 | } | ||
1202 | } | ||
1203 | /* TRY. */ | ||
1204 | e = lc_try_get(mdev->resync, enr); | ||
1205 | bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; | ||
1206 | if (bm_ext) { | ||
1207 | if (test_bit(BME_LOCKED, &bm_ext->flags)) | ||
1208 | goto proceed; | ||
1209 | if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) { | ||
1210 | mdev->resync_locked++; | ||
1211 | } else { | ||
1212 | /* we did set the BME_NO_WRITES, | ||
1213 | * but then could not set BME_LOCKED, | ||
1214 | * so we tried again. | ||
1215 | * drop the extra reference. */ | ||
1216 | bm_ext->lce.refcnt--; | ||
1217 | D_ASSERT(bm_ext->lce.refcnt > 0); | ||
1218 | } | ||
1219 | goto check_al; | ||
1220 | } else { | ||
1221 | /* do we rather want to try later? */ | ||
1222 | if (mdev->resync_locked > mdev->resync->nr_elements-3) | ||
1223 | goto try_again; | ||
1224 | /* Do or do not. There is no try. -- Yoda */ | ||
1225 | e = lc_get(mdev->resync, enr); | ||
1226 | bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; | ||
1227 | if (!bm_ext) { | ||
1228 | const unsigned long rs_flags = mdev->resync->flags; | ||
1229 | if (rs_flags & LC_STARVING) | ||
1230 | dev_warn(DEV, "Have to wait for element" | ||
1231 | " (resync LRU too small?)\n"); | ||
1232 | BUG_ON(rs_flags & LC_DIRTY); | ||
1233 | goto try_again; | ||
1234 | } | ||
1235 | if (bm_ext->lce.lc_number != enr) { | ||
1236 | bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); | ||
1237 | bm_ext->rs_failed = 0; | ||
1238 | lc_changed(mdev->resync, &bm_ext->lce); | ||
1239 | wake_up(&mdev->al_wait); | ||
1240 | D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0); | ||
1241 | } | ||
1242 | set_bit(BME_NO_WRITES, &bm_ext->flags); | ||
1243 | D_ASSERT(bm_ext->lce.refcnt == 1); | ||
1244 | mdev->resync_locked++; | ||
1245 | goto check_al; | ||
1246 | } | ||
1247 | check_al: | ||
1248 | for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { | ||
1249 | if (unlikely(al_enr+i == mdev->act_log->new_number)) | ||
1250 | goto try_again; | ||
1251 | if (lc_is_used(mdev->act_log, al_enr+i)) | ||
1252 | goto try_again; | ||
1253 | } | ||
1254 | set_bit(BME_LOCKED, &bm_ext->flags); | ||
1255 | proceed: | ||
1256 | mdev->resync_wenr = LC_FREE; | ||
1257 | spin_unlock_irq(&mdev->al_lock); | ||
1258 | return 0; | ||
1259 | |||
1260 | try_again: | ||
1261 | if (bm_ext) | ||
1262 | mdev->resync_wenr = enr; | ||
1263 | spin_unlock_irq(&mdev->al_lock); | ||
1264 | return -EAGAIN; | ||
1265 | } | ||
1266 | |||
1267 | void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector) | ||
1268 | { | ||
1269 | unsigned int enr = BM_SECT_TO_EXT(sector); | ||
1270 | struct lc_element *e; | ||
1271 | struct bm_extent *bm_ext; | ||
1272 | unsigned long flags; | ||
1273 | |||
1274 | spin_lock_irqsave(&mdev->al_lock, flags); | ||
1275 | e = lc_find(mdev->resync, enr); | ||
1276 | bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; | ||
1277 | if (!bm_ext) { | ||
1278 | spin_unlock_irqrestore(&mdev->al_lock, flags); | ||
1279 | if (__ratelimit(&drbd_ratelimit_state)) | ||
1280 | dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n"); | ||
1281 | return; | ||
1282 | } | ||
1283 | |||
1284 | if (bm_ext->lce.refcnt == 0) { | ||
1285 | spin_unlock_irqrestore(&mdev->al_lock, flags); | ||
1286 | dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, " | ||
1287 | "but refcnt is 0!?\n", | ||
1288 | (unsigned long long)sector, enr); | ||
1289 | return; | ||
1290 | } | ||
1291 | |||
1292 | if (lc_put(mdev->resync, &bm_ext->lce) == 0) { | ||
1293 | clear_bit(BME_LOCKED, &bm_ext->flags); | ||
1294 | clear_bit(BME_NO_WRITES, &bm_ext->flags); | ||
1295 | mdev->resync_locked--; | ||
1296 | wake_up(&mdev->al_wait); | ||
1297 | } | ||
1298 | |||
1299 | spin_unlock_irqrestore(&mdev->al_lock, flags); | ||
1300 | } | ||
1301 | |||
1302 | /** | ||
1303 | * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED) | ||
1304 | * @mdev: DRBD device. | ||
1305 | */ | ||
1306 | void drbd_rs_cancel_all(struct drbd_conf *mdev) | ||
1307 | { | ||
1308 | spin_lock_irq(&mdev->al_lock); | ||
1309 | |||
1310 | if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */ | ||
1311 | lc_reset(mdev->resync); | ||
1312 | put_ldev(mdev); | ||
1313 | } | ||
1314 | mdev->resync_locked = 0; | ||
1315 | mdev->resync_wenr = LC_FREE; | ||
1316 | spin_unlock_irq(&mdev->al_lock); | ||
1317 | wake_up(&mdev->al_wait); | ||
1318 | } | ||
1319 | |||
1320 | /** | ||
1321 | * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU | ||
1322 | * @mdev: DRBD device. | ||
1323 | * | ||
1324 | * Returns 0 upon success, -EAGAIN if at least one reference count was | ||
1325 | * not zero. | ||
1326 | */ | ||
1327 | int drbd_rs_del_all(struct drbd_conf *mdev) | ||
1328 | { | ||
1329 | struct lc_element *e; | ||
1330 | struct bm_extent *bm_ext; | ||
1331 | int i; | ||
1332 | |||
1333 | spin_lock_irq(&mdev->al_lock); | ||
1334 | |||
1335 | if (get_ldev_if_state(mdev, D_FAILED)) { | ||
1336 | /* ok, ->resync is there. */ | ||
1337 | for (i = 0; i < mdev->resync->nr_elements; i++) { | ||
1338 | e = lc_element_by_index(mdev->resync, i); | ||
1339 | bm_ext = lc_entry(e, struct bm_extent, lce); | ||
1340 | if (bm_ext->lce.lc_number == LC_FREE) | ||
1341 | continue; | ||
1342 | if (bm_ext->lce.lc_number == mdev->resync_wenr) { | ||
1343 | dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently" | ||
1344 | " got 'synced' by application io\n", | ||
1345 | mdev->resync_wenr); | ||
1346 | D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags)); | ||
1347 | D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags)); | ||
1348 | clear_bit(BME_NO_WRITES, &bm_ext->flags); | ||
1349 | mdev->resync_wenr = LC_FREE; | ||
1350 | lc_put(mdev->resync, &bm_ext->lce); | ||
1351 | } | ||
1352 | if (bm_ext->lce.refcnt != 0) { | ||
1353 | dev_info(DEV, "Retrying drbd_rs_del_all() later. " | ||
1354 | "refcnt=%d\n", bm_ext->lce.refcnt); | ||
1355 | put_ldev(mdev); | ||
1356 | spin_unlock_irq(&mdev->al_lock); | ||
1357 | return -EAGAIN; | ||
1358 | } | ||
1359 | D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags)); | ||
1360 | D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags)); | ||
1361 | lc_del(mdev->resync, &bm_ext->lce); | ||
1362 | } | ||
1363 | D_ASSERT(mdev->resync->used == 0); | ||
1364 | put_ldev(mdev); | ||
1365 | } | ||
1366 | spin_unlock_irq(&mdev->al_lock); | ||
1367 | |||
1368 | return 0; | ||
1369 | } | ||
1370 | |||
1371 | /** | ||
1372 | * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks | ||
1373 | * @mdev: DRBD device. | ||
1374 | * @sector: The sector number. | ||
1375 | * @size: Size of failed IO operation, in byte. | ||
1376 | */ | ||
1377 | void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size) | ||
1378 | { | ||
1379 | /* Is called from worker and receiver context _only_ */ | ||
1380 | unsigned long sbnr, ebnr, lbnr; | ||
1381 | unsigned long count; | ||
1382 | sector_t esector, nr_sectors; | ||
1383 | int wake_up = 0; | ||
1384 | |||
1385 | if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { | ||
1386 | dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", | ||
1387 | (unsigned long long)sector, size); | ||
1388 | return; | ||
1389 | } | ||
1390 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | ||
1391 | esector = sector + (size >> 9) - 1; | ||
1392 | |||
1393 | ERR_IF(sector >= nr_sectors) return; | ||
1394 | ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); | ||
1395 | |||
1396 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); | ||
1397 | |||
1398 | /* | ||
1399 | * round up start sector, round down end sector. we make sure we only | ||
1400 | * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */ | ||
1401 | if (unlikely(esector < BM_SECT_PER_BIT-1)) | ||
1402 | return; | ||
1403 | if (unlikely(esector == (nr_sectors-1))) | ||
1404 | ebnr = lbnr; | ||
1405 | else | ||
1406 | ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); | ||
1407 | sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); | ||
1408 | |||
1409 | if (sbnr > ebnr) | ||
1410 | return; | ||
1411 | |||
1412 | /* | ||
1413 | * ok, (capacity & 7) != 0 sometimes, but who cares... | ||
1414 | * we count rs_{total,left} in bits, not sectors. | ||
1415 | */ | ||
1416 | spin_lock_irq(&mdev->al_lock); | ||
1417 | count = drbd_bm_count_bits(mdev, sbnr, ebnr); | ||
1418 | if (count) { | ||
1419 | mdev->rs_failed += count; | ||
1420 | |||
1421 | if (get_ldev(mdev)) { | ||
1422 | drbd_try_clear_on_disk_bm(mdev, sector, count, FALSE); | ||
1423 | put_ldev(mdev); | ||
1424 | } | ||
1425 | |||
1426 | /* just wake_up unconditional now, various lc_chaged(), | ||
1427 | * lc_put() in drbd_try_clear_on_disk_bm(). */ | ||
1428 | wake_up = 1; | ||
1429 | } | ||
1430 | spin_unlock_irq(&mdev->al_lock); | ||
1431 | if (wake_up) | ||
1432 | wake_up(&mdev->al_wait); | ||
1433 | } | ||
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c new file mode 100644 index 000000000000..3390716898d5 --- /dev/null +++ b/drivers/block/drbd/drbd_bitmap.c | |||
@@ -0,0 +1,1328 @@ | |||
1 | /* | ||
2 | drbd_bitmap.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2004-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | */ | ||
24 | |||
25 | #include <linux/bitops.h> | ||
26 | #include <linux/vmalloc.h> | ||
27 | #include <linux/string.h> | ||
28 | #include <linux/drbd.h> | ||
29 | #include <linux/slab.h> | ||
30 | #include <asm/kmap_types.h> | ||
31 | #include "drbd_int.h" | ||
32 | |||
33 | /* OPAQUE outside this file! | ||
34 | * interface defined in drbd_int.h | ||
35 | |||
36 | * convention: | ||
37 | * function name drbd_bm_... => used elsewhere, "public". | ||
38 | * function name bm_... => internal to implementation, "private". | ||
39 | |||
40 | * Note that since find_first_bit returns int, at the current granularity of | ||
41 | * the bitmap (4KB per byte), this implementation "only" supports up to | ||
42 | * 1<<(32+12) == 16 TB... | ||
43 | */ | ||
44 | |||
45 | /* | ||
46 | * NOTE | ||
47 | * Access to the *bm_pages is protected by bm_lock. | ||
48 | * It is safe to read the other members within the lock. | ||
49 | * | ||
50 | * drbd_bm_set_bits is called from bio_endio callbacks, | ||
51 | * We may be called with irq already disabled, | ||
52 | * so we need spin_lock_irqsave(). | ||
53 | * And we need the kmap_atomic. | ||
54 | */ | ||
55 | struct drbd_bitmap { | ||
56 | struct page **bm_pages; | ||
57 | spinlock_t bm_lock; | ||
58 | /* WARNING unsigned long bm_*: | ||
59 | * 32bit number of bit offset is just enough for 512 MB bitmap. | ||
60 | * it will blow up if we make the bitmap bigger... | ||
61 | * not that it makes much sense to have a bitmap that large, | ||
62 | * rather change the granularity to 16k or 64k or something. | ||
63 | * (that implies other problems, however...) | ||
64 | */ | ||
65 | unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */ | ||
66 | unsigned long bm_bits; | ||
67 | size_t bm_words; | ||
68 | size_t bm_number_of_pages; | ||
69 | sector_t bm_dev_capacity; | ||
70 | struct mutex bm_change; /* serializes resize operations */ | ||
71 | |||
72 | atomic_t bm_async_io; | ||
73 | wait_queue_head_t bm_io_wait; | ||
74 | |||
75 | unsigned long bm_flags; | ||
76 | |||
77 | /* debugging aid, in case we are still racy somewhere */ | ||
78 | char *bm_why; | ||
79 | struct task_struct *bm_task; | ||
80 | }; | ||
81 | |||
82 | /* definition of bits in bm_flags */ | ||
83 | #define BM_LOCKED 0 | ||
84 | #define BM_MD_IO_ERROR 1 | ||
85 | #define BM_P_VMALLOCED 2 | ||
86 | |||
87 | static int bm_is_locked(struct drbd_bitmap *b) | ||
88 | { | ||
89 | return test_bit(BM_LOCKED, &b->bm_flags); | ||
90 | } | ||
91 | |||
92 | #define bm_print_lock_info(m) __bm_print_lock_info(m, __func__) | ||
93 | static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func) | ||
94 | { | ||
95 | struct drbd_bitmap *b = mdev->bitmap; | ||
96 | if (!__ratelimit(&drbd_ratelimit_state)) | ||
97 | return; | ||
98 | dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n", | ||
99 | current == mdev->receiver.task ? "receiver" : | ||
100 | current == mdev->asender.task ? "asender" : | ||
101 | current == mdev->worker.task ? "worker" : current->comm, | ||
102 | func, b->bm_why ?: "?", | ||
103 | b->bm_task == mdev->receiver.task ? "receiver" : | ||
104 | b->bm_task == mdev->asender.task ? "asender" : | ||
105 | b->bm_task == mdev->worker.task ? "worker" : "?"); | ||
106 | } | ||
107 | |||
108 | void drbd_bm_lock(struct drbd_conf *mdev, char *why) | ||
109 | { | ||
110 | struct drbd_bitmap *b = mdev->bitmap; | ||
111 | int trylock_failed; | ||
112 | |||
113 | if (!b) { | ||
114 | dev_err(DEV, "FIXME no bitmap in drbd_bm_lock!?\n"); | ||
115 | return; | ||
116 | } | ||
117 | |||
118 | trylock_failed = !mutex_trylock(&b->bm_change); | ||
119 | |||
120 | if (trylock_failed) { | ||
121 | dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n", | ||
122 | current == mdev->receiver.task ? "receiver" : | ||
123 | current == mdev->asender.task ? "asender" : | ||
124 | current == mdev->worker.task ? "worker" : current->comm, | ||
125 | why, b->bm_why ?: "?", | ||
126 | b->bm_task == mdev->receiver.task ? "receiver" : | ||
127 | b->bm_task == mdev->asender.task ? "asender" : | ||
128 | b->bm_task == mdev->worker.task ? "worker" : "?"); | ||
129 | mutex_lock(&b->bm_change); | ||
130 | } | ||
131 | if (__test_and_set_bit(BM_LOCKED, &b->bm_flags)) | ||
132 | dev_err(DEV, "FIXME bitmap already locked in bm_lock\n"); | ||
133 | |||
134 | b->bm_why = why; | ||
135 | b->bm_task = current; | ||
136 | } | ||
137 | |||
138 | void drbd_bm_unlock(struct drbd_conf *mdev) | ||
139 | { | ||
140 | struct drbd_bitmap *b = mdev->bitmap; | ||
141 | if (!b) { | ||
142 | dev_err(DEV, "FIXME no bitmap in drbd_bm_unlock!?\n"); | ||
143 | return; | ||
144 | } | ||
145 | |||
146 | if (!__test_and_clear_bit(BM_LOCKED, &mdev->bitmap->bm_flags)) | ||
147 | dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n"); | ||
148 | |||
149 | b->bm_why = NULL; | ||
150 | b->bm_task = NULL; | ||
151 | mutex_unlock(&b->bm_change); | ||
152 | } | ||
153 | |||
154 | /* word offset to long pointer */ | ||
155 | static unsigned long *__bm_map_paddr(struct drbd_bitmap *b, unsigned long offset, const enum km_type km) | ||
156 | { | ||
157 | struct page *page; | ||
158 | unsigned long page_nr; | ||
159 | |||
160 | /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */ | ||
161 | page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3); | ||
162 | BUG_ON(page_nr >= b->bm_number_of_pages); | ||
163 | page = b->bm_pages[page_nr]; | ||
164 | |||
165 | return (unsigned long *) kmap_atomic(page, km); | ||
166 | } | ||
167 | |||
168 | static unsigned long * bm_map_paddr(struct drbd_bitmap *b, unsigned long offset) | ||
169 | { | ||
170 | return __bm_map_paddr(b, offset, KM_IRQ1); | ||
171 | } | ||
172 | |||
173 | static void __bm_unmap(unsigned long *p_addr, const enum km_type km) | ||
174 | { | ||
175 | kunmap_atomic(p_addr, km); | ||
176 | }; | ||
177 | |||
178 | static void bm_unmap(unsigned long *p_addr) | ||
179 | { | ||
180 | return __bm_unmap(p_addr, KM_IRQ1); | ||
181 | } | ||
182 | |||
183 | /* long word offset of _bitmap_ sector */ | ||
184 | #define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) | ||
185 | /* word offset from start of bitmap to word number _in_page_ | ||
186 | * modulo longs per page | ||
187 | #define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long)) | ||
188 | hm, well, Philipp thinks gcc might not optimze the % into & (... - 1) | ||
189 | so do it explicitly: | ||
190 | */ | ||
191 | #define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1)) | ||
192 | |||
193 | /* Long words per page */ | ||
194 | #define LWPP (PAGE_SIZE/sizeof(long)) | ||
195 | |||
196 | /* | ||
197 | * actually most functions herein should take a struct drbd_bitmap*, not a | ||
198 | * struct drbd_conf*, but for the debug macros I like to have the mdev around | ||
199 | * to be able to report device specific. | ||
200 | */ | ||
201 | |||
202 | static void bm_free_pages(struct page **pages, unsigned long number) | ||
203 | { | ||
204 | unsigned long i; | ||
205 | if (!pages) | ||
206 | return; | ||
207 | |||
208 | for (i = 0; i < number; i++) { | ||
209 | if (!pages[i]) { | ||
210 | printk(KERN_ALERT "drbd: bm_free_pages tried to free " | ||
211 | "a NULL pointer; i=%lu n=%lu\n", | ||
212 | i, number); | ||
213 | continue; | ||
214 | } | ||
215 | __free_page(pages[i]); | ||
216 | pages[i] = NULL; | ||
217 | } | ||
218 | } | ||
219 | |||
220 | static void bm_vk_free(void *ptr, int v) | ||
221 | { | ||
222 | if (v) | ||
223 | vfree(ptr); | ||
224 | else | ||
225 | kfree(ptr); | ||
226 | } | ||
227 | |||
228 | /* | ||
229 | * "have" and "want" are NUMBER OF PAGES. | ||
230 | */ | ||
231 | static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) | ||
232 | { | ||
233 | struct page **old_pages = b->bm_pages; | ||
234 | struct page **new_pages, *page; | ||
235 | unsigned int i, bytes, vmalloced = 0; | ||
236 | unsigned long have = b->bm_number_of_pages; | ||
237 | |||
238 | BUG_ON(have == 0 && old_pages != NULL); | ||
239 | BUG_ON(have != 0 && old_pages == NULL); | ||
240 | |||
241 | if (have == want) | ||
242 | return old_pages; | ||
243 | |||
244 | /* Trying kmalloc first, falling back to vmalloc. | ||
245 | * GFP_KERNEL is ok, as this is done when a lower level disk is | ||
246 | * "attached" to the drbd. Context is receiver thread or cqueue | ||
247 | * thread. As we have no disk yet, we are not in the IO path, | ||
248 | * not even the IO path of the peer. */ | ||
249 | bytes = sizeof(struct page *)*want; | ||
250 | new_pages = kmalloc(bytes, GFP_KERNEL); | ||
251 | if (!new_pages) { | ||
252 | new_pages = vmalloc(bytes); | ||
253 | if (!new_pages) | ||
254 | return NULL; | ||
255 | vmalloced = 1; | ||
256 | } | ||
257 | |||
258 | memset(new_pages, 0, bytes); | ||
259 | if (want >= have) { | ||
260 | for (i = 0; i < have; i++) | ||
261 | new_pages[i] = old_pages[i]; | ||
262 | for (; i < want; i++) { | ||
263 | page = alloc_page(GFP_HIGHUSER); | ||
264 | if (!page) { | ||
265 | bm_free_pages(new_pages + have, i - have); | ||
266 | bm_vk_free(new_pages, vmalloced); | ||
267 | return NULL; | ||
268 | } | ||
269 | new_pages[i] = page; | ||
270 | } | ||
271 | } else { | ||
272 | for (i = 0; i < want; i++) | ||
273 | new_pages[i] = old_pages[i]; | ||
274 | /* NOT HERE, we are outside the spinlock! | ||
275 | bm_free_pages(old_pages + want, have - want); | ||
276 | */ | ||
277 | } | ||
278 | |||
279 | if (vmalloced) | ||
280 | set_bit(BM_P_VMALLOCED, &b->bm_flags); | ||
281 | else | ||
282 | clear_bit(BM_P_VMALLOCED, &b->bm_flags); | ||
283 | |||
284 | return new_pages; | ||
285 | } | ||
286 | |||
287 | /* | ||
288 | * called on driver init only. TODO call when a device is created. | ||
289 | * allocates the drbd_bitmap, and stores it in mdev->bitmap. | ||
290 | */ | ||
291 | int drbd_bm_init(struct drbd_conf *mdev) | ||
292 | { | ||
293 | struct drbd_bitmap *b = mdev->bitmap; | ||
294 | WARN_ON(b != NULL); | ||
295 | b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL); | ||
296 | if (!b) | ||
297 | return -ENOMEM; | ||
298 | spin_lock_init(&b->bm_lock); | ||
299 | mutex_init(&b->bm_change); | ||
300 | init_waitqueue_head(&b->bm_io_wait); | ||
301 | |||
302 | mdev->bitmap = b; | ||
303 | |||
304 | return 0; | ||
305 | } | ||
306 | |||
307 | sector_t drbd_bm_capacity(struct drbd_conf *mdev) | ||
308 | { | ||
309 | ERR_IF(!mdev->bitmap) return 0; | ||
310 | return mdev->bitmap->bm_dev_capacity; | ||
311 | } | ||
312 | |||
313 | /* called on driver unload. TODO: call when a device is destroyed. | ||
314 | */ | ||
315 | void drbd_bm_cleanup(struct drbd_conf *mdev) | ||
316 | { | ||
317 | ERR_IF (!mdev->bitmap) return; | ||
318 | bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages); | ||
319 | bm_vk_free(mdev->bitmap->bm_pages, test_bit(BM_P_VMALLOCED, &mdev->bitmap->bm_flags)); | ||
320 | kfree(mdev->bitmap); | ||
321 | mdev->bitmap = NULL; | ||
322 | } | ||
323 | |||
324 | /* | ||
325 | * since (b->bm_bits % BITS_PER_LONG) != 0, | ||
326 | * this masks out the remaining bits. | ||
327 | * Returns the number of bits cleared. | ||
328 | */ | ||
329 | static int bm_clear_surplus(struct drbd_bitmap *b) | ||
330 | { | ||
331 | const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1; | ||
332 | size_t w = b->bm_bits >> LN2_BPL; | ||
333 | int cleared = 0; | ||
334 | unsigned long *p_addr, *bm; | ||
335 | |||
336 | p_addr = bm_map_paddr(b, w); | ||
337 | bm = p_addr + MLPP(w); | ||
338 | if (w < b->bm_words) { | ||
339 | cleared = hweight_long(*bm & ~mask); | ||
340 | *bm &= mask; | ||
341 | w++; bm++; | ||
342 | } | ||
343 | |||
344 | if (w < b->bm_words) { | ||
345 | cleared += hweight_long(*bm); | ||
346 | *bm = 0; | ||
347 | } | ||
348 | bm_unmap(p_addr); | ||
349 | return cleared; | ||
350 | } | ||
351 | |||
352 | static void bm_set_surplus(struct drbd_bitmap *b) | ||
353 | { | ||
354 | const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1; | ||
355 | size_t w = b->bm_bits >> LN2_BPL; | ||
356 | unsigned long *p_addr, *bm; | ||
357 | |||
358 | p_addr = bm_map_paddr(b, w); | ||
359 | bm = p_addr + MLPP(w); | ||
360 | if (w < b->bm_words) { | ||
361 | *bm |= ~mask; | ||
362 | bm++; w++; | ||
363 | } | ||
364 | |||
365 | if (w < b->bm_words) { | ||
366 | *bm = ~(0UL); | ||
367 | } | ||
368 | bm_unmap(p_addr); | ||
369 | } | ||
370 | |||
371 | static unsigned long __bm_count_bits(struct drbd_bitmap *b, const int swap_endian) | ||
372 | { | ||
373 | unsigned long *p_addr, *bm, offset = 0; | ||
374 | unsigned long bits = 0; | ||
375 | unsigned long i, do_now; | ||
376 | |||
377 | while (offset < b->bm_words) { | ||
378 | i = do_now = min_t(size_t, b->bm_words-offset, LWPP); | ||
379 | p_addr = __bm_map_paddr(b, offset, KM_USER0); | ||
380 | bm = p_addr + MLPP(offset); | ||
381 | while (i--) { | ||
382 | #ifndef __LITTLE_ENDIAN | ||
383 | if (swap_endian) | ||
384 | *bm = lel_to_cpu(*bm); | ||
385 | #endif | ||
386 | bits += hweight_long(*bm++); | ||
387 | } | ||
388 | __bm_unmap(p_addr, KM_USER0); | ||
389 | offset += do_now; | ||
390 | cond_resched(); | ||
391 | } | ||
392 | |||
393 | return bits; | ||
394 | } | ||
395 | |||
396 | static unsigned long bm_count_bits(struct drbd_bitmap *b) | ||
397 | { | ||
398 | return __bm_count_bits(b, 0); | ||
399 | } | ||
400 | |||
401 | static unsigned long bm_count_bits_swap_endian(struct drbd_bitmap *b) | ||
402 | { | ||
403 | return __bm_count_bits(b, 1); | ||
404 | } | ||
405 | |||
406 | /* offset and len in long words.*/ | ||
407 | static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) | ||
408 | { | ||
409 | unsigned long *p_addr, *bm; | ||
410 | size_t do_now, end; | ||
411 | |||
412 | #define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512) | ||
413 | |||
414 | end = offset + len; | ||
415 | |||
416 | if (end > b->bm_words) { | ||
417 | printk(KERN_ALERT "drbd: bm_memset end > bm_words\n"); | ||
418 | return; | ||
419 | } | ||
420 | |||
421 | while (offset < end) { | ||
422 | do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset; | ||
423 | p_addr = bm_map_paddr(b, offset); | ||
424 | bm = p_addr + MLPP(offset); | ||
425 | if (bm+do_now > p_addr + LWPP) { | ||
426 | printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n", | ||
427 | p_addr, bm, (int)do_now); | ||
428 | break; /* breaks to after catch_oob_access_end() only! */ | ||
429 | } | ||
430 | memset(bm, c, do_now * sizeof(long)); | ||
431 | bm_unmap(p_addr); | ||
432 | offset += do_now; | ||
433 | } | ||
434 | } | ||
435 | |||
436 | /* | ||
437 | * make sure the bitmap has enough room for the attached storage, | ||
438 | * if necessary, resize. | ||
439 | * called whenever we may have changed the device size. | ||
440 | * returns -ENOMEM if we could not allocate enough memory, 0 on success. | ||
441 | * In case this is actually a resize, we copy the old bitmap into the new one. | ||
442 | * Otherwise, the bitmap is initialized to all bits set. | ||
443 | */ | ||
444 | int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity) | ||
445 | { | ||
446 | struct drbd_bitmap *b = mdev->bitmap; | ||
447 | unsigned long bits, words, owords, obits, *p_addr, *bm; | ||
448 | unsigned long want, have, onpages; /* number of pages */ | ||
449 | struct page **npages, **opages = NULL; | ||
450 | int err = 0, growing; | ||
451 | int opages_vmalloced; | ||
452 | |||
453 | ERR_IF(!b) return -ENOMEM; | ||
454 | |||
455 | drbd_bm_lock(mdev, "resize"); | ||
456 | |||
457 | dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n", | ||
458 | (unsigned long long)capacity); | ||
459 | |||
460 | if (capacity == b->bm_dev_capacity) | ||
461 | goto out; | ||
462 | |||
463 | opages_vmalloced = test_bit(BM_P_VMALLOCED, &b->bm_flags); | ||
464 | |||
465 | if (capacity == 0) { | ||
466 | spin_lock_irq(&b->bm_lock); | ||
467 | opages = b->bm_pages; | ||
468 | onpages = b->bm_number_of_pages; | ||
469 | owords = b->bm_words; | ||
470 | b->bm_pages = NULL; | ||
471 | b->bm_number_of_pages = | ||
472 | b->bm_set = | ||
473 | b->bm_bits = | ||
474 | b->bm_words = | ||
475 | b->bm_dev_capacity = 0; | ||
476 | spin_unlock_irq(&b->bm_lock); | ||
477 | bm_free_pages(opages, onpages); | ||
478 | bm_vk_free(opages, opages_vmalloced); | ||
479 | goto out; | ||
480 | } | ||
481 | bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT)); | ||
482 | |||
483 | /* if we would use | ||
484 | words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL; | ||
485 | a 32bit host could present the wrong number of words | ||
486 | to a 64bit host. | ||
487 | */ | ||
488 | words = ALIGN(bits, 64) >> LN2_BPL; | ||
489 | |||
490 | if (get_ldev(mdev)) { | ||
491 | D_ASSERT((u64)bits <= (((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12)); | ||
492 | put_ldev(mdev); | ||
493 | } | ||
494 | |||
495 | /* one extra long to catch off by one errors */ | ||
496 | want = ALIGN((words+1)*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT; | ||
497 | have = b->bm_number_of_pages; | ||
498 | if (want == have) { | ||
499 | D_ASSERT(b->bm_pages != NULL); | ||
500 | npages = b->bm_pages; | ||
501 | } else { | ||
502 | if (FAULT_ACTIVE(mdev, DRBD_FAULT_BM_ALLOC)) | ||
503 | npages = NULL; | ||
504 | else | ||
505 | npages = bm_realloc_pages(b, want); | ||
506 | } | ||
507 | |||
508 | if (!npages) { | ||
509 | err = -ENOMEM; | ||
510 | goto out; | ||
511 | } | ||
512 | |||
513 | spin_lock_irq(&b->bm_lock); | ||
514 | opages = b->bm_pages; | ||
515 | owords = b->bm_words; | ||
516 | obits = b->bm_bits; | ||
517 | |||
518 | growing = bits > obits; | ||
519 | if (opages) | ||
520 | bm_set_surplus(b); | ||
521 | |||
522 | b->bm_pages = npages; | ||
523 | b->bm_number_of_pages = want; | ||
524 | b->bm_bits = bits; | ||
525 | b->bm_words = words; | ||
526 | b->bm_dev_capacity = capacity; | ||
527 | |||
528 | if (growing) { | ||
529 | bm_memset(b, owords, 0xff, words-owords); | ||
530 | b->bm_set += bits - obits; | ||
531 | } | ||
532 | |||
533 | if (want < have) { | ||
534 | /* implicit: (opages != NULL) && (opages != npages) */ | ||
535 | bm_free_pages(opages + want, have - want); | ||
536 | } | ||
537 | |||
538 | p_addr = bm_map_paddr(b, words); | ||
539 | bm = p_addr + MLPP(words); | ||
540 | *bm = DRBD_MAGIC; | ||
541 | bm_unmap(p_addr); | ||
542 | |||
543 | (void)bm_clear_surplus(b); | ||
544 | |||
545 | spin_unlock_irq(&b->bm_lock); | ||
546 | if (opages != npages) | ||
547 | bm_vk_free(opages, opages_vmalloced); | ||
548 | if (!growing) | ||
549 | b->bm_set = bm_count_bits(b); | ||
550 | dev_info(DEV, "resync bitmap: bits=%lu words=%lu\n", bits, words); | ||
551 | |||
552 | out: | ||
553 | drbd_bm_unlock(mdev); | ||
554 | return err; | ||
555 | } | ||
556 | |||
557 | /* inherently racy: | ||
558 | * if not protected by other means, return value may be out of date when | ||
559 | * leaving this function... | ||
560 | * we still need to lock it, since it is important that this returns | ||
561 | * bm_set == 0 precisely. | ||
562 | * | ||
563 | * maybe bm_set should be atomic_t ? | ||
564 | */ | ||
565 | static unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev) | ||
566 | { | ||
567 | struct drbd_bitmap *b = mdev->bitmap; | ||
568 | unsigned long s; | ||
569 | unsigned long flags; | ||
570 | |||
571 | ERR_IF(!b) return 0; | ||
572 | ERR_IF(!b->bm_pages) return 0; | ||
573 | |||
574 | spin_lock_irqsave(&b->bm_lock, flags); | ||
575 | s = b->bm_set; | ||
576 | spin_unlock_irqrestore(&b->bm_lock, flags); | ||
577 | |||
578 | return s; | ||
579 | } | ||
580 | |||
581 | unsigned long drbd_bm_total_weight(struct drbd_conf *mdev) | ||
582 | { | ||
583 | unsigned long s; | ||
584 | /* if I don't have a disk, I don't know about out-of-sync status */ | ||
585 | if (!get_ldev_if_state(mdev, D_NEGOTIATING)) | ||
586 | return 0; | ||
587 | s = _drbd_bm_total_weight(mdev); | ||
588 | put_ldev(mdev); | ||
589 | return s; | ||
590 | } | ||
591 | |||
592 | size_t drbd_bm_words(struct drbd_conf *mdev) | ||
593 | { | ||
594 | struct drbd_bitmap *b = mdev->bitmap; | ||
595 | ERR_IF(!b) return 0; | ||
596 | ERR_IF(!b->bm_pages) return 0; | ||
597 | |||
598 | return b->bm_words; | ||
599 | } | ||
600 | |||
601 | unsigned long drbd_bm_bits(struct drbd_conf *mdev) | ||
602 | { | ||
603 | struct drbd_bitmap *b = mdev->bitmap; | ||
604 | ERR_IF(!b) return 0; | ||
605 | |||
606 | return b->bm_bits; | ||
607 | } | ||
608 | |||
609 | /* merge number words from buffer into the bitmap starting at offset. | ||
610 | * buffer[i] is expected to be little endian unsigned long. | ||
611 | * bitmap must be locked by drbd_bm_lock. | ||
612 | * currently only used from receive_bitmap. | ||
613 | */ | ||
614 | void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number, | ||
615 | unsigned long *buffer) | ||
616 | { | ||
617 | struct drbd_bitmap *b = mdev->bitmap; | ||
618 | unsigned long *p_addr, *bm; | ||
619 | unsigned long word, bits; | ||
620 | size_t end, do_now; | ||
621 | |||
622 | end = offset + number; | ||
623 | |||
624 | ERR_IF(!b) return; | ||
625 | ERR_IF(!b->bm_pages) return; | ||
626 | if (number == 0) | ||
627 | return; | ||
628 | WARN_ON(offset >= b->bm_words); | ||
629 | WARN_ON(end > b->bm_words); | ||
630 | |||
631 | spin_lock_irq(&b->bm_lock); | ||
632 | while (offset < end) { | ||
633 | do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; | ||
634 | p_addr = bm_map_paddr(b, offset); | ||
635 | bm = p_addr + MLPP(offset); | ||
636 | offset += do_now; | ||
637 | while (do_now--) { | ||
638 | bits = hweight_long(*bm); | ||
639 | word = *bm | lel_to_cpu(*buffer++); | ||
640 | *bm++ = word; | ||
641 | b->bm_set += hweight_long(word) - bits; | ||
642 | } | ||
643 | bm_unmap(p_addr); | ||
644 | } | ||
645 | /* with 32bit <-> 64bit cross-platform connect | ||
646 | * this is only correct for current usage, | ||
647 | * where we _know_ that we are 64 bit aligned, | ||
648 | * and know that this function is used in this way, too... | ||
649 | */ | ||
650 | if (end == b->bm_words) | ||
651 | b->bm_set -= bm_clear_surplus(b); | ||
652 | |||
653 | spin_unlock_irq(&b->bm_lock); | ||
654 | } | ||
655 | |||
656 | /* copy number words from the bitmap starting at offset into the buffer. | ||
657 | * buffer[i] will be little endian unsigned long. | ||
658 | */ | ||
659 | void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number, | ||
660 | unsigned long *buffer) | ||
661 | { | ||
662 | struct drbd_bitmap *b = mdev->bitmap; | ||
663 | unsigned long *p_addr, *bm; | ||
664 | size_t end, do_now; | ||
665 | |||
666 | end = offset + number; | ||
667 | |||
668 | ERR_IF(!b) return; | ||
669 | ERR_IF(!b->bm_pages) return; | ||
670 | |||
671 | spin_lock_irq(&b->bm_lock); | ||
672 | if ((offset >= b->bm_words) || | ||
673 | (end > b->bm_words) || | ||
674 | (number <= 0)) | ||
675 | dev_err(DEV, "offset=%lu number=%lu bm_words=%lu\n", | ||
676 | (unsigned long) offset, | ||
677 | (unsigned long) number, | ||
678 | (unsigned long) b->bm_words); | ||
679 | else { | ||
680 | while (offset < end) { | ||
681 | do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; | ||
682 | p_addr = bm_map_paddr(b, offset); | ||
683 | bm = p_addr + MLPP(offset); | ||
684 | offset += do_now; | ||
685 | while (do_now--) | ||
686 | *buffer++ = cpu_to_lel(*bm++); | ||
687 | bm_unmap(p_addr); | ||
688 | } | ||
689 | } | ||
690 | spin_unlock_irq(&b->bm_lock); | ||
691 | } | ||
692 | |||
693 | /* set all bits in the bitmap */ | ||
694 | void drbd_bm_set_all(struct drbd_conf *mdev) | ||
695 | { | ||
696 | struct drbd_bitmap *b = mdev->bitmap; | ||
697 | ERR_IF(!b) return; | ||
698 | ERR_IF(!b->bm_pages) return; | ||
699 | |||
700 | spin_lock_irq(&b->bm_lock); | ||
701 | bm_memset(b, 0, 0xff, b->bm_words); | ||
702 | (void)bm_clear_surplus(b); | ||
703 | b->bm_set = b->bm_bits; | ||
704 | spin_unlock_irq(&b->bm_lock); | ||
705 | } | ||
706 | |||
707 | /* clear all bits in the bitmap */ | ||
708 | void drbd_bm_clear_all(struct drbd_conf *mdev) | ||
709 | { | ||
710 | struct drbd_bitmap *b = mdev->bitmap; | ||
711 | ERR_IF(!b) return; | ||
712 | ERR_IF(!b->bm_pages) return; | ||
713 | |||
714 | spin_lock_irq(&b->bm_lock); | ||
715 | bm_memset(b, 0, 0, b->bm_words); | ||
716 | b->bm_set = 0; | ||
717 | spin_unlock_irq(&b->bm_lock); | ||
718 | } | ||
719 | |||
720 | static void bm_async_io_complete(struct bio *bio, int error) | ||
721 | { | ||
722 | struct drbd_bitmap *b = bio->bi_private; | ||
723 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | ||
724 | |||
725 | |||
726 | /* strange behavior of some lower level drivers... | ||
727 | * fail the request by clearing the uptodate flag, | ||
728 | * but do not return any error?! | ||
729 | * do we want to WARN() on this? */ | ||
730 | if (!error && !uptodate) | ||
731 | error = -EIO; | ||
732 | |||
733 | if (error) { | ||
734 | /* doh. what now? | ||
735 | * for now, set all bits, and flag MD_IO_ERROR */ | ||
736 | __set_bit(BM_MD_IO_ERROR, &b->bm_flags); | ||
737 | } | ||
738 | if (atomic_dec_and_test(&b->bm_async_io)) | ||
739 | wake_up(&b->bm_io_wait); | ||
740 | |||
741 | bio_put(bio); | ||
742 | } | ||
743 | |||
744 | static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int page_nr, int rw) __must_hold(local) | ||
745 | { | ||
746 | /* we are process context. we always get a bio */ | ||
747 | struct bio *bio = bio_alloc(GFP_KERNEL, 1); | ||
748 | unsigned int len; | ||
749 | sector_t on_disk_sector = | ||
750 | mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset; | ||
751 | on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9); | ||
752 | |||
753 | /* this might happen with very small | ||
754 | * flexible external meta data device */ | ||
755 | len = min_t(unsigned int, PAGE_SIZE, | ||
756 | (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9); | ||
757 | |||
758 | bio->bi_bdev = mdev->ldev->md_bdev; | ||
759 | bio->bi_sector = on_disk_sector; | ||
760 | bio_add_page(bio, b->bm_pages[page_nr], len, 0); | ||
761 | bio->bi_private = b; | ||
762 | bio->bi_end_io = bm_async_io_complete; | ||
763 | |||
764 | if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) { | ||
765 | bio->bi_rw |= rw; | ||
766 | bio_endio(bio, -EIO); | ||
767 | } else { | ||
768 | submit_bio(rw, bio); | ||
769 | } | ||
770 | } | ||
771 | |||
772 | # if defined(__LITTLE_ENDIAN) | ||
773 | /* nothing to do, on disk == in memory */ | ||
774 | # define bm_cpu_to_lel(x) ((void)0) | ||
775 | # else | ||
776 | void bm_cpu_to_lel(struct drbd_bitmap *b) | ||
777 | { | ||
778 | /* need to cpu_to_lel all the pages ... | ||
779 | * this may be optimized by using | ||
780 | * cpu_to_lel(-1) == -1 and cpu_to_lel(0) == 0; | ||
781 | * the following is still not optimal, but better than nothing */ | ||
782 | unsigned int i; | ||
783 | unsigned long *p_addr, *bm; | ||
784 | if (b->bm_set == 0) { | ||
785 | /* no page at all; avoid swap if all is 0 */ | ||
786 | i = b->bm_number_of_pages; | ||
787 | } else if (b->bm_set == b->bm_bits) { | ||
788 | /* only the last page */ | ||
789 | i = b->bm_number_of_pages - 1; | ||
790 | } else { | ||
791 | /* all pages */ | ||
792 | i = 0; | ||
793 | } | ||
794 | for (; i < b->bm_number_of_pages; i++) { | ||
795 | p_addr = kmap_atomic(b->bm_pages[i], KM_USER0); | ||
796 | for (bm = p_addr; bm < p_addr + PAGE_SIZE/sizeof(long); bm++) | ||
797 | *bm = cpu_to_lel(*bm); | ||
798 | kunmap_atomic(p_addr, KM_USER0); | ||
799 | } | ||
800 | } | ||
801 | # endif | ||
802 | /* lel_to_cpu == cpu_to_lel */ | ||
803 | # define bm_lel_to_cpu(x) bm_cpu_to_lel(x) | ||
804 | |||
805 | /* | ||
806 | * bm_rw: read/write the whole bitmap from/to its on disk location. | ||
807 | */ | ||
808 | static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local) | ||
809 | { | ||
810 | struct drbd_bitmap *b = mdev->bitmap; | ||
811 | /* sector_t sector; */ | ||
812 | int bm_words, num_pages, i; | ||
813 | unsigned long now; | ||
814 | char ppb[10]; | ||
815 | int err = 0; | ||
816 | |||
817 | WARN_ON(!bm_is_locked(b)); | ||
818 | |||
819 | /* no spinlock here, the drbd_bm_lock should be enough! */ | ||
820 | |||
821 | bm_words = drbd_bm_words(mdev); | ||
822 | num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT; | ||
823 | |||
824 | /* on disk bitmap is little endian */ | ||
825 | if (rw == WRITE) | ||
826 | bm_cpu_to_lel(b); | ||
827 | |||
828 | now = jiffies; | ||
829 | atomic_set(&b->bm_async_io, num_pages); | ||
830 | __clear_bit(BM_MD_IO_ERROR, &b->bm_flags); | ||
831 | |||
832 | /* let the layers below us try to merge these bios... */ | ||
833 | for (i = 0; i < num_pages; i++) | ||
834 | bm_page_io_async(mdev, b, i, rw); | ||
835 | |||
836 | drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev)); | ||
837 | wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0); | ||
838 | |||
839 | if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) { | ||
840 | dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); | ||
841 | drbd_chk_io_error(mdev, 1, TRUE); | ||
842 | err = -EIO; | ||
843 | } | ||
844 | |||
845 | now = jiffies; | ||
846 | if (rw == WRITE) { | ||
847 | /* swap back endianness */ | ||
848 | bm_lel_to_cpu(b); | ||
849 | /* flush bitmap to stable storage */ | ||
850 | drbd_md_flush(mdev); | ||
851 | } else /* rw == READ */ { | ||
852 | /* just read, if necessary adjust endianness */ | ||
853 | b->bm_set = bm_count_bits_swap_endian(b); | ||
854 | dev_info(DEV, "recounting of set bits took additional %lu jiffies\n", | ||
855 | jiffies - now); | ||
856 | } | ||
857 | now = b->bm_set; | ||
858 | |||
859 | dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", | ||
860 | ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); | ||
861 | |||
862 | return err; | ||
863 | } | ||
864 | |||
865 | /** | ||
866 | * drbd_bm_read() - Read the whole bitmap from its on disk location. | ||
867 | * @mdev: DRBD device. | ||
868 | */ | ||
869 | int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) | ||
870 | { | ||
871 | return bm_rw(mdev, READ); | ||
872 | } | ||
873 | |||
874 | /** | ||
875 | * drbd_bm_write() - Write the whole bitmap to its on disk location. | ||
876 | * @mdev: DRBD device. | ||
877 | */ | ||
878 | int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) | ||
879 | { | ||
880 | return bm_rw(mdev, WRITE); | ||
881 | } | ||
882 | |||
883 | /** | ||
884 | * drbd_bm_write_sect: Writes a 512 (MD_SECTOR_SIZE) byte piece of the bitmap | ||
885 | * @mdev: DRBD device. | ||
886 | * @enr: Extent number in the resync lru (happens to be sector offset) | ||
887 | * | ||
888 | * The BM_EXT_SIZE is on purpose exactly the amount of the bitmap covered | ||
889 | * by a single sector write. Therefore enr == sector offset from the | ||
890 | * start of the bitmap. | ||
891 | */ | ||
892 | int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local) | ||
893 | { | ||
894 | sector_t on_disk_sector = enr + mdev->ldev->md.md_offset | ||
895 | + mdev->ldev->md.bm_offset; | ||
896 | int bm_words, num_words, offset; | ||
897 | int err = 0; | ||
898 | |||
899 | mutex_lock(&mdev->md_io_mutex); | ||
900 | bm_words = drbd_bm_words(mdev); | ||
901 | offset = S2W(enr); /* word offset into bitmap */ | ||
902 | num_words = min(S2W(1), bm_words - offset); | ||
903 | if (num_words < S2W(1)) | ||
904 | memset(page_address(mdev->md_io_page), 0, MD_SECTOR_SIZE); | ||
905 | drbd_bm_get_lel(mdev, offset, num_words, | ||
906 | page_address(mdev->md_io_page)); | ||
907 | if (!drbd_md_sync_page_io(mdev, mdev->ldev, on_disk_sector, WRITE)) { | ||
908 | int i; | ||
909 | err = -EIO; | ||
910 | dev_err(DEV, "IO ERROR writing bitmap sector %lu " | ||
911 | "(meta-disk sector %llus)\n", | ||
912 | enr, (unsigned long long)on_disk_sector); | ||
913 | drbd_chk_io_error(mdev, 1, TRUE); | ||
914 | for (i = 0; i < AL_EXT_PER_BM_SECT; i++) | ||
915 | drbd_bm_ALe_set_all(mdev, enr*AL_EXT_PER_BM_SECT+i); | ||
916 | } | ||
917 | mdev->bm_writ_cnt++; | ||
918 | mutex_unlock(&mdev->md_io_mutex); | ||
919 | return err; | ||
920 | } | ||
921 | |||
922 | /* NOTE | ||
923 | * find_first_bit returns int, we return unsigned long. | ||
924 | * should not make much difference anyways, but ... | ||
925 | * | ||
926 | * this returns a bit number, NOT a sector! | ||
927 | */ | ||
928 | #define BPP_MASK ((1UL << (PAGE_SHIFT+3)) - 1) | ||
929 | static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo, | ||
930 | const int find_zero_bit, const enum km_type km) | ||
931 | { | ||
932 | struct drbd_bitmap *b = mdev->bitmap; | ||
933 | unsigned long i = -1UL; | ||
934 | unsigned long *p_addr; | ||
935 | unsigned long bit_offset; /* bit offset of the mapped page. */ | ||
936 | |||
937 | if (bm_fo > b->bm_bits) { | ||
938 | dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits); | ||
939 | } else { | ||
940 | while (bm_fo < b->bm_bits) { | ||
941 | unsigned long offset; | ||
942 | bit_offset = bm_fo & ~BPP_MASK; /* bit offset of the page */ | ||
943 | offset = bit_offset >> LN2_BPL; /* word offset of the page */ | ||
944 | p_addr = __bm_map_paddr(b, offset, km); | ||
945 | |||
946 | if (find_zero_bit) | ||
947 | i = find_next_zero_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK); | ||
948 | else | ||
949 | i = find_next_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK); | ||
950 | |||
951 | __bm_unmap(p_addr, km); | ||
952 | if (i < PAGE_SIZE*8) { | ||
953 | i = bit_offset + i; | ||
954 | if (i >= b->bm_bits) | ||
955 | break; | ||
956 | goto found; | ||
957 | } | ||
958 | bm_fo = bit_offset + PAGE_SIZE*8; | ||
959 | } | ||
960 | i = -1UL; | ||
961 | } | ||
962 | found: | ||
963 | return i; | ||
964 | } | ||
965 | |||
966 | static unsigned long bm_find_next(struct drbd_conf *mdev, | ||
967 | unsigned long bm_fo, const int find_zero_bit) | ||
968 | { | ||
969 | struct drbd_bitmap *b = mdev->bitmap; | ||
970 | unsigned long i = -1UL; | ||
971 | |||
972 | ERR_IF(!b) return i; | ||
973 | ERR_IF(!b->bm_pages) return i; | ||
974 | |||
975 | spin_lock_irq(&b->bm_lock); | ||
976 | if (bm_is_locked(b)) | ||
977 | bm_print_lock_info(mdev); | ||
978 | |||
979 | i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1); | ||
980 | |||
981 | spin_unlock_irq(&b->bm_lock); | ||
982 | return i; | ||
983 | } | ||
984 | |||
985 | unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo) | ||
986 | { | ||
987 | return bm_find_next(mdev, bm_fo, 0); | ||
988 | } | ||
989 | |||
990 | #if 0 | ||
991 | /* not yet needed for anything. */ | ||
992 | unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo) | ||
993 | { | ||
994 | return bm_find_next(mdev, bm_fo, 1); | ||
995 | } | ||
996 | #endif | ||
997 | |||
998 | /* does not spin_lock_irqsave. | ||
999 | * you must take drbd_bm_lock() first */ | ||
1000 | unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo) | ||
1001 | { | ||
1002 | /* WARN_ON(!bm_is_locked(mdev)); */ | ||
1003 | return __bm_find_next(mdev, bm_fo, 0, KM_USER1); | ||
1004 | } | ||
1005 | |||
1006 | unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo) | ||
1007 | { | ||
1008 | /* WARN_ON(!bm_is_locked(mdev)); */ | ||
1009 | return __bm_find_next(mdev, bm_fo, 1, KM_USER1); | ||
1010 | } | ||
1011 | |||
1012 | /* returns number of bits actually changed. | ||
1013 | * for val != 0, we change 0 -> 1, return code positive | ||
1014 | * for val == 0, we change 1 -> 0, return code negative | ||
1015 | * wants bitnr, not sector. | ||
1016 | * expected to be called for only a few bits (e - s about BITS_PER_LONG). | ||
1017 | * Must hold bitmap lock already. */ | ||
1018 | int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, | ||
1019 | unsigned long e, int val, const enum km_type km) | ||
1020 | { | ||
1021 | struct drbd_bitmap *b = mdev->bitmap; | ||
1022 | unsigned long *p_addr = NULL; | ||
1023 | unsigned long bitnr; | ||
1024 | unsigned long last_page_nr = -1UL; | ||
1025 | int c = 0; | ||
1026 | |||
1027 | if (e >= b->bm_bits) { | ||
1028 | dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n", | ||
1029 | s, e, b->bm_bits); | ||
1030 | e = b->bm_bits ? b->bm_bits -1 : 0; | ||
1031 | } | ||
1032 | for (bitnr = s; bitnr <= e; bitnr++) { | ||
1033 | unsigned long offset = bitnr>>LN2_BPL; | ||
1034 | unsigned long page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3); | ||
1035 | if (page_nr != last_page_nr) { | ||
1036 | if (p_addr) | ||
1037 | __bm_unmap(p_addr, km); | ||
1038 | p_addr = __bm_map_paddr(b, offset, km); | ||
1039 | last_page_nr = page_nr; | ||
1040 | } | ||
1041 | if (val) | ||
1042 | c += (0 == __test_and_set_bit(bitnr & BPP_MASK, p_addr)); | ||
1043 | else | ||
1044 | c -= (0 != __test_and_clear_bit(bitnr & BPP_MASK, p_addr)); | ||
1045 | } | ||
1046 | if (p_addr) | ||
1047 | __bm_unmap(p_addr, km); | ||
1048 | b->bm_set += c; | ||
1049 | return c; | ||
1050 | } | ||
1051 | |||
1052 | /* returns number of bits actually changed. | ||
1053 | * for val != 0, we change 0 -> 1, return code positive | ||
1054 | * for val == 0, we change 1 -> 0, return code negative | ||
1055 | * wants bitnr, not sector */ | ||
1056 | int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, | ||
1057 | const unsigned long e, int val) | ||
1058 | { | ||
1059 | unsigned long flags; | ||
1060 | struct drbd_bitmap *b = mdev->bitmap; | ||
1061 | int c = 0; | ||
1062 | |||
1063 | ERR_IF(!b) return 1; | ||
1064 | ERR_IF(!b->bm_pages) return 0; | ||
1065 | |||
1066 | spin_lock_irqsave(&b->bm_lock, flags); | ||
1067 | if (bm_is_locked(b)) | ||
1068 | bm_print_lock_info(mdev); | ||
1069 | |||
1070 | c = __bm_change_bits_to(mdev, s, e, val, KM_IRQ1); | ||
1071 | |||
1072 | spin_unlock_irqrestore(&b->bm_lock, flags); | ||
1073 | return c; | ||
1074 | } | ||
1075 | |||
1076 | /* returns number of bits changed 0 -> 1 */ | ||
1077 | int drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e) | ||
1078 | { | ||
1079 | return bm_change_bits_to(mdev, s, e, 1); | ||
1080 | } | ||
1081 | |||
1082 | /* returns number of bits changed 1 -> 0 */ | ||
1083 | int drbd_bm_clear_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e) | ||
1084 | { | ||
1085 | return -bm_change_bits_to(mdev, s, e, 0); | ||
1086 | } | ||
1087 | |||
1088 | /* sets all bits in full words, | ||
1089 | * from first_word up to, but not including, last_word */ | ||
1090 | static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b, | ||
1091 | int page_nr, int first_word, int last_word) | ||
1092 | { | ||
1093 | int i; | ||
1094 | int bits; | ||
1095 | unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_USER0); | ||
1096 | for (i = first_word; i < last_word; i++) { | ||
1097 | bits = hweight_long(paddr[i]); | ||
1098 | paddr[i] = ~0UL; | ||
1099 | b->bm_set += BITS_PER_LONG - bits; | ||
1100 | } | ||
1101 | kunmap_atomic(paddr, KM_USER0); | ||
1102 | } | ||
1103 | |||
1104 | /* Same thing as drbd_bm_set_bits, but without taking the spin_lock_irqsave. | ||
1105 | * You must first drbd_bm_lock(). | ||
1106 | * Can be called to set the whole bitmap in one go. | ||
1107 | * Sets bits from s to e _inclusive_. */ | ||
1108 | void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e) | ||
1109 | { | ||
1110 | /* First set_bit from the first bit (s) | ||
1111 | * up to the next long boundary (sl), | ||
1112 | * then assign full words up to the last long boundary (el), | ||
1113 | * then set_bit up to and including the last bit (e). | ||
1114 | * | ||
1115 | * Do not use memset, because we must account for changes, | ||
1116 | * so we need to loop over the words with hweight() anyways. | ||
1117 | */ | ||
1118 | unsigned long sl = ALIGN(s,BITS_PER_LONG); | ||
1119 | unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1); | ||
1120 | int first_page; | ||
1121 | int last_page; | ||
1122 | int page_nr; | ||
1123 | int first_word; | ||
1124 | int last_word; | ||
1125 | |||
1126 | if (e - s <= 3*BITS_PER_LONG) { | ||
1127 | /* don't bother; el and sl may even be wrong. */ | ||
1128 | __bm_change_bits_to(mdev, s, e, 1, KM_USER0); | ||
1129 | return; | ||
1130 | } | ||
1131 | |||
1132 | /* difference is large enough that we can trust sl and el */ | ||
1133 | |||
1134 | /* bits filling the current long */ | ||
1135 | if (sl) | ||
1136 | __bm_change_bits_to(mdev, s, sl-1, 1, KM_USER0); | ||
1137 | |||
1138 | first_page = sl >> (3 + PAGE_SHIFT); | ||
1139 | last_page = el >> (3 + PAGE_SHIFT); | ||
1140 | |||
1141 | /* MLPP: modulo longs per page */ | ||
1142 | /* LWPP: long words per page */ | ||
1143 | first_word = MLPP(sl >> LN2_BPL); | ||
1144 | last_word = LWPP; | ||
1145 | |||
1146 | /* first and full pages, unless first page == last page */ | ||
1147 | for (page_nr = first_page; page_nr < last_page; page_nr++) { | ||
1148 | bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word); | ||
1149 | cond_resched(); | ||
1150 | first_word = 0; | ||
1151 | } | ||
1152 | |||
1153 | /* last page (respectively only page, for first page == last page) */ | ||
1154 | last_word = MLPP(el >> LN2_BPL); | ||
1155 | bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word); | ||
1156 | |||
1157 | /* possibly trailing bits. | ||
1158 | * example: (e & 63) == 63, el will be e+1. | ||
1159 | * if that even was the very last bit, | ||
1160 | * it would trigger an assert in __bm_change_bits_to() | ||
1161 | */ | ||
1162 | if (el <= e) | ||
1163 | __bm_change_bits_to(mdev, el, e, 1, KM_USER0); | ||
1164 | } | ||
1165 | |||
1166 | /* returns bit state | ||
1167 | * wants bitnr, NOT sector. | ||
1168 | * inherently racy... area needs to be locked by means of {al,rs}_lru | ||
1169 | * 1 ... bit set | ||
1170 | * 0 ... bit not set | ||
1171 | * -1 ... first out of bounds access, stop testing for bits! | ||
1172 | */ | ||
1173 | int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr) | ||
1174 | { | ||
1175 | unsigned long flags; | ||
1176 | struct drbd_bitmap *b = mdev->bitmap; | ||
1177 | unsigned long *p_addr; | ||
1178 | int i; | ||
1179 | |||
1180 | ERR_IF(!b) return 0; | ||
1181 | ERR_IF(!b->bm_pages) return 0; | ||
1182 | |||
1183 | spin_lock_irqsave(&b->bm_lock, flags); | ||
1184 | if (bm_is_locked(b)) | ||
1185 | bm_print_lock_info(mdev); | ||
1186 | if (bitnr < b->bm_bits) { | ||
1187 | unsigned long offset = bitnr>>LN2_BPL; | ||
1188 | p_addr = bm_map_paddr(b, offset); | ||
1189 | i = test_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0; | ||
1190 | bm_unmap(p_addr); | ||
1191 | } else if (bitnr == b->bm_bits) { | ||
1192 | i = -1; | ||
1193 | } else { /* (bitnr > b->bm_bits) */ | ||
1194 | dev_err(DEV, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits); | ||
1195 | i = 0; | ||
1196 | } | ||
1197 | |||
1198 | spin_unlock_irqrestore(&b->bm_lock, flags); | ||
1199 | return i; | ||
1200 | } | ||
1201 | |||
1202 | /* returns number of bits set in the range [s, e] */ | ||
1203 | int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e) | ||
1204 | { | ||
1205 | unsigned long flags; | ||
1206 | struct drbd_bitmap *b = mdev->bitmap; | ||
1207 | unsigned long *p_addr = NULL, page_nr = -1; | ||
1208 | unsigned long bitnr; | ||
1209 | int c = 0; | ||
1210 | size_t w; | ||
1211 | |||
1212 | /* If this is called without a bitmap, that is a bug. But just to be | ||
1213 | * robust in case we screwed up elsewhere, in that case pretend there | ||
1214 | * was one dirty bit in the requested area, so we won't try to do a | ||
1215 | * local read there (no bitmap probably implies no disk) */ | ||
1216 | ERR_IF(!b) return 1; | ||
1217 | ERR_IF(!b->bm_pages) return 1; | ||
1218 | |||
1219 | spin_lock_irqsave(&b->bm_lock, flags); | ||
1220 | if (bm_is_locked(b)) | ||
1221 | bm_print_lock_info(mdev); | ||
1222 | for (bitnr = s; bitnr <= e; bitnr++) { | ||
1223 | w = bitnr >> LN2_BPL; | ||
1224 | if (page_nr != w >> (PAGE_SHIFT - LN2_BPL + 3)) { | ||
1225 | page_nr = w >> (PAGE_SHIFT - LN2_BPL + 3); | ||
1226 | if (p_addr) | ||
1227 | bm_unmap(p_addr); | ||
1228 | p_addr = bm_map_paddr(b, w); | ||
1229 | } | ||
1230 | ERR_IF (bitnr >= b->bm_bits) { | ||
1231 | dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits); | ||
1232 | } else { | ||
1233 | c += (0 != test_bit(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr)); | ||
1234 | } | ||
1235 | } | ||
1236 | if (p_addr) | ||
1237 | bm_unmap(p_addr); | ||
1238 | spin_unlock_irqrestore(&b->bm_lock, flags); | ||
1239 | return c; | ||
1240 | } | ||
1241 | |||
1242 | |||
1243 | /* inherently racy... | ||
1244 | * return value may be already out-of-date when this function returns. | ||
1245 | * but the general usage is that this is only use during a cstate when bits are | ||
1246 | * only cleared, not set, and typically only care for the case when the return | ||
1247 | * value is zero, or we already "locked" this "bitmap extent" by other means. | ||
1248 | * | ||
1249 | * enr is bm-extent number, since we chose to name one sector (512 bytes) | ||
1250 | * worth of the bitmap a "bitmap extent". | ||
1251 | * | ||
1252 | * TODO | ||
1253 | * I think since we use it like a reference count, we should use the real | ||
1254 | * reference count of some bitmap extent element from some lru instead... | ||
1255 | * | ||
1256 | */ | ||
1257 | int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr) | ||
1258 | { | ||
1259 | struct drbd_bitmap *b = mdev->bitmap; | ||
1260 | int count, s, e; | ||
1261 | unsigned long flags; | ||
1262 | unsigned long *p_addr, *bm; | ||
1263 | |||
1264 | ERR_IF(!b) return 0; | ||
1265 | ERR_IF(!b->bm_pages) return 0; | ||
1266 | |||
1267 | spin_lock_irqsave(&b->bm_lock, flags); | ||
1268 | if (bm_is_locked(b)) | ||
1269 | bm_print_lock_info(mdev); | ||
1270 | |||
1271 | s = S2W(enr); | ||
1272 | e = min((size_t)S2W(enr+1), b->bm_words); | ||
1273 | count = 0; | ||
1274 | if (s < b->bm_words) { | ||
1275 | int n = e-s; | ||
1276 | p_addr = bm_map_paddr(b, s); | ||
1277 | bm = p_addr + MLPP(s); | ||
1278 | while (n--) | ||
1279 | count += hweight_long(*bm++); | ||
1280 | bm_unmap(p_addr); | ||
1281 | } else { | ||
1282 | dev_err(DEV, "start offset (%d) too large in drbd_bm_e_weight\n", s); | ||
1283 | } | ||
1284 | spin_unlock_irqrestore(&b->bm_lock, flags); | ||
1285 | return count; | ||
1286 | } | ||
1287 | |||
1288 | /* set all bits covered by the AL-extent al_enr */ | ||
1289 | unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr) | ||
1290 | { | ||
1291 | struct drbd_bitmap *b = mdev->bitmap; | ||
1292 | unsigned long *p_addr, *bm; | ||
1293 | unsigned long weight; | ||
1294 | int count, s, e, i, do_now; | ||
1295 | ERR_IF(!b) return 0; | ||
1296 | ERR_IF(!b->bm_pages) return 0; | ||
1297 | |||
1298 | spin_lock_irq(&b->bm_lock); | ||
1299 | if (bm_is_locked(b)) | ||
1300 | bm_print_lock_info(mdev); | ||
1301 | weight = b->bm_set; | ||
1302 | |||
1303 | s = al_enr * BM_WORDS_PER_AL_EXT; | ||
1304 | e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words); | ||
1305 | /* assert that s and e are on the same page */ | ||
1306 | D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3) | ||
1307 | == s >> (PAGE_SHIFT - LN2_BPL + 3)); | ||
1308 | count = 0; | ||
1309 | if (s < b->bm_words) { | ||
1310 | i = do_now = e-s; | ||
1311 | p_addr = bm_map_paddr(b, s); | ||
1312 | bm = p_addr + MLPP(s); | ||
1313 | while (i--) { | ||
1314 | count += hweight_long(*bm); | ||
1315 | *bm = -1UL; | ||
1316 | bm++; | ||
1317 | } | ||
1318 | bm_unmap(p_addr); | ||
1319 | b->bm_set += do_now*BITS_PER_LONG - count; | ||
1320 | if (e == b->bm_words) | ||
1321 | b->bm_set -= bm_clear_surplus(b); | ||
1322 | } else { | ||
1323 | dev_err(DEV, "start offset (%d) too large in drbd_bm_ALe_set_all\n", s); | ||
1324 | } | ||
1325 | weight = b->bm_set - weight; | ||
1326 | spin_unlock_irq(&b->bm_lock); | ||
1327 | return weight; | ||
1328 | } | ||
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h new file mode 100644 index 000000000000..e5e86a781820 --- /dev/null +++ b/drivers/block/drbd/drbd_int.h | |||
@@ -0,0 +1,2261 @@ | |||
1 | /* | ||
2 | drbd_int.h | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #ifndef _DRBD_INT_H | ||
27 | #define _DRBD_INT_H | ||
28 | |||
29 | #include <linux/compiler.h> | ||
30 | #include <linux/types.h> | ||
31 | #include <linux/version.h> | ||
32 | #include <linux/list.h> | ||
33 | #include <linux/sched.h> | ||
34 | #include <linux/bitops.h> | ||
35 | #include <linux/slab.h> | ||
36 | #include <linux/crypto.h> | ||
37 | #include <linux/ratelimit.h> | ||
38 | #include <linux/tcp.h> | ||
39 | #include <linux/mutex.h> | ||
40 | #include <linux/major.h> | ||
41 | #include <linux/blkdev.h> | ||
42 | #include <linux/genhd.h> | ||
43 | #include <net/tcp.h> | ||
44 | #include <linux/lru_cache.h> | ||
45 | |||
46 | #ifdef __CHECKER__ | ||
47 | # define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) | ||
48 | # define __protected_read_by(x) __attribute__((require_context(x,1,999,"read"))) | ||
49 | # define __protected_write_by(x) __attribute__((require_context(x,1,999,"write"))) | ||
50 | # define __must_hold(x) __attribute__((context(x,1,1), require_context(x,1,999,"call"))) | ||
51 | #else | ||
52 | # define __protected_by(x) | ||
53 | # define __protected_read_by(x) | ||
54 | # define __protected_write_by(x) | ||
55 | # define __must_hold(x) | ||
56 | #endif | ||
57 | |||
58 | #define __no_warn(lock, stmt) do { __acquire(lock); stmt; __release(lock); } while (0) | ||
59 | |||
60 | /* module parameter, defined in drbd_main.c */ | ||
61 | extern unsigned int minor_count; | ||
62 | extern int disable_sendpage; | ||
63 | extern int allow_oos; | ||
64 | extern unsigned int cn_idx; | ||
65 | |||
66 | #ifdef CONFIG_DRBD_FAULT_INJECTION | ||
67 | extern int enable_faults; | ||
68 | extern int fault_rate; | ||
69 | extern int fault_devs; | ||
70 | #endif | ||
71 | |||
72 | extern char usermode_helper[]; | ||
73 | |||
74 | |||
75 | #ifndef TRUE | ||
76 | #define TRUE 1 | ||
77 | #endif | ||
78 | #ifndef FALSE | ||
79 | #define FALSE 0 | ||
80 | #endif | ||
81 | |||
82 | /* I don't remember why XCPU ... | ||
83 | * This is used to wake the asender, | ||
84 | * and to interrupt sending the sending task | ||
85 | * on disconnect. | ||
86 | */ | ||
87 | #define DRBD_SIG SIGXCPU | ||
88 | |||
89 | /* This is used to stop/restart our threads. | ||
90 | * Cannot use SIGTERM nor SIGKILL, since these | ||
91 | * are sent out by init on runlevel changes | ||
92 | * I choose SIGHUP for now. | ||
93 | */ | ||
94 | #define DRBD_SIGKILL SIGHUP | ||
95 | |||
96 | /* All EEs on the free list should have ID_VACANT (== 0) | ||
97 | * freshly allocated EEs get !ID_VACANT (== 1) | ||
98 | * so if it says "cannot dereference null pointer at address 0x00000001", | ||
99 | * it is most likely one of these :( */ | ||
100 | |||
101 | #define ID_IN_SYNC (4711ULL) | ||
102 | #define ID_OUT_OF_SYNC (4712ULL) | ||
103 | |||
104 | #define ID_SYNCER (-1ULL) | ||
105 | #define ID_VACANT 0 | ||
106 | #define is_syncer_block_id(id) ((id) == ID_SYNCER) | ||
107 | |||
108 | struct drbd_conf; | ||
109 | |||
110 | |||
111 | /* to shorten dev_warn(DEV, "msg"); and relatives statements */ | ||
112 | #define DEV (disk_to_dev(mdev->vdisk)) | ||
113 | |||
114 | #define D_ASSERT(exp) if (!(exp)) \ | ||
115 | dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__) | ||
116 | |||
117 | #define ERR_IF(exp) if (({ \ | ||
118 | int _b = (exp) != 0; \ | ||
119 | if (_b) dev_err(DEV, "%s: (%s) in %s:%d\n", \ | ||
120 | __func__, #exp, __FILE__, __LINE__); \ | ||
121 | _b; \ | ||
122 | })) | ||
123 | |||
124 | /* Defines to control fault insertion */ | ||
125 | enum { | ||
126 | DRBD_FAULT_MD_WR = 0, /* meta data write */ | ||
127 | DRBD_FAULT_MD_RD = 1, /* read */ | ||
128 | DRBD_FAULT_RS_WR = 2, /* resync */ | ||
129 | DRBD_FAULT_RS_RD = 3, | ||
130 | DRBD_FAULT_DT_WR = 4, /* data */ | ||
131 | DRBD_FAULT_DT_RD = 5, | ||
132 | DRBD_FAULT_DT_RA = 6, /* data read ahead */ | ||
133 | DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */ | ||
134 | DRBD_FAULT_AL_EE = 8, /* alloc ee */ | ||
135 | |||
136 | DRBD_FAULT_MAX, | ||
137 | }; | ||
138 | |||
139 | #ifdef CONFIG_DRBD_FAULT_INJECTION | ||
140 | extern unsigned int | ||
141 | _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type); | ||
142 | static inline int | ||
143 | drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) { | ||
144 | return fault_rate && | ||
145 | (enable_faults & (1<<type)) && | ||
146 | _drbd_insert_fault(mdev, type); | ||
147 | } | ||
148 | #define FAULT_ACTIVE(_m, _t) (drbd_insert_fault((_m), (_t))) | ||
149 | |||
150 | #else | ||
151 | #define FAULT_ACTIVE(_m, _t) (0) | ||
152 | #endif | ||
153 | |||
154 | /* integer division, round _UP_ to the next integer */ | ||
155 | #define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0)) | ||
156 | /* usual integer division */ | ||
157 | #define div_floor(A, B) ((A)/(B)) | ||
158 | |||
159 | /* drbd_meta-data.c (still in drbd_main.c) */ | ||
160 | /* 4th incarnation of the disk layout. */ | ||
161 | #define DRBD_MD_MAGIC (DRBD_MAGIC+4) | ||
162 | |||
163 | extern struct drbd_conf **minor_table; | ||
164 | extern struct ratelimit_state drbd_ratelimit_state; | ||
165 | |||
166 | /* on the wire */ | ||
167 | enum drbd_packets { | ||
168 | /* receiver (data socket) */ | ||
169 | P_DATA = 0x00, | ||
170 | P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */ | ||
171 | P_RS_DATA_REPLY = 0x02, /* Response to P_RS_DATA_REQUEST */ | ||
172 | P_BARRIER = 0x03, | ||
173 | P_BITMAP = 0x04, | ||
174 | P_BECOME_SYNC_TARGET = 0x05, | ||
175 | P_BECOME_SYNC_SOURCE = 0x06, | ||
176 | P_UNPLUG_REMOTE = 0x07, /* Used at various times to hint the peer */ | ||
177 | P_DATA_REQUEST = 0x08, /* Used to ask for a data block */ | ||
178 | P_RS_DATA_REQUEST = 0x09, /* Used to ask for a data block for resync */ | ||
179 | P_SYNC_PARAM = 0x0a, | ||
180 | P_PROTOCOL = 0x0b, | ||
181 | P_UUIDS = 0x0c, | ||
182 | P_SIZES = 0x0d, | ||
183 | P_STATE = 0x0e, | ||
184 | P_SYNC_UUID = 0x0f, | ||
185 | P_AUTH_CHALLENGE = 0x10, | ||
186 | P_AUTH_RESPONSE = 0x11, | ||
187 | P_STATE_CHG_REQ = 0x12, | ||
188 | |||
189 | /* asender (meta socket */ | ||
190 | P_PING = 0x13, | ||
191 | P_PING_ACK = 0x14, | ||
192 | P_RECV_ACK = 0x15, /* Used in protocol B */ | ||
193 | P_WRITE_ACK = 0x16, /* Used in protocol C */ | ||
194 | P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */ | ||
195 | P_DISCARD_ACK = 0x18, /* Used in proto C, two-primaries conflict detection */ | ||
196 | P_NEG_ACK = 0x19, /* Sent if local disk is unusable */ | ||
197 | P_NEG_DREPLY = 0x1a, /* Local disk is broken... */ | ||
198 | P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */ | ||
199 | P_BARRIER_ACK = 0x1c, | ||
200 | P_STATE_CHG_REPLY = 0x1d, | ||
201 | |||
202 | /* "new" commands, no longer fitting into the ordering scheme above */ | ||
203 | |||
204 | P_OV_REQUEST = 0x1e, /* data socket */ | ||
205 | P_OV_REPLY = 0x1f, | ||
206 | P_OV_RESULT = 0x20, /* meta socket */ | ||
207 | P_CSUM_RS_REQUEST = 0x21, /* data socket */ | ||
208 | P_RS_IS_IN_SYNC = 0x22, /* meta socket */ | ||
209 | P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */ | ||
210 | P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */ | ||
211 | |||
212 | P_MAX_CMD = 0x25, | ||
213 | P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ | ||
214 | P_MAX_OPT_CMD = 0x101, | ||
215 | |||
216 | /* special command ids for handshake */ | ||
217 | |||
218 | P_HAND_SHAKE_M = 0xfff1, /* First Packet on the MetaSock */ | ||
219 | P_HAND_SHAKE_S = 0xfff2, /* First Packet on the Socket */ | ||
220 | |||
221 | P_HAND_SHAKE = 0xfffe /* FIXED for the next century! */ | ||
222 | }; | ||
223 | |||
224 | static inline const char *cmdname(enum drbd_packets cmd) | ||
225 | { | ||
226 | /* THINK may need to become several global tables | ||
227 | * when we want to support more than | ||
228 | * one PRO_VERSION */ | ||
229 | static const char *cmdnames[] = { | ||
230 | [P_DATA] = "Data", | ||
231 | [P_DATA_REPLY] = "DataReply", | ||
232 | [P_RS_DATA_REPLY] = "RSDataReply", | ||
233 | [P_BARRIER] = "Barrier", | ||
234 | [P_BITMAP] = "ReportBitMap", | ||
235 | [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget", | ||
236 | [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource", | ||
237 | [P_UNPLUG_REMOTE] = "UnplugRemote", | ||
238 | [P_DATA_REQUEST] = "DataRequest", | ||
239 | [P_RS_DATA_REQUEST] = "RSDataRequest", | ||
240 | [P_SYNC_PARAM] = "SyncParam", | ||
241 | [P_SYNC_PARAM89] = "SyncParam89", | ||
242 | [P_PROTOCOL] = "ReportProtocol", | ||
243 | [P_UUIDS] = "ReportUUIDs", | ||
244 | [P_SIZES] = "ReportSizes", | ||
245 | [P_STATE] = "ReportState", | ||
246 | [P_SYNC_UUID] = "ReportSyncUUID", | ||
247 | [P_AUTH_CHALLENGE] = "AuthChallenge", | ||
248 | [P_AUTH_RESPONSE] = "AuthResponse", | ||
249 | [P_PING] = "Ping", | ||
250 | [P_PING_ACK] = "PingAck", | ||
251 | [P_RECV_ACK] = "RecvAck", | ||
252 | [P_WRITE_ACK] = "WriteAck", | ||
253 | [P_RS_WRITE_ACK] = "RSWriteAck", | ||
254 | [P_DISCARD_ACK] = "DiscardAck", | ||
255 | [P_NEG_ACK] = "NegAck", | ||
256 | [P_NEG_DREPLY] = "NegDReply", | ||
257 | [P_NEG_RS_DREPLY] = "NegRSDReply", | ||
258 | [P_BARRIER_ACK] = "BarrierAck", | ||
259 | [P_STATE_CHG_REQ] = "StateChgRequest", | ||
260 | [P_STATE_CHG_REPLY] = "StateChgReply", | ||
261 | [P_OV_REQUEST] = "OVRequest", | ||
262 | [P_OV_REPLY] = "OVReply", | ||
263 | [P_OV_RESULT] = "OVResult", | ||
264 | [P_CSUM_RS_REQUEST] = "CsumRSRequest", | ||
265 | [P_RS_IS_IN_SYNC] = "CsumRSIsInSync", | ||
266 | [P_COMPRESSED_BITMAP] = "CBitmap", | ||
267 | [P_MAX_CMD] = NULL, | ||
268 | }; | ||
269 | |||
270 | if (cmd == P_HAND_SHAKE_M) | ||
271 | return "HandShakeM"; | ||
272 | if (cmd == P_HAND_SHAKE_S) | ||
273 | return "HandShakeS"; | ||
274 | if (cmd == P_HAND_SHAKE) | ||
275 | return "HandShake"; | ||
276 | if (cmd >= P_MAX_CMD) | ||
277 | return "Unknown"; | ||
278 | return cmdnames[cmd]; | ||
279 | } | ||
280 | |||
281 | /* for sending/receiving the bitmap, | ||
282 | * possibly in some encoding scheme */ | ||
283 | struct bm_xfer_ctx { | ||
284 | /* "const" | ||
285 | * stores total bits and long words | ||
286 | * of the bitmap, so we don't need to | ||
287 | * call the accessor functions over and again. */ | ||
288 | unsigned long bm_bits; | ||
289 | unsigned long bm_words; | ||
290 | /* during xfer, current position within the bitmap */ | ||
291 | unsigned long bit_offset; | ||
292 | unsigned long word_offset; | ||
293 | |||
294 | /* statistics; index: (h->command == P_BITMAP) */ | ||
295 | unsigned packets[2]; | ||
296 | unsigned bytes[2]; | ||
297 | }; | ||
298 | |||
299 | extern void INFO_bm_xfer_stats(struct drbd_conf *mdev, | ||
300 | const char *direction, struct bm_xfer_ctx *c); | ||
301 | |||
302 | static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c) | ||
303 | { | ||
304 | /* word_offset counts "native long words" (32 or 64 bit), | ||
305 | * aligned at 64 bit. | ||
306 | * Encoded packet may end at an unaligned bit offset. | ||
307 | * In case a fallback clear text packet is transmitted in | ||
308 | * between, we adjust this offset back to the last 64bit | ||
309 | * aligned "native long word", which makes coding and decoding | ||
310 | * the plain text bitmap much more convenient. */ | ||
311 | #if BITS_PER_LONG == 64 | ||
312 | c->word_offset = c->bit_offset >> 6; | ||
313 | #elif BITS_PER_LONG == 32 | ||
314 | c->word_offset = c->bit_offset >> 5; | ||
315 | c->word_offset &= ~(1UL); | ||
316 | #else | ||
317 | # error "unsupported BITS_PER_LONG" | ||
318 | #endif | ||
319 | } | ||
320 | |||
321 | #ifndef __packed | ||
322 | #define __packed __attribute__((packed)) | ||
323 | #endif | ||
324 | |||
325 | /* This is the layout for a packet on the wire. | ||
326 | * The byteorder is the network byte order. | ||
327 | * (except block_id and barrier fields. | ||
328 | * these are pointers to local structs | ||
329 | * and have no relevance for the partner, | ||
330 | * which just echoes them as received.) | ||
331 | * | ||
332 | * NOTE that the payload starts at a long aligned offset, | ||
333 | * regardless of 32 or 64 bit arch! | ||
334 | */ | ||
335 | struct p_header { | ||
336 | u32 magic; | ||
337 | u16 command; | ||
338 | u16 length; /* bytes of data after this header */ | ||
339 | u8 payload[0]; | ||
340 | } __packed; | ||
341 | /* 8 bytes. packet FIXED for the next century! */ | ||
342 | |||
343 | /* | ||
344 | * short commands, packets without payload, plain p_header: | ||
345 | * P_PING | ||
346 | * P_PING_ACK | ||
347 | * P_BECOME_SYNC_TARGET | ||
348 | * P_BECOME_SYNC_SOURCE | ||
349 | * P_UNPLUG_REMOTE | ||
350 | */ | ||
351 | |||
352 | /* | ||
353 | * commands with out-of-struct payload: | ||
354 | * P_BITMAP (no additional fields) | ||
355 | * P_DATA, P_DATA_REPLY (see p_data) | ||
356 | * P_COMPRESSED_BITMAP (see receive_compressed_bitmap) | ||
357 | */ | ||
358 | |||
359 | /* these defines must not be changed without changing the protocol version */ | ||
360 | #define DP_HARDBARRIER 1 | ||
361 | #define DP_RW_SYNC 2 | ||
362 | #define DP_MAY_SET_IN_SYNC 4 | ||
363 | |||
364 | struct p_data { | ||
365 | struct p_header head; | ||
366 | u64 sector; /* 64 bits sector number */ | ||
367 | u64 block_id; /* to identify the request in protocol B&C */ | ||
368 | u32 seq_num; | ||
369 | u32 dp_flags; | ||
370 | } __packed; | ||
371 | |||
372 | /* | ||
373 | * commands which share a struct: | ||
374 | * p_block_ack: | ||
375 | * P_RECV_ACK (proto B), P_WRITE_ACK (proto C), | ||
376 | * P_DISCARD_ACK (proto C, two-primaries conflict detection) | ||
377 | * p_block_req: | ||
378 | * P_DATA_REQUEST, P_RS_DATA_REQUEST | ||
379 | */ | ||
380 | struct p_block_ack { | ||
381 | struct p_header head; | ||
382 | u64 sector; | ||
383 | u64 block_id; | ||
384 | u32 blksize; | ||
385 | u32 seq_num; | ||
386 | } __packed; | ||
387 | |||
388 | |||
389 | struct p_block_req { | ||
390 | struct p_header head; | ||
391 | u64 sector; | ||
392 | u64 block_id; | ||
393 | u32 blksize; | ||
394 | u32 pad; /* to multiple of 8 Byte */ | ||
395 | } __packed; | ||
396 | |||
397 | /* | ||
398 | * commands with their own struct for additional fields: | ||
399 | * P_HAND_SHAKE | ||
400 | * P_BARRIER | ||
401 | * P_BARRIER_ACK | ||
402 | * P_SYNC_PARAM | ||
403 | * ReportParams | ||
404 | */ | ||
405 | |||
406 | struct p_handshake { | ||
407 | struct p_header head; /* 8 bytes */ | ||
408 | u32 protocol_min; | ||
409 | u32 feature_flags; | ||
410 | u32 protocol_max; | ||
411 | |||
412 | /* should be more than enough for future enhancements | ||
413 | * for now, feature_flags and the reserverd array shall be zero. | ||
414 | */ | ||
415 | |||
416 | u32 _pad; | ||
417 | u64 reserverd[7]; | ||
418 | } __packed; | ||
419 | /* 80 bytes, FIXED for the next century */ | ||
420 | |||
421 | struct p_barrier { | ||
422 | struct p_header head; | ||
423 | u32 barrier; /* barrier number _handle_ only */ | ||
424 | u32 pad; /* to multiple of 8 Byte */ | ||
425 | } __packed; | ||
426 | |||
427 | struct p_barrier_ack { | ||
428 | struct p_header head; | ||
429 | u32 barrier; | ||
430 | u32 set_size; | ||
431 | } __packed; | ||
432 | |||
433 | struct p_rs_param { | ||
434 | struct p_header head; | ||
435 | u32 rate; | ||
436 | |||
437 | /* Since protocol version 88 and higher. */ | ||
438 | char verify_alg[0]; | ||
439 | } __packed; | ||
440 | |||
441 | struct p_rs_param_89 { | ||
442 | struct p_header head; | ||
443 | u32 rate; | ||
444 | /* protocol version 89: */ | ||
445 | char verify_alg[SHARED_SECRET_MAX]; | ||
446 | char csums_alg[SHARED_SECRET_MAX]; | ||
447 | } __packed; | ||
448 | |||
449 | enum drbd_conn_flags { | ||
450 | CF_WANT_LOSE = 1, | ||
451 | CF_DRY_RUN = 2, | ||
452 | }; | ||
453 | |||
454 | struct p_protocol { | ||
455 | struct p_header head; | ||
456 | u32 protocol; | ||
457 | u32 after_sb_0p; | ||
458 | u32 after_sb_1p; | ||
459 | u32 after_sb_2p; | ||
460 | u32 conn_flags; | ||
461 | u32 two_primaries; | ||
462 | |||
463 | /* Since protocol version 87 and higher. */ | ||
464 | char integrity_alg[0]; | ||
465 | |||
466 | } __packed; | ||
467 | |||
468 | struct p_uuids { | ||
469 | struct p_header head; | ||
470 | u64 uuid[UI_EXTENDED_SIZE]; | ||
471 | } __packed; | ||
472 | |||
473 | struct p_rs_uuid { | ||
474 | struct p_header head; | ||
475 | u64 uuid; | ||
476 | } __packed; | ||
477 | |||
478 | struct p_sizes { | ||
479 | struct p_header head; | ||
480 | u64 d_size; /* size of disk */ | ||
481 | u64 u_size; /* user requested size */ | ||
482 | u64 c_size; /* current exported size */ | ||
483 | u32 max_segment_size; /* Maximal size of a BIO */ | ||
484 | u32 queue_order_type; | ||
485 | } __packed; | ||
486 | |||
487 | struct p_state { | ||
488 | struct p_header head; | ||
489 | u32 state; | ||
490 | } __packed; | ||
491 | |||
492 | struct p_req_state { | ||
493 | struct p_header head; | ||
494 | u32 mask; | ||
495 | u32 val; | ||
496 | } __packed; | ||
497 | |||
498 | struct p_req_state_reply { | ||
499 | struct p_header head; | ||
500 | u32 retcode; | ||
501 | } __packed; | ||
502 | |||
503 | struct p_drbd06_param { | ||
504 | u64 size; | ||
505 | u32 state; | ||
506 | u32 blksize; | ||
507 | u32 protocol; | ||
508 | u32 version; | ||
509 | u32 gen_cnt[5]; | ||
510 | u32 bit_map_gen[5]; | ||
511 | } __packed; | ||
512 | |||
513 | struct p_discard { | ||
514 | struct p_header head; | ||
515 | u64 block_id; | ||
516 | u32 seq_num; | ||
517 | u32 pad; | ||
518 | } __packed; | ||
519 | |||
520 | /* Valid values for the encoding field. | ||
521 | * Bump proto version when changing this. */ | ||
522 | enum drbd_bitmap_code { | ||
523 | /* RLE_VLI_Bytes = 0, | ||
524 | * and other bit variants had been defined during | ||
525 | * algorithm evaluation. */ | ||
526 | RLE_VLI_Bits = 2, | ||
527 | }; | ||
528 | |||
529 | struct p_compressed_bm { | ||
530 | struct p_header head; | ||
531 | /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code | ||
532 | * (encoding & 0x80): polarity (set/unset) of first runlength | ||
533 | * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits | ||
534 | * used to pad up to head.length bytes | ||
535 | */ | ||
536 | u8 encoding; | ||
537 | |||
538 | u8 code[0]; | ||
539 | } __packed; | ||
540 | |||
541 | /* DCBP: Drbd Compressed Bitmap Packet ... */ | ||
542 | static inline enum drbd_bitmap_code | ||
543 | DCBP_get_code(struct p_compressed_bm *p) | ||
544 | { | ||
545 | return (enum drbd_bitmap_code)(p->encoding & 0x0f); | ||
546 | } | ||
547 | |||
548 | static inline void | ||
549 | DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code) | ||
550 | { | ||
551 | BUG_ON(code & ~0xf); | ||
552 | p->encoding = (p->encoding & ~0xf) | code; | ||
553 | } | ||
554 | |||
555 | static inline int | ||
556 | DCBP_get_start(struct p_compressed_bm *p) | ||
557 | { | ||
558 | return (p->encoding & 0x80) != 0; | ||
559 | } | ||
560 | |||
561 | static inline void | ||
562 | DCBP_set_start(struct p_compressed_bm *p, int set) | ||
563 | { | ||
564 | p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0); | ||
565 | } | ||
566 | |||
567 | static inline int | ||
568 | DCBP_get_pad_bits(struct p_compressed_bm *p) | ||
569 | { | ||
570 | return (p->encoding >> 4) & 0x7; | ||
571 | } | ||
572 | |||
573 | static inline void | ||
574 | DCBP_set_pad_bits(struct p_compressed_bm *p, int n) | ||
575 | { | ||
576 | BUG_ON(n & ~0x7); | ||
577 | p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4); | ||
578 | } | ||
579 | |||
580 | /* one bitmap packet, including the p_header, | ||
581 | * should fit within one _architecture independend_ page. | ||
582 | * so we need to use the fixed size 4KiB page size | ||
583 | * most architechtures have used for a long time. | ||
584 | */ | ||
585 | #define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header)) | ||
586 | #define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long)) | ||
587 | #define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm)) | ||
588 | #if (PAGE_SIZE < 4096) | ||
589 | /* drbd_send_bitmap / receive_bitmap would break horribly */ | ||
590 | #error "PAGE_SIZE too small" | ||
591 | #endif | ||
592 | |||
593 | union p_polymorph { | ||
594 | struct p_header header; | ||
595 | struct p_handshake handshake; | ||
596 | struct p_data data; | ||
597 | struct p_block_ack block_ack; | ||
598 | struct p_barrier barrier; | ||
599 | struct p_barrier_ack barrier_ack; | ||
600 | struct p_rs_param_89 rs_param_89; | ||
601 | struct p_protocol protocol; | ||
602 | struct p_sizes sizes; | ||
603 | struct p_uuids uuids; | ||
604 | struct p_state state; | ||
605 | struct p_req_state req_state; | ||
606 | struct p_req_state_reply req_state_reply; | ||
607 | struct p_block_req block_req; | ||
608 | } __packed; | ||
609 | |||
610 | /**********************************************************************/ | ||
611 | enum drbd_thread_state { | ||
612 | None, | ||
613 | Running, | ||
614 | Exiting, | ||
615 | Restarting | ||
616 | }; | ||
617 | |||
618 | struct drbd_thread { | ||
619 | spinlock_t t_lock; | ||
620 | struct task_struct *task; | ||
621 | struct completion stop; | ||
622 | enum drbd_thread_state t_state; | ||
623 | int (*function) (struct drbd_thread *); | ||
624 | struct drbd_conf *mdev; | ||
625 | int reset_cpu_mask; | ||
626 | }; | ||
627 | |||
628 | static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi) | ||
629 | { | ||
630 | /* THINK testing the t_state seems to be uncritical in all cases | ||
631 | * (but thread_{start,stop}), so we can read it *without* the lock. | ||
632 | * --lge */ | ||
633 | |||
634 | smp_rmb(); | ||
635 | return thi->t_state; | ||
636 | } | ||
637 | |||
638 | |||
639 | /* | ||
640 | * Having this as the first member of a struct provides sort of "inheritance". | ||
641 | * "derived" structs can be "drbd_queue_work()"ed. | ||
642 | * The callback should know and cast back to the descendant struct. | ||
643 | * drbd_request and drbd_epoch_entry are descendants of drbd_work. | ||
644 | */ | ||
645 | struct drbd_work; | ||
646 | typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel); | ||
647 | struct drbd_work { | ||
648 | struct list_head list; | ||
649 | drbd_work_cb cb; | ||
650 | }; | ||
651 | |||
652 | struct drbd_tl_epoch; | ||
653 | struct drbd_request { | ||
654 | struct drbd_work w; | ||
655 | struct drbd_conf *mdev; | ||
656 | |||
657 | /* if local IO is not allowed, will be NULL. | ||
658 | * if local IO _is_ allowed, holds the locally submitted bio clone, | ||
659 | * or, after local IO completion, the ERR_PTR(error). | ||
660 | * see drbd_endio_pri(). */ | ||
661 | struct bio *private_bio; | ||
662 | |||
663 | struct hlist_node colision; | ||
664 | sector_t sector; | ||
665 | unsigned int size; | ||
666 | unsigned int epoch; /* barrier_nr */ | ||
667 | |||
668 | /* barrier_nr: used to check on "completion" whether this req was in | ||
669 | * the current epoch, and we therefore have to close it, | ||
670 | * starting a new epoch... | ||
671 | */ | ||
672 | |||
673 | /* up to here, the struct layout is identical to drbd_epoch_entry; | ||
674 | * we might be able to use that to our advantage... */ | ||
675 | |||
676 | struct list_head tl_requests; /* ring list in the transfer log */ | ||
677 | struct bio *master_bio; /* master bio pointer */ | ||
678 | unsigned long rq_state; /* see comments above _req_mod() */ | ||
679 | int seq_num; | ||
680 | unsigned long start_time; | ||
681 | }; | ||
682 | |||
683 | struct drbd_tl_epoch { | ||
684 | struct drbd_work w; | ||
685 | struct list_head requests; /* requests before */ | ||
686 | struct drbd_tl_epoch *next; /* pointer to the next barrier */ | ||
687 | unsigned int br_number; /* the barriers identifier. */ | ||
688 | int n_req; /* number of requests attached before this barrier */ | ||
689 | }; | ||
690 | |||
691 | struct drbd_request; | ||
692 | |||
693 | /* These Tl_epoch_entries may be in one of 6 lists: | ||
694 | active_ee .. data packet being written | ||
695 | sync_ee .. syncer block being written | ||
696 | done_ee .. block written, need to send P_WRITE_ACK | ||
697 | read_ee .. [RS]P_DATA_REQUEST being read | ||
698 | */ | ||
699 | |||
700 | struct drbd_epoch { | ||
701 | struct list_head list; | ||
702 | unsigned int barrier_nr; | ||
703 | atomic_t epoch_size; /* increased on every request added. */ | ||
704 | atomic_t active; /* increased on every req. added, and dec on every finished. */ | ||
705 | unsigned long flags; | ||
706 | }; | ||
707 | |||
708 | /* drbd_epoch flag bits */ | ||
709 | enum { | ||
710 | DE_BARRIER_IN_NEXT_EPOCH_ISSUED, | ||
711 | DE_BARRIER_IN_NEXT_EPOCH_DONE, | ||
712 | DE_CONTAINS_A_BARRIER, | ||
713 | DE_HAVE_BARRIER_NUMBER, | ||
714 | DE_IS_FINISHING, | ||
715 | }; | ||
716 | |||
717 | enum epoch_event { | ||
718 | EV_PUT, | ||
719 | EV_GOT_BARRIER_NR, | ||
720 | EV_BARRIER_DONE, | ||
721 | EV_BECAME_LAST, | ||
722 | EV_CLEANUP = 32, /* used as flag */ | ||
723 | }; | ||
724 | |||
725 | struct drbd_epoch_entry { | ||
726 | struct drbd_work w; | ||
727 | struct drbd_conf *mdev; | ||
728 | struct bio *private_bio; | ||
729 | struct hlist_node colision; | ||
730 | sector_t sector; | ||
731 | unsigned int size; | ||
732 | struct drbd_epoch *epoch; | ||
733 | |||
734 | /* up to here, the struct layout is identical to drbd_request; | ||
735 | * we might be able to use that to our advantage... */ | ||
736 | |||
737 | unsigned int flags; | ||
738 | u64 block_id; | ||
739 | }; | ||
740 | |||
741 | struct drbd_wq_barrier { | ||
742 | struct drbd_work w; | ||
743 | struct completion done; | ||
744 | }; | ||
745 | |||
746 | struct digest_info { | ||
747 | int digest_size; | ||
748 | void *digest; | ||
749 | }; | ||
750 | |||
751 | /* ee flag bits */ | ||
752 | enum { | ||
753 | __EE_CALL_AL_COMPLETE_IO, | ||
754 | __EE_CONFLICT_PENDING, | ||
755 | __EE_MAY_SET_IN_SYNC, | ||
756 | __EE_IS_BARRIER, | ||
757 | }; | ||
758 | #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) | ||
759 | #define EE_CONFLICT_PENDING (1<<__EE_CONFLICT_PENDING) | ||
760 | #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) | ||
761 | #define EE_IS_BARRIER (1<<__EE_IS_BARRIER) | ||
762 | |||
763 | /* global flag bits */ | ||
764 | enum { | ||
765 | CREATE_BARRIER, /* next P_DATA is preceeded by a P_BARRIER */ | ||
766 | SIGNAL_ASENDER, /* whether asender wants to be interrupted */ | ||
767 | SEND_PING, /* whether asender should send a ping asap */ | ||
768 | |||
769 | STOP_SYNC_TIMER, /* tell timer to cancel itself */ | ||
770 | UNPLUG_QUEUED, /* only relevant with kernel 2.4 */ | ||
771 | UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ | ||
772 | MD_DIRTY, /* current uuids and flags not yet on disk */ | ||
773 | DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */ | ||
774 | USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */ | ||
775 | CLUSTER_ST_CHANGE, /* Cluster wide state change going on... */ | ||
776 | CL_ST_CHG_SUCCESS, | ||
777 | CL_ST_CHG_FAIL, | ||
778 | CRASHED_PRIMARY, /* This node was a crashed primary. | ||
779 | * Gets cleared when the state.conn | ||
780 | * goes into C_CONNECTED state. */ | ||
781 | WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */ | ||
782 | NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */ | ||
783 | CONSIDER_RESYNC, | ||
784 | |||
785 | MD_NO_BARRIER, /* meta data device does not support barriers, | ||
786 | so don't even try */ | ||
787 | SUSPEND_IO, /* suspend application io */ | ||
788 | BITMAP_IO, /* suspend application io; | ||
789 | once no more io in flight, start bitmap io */ | ||
790 | BITMAP_IO_QUEUED, /* Started bitmap IO */ | ||
791 | RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ | ||
792 | NET_CONGESTED, /* The data socket is congested */ | ||
793 | |||
794 | CONFIG_PENDING, /* serialization of (re)configuration requests. | ||
795 | * if set, also prevents the device from dying */ | ||
796 | DEVICE_DYING, /* device became unconfigured, | ||
797 | * but worker thread is still handling the cleanup. | ||
798 | * reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed, | ||
799 | * while this is set. */ | ||
800 | RESIZE_PENDING, /* Size change detected locally, waiting for the response from | ||
801 | * the peer, if it changed there as well. */ | ||
802 | CONN_DRY_RUN, /* Expect disconnect after resync handshake. */ | ||
803 | GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */ | ||
804 | }; | ||
805 | |||
806 | struct drbd_bitmap; /* opaque for drbd_conf */ | ||
807 | |||
808 | /* TODO sort members for performance | ||
809 | * MAYBE group them further */ | ||
810 | |||
811 | /* THINK maybe we actually want to use the default "event/%s" worker threads | ||
812 | * or similar in linux 2.6, which uses per cpu data and threads. | ||
813 | * | ||
814 | * To be general, this might need a spin_lock member. | ||
815 | * For now, please use the mdev->req_lock to protect list_head, | ||
816 | * see drbd_queue_work below. | ||
817 | */ | ||
818 | struct drbd_work_queue { | ||
819 | struct list_head q; | ||
820 | struct semaphore s; /* producers up it, worker down()s it */ | ||
821 | spinlock_t q_lock; /* to protect the list. */ | ||
822 | }; | ||
823 | |||
824 | struct drbd_socket { | ||
825 | struct drbd_work_queue work; | ||
826 | struct mutex mutex; | ||
827 | struct socket *socket; | ||
828 | /* this way we get our | ||
829 | * send/receive buffers off the stack */ | ||
830 | union p_polymorph sbuf; | ||
831 | union p_polymorph rbuf; | ||
832 | }; | ||
833 | |||
834 | struct drbd_md { | ||
835 | u64 md_offset; /* sector offset to 'super' block */ | ||
836 | |||
837 | u64 la_size_sect; /* last agreed size, unit sectors */ | ||
838 | u64 uuid[UI_SIZE]; | ||
839 | u64 device_uuid; | ||
840 | u32 flags; | ||
841 | u32 md_size_sect; | ||
842 | |||
843 | s32 al_offset; /* signed relative sector offset to al area */ | ||
844 | s32 bm_offset; /* signed relative sector offset to bitmap */ | ||
845 | |||
846 | /* u32 al_nr_extents; important for restoring the AL | ||
847 | * is stored into sync_conf.al_extents, which in turn | ||
848 | * gets applied to act_log->nr_elements | ||
849 | */ | ||
850 | }; | ||
851 | |||
852 | /* for sync_conf and other types... */ | ||
853 | #define NL_PACKET(name, number, fields) struct name { fields }; | ||
854 | #define NL_INTEGER(pn,pr,member) int member; | ||
855 | #define NL_INT64(pn,pr,member) __u64 member; | ||
856 | #define NL_BIT(pn,pr,member) unsigned member:1; | ||
857 | #define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len; | ||
858 | #include "linux/drbd_nl.h" | ||
859 | |||
860 | struct drbd_backing_dev { | ||
861 | struct block_device *backing_bdev; | ||
862 | struct block_device *md_bdev; | ||
863 | struct file *lo_file; | ||
864 | struct file *md_file; | ||
865 | struct drbd_md md; | ||
866 | struct disk_conf dc; /* The user provided config... */ | ||
867 | sector_t known_size; /* last known size of that backing device */ | ||
868 | }; | ||
869 | |||
870 | struct drbd_md_io { | ||
871 | struct drbd_conf *mdev; | ||
872 | struct completion event; | ||
873 | int error; | ||
874 | }; | ||
875 | |||
876 | struct bm_io_work { | ||
877 | struct drbd_work w; | ||
878 | char *why; | ||
879 | int (*io_fn)(struct drbd_conf *mdev); | ||
880 | void (*done)(struct drbd_conf *mdev, int rv); | ||
881 | }; | ||
882 | |||
883 | enum write_ordering_e { | ||
884 | WO_none, | ||
885 | WO_drain_io, | ||
886 | WO_bdev_flush, | ||
887 | WO_bio_barrier | ||
888 | }; | ||
889 | |||
890 | struct drbd_conf { | ||
891 | /* things that are stored as / read from meta data on disk */ | ||
892 | unsigned long flags; | ||
893 | |||
894 | /* configured by drbdsetup */ | ||
895 | struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */ | ||
896 | struct syncer_conf sync_conf; | ||
897 | struct drbd_backing_dev *ldev __protected_by(local); | ||
898 | |||
899 | sector_t p_size; /* partner's disk size */ | ||
900 | struct request_queue *rq_queue; | ||
901 | struct block_device *this_bdev; | ||
902 | struct gendisk *vdisk; | ||
903 | |||
904 | struct drbd_socket data; /* data/barrier/cstate/parameter packets */ | ||
905 | struct drbd_socket meta; /* ping/ack (metadata) packets */ | ||
906 | int agreed_pro_version; /* actually used protocol version */ | ||
907 | unsigned long last_received; /* in jiffies, either socket */ | ||
908 | unsigned int ko_count; | ||
909 | struct drbd_work resync_work, | ||
910 | unplug_work, | ||
911 | md_sync_work; | ||
912 | struct timer_list resync_timer; | ||
913 | struct timer_list md_sync_timer; | ||
914 | |||
915 | /* Used after attach while negotiating new disk state. */ | ||
916 | union drbd_state new_state_tmp; | ||
917 | |||
918 | union drbd_state state; | ||
919 | wait_queue_head_t misc_wait; | ||
920 | wait_queue_head_t state_wait; /* upon each state change. */ | ||
921 | unsigned int send_cnt; | ||
922 | unsigned int recv_cnt; | ||
923 | unsigned int read_cnt; | ||
924 | unsigned int writ_cnt; | ||
925 | unsigned int al_writ_cnt; | ||
926 | unsigned int bm_writ_cnt; | ||
927 | atomic_t ap_bio_cnt; /* Requests we need to complete */ | ||
928 | atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */ | ||
929 | atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ | ||
930 | atomic_t unacked_cnt; /* Need to send replys for */ | ||
931 | atomic_t local_cnt; /* Waiting for local completion */ | ||
932 | atomic_t net_cnt; /* Users of net_conf */ | ||
933 | spinlock_t req_lock; | ||
934 | struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */ | ||
935 | struct drbd_tl_epoch *newest_tle; | ||
936 | struct drbd_tl_epoch *oldest_tle; | ||
937 | struct list_head out_of_sequence_requests; | ||
938 | struct hlist_head *tl_hash; | ||
939 | unsigned int tl_hash_s; | ||
940 | |||
941 | /* blocks to sync in this run [unit BM_BLOCK_SIZE] */ | ||
942 | unsigned long rs_total; | ||
943 | /* number of sync IOs that failed in this run */ | ||
944 | unsigned long rs_failed; | ||
945 | /* Syncer's start time [unit jiffies] */ | ||
946 | unsigned long rs_start; | ||
947 | /* cumulated time in PausedSyncX state [unit jiffies] */ | ||
948 | unsigned long rs_paused; | ||
949 | /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */ | ||
950 | unsigned long rs_mark_left; | ||
951 | /* marks's time [unit jiffies] */ | ||
952 | unsigned long rs_mark_time; | ||
953 | /* skipped because csum was equeal [unit BM_BLOCK_SIZE] */ | ||
954 | unsigned long rs_same_csum; | ||
955 | |||
956 | /* where does the admin want us to start? (sector) */ | ||
957 | sector_t ov_start_sector; | ||
958 | /* where are we now? (sector) */ | ||
959 | sector_t ov_position; | ||
960 | /* Start sector of out of sync range (to merge printk reporting). */ | ||
961 | sector_t ov_last_oos_start; | ||
962 | /* size of out-of-sync range in sectors. */ | ||
963 | sector_t ov_last_oos_size; | ||
964 | unsigned long ov_left; /* in bits */ | ||
965 | struct crypto_hash *csums_tfm; | ||
966 | struct crypto_hash *verify_tfm; | ||
967 | |||
968 | struct drbd_thread receiver; | ||
969 | struct drbd_thread worker; | ||
970 | struct drbd_thread asender; | ||
971 | struct drbd_bitmap *bitmap; | ||
972 | unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */ | ||
973 | |||
974 | /* Used to track operations of resync... */ | ||
975 | struct lru_cache *resync; | ||
976 | /* Number of locked elements in resync LRU */ | ||
977 | unsigned int resync_locked; | ||
978 | /* resync extent number waiting for application requests */ | ||
979 | unsigned int resync_wenr; | ||
980 | |||
981 | int open_cnt; | ||
982 | u64 *p_uuid; | ||
983 | struct drbd_epoch *current_epoch; | ||
984 | spinlock_t epoch_lock; | ||
985 | unsigned int epochs; | ||
986 | enum write_ordering_e write_ordering; | ||
987 | struct list_head active_ee; /* IO in progress */ | ||
988 | struct list_head sync_ee; /* IO in progress */ | ||
989 | struct list_head done_ee; /* send ack */ | ||
990 | struct list_head read_ee; /* IO in progress */ | ||
991 | struct list_head net_ee; /* zero-copy network send in progress */ | ||
992 | struct hlist_head *ee_hash; /* is proteced by req_lock! */ | ||
993 | unsigned int ee_hash_s; | ||
994 | |||
995 | /* this one is protected by ee_lock, single thread */ | ||
996 | struct drbd_epoch_entry *last_write_w_barrier; | ||
997 | |||
998 | int next_barrier_nr; | ||
999 | struct hlist_head *app_reads_hash; /* is proteced by req_lock */ | ||
1000 | struct list_head resync_reads; | ||
1001 | atomic_t pp_in_use; | ||
1002 | wait_queue_head_t ee_wait; | ||
1003 | struct page *md_io_page; /* one page buffer for md_io */ | ||
1004 | struct page *md_io_tmpp; /* for logical_block_size != 512 */ | ||
1005 | struct mutex md_io_mutex; /* protects the md_io_buffer */ | ||
1006 | spinlock_t al_lock; | ||
1007 | wait_queue_head_t al_wait; | ||
1008 | struct lru_cache *act_log; /* activity log */ | ||
1009 | unsigned int al_tr_number; | ||
1010 | int al_tr_cycle; | ||
1011 | int al_tr_pos; /* position of the next transaction in the journal */ | ||
1012 | struct crypto_hash *cram_hmac_tfm; | ||
1013 | struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */ | ||
1014 | struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */ | ||
1015 | void *int_dig_out; | ||
1016 | void *int_dig_in; | ||
1017 | void *int_dig_vv; | ||
1018 | wait_queue_head_t seq_wait; | ||
1019 | atomic_t packet_seq; | ||
1020 | unsigned int peer_seq; | ||
1021 | spinlock_t peer_seq_lock; | ||
1022 | unsigned int minor; | ||
1023 | unsigned long comm_bm_set; /* communicated number of set bits. */ | ||
1024 | cpumask_var_t cpu_mask; | ||
1025 | struct bm_io_work bm_io_work; | ||
1026 | u64 ed_uuid; /* UUID of the exposed data */ | ||
1027 | struct mutex state_mutex; | ||
1028 | char congestion_reason; /* Why we where congested... */ | ||
1029 | }; | ||
1030 | |||
1031 | static inline struct drbd_conf *minor_to_mdev(unsigned int minor) | ||
1032 | { | ||
1033 | struct drbd_conf *mdev; | ||
1034 | |||
1035 | mdev = minor < minor_count ? minor_table[minor] : NULL; | ||
1036 | |||
1037 | return mdev; | ||
1038 | } | ||
1039 | |||
1040 | static inline unsigned int mdev_to_minor(struct drbd_conf *mdev) | ||
1041 | { | ||
1042 | return mdev->minor; | ||
1043 | } | ||
1044 | |||
1045 | /* returns 1 if it was successfull, | ||
1046 | * returns 0 if there was no data socket. | ||
1047 | * so wherever you are going to use the data.socket, e.g. do | ||
1048 | * if (!drbd_get_data_sock(mdev)) | ||
1049 | * return 0; | ||
1050 | * CODE(); | ||
1051 | * drbd_put_data_sock(mdev); | ||
1052 | */ | ||
1053 | static inline int drbd_get_data_sock(struct drbd_conf *mdev) | ||
1054 | { | ||
1055 | mutex_lock(&mdev->data.mutex); | ||
1056 | /* drbd_disconnect() could have called drbd_free_sock() | ||
1057 | * while we were waiting in down()... */ | ||
1058 | if (unlikely(mdev->data.socket == NULL)) { | ||
1059 | mutex_unlock(&mdev->data.mutex); | ||
1060 | return 0; | ||
1061 | } | ||
1062 | return 1; | ||
1063 | } | ||
1064 | |||
1065 | static inline void drbd_put_data_sock(struct drbd_conf *mdev) | ||
1066 | { | ||
1067 | mutex_unlock(&mdev->data.mutex); | ||
1068 | } | ||
1069 | |||
1070 | /* | ||
1071 | * function declarations | ||
1072 | *************************/ | ||
1073 | |||
1074 | /* drbd_main.c */ | ||
1075 | |||
1076 | enum chg_state_flags { | ||
1077 | CS_HARD = 1, | ||
1078 | CS_VERBOSE = 2, | ||
1079 | CS_WAIT_COMPLETE = 4, | ||
1080 | CS_SERIALIZE = 8, | ||
1081 | CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE, | ||
1082 | }; | ||
1083 | |||
1084 | extern void drbd_init_set_defaults(struct drbd_conf *mdev); | ||
1085 | extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, | ||
1086 | union drbd_state mask, union drbd_state val); | ||
1087 | extern void drbd_force_state(struct drbd_conf *, union drbd_state, | ||
1088 | union drbd_state); | ||
1089 | extern int _drbd_request_state(struct drbd_conf *, union drbd_state, | ||
1090 | union drbd_state, enum chg_state_flags); | ||
1091 | extern int __drbd_set_state(struct drbd_conf *, union drbd_state, | ||
1092 | enum chg_state_flags, struct completion *done); | ||
1093 | extern void print_st_err(struct drbd_conf *, union drbd_state, | ||
1094 | union drbd_state, int); | ||
1095 | extern int drbd_thread_start(struct drbd_thread *thi); | ||
1096 | extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait); | ||
1097 | #ifdef CONFIG_SMP | ||
1098 | extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev); | ||
1099 | extern void drbd_calc_cpu_mask(struct drbd_conf *mdev); | ||
1100 | #else | ||
1101 | #define drbd_thread_current_set_cpu(A) ({}) | ||
1102 | #define drbd_calc_cpu_mask(A) ({}) | ||
1103 | #endif | ||
1104 | extern void drbd_free_resources(struct drbd_conf *mdev); | ||
1105 | extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, | ||
1106 | unsigned int set_size); | ||
1107 | extern void tl_clear(struct drbd_conf *mdev); | ||
1108 | extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *); | ||
1109 | extern void drbd_free_sock(struct drbd_conf *mdev); | ||
1110 | extern int drbd_send(struct drbd_conf *mdev, struct socket *sock, | ||
1111 | void *buf, size_t size, unsigned msg_flags); | ||
1112 | extern int drbd_send_protocol(struct drbd_conf *mdev); | ||
1113 | extern int drbd_send_uuids(struct drbd_conf *mdev); | ||
1114 | extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); | ||
1115 | extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val); | ||
1116 | extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply); | ||
1117 | extern int _drbd_send_state(struct drbd_conf *mdev); | ||
1118 | extern int drbd_send_state(struct drbd_conf *mdev); | ||
1119 | extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, | ||
1120 | enum drbd_packets cmd, struct p_header *h, | ||
1121 | size_t size, unsigned msg_flags); | ||
1122 | #define USE_DATA_SOCKET 1 | ||
1123 | #define USE_META_SOCKET 0 | ||
1124 | extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, | ||
1125 | enum drbd_packets cmd, struct p_header *h, | ||
1126 | size_t size); | ||
1127 | extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1128 | char *data, size_t size); | ||
1129 | extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc); | ||
1130 | extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, | ||
1131 | u32 set_size); | ||
1132 | extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1133 | struct drbd_epoch_entry *e); | ||
1134 | extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1135 | struct p_block_req *rp); | ||
1136 | extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1137 | struct p_data *dp); | ||
1138 | extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1139 | sector_t sector, int blksize, u64 block_id); | ||
1140 | extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1141 | struct drbd_epoch_entry *e); | ||
1142 | extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req); | ||
1143 | extern int _drbd_send_barrier(struct drbd_conf *mdev, | ||
1144 | struct drbd_tl_epoch *barrier); | ||
1145 | extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd, | ||
1146 | sector_t sector, int size, u64 block_id); | ||
1147 | extern int drbd_send_drequest_csum(struct drbd_conf *mdev, | ||
1148 | sector_t sector,int size, | ||
1149 | void *digest, int digest_size, | ||
1150 | enum drbd_packets cmd); | ||
1151 | extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size); | ||
1152 | |||
1153 | extern int drbd_send_bitmap(struct drbd_conf *mdev); | ||
1154 | extern int _drbd_send_bitmap(struct drbd_conf *mdev); | ||
1155 | extern int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode); | ||
1156 | extern void drbd_free_bc(struct drbd_backing_dev *ldev); | ||
1157 | extern void drbd_mdev_cleanup(struct drbd_conf *mdev); | ||
1158 | |||
1159 | /* drbd_meta-data.c (still in drbd_main.c) */ | ||
1160 | extern void drbd_md_sync(struct drbd_conf *mdev); | ||
1161 | extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev); | ||
1162 | /* maybe define them below as inline? */ | ||
1163 | extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); | ||
1164 | extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); | ||
1165 | extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local); | ||
1166 | extern void _drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local); | ||
1167 | extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local); | ||
1168 | extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local); | ||
1169 | extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local); | ||
1170 | extern int drbd_md_test_flag(struct drbd_backing_dev *, int); | ||
1171 | extern void drbd_md_mark_dirty(struct drbd_conf *mdev); | ||
1172 | extern void drbd_queue_bitmap_io(struct drbd_conf *mdev, | ||
1173 | int (*io_fn)(struct drbd_conf *), | ||
1174 | void (*done)(struct drbd_conf *, int), | ||
1175 | char *why); | ||
1176 | extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); | ||
1177 | extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); | ||
1178 | extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why); | ||
1179 | |||
1180 | |||
1181 | /* Meta data layout | ||
1182 | We reserve a 128MB Block (4k aligned) | ||
1183 | * either at the end of the backing device | ||
1184 | * or on a separate meta data device. */ | ||
1185 | |||
1186 | #define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */ | ||
1187 | /* The following numbers are sectors */ | ||
1188 | #define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */ | ||
1189 | #define MD_AL_MAX_SIZE 64 /* = 32 kb LOG ~ 3776 extents ~ 14 GB Storage */ | ||
1190 | /* Allows up to about 3.8TB */ | ||
1191 | #define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE) | ||
1192 | |||
1193 | /* Since the smalles IO unit is usually 512 byte */ | ||
1194 | #define MD_SECTOR_SHIFT 9 | ||
1195 | #define MD_SECTOR_SIZE (1<<MD_SECTOR_SHIFT) | ||
1196 | |||
1197 | /* activity log */ | ||
1198 | #define AL_EXTENTS_PT ((MD_SECTOR_SIZE-12)/8-1) /* 61 ; Extents per 512B sector */ | ||
1199 | #define AL_EXTENT_SHIFT 22 /* One extent represents 4M Storage */ | ||
1200 | #define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT) | ||
1201 | |||
1202 | #if BITS_PER_LONG == 32 | ||
1203 | #define LN2_BPL 5 | ||
1204 | #define cpu_to_lel(A) cpu_to_le32(A) | ||
1205 | #define lel_to_cpu(A) le32_to_cpu(A) | ||
1206 | #elif BITS_PER_LONG == 64 | ||
1207 | #define LN2_BPL 6 | ||
1208 | #define cpu_to_lel(A) cpu_to_le64(A) | ||
1209 | #define lel_to_cpu(A) le64_to_cpu(A) | ||
1210 | #else | ||
1211 | #error "LN2 of BITS_PER_LONG unknown!" | ||
1212 | #endif | ||
1213 | |||
1214 | /* resync bitmap */ | ||
1215 | /* 16MB sized 'bitmap extent' to track syncer usage */ | ||
1216 | struct bm_extent { | ||
1217 | int rs_left; /* number of bits set (out of sync) in this extent. */ | ||
1218 | int rs_failed; /* number of failed resync requests in this extent. */ | ||
1219 | unsigned long flags; | ||
1220 | struct lc_element lce; | ||
1221 | }; | ||
1222 | |||
1223 | #define BME_NO_WRITES 0 /* bm_extent.flags: no more requests on this one! */ | ||
1224 | #define BME_LOCKED 1 /* bm_extent.flags: syncer active on this one. */ | ||
1225 | |||
1226 | /* drbd_bitmap.c */ | ||
1227 | /* | ||
1228 | * We need to store one bit for a block. | ||
1229 | * Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap. | ||
1230 | * Bit 0 ==> local node thinks this block is binary identical on both nodes | ||
1231 | * Bit 1 ==> local node thinks this block needs to be synced. | ||
1232 | */ | ||
1233 | |||
1234 | #define BM_BLOCK_SHIFT 12 /* 4k per bit */ | ||
1235 | #define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT) | ||
1236 | /* (9+3) : 512 bytes @ 8 bits; representing 16M storage | ||
1237 | * per sector of on disk bitmap */ | ||
1238 | #define BM_EXT_SHIFT (BM_BLOCK_SHIFT + MD_SECTOR_SHIFT + 3) /* = 24 */ | ||
1239 | #define BM_EXT_SIZE (1<<BM_EXT_SHIFT) | ||
1240 | |||
1241 | #if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12) | ||
1242 | #error "HAVE YOU FIXED drbdmeta AS WELL??" | ||
1243 | #endif | ||
1244 | |||
1245 | /* thus many _storage_ sectors are described by one bit */ | ||
1246 | #define BM_SECT_TO_BIT(x) ((x)>>(BM_BLOCK_SHIFT-9)) | ||
1247 | #define BM_BIT_TO_SECT(x) ((sector_t)(x)<<(BM_BLOCK_SHIFT-9)) | ||
1248 | #define BM_SECT_PER_BIT BM_BIT_TO_SECT(1) | ||
1249 | |||
1250 | /* bit to represented kilo byte conversion */ | ||
1251 | #define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10)) | ||
1252 | |||
1253 | /* in which _bitmap_ extent (resp. sector) the bit for a certain | ||
1254 | * _storage_ sector is located in */ | ||
1255 | #define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9)) | ||
1256 | |||
1257 | /* how much _storage_ sectors we have per bitmap sector */ | ||
1258 | #define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9)) | ||
1259 | #define BM_SECT_PER_EXT BM_EXT_TO_SECT(1) | ||
1260 | |||
1261 | /* in one sector of the bitmap, we have this many activity_log extents. */ | ||
1262 | #define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT)) | ||
1263 | #define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) | ||
1264 | |||
1265 | #define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT) | ||
1266 | #define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1) | ||
1267 | |||
1268 | /* the extent in "PER_EXTENT" below is an activity log extent | ||
1269 | * we need that many (long words/bytes) to store the bitmap | ||
1270 | * of one AL_EXTENT_SIZE chunk of storage. | ||
1271 | * we can store the bitmap for that many AL_EXTENTS within | ||
1272 | * one sector of the _on_disk_ bitmap: | ||
1273 | * bit 0 bit 37 bit 38 bit (512*8)-1 | ||
1274 | * ...|........|........|.. // ..|........| | ||
1275 | * sect. 0 `296 `304 ^(512*8*8)-1 | ||
1276 | * | ||
1277 | #define BM_WORDS_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG ) | ||
1278 | #define BM_BYTES_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 ) // 128 | ||
1279 | #define BM_EXT_PER_SECT ( 512 / BM_BYTES_PER_EXTENT ) // 4 | ||
1280 | */ | ||
1281 | |||
1282 | #define DRBD_MAX_SECTORS_32 (0xffffffffLU) | ||
1283 | #define DRBD_MAX_SECTORS_BM \ | ||
1284 | ((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9))) | ||
1285 | #if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32 | ||
1286 | #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM | ||
1287 | #define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM | ||
1288 | #elif !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32 | ||
1289 | #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32 | ||
1290 | #define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32 | ||
1291 | #else | ||
1292 | #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM | ||
1293 | /* 16 TB in units of sectors */ | ||
1294 | #if BITS_PER_LONG == 32 | ||
1295 | /* adjust by one page worth of bitmap, | ||
1296 | * so we won't wrap around in drbd_bm_find_next_bit. | ||
1297 | * you should use 64bit OS for that much storage, anyways. */ | ||
1298 | #define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff) | ||
1299 | #else | ||
1300 | #define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0x1LU << 32) | ||
1301 | #endif | ||
1302 | #endif | ||
1303 | |||
1304 | /* Sector shift value for the "hash" functions of tl_hash and ee_hash tables. | ||
1305 | * With a value of 6 all IO in one 32K block make it to the same slot of the | ||
1306 | * hash table. */ | ||
1307 | #define HT_SHIFT 6 | ||
1308 | #define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT)) | ||
1309 | |||
1310 | /* Number of elements in the app_reads_hash */ | ||
1311 | #define APP_R_HSIZE 15 | ||
1312 | |||
1313 | extern int drbd_bm_init(struct drbd_conf *mdev); | ||
1314 | extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors); | ||
1315 | extern void drbd_bm_cleanup(struct drbd_conf *mdev); | ||
1316 | extern void drbd_bm_set_all(struct drbd_conf *mdev); | ||
1317 | extern void drbd_bm_clear_all(struct drbd_conf *mdev); | ||
1318 | extern int drbd_bm_set_bits( | ||
1319 | struct drbd_conf *mdev, unsigned long s, unsigned long e); | ||
1320 | extern int drbd_bm_clear_bits( | ||
1321 | struct drbd_conf *mdev, unsigned long s, unsigned long e); | ||
1322 | /* bm_set_bits variant for use while holding drbd_bm_lock */ | ||
1323 | extern void _drbd_bm_set_bits(struct drbd_conf *mdev, | ||
1324 | const unsigned long s, const unsigned long e); | ||
1325 | extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr); | ||
1326 | extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); | ||
1327 | extern int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local); | ||
1328 | extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); | ||
1329 | extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); | ||
1330 | extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, | ||
1331 | unsigned long al_enr); | ||
1332 | extern size_t drbd_bm_words(struct drbd_conf *mdev); | ||
1333 | extern unsigned long drbd_bm_bits(struct drbd_conf *mdev); | ||
1334 | extern sector_t drbd_bm_capacity(struct drbd_conf *mdev); | ||
1335 | extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo); | ||
1336 | /* bm_find_next variants for use while you hold drbd_bm_lock() */ | ||
1337 | extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo); | ||
1338 | extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo); | ||
1339 | extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev); | ||
1340 | extern int drbd_bm_rs_done(struct drbd_conf *mdev); | ||
1341 | /* for receive_bitmap */ | ||
1342 | extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, | ||
1343 | size_t number, unsigned long *buffer); | ||
1344 | /* for _drbd_send_bitmap and drbd_bm_write_sect */ | ||
1345 | extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, | ||
1346 | size_t number, unsigned long *buffer); | ||
1347 | |||
1348 | extern void drbd_bm_lock(struct drbd_conf *mdev, char *why); | ||
1349 | extern void drbd_bm_unlock(struct drbd_conf *mdev); | ||
1350 | |||
1351 | extern int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e); | ||
1352 | /* drbd_main.c */ | ||
1353 | |||
1354 | extern struct kmem_cache *drbd_request_cache; | ||
1355 | extern struct kmem_cache *drbd_ee_cache; /* epoch entries */ | ||
1356 | extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ | ||
1357 | extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ | ||
1358 | extern mempool_t *drbd_request_mempool; | ||
1359 | extern mempool_t *drbd_ee_mempool; | ||
1360 | |||
1361 | extern struct page *drbd_pp_pool; /* drbd's page pool */ | ||
1362 | extern spinlock_t drbd_pp_lock; | ||
1363 | extern int drbd_pp_vacant; | ||
1364 | extern wait_queue_head_t drbd_pp_wait; | ||
1365 | |||
1366 | extern rwlock_t global_state_lock; | ||
1367 | |||
1368 | extern struct drbd_conf *drbd_new_device(unsigned int minor); | ||
1369 | extern void drbd_free_mdev(struct drbd_conf *mdev); | ||
1370 | |||
1371 | extern int proc_details; | ||
1372 | |||
1373 | /* drbd_req */ | ||
1374 | extern int drbd_make_request_26(struct request_queue *q, struct bio *bio); | ||
1375 | extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); | ||
1376 | extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec); | ||
1377 | extern int is_valid_ar_handle(struct drbd_request *, sector_t); | ||
1378 | |||
1379 | |||
1380 | /* drbd_nl.c */ | ||
1381 | extern void drbd_suspend_io(struct drbd_conf *mdev); | ||
1382 | extern void drbd_resume_io(struct drbd_conf *mdev); | ||
1383 | extern char *ppsize(char *buf, unsigned long long size); | ||
1384 | extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int); | ||
1385 | enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; | ||
1386 | extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, int force) __must_hold(local); | ||
1387 | extern void resync_after_online_grow(struct drbd_conf *); | ||
1388 | extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local); | ||
1389 | extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, | ||
1390 | int force); | ||
1391 | enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev); | ||
1392 | extern int drbd_khelper(struct drbd_conf *mdev, char *cmd); | ||
1393 | |||
1394 | /* drbd_worker.c */ | ||
1395 | extern int drbd_worker(struct drbd_thread *thi); | ||
1396 | extern int drbd_alter_sa(struct drbd_conf *mdev, int na); | ||
1397 | extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side); | ||
1398 | extern void resume_next_sg(struct drbd_conf *mdev); | ||
1399 | extern void suspend_other_sg(struct drbd_conf *mdev); | ||
1400 | extern int drbd_resync_finished(struct drbd_conf *mdev); | ||
1401 | /* maybe rather drbd_main.c ? */ | ||
1402 | extern int drbd_md_sync_page_io(struct drbd_conf *mdev, | ||
1403 | struct drbd_backing_dev *bdev, sector_t sector, int rw); | ||
1404 | extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int); | ||
1405 | |||
1406 | static inline void ov_oos_print(struct drbd_conf *mdev) | ||
1407 | { | ||
1408 | if (mdev->ov_last_oos_size) { | ||
1409 | dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n", | ||
1410 | (unsigned long long)mdev->ov_last_oos_start, | ||
1411 | (unsigned long)mdev->ov_last_oos_size); | ||
1412 | } | ||
1413 | mdev->ov_last_oos_size=0; | ||
1414 | } | ||
1415 | |||
1416 | |||
1417 | extern void drbd_csum(struct drbd_conf *, struct crypto_hash *, struct bio *, void *); | ||
1418 | /* worker callbacks */ | ||
1419 | extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int); | ||
1420 | extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int); | ||
1421 | extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int); | ||
1422 | extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int); | ||
1423 | extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int); | ||
1424 | extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int); | ||
1425 | extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int); | ||
1426 | extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int); | ||
1427 | extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int); | ||
1428 | extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int); | ||
1429 | extern int w_io_error(struct drbd_conf *, struct drbd_work *, int); | ||
1430 | extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int); | ||
1431 | extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int); | ||
1432 | extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int); | ||
1433 | extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int); | ||
1434 | extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int); | ||
1435 | extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int); | ||
1436 | extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int); | ||
1437 | |||
1438 | extern void resync_timer_fn(unsigned long data); | ||
1439 | |||
1440 | /* drbd_receiver.c */ | ||
1441 | extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list); | ||
1442 | extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, | ||
1443 | u64 id, | ||
1444 | sector_t sector, | ||
1445 | unsigned int data_size, | ||
1446 | gfp_t gfp_mask) __must_hold(local); | ||
1447 | extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e); | ||
1448 | extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev, | ||
1449 | struct list_head *head); | ||
1450 | extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, | ||
1451 | struct list_head *head); | ||
1452 | extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled); | ||
1453 | extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed); | ||
1454 | extern void drbd_flush_workqueue(struct drbd_conf *mdev); | ||
1455 | |||
1456 | /* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to | ||
1457 | * mess with get_fs/set_fs, we know we are KERNEL_DS always. */ | ||
1458 | static inline int drbd_setsockopt(struct socket *sock, int level, int optname, | ||
1459 | char __user *optval, int optlen) | ||
1460 | { | ||
1461 | int err; | ||
1462 | if (level == SOL_SOCKET) | ||
1463 | err = sock_setsockopt(sock, level, optname, optval, optlen); | ||
1464 | else | ||
1465 | err = sock->ops->setsockopt(sock, level, optname, optval, | ||
1466 | optlen); | ||
1467 | return err; | ||
1468 | } | ||
1469 | |||
1470 | static inline void drbd_tcp_cork(struct socket *sock) | ||
1471 | { | ||
1472 | int __user val = 1; | ||
1473 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, | ||
1474 | (char __user *)&val, sizeof(val)); | ||
1475 | } | ||
1476 | |||
1477 | static inline void drbd_tcp_uncork(struct socket *sock) | ||
1478 | { | ||
1479 | int __user val = 0; | ||
1480 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, | ||
1481 | (char __user *)&val, sizeof(val)); | ||
1482 | } | ||
1483 | |||
1484 | static inline void drbd_tcp_nodelay(struct socket *sock) | ||
1485 | { | ||
1486 | int __user val = 1; | ||
1487 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY, | ||
1488 | (char __user *)&val, sizeof(val)); | ||
1489 | } | ||
1490 | |||
1491 | static inline void drbd_tcp_quickack(struct socket *sock) | ||
1492 | { | ||
1493 | int __user val = 1; | ||
1494 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK, | ||
1495 | (char __user *)&val, sizeof(val)); | ||
1496 | } | ||
1497 | |||
1498 | void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo); | ||
1499 | |||
1500 | /* drbd_proc.c */ | ||
1501 | extern struct proc_dir_entry *drbd_proc; | ||
1502 | extern const struct file_operations drbd_proc_fops; | ||
1503 | extern const char *drbd_conn_str(enum drbd_conns s); | ||
1504 | extern const char *drbd_role_str(enum drbd_role s); | ||
1505 | |||
1506 | /* drbd_actlog.c */ | ||
1507 | extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector); | ||
1508 | extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector); | ||
1509 | extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector); | ||
1510 | extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector); | ||
1511 | extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector); | ||
1512 | extern void drbd_rs_cancel_all(struct drbd_conf *mdev); | ||
1513 | extern int drbd_rs_del_all(struct drbd_conf *mdev); | ||
1514 | extern void drbd_rs_failed_io(struct drbd_conf *mdev, | ||
1515 | sector_t sector, int size); | ||
1516 | extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *); | ||
1517 | extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, | ||
1518 | int size, const char *file, const unsigned int line); | ||
1519 | #define drbd_set_in_sync(mdev, sector, size) \ | ||
1520 | __drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__) | ||
1521 | extern void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, | ||
1522 | int size, const char *file, const unsigned int line); | ||
1523 | #define drbd_set_out_of_sync(mdev, sector, size) \ | ||
1524 | __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) | ||
1525 | extern void drbd_al_apply_to_bm(struct drbd_conf *mdev); | ||
1526 | extern void drbd_al_to_on_disk_bm(struct drbd_conf *mdev); | ||
1527 | extern void drbd_al_shrink(struct drbd_conf *mdev); | ||
1528 | |||
1529 | |||
1530 | /* drbd_nl.c */ | ||
1531 | |||
1532 | void drbd_nl_cleanup(void); | ||
1533 | int __init drbd_nl_init(void); | ||
1534 | void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state); | ||
1535 | void drbd_bcast_sync_progress(struct drbd_conf *mdev); | ||
1536 | void drbd_bcast_ee(struct drbd_conf *mdev, | ||
1537 | const char *reason, const int dgs, | ||
1538 | const char* seen_hash, const char* calc_hash, | ||
1539 | const struct drbd_epoch_entry* e); | ||
1540 | |||
1541 | |||
1542 | /** | ||
1543 | * DOC: DRBD State macros | ||
1544 | * | ||
1545 | * These macros are used to express state changes in easily readable form. | ||
1546 | * | ||
1547 | * The NS macros expand to a mask and a value, that can be bit ored onto the | ||
1548 | * current state as soon as the spinlock (req_lock) was taken. | ||
1549 | * | ||
1550 | * The _NS macros are used for state functions that get called with the | ||
1551 | * spinlock. These macros expand directly to the new state value. | ||
1552 | * | ||
1553 | * Besides the basic forms NS() and _NS() additional _?NS[23] are defined | ||
1554 | * to express state changes that affect more than one aspect of the state. | ||
1555 | * | ||
1556 | * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY) | ||
1557 | * Means that the network connection was established and that the peer | ||
1558 | * is in secondary role. | ||
1559 | */ | ||
1560 | #define role_MASK R_MASK | ||
1561 | #define peer_MASK R_MASK | ||
1562 | #define disk_MASK D_MASK | ||
1563 | #define pdsk_MASK D_MASK | ||
1564 | #define conn_MASK C_MASK | ||
1565 | #define susp_MASK 1 | ||
1566 | #define user_isp_MASK 1 | ||
1567 | #define aftr_isp_MASK 1 | ||
1568 | |||
1569 | #define NS(T, S) \ | ||
1570 | ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \ | ||
1571 | ({ union drbd_state val; val.i = 0; val.T = (S); val; }) | ||
1572 | #define NS2(T1, S1, T2, S2) \ | ||
1573 | ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ | ||
1574 | mask.T2 = T2##_MASK; mask; }), \ | ||
1575 | ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ | ||
1576 | val.T2 = (S2); val; }) | ||
1577 | #define NS3(T1, S1, T2, S2, T3, S3) \ | ||
1578 | ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ | ||
1579 | mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \ | ||
1580 | ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ | ||
1581 | val.T2 = (S2); val.T3 = (S3); val; }) | ||
1582 | |||
1583 | #define _NS(D, T, S) \ | ||
1584 | D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T = (S); __ns; }) | ||
1585 | #define _NS2(D, T1, S1, T2, S2) \ | ||
1586 | D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \ | ||
1587 | __ns.T2 = (S2); __ns; }) | ||
1588 | #define _NS3(D, T1, S1, T2, S2, T3, S3) \ | ||
1589 | D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \ | ||
1590 | __ns.T2 = (S2); __ns.T3 = (S3); __ns; }) | ||
1591 | |||
1592 | /* | ||
1593 | * inline helper functions | ||
1594 | *************************/ | ||
1595 | |||
1596 | static inline void drbd_state_lock(struct drbd_conf *mdev) | ||
1597 | { | ||
1598 | wait_event(mdev->misc_wait, | ||
1599 | !test_and_set_bit(CLUSTER_ST_CHANGE, &mdev->flags)); | ||
1600 | } | ||
1601 | |||
1602 | static inline void drbd_state_unlock(struct drbd_conf *mdev) | ||
1603 | { | ||
1604 | clear_bit(CLUSTER_ST_CHANGE, &mdev->flags); | ||
1605 | wake_up(&mdev->misc_wait); | ||
1606 | } | ||
1607 | |||
1608 | static inline int _drbd_set_state(struct drbd_conf *mdev, | ||
1609 | union drbd_state ns, enum chg_state_flags flags, | ||
1610 | struct completion *done) | ||
1611 | { | ||
1612 | int rv; | ||
1613 | |||
1614 | read_lock(&global_state_lock); | ||
1615 | rv = __drbd_set_state(mdev, ns, flags, done); | ||
1616 | read_unlock(&global_state_lock); | ||
1617 | |||
1618 | return rv; | ||
1619 | } | ||
1620 | |||
1621 | /** | ||
1622 | * drbd_request_state() - Reqest a state change | ||
1623 | * @mdev: DRBD device. | ||
1624 | * @mask: mask of state bits to change. | ||
1625 | * @val: value of new state bits. | ||
1626 | * | ||
1627 | * This is the most graceful way of requesting a state change. It is verbose | ||
1628 | * quite verbose in case the state change is not possible, and all those | ||
1629 | * state changes are globally serialized. | ||
1630 | */ | ||
1631 | static inline int drbd_request_state(struct drbd_conf *mdev, | ||
1632 | union drbd_state mask, | ||
1633 | union drbd_state val) | ||
1634 | { | ||
1635 | return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED); | ||
1636 | } | ||
1637 | |||
1638 | #define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__) | ||
1639 | static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, const char *where) | ||
1640 | { | ||
1641 | switch (mdev->ldev->dc.on_io_error) { | ||
1642 | case EP_PASS_ON: | ||
1643 | if (!forcedetach) { | ||
1644 | if (printk_ratelimit()) | ||
1645 | dev_err(DEV, "Local IO failed in %s." | ||
1646 | "Passing error on...\n", where); | ||
1647 | break; | ||
1648 | } | ||
1649 | /* NOTE fall through to detach case if forcedetach set */ | ||
1650 | case EP_DETACH: | ||
1651 | case EP_CALL_HELPER: | ||
1652 | if (mdev->state.disk > D_FAILED) { | ||
1653 | _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); | ||
1654 | dev_err(DEV, "Local IO failed in %s." | ||
1655 | "Detaching...\n", where); | ||
1656 | } | ||
1657 | break; | ||
1658 | } | ||
1659 | } | ||
1660 | |||
1661 | /** | ||
1662 | * drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers | ||
1663 | * @mdev: DRBD device. | ||
1664 | * @error: Error code passed to the IO completion callback | ||
1665 | * @forcedetach: Force detach. I.e. the error happened while accessing the meta data | ||
1666 | * | ||
1667 | * See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED) | ||
1668 | */ | ||
1669 | #define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__) | ||
1670 | static inline void drbd_chk_io_error_(struct drbd_conf *mdev, | ||
1671 | int error, int forcedetach, const char *where) | ||
1672 | { | ||
1673 | if (error) { | ||
1674 | unsigned long flags; | ||
1675 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
1676 | __drbd_chk_io_error_(mdev, forcedetach, where); | ||
1677 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
1678 | } | ||
1679 | } | ||
1680 | |||
1681 | |||
1682 | /** | ||
1683 | * drbd_md_first_sector() - Returns the first sector number of the meta data area | ||
1684 | * @bdev: Meta data block device. | ||
1685 | * | ||
1686 | * BTW, for internal meta data, this happens to be the maximum capacity | ||
1687 | * we could agree upon with our peer node. | ||
1688 | */ | ||
1689 | static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) | ||
1690 | { | ||
1691 | switch (bdev->dc.meta_dev_idx) { | ||
1692 | case DRBD_MD_INDEX_INTERNAL: | ||
1693 | case DRBD_MD_INDEX_FLEX_INT: | ||
1694 | return bdev->md.md_offset + bdev->md.bm_offset; | ||
1695 | case DRBD_MD_INDEX_FLEX_EXT: | ||
1696 | default: | ||
1697 | return bdev->md.md_offset; | ||
1698 | } | ||
1699 | } | ||
1700 | |||
1701 | /** | ||
1702 | * drbd_md_last_sector() - Return the last sector number of the meta data area | ||
1703 | * @bdev: Meta data block device. | ||
1704 | */ | ||
1705 | static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) | ||
1706 | { | ||
1707 | switch (bdev->dc.meta_dev_idx) { | ||
1708 | case DRBD_MD_INDEX_INTERNAL: | ||
1709 | case DRBD_MD_INDEX_FLEX_INT: | ||
1710 | return bdev->md.md_offset + MD_AL_OFFSET - 1; | ||
1711 | case DRBD_MD_INDEX_FLEX_EXT: | ||
1712 | default: | ||
1713 | return bdev->md.md_offset + bdev->md.md_size_sect; | ||
1714 | } | ||
1715 | } | ||
1716 | |||
1717 | /* Returns the number of 512 byte sectors of the device */ | ||
1718 | static inline sector_t drbd_get_capacity(struct block_device *bdev) | ||
1719 | { | ||
1720 | /* return bdev ? get_capacity(bdev->bd_disk) : 0; */ | ||
1721 | return bdev ? bdev->bd_inode->i_size >> 9 : 0; | ||
1722 | } | ||
1723 | |||
1724 | /** | ||
1725 | * drbd_get_max_capacity() - Returns the capacity we announce to out peer | ||
1726 | * @bdev: Meta data block device. | ||
1727 | * | ||
1728 | * returns the capacity we announce to out peer. we clip ourselves at the | ||
1729 | * various MAX_SECTORS, because if we don't, current implementation will | ||
1730 | * oops sooner or later | ||
1731 | */ | ||
1732 | static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) | ||
1733 | { | ||
1734 | sector_t s; | ||
1735 | switch (bdev->dc.meta_dev_idx) { | ||
1736 | case DRBD_MD_INDEX_INTERNAL: | ||
1737 | case DRBD_MD_INDEX_FLEX_INT: | ||
1738 | s = drbd_get_capacity(bdev->backing_bdev) | ||
1739 | ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, | ||
1740 | drbd_md_first_sector(bdev)) | ||
1741 | : 0; | ||
1742 | break; | ||
1743 | case DRBD_MD_INDEX_FLEX_EXT: | ||
1744 | s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX, | ||
1745 | drbd_get_capacity(bdev->backing_bdev)); | ||
1746 | /* clip at maximum size the meta device can support */ | ||
1747 | s = min_t(sector_t, s, | ||
1748 | BM_EXT_TO_SECT(bdev->md.md_size_sect | ||
1749 | - bdev->md.bm_offset)); | ||
1750 | break; | ||
1751 | default: | ||
1752 | s = min_t(sector_t, DRBD_MAX_SECTORS, | ||
1753 | drbd_get_capacity(bdev->backing_bdev)); | ||
1754 | } | ||
1755 | return s; | ||
1756 | } | ||
1757 | |||
1758 | /** | ||
1759 | * drbd_md_ss__() - Return the sector number of our meta data super block | ||
1760 | * @mdev: DRBD device. | ||
1761 | * @bdev: Meta data block device. | ||
1762 | */ | ||
1763 | static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, | ||
1764 | struct drbd_backing_dev *bdev) | ||
1765 | { | ||
1766 | switch (bdev->dc.meta_dev_idx) { | ||
1767 | default: /* external, some index */ | ||
1768 | return MD_RESERVED_SECT * bdev->dc.meta_dev_idx; | ||
1769 | case DRBD_MD_INDEX_INTERNAL: | ||
1770 | /* with drbd08, internal meta data is always "flexible" */ | ||
1771 | case DRBD_MD_INDEX_FLEX_INT: | ||
1772 | /* sizeof(struct md_on_disk_07) == 4k | ||
1773 | * position: last 4k aligned block of 4k size */ | ||
1774 | if (!bdev->backing_bdev) { | ||
1775 | if (__ratelimit(&drbd_ratelimit_state)) { | ||
1776 | dev_err(DEV, "bdev->backing_bdev==NULL\n"); | ||
1777 | dump_stack(); | ||
1778 | } | ||
1779 | return 0; | ||
1780 | } | ||
1781 | return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) | ||
1782 | - MD_AL_OFFSET; | ||
1783 | case DRBD_MD_INDEX_FLEX_EXT: | ||
1784 | return 0; | ||
1785 | } | ||
1786 | } | ||
1787 | |||
1788 | static inline void | ||
1789 | _drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) | ||
1790 | { | ||
1791 | list_add_tail(&w->list, &q->q); | ||
1792 | up(&q->s); | ||
1793 | } | ||
1794 | |||
1795 | static inline void | ||
1796 | drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w) | ||
1797 | { | ||
1798 | unsigned long flags; | ||
1799 | spin_lock_irqsave(&q->q_lock, flags); | ||
1800 | list_add(&w->list, &q->q); | ||
1801 | up(&q->s); /* within the spinlock, | ||
1802 | see comment near end of drbd_worker() */ | ||
1803 | spin_unlock_irqrestore(&q->q_lock, flags); | ||
1804 | } | ||
1805 | |||
1806 | static inline void | ||
1807 | drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) | ||
1808 | { | ||
1809 | unsigned long flags; | ||
1810 | spin_lock_irqsave(&q->q_lock, flags); | ||
1811 | list_add_tail(&w->list, &q->q); | ||
1812 | up(&q->s); /* within the spinlock, | ||
1813 | see comment near end of drbd_worker() */ | ||
1814 | spin_unlock_irqrestore(&q->q_lock, flags); | ||
1815 | } | ||
1816 | |||
1817 | static inline void wake_asender(struct drbd_conf *mdev) | ||
1818 | { | ||
1819 | if (test_bit(SIGNAL_ASENDER, &mdev->flags)) | ||
1820 | force_sig(DRBD_SIG, mdev->asender.task); | ||
1821 | } | ||
1822 | |||
1823 | static inline void request_ping(struct drbd_conf *mdev) | ||
1824 | { | ||
1825 | set_bit(SEND_PING, &mdev->flags); | ||
1826 | wake_asender(mdev); | ||
1827 | } | ||
1828 | |||
1829 | static inline int drbd_send_short_cmd(struct drbd_conf *mdev, | ||
1830 | enum drbd_packets cmd) | ||
1831 | { | ||
1832 | struct p_header h; | ||
1833 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h)); | ||
1834 | } | ||
1835 | |||
1836 | static inline int drbd_send_ping(struct drbd_conf *mdev) | ||
1837 | { | ||
1838 | struct p_header h; | ||
1839 | return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h)); | ||
1840 | } | ||
1841 | |||
1842 | static inline int drbd_send_ping_ack(struct drbd_conf *mdev) | ||
1843 | { | ||
1844 | struct p_header h; | ||
1845 | return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h)); | ||
1846 | } | ||
1847 | |||
1848 | static inline void drbd_thread_stop(struct drbd_thread *thi) | ||
1849 | { | ||
1850 | _drbd_thread_stop(thi, FALSE, TRUE); | ||
1851 | } | ||
1852 | |||
1853 | static inline void drbd_thread_stop_nowait(struct drbd_thread *thi) | ||
1854 | { | ||
1855 | _drbd_thread_stop(thi, FALSE, FALSE); | ||
1856 | } | ||
1857 | |||
1858 | static inline void drbd_thread_restart_nowait(struct drbd_thread *thi) | ||
1859 | { | ||
1860 | _drbd_thread_stop(thi, TRUE, FALSE); | ||
1861 | } | ||
1862 | |||
1863 | /* counts how many answer packets packets we expect from our peer, | ||
1864 | * for either explicit application requests, | ||
1865 | * or implicit barrier packets as necessary. | ||
1866 | * increased: | ||
1867 | * w_send_barrier | ||
1868 | * _req_mod(req, queue_for_net_write or queue_for_net_read); | ||
1869 | * it is much easier and equally valid to count what we queue for the | ||
1870 | * worker, even before it actually was queued or send. | ||
1871 | * (drbd_make_request_common; recovery path on read io-error) | ||
1872 | * decreased: | ||
1873 | * got_BarrierAck (respective tl_clear, tl_clear_barrier) | ||
1874 | * _req_mod(req, data_received) | ||
1875 | * [from receive_DataReply] | ||
1876 | * _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked) | ||
1877 | * [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)] | ||
1878 | * for some reason it is NOT decreased in got_NegAck, | ||
1879 | * but in the resulting cleanup code from report_params. | ||
1880 | * we should try to remember the reason for that... | ||
1881 | * _req_mod(req, send_failed or send_canceled) | ||
1882 | * _req_mod(req, connection_lost_while_pending) | ||
1883 | * [from tl_clear_barrier] | ||
1884 | */ | ||
1885 | static inline void inc_ap_pending(struct drbd_conf *mdev) | ||
1886 | { | ||
1887 | atomic_inc(&mdev->ap_pending_cnt); | ||
1888 | } | ||
1889 | |||
1890 | #define ERR_IF_CNT_IS_NEGATIVE(which) \ | ||
1891 | if (atomic_read(&mdev->which) < 0) \ | ||
1892 | dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n", \ | ||
1893 | __func__ , __LINE__ , \ | ||
1894 | atomic_read(&mdev->which)) | ||
1895 | |||
1896 | #define dec_ap_pending(mdev) do { \ | ||
1897 | typecheck(struct drbd_conf *, mdev); \ | ||
1898 | if (atomic_dec_and_test(&mdev->ap_pending_cnt)) \ | ||
1899 | wake_up(&mdev->misc_wait); \ | ||
1900 | ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0) | ||
1901 | |||
1902 | /* counts how many resync-related answers we still expect from the peer | ||
1903 | * increase decrease | ||
1904 | * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY) | ||
1905 | * C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK whith ID_SYNCER) | ||
1906 | * (or P_NEG_ACK with ID_SYNCER) | ||
1907 | */ | ||
1908 | static inline void inc_rs_pending(struct drbd_conf *mdev) | ||
1909 | { | ||
1910 | atomic_inc(&mdev->rs_pending_cnt); | ||
1911 | } | ||
1912 | |||
1913 | #define dec_rs_pending(mdev) do { \ | ||
1914 | typecheck(struct drbd_conf *, mdev); \ | ||
1915 | atomic_dec(&mdev->rs_pending_cnt); \ | ||
1916 | ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0) | ||
1917 | |||
1918 | /* counts how many answers we still need to send to the peer. | ||
1919 | * increased on | ||
1920 | * receive_Data unless protocol A; | ||
1921 | * we need to send a P_RECV_ACK (proto B) | ||
1922 | * or P_WRITE_ACK (proto C) | ||
1923 | * receive_RSDataReply (recv_resync_read) we need to send a P_WRITE_ACK | ||
1924 | * receive_DataRequest (receive_RSDataRequest) we need to send back P_DATA | ||
1925 | * receive_Barrier_* we need to send a P_BARRIER_ACK | ||
1926 | */ | ||
1927 | static inline void inc_unacked(struct drbd_conf *mdev) | ||
1928 | { | ||
1929 | atomic_inc(&mdev->unacked_cnt); | ||
1930 | } | ||
1931 | |||
1932 | #define dec_unacked(mdev) do { \ | ||
1933 | typecheck(struct drbd_conf *, mdev); \ | ||
1934 | atomic_dec(&mdev->unacked_cnt); \ | ||
1935 | ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) | ||
1936 | |||
1937 | #define sub_unacked(mdev, n) do { \ | ||
1938 | typecheck(struct drbd_conf *, mdev); \ | ||
1939 | atomic_sub(n, &mdev->unacked_cnt); \ | ||
1940 | ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) | ||
1941 | |||
1942 | |||
1943 | static inline void put_net_conf(struct drbd_conf *mdev) | ||
1944 | { | ||
1945 | if (atomic_dec_and_test(&mdev->net_cnt)) | ||
1946 | wake_up(&mdev->misc_wait); | ||
1947 | } | ||
1948 | |||
1949 | /** | ||
1950 | * get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there | ||
1951 | * @mdev: DRBD device. | ||
1952 | * | ||
1953 | * You have to call put_net_conf() when finished working with mdev->net_conf. | ||
1954 | */ | ||
1955 | static inline int get_net_conf(struct drbd_conf *mdev) | ||
1956 | { | ||
1957 | int have_net_conf; | ||
1958 | |||
1959 | atomic_inc(&mdev->net_cnt); | ||
1960 | have_net_conf = mdev->state.conn >= C_UNCONNECTED; | ||
1961 | if (!have_net_conf) | ||
1962 | put_net_conf(mdev); | ||
1963 | return have_net_conf; | ||
1964 | } | ||
1965 | |||
1966 | /** | ||
1967 | * get_ldev() - Increase the ref count on mdev->ldev. Returns 0 if there is no ldev | ||
1968 | * @M: DRBD device. | ||
1969 | * | ||
1970 | * You have to call put_ldev() when finished working with mdev->ldev. | ||
1971 | */ | ||
1972 | #define get_ldev(M) __cond_lock(local, _get_ldev_if_state(M,D_INCONSISTENT)) | ||
1973 | #define get_ldev_if_state(M,MINS) __cond_lock(local, _get_ldev_if_state(M,MINS)) | ||
1974 | |||
1975 | static inline void put_ldev(struct drbd_conf *mdev) | ||
1976 | { | ||
1977 | __release(local); | ||
1978 | if (atomic_dec_and_test(&mdev->local_cnt)) | ||
1979 | wake_up(&mdev->misc_wait); | ||
1980 | D_ASSERT(atomic_read(&mdev->local_cnt) >= 0); | ||
1981 | } | ||
1982 | |||
1983 | #ifndef __CHECKER__ | ||
1984 | static inline int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins) | ||
1985 | { | ||
1986 | int io_allowed; | ||
1987 | |||
1988 | atomic_inc(&mdev->local_cnt); | ||
1989 | io_allowed = (mdev->state.disk >= mins); | ||
1990 | if (!io_allowed) | ||
1991 | put_ldev(mdev); | ||
1992 | return io_allowed; | ||
1993 | } | ||
1994 | #else | ||
1995 | extern int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins); | ||
1996 | #endif | ||
1997 | |||
1998 | /* you must have an "get_ldev" reference */ | ||
1999 | static inline void drbd_get_syncer_progress(struct drbd_conf *mdev, | ||
2000 | unsigned long *bits_left, unsigned int *per_mil_done) | ||
2001 | { | ||
2002 | /* | ||
2003 | * this is to break it at compile time when we change that | ||
2004 | * (we may feel 4TB maximum storage per drbd is not enough) | ||
2005 | */ | ||
2006 | typecheck(unsigned long, mdev->rs_total); | ||
2007 | |||
2008 | /* note: both rs_total and rs_left are in bits, i.e. in | ||
2009 | * units of BM_BLOCK_SIZE. | ||
2010 | * for the percentage, we don't care. */ | ||
2011 | |||
2012 | *bits_left = drbd_bm_total_weight(mdev) - mdev->rs_failed; | ||
2013 | /* >> 10 to prevent overflow, | ||
2014 | * +1 to prevent division by zero */ | ||
2015 | if (*bits_left > mdev->rs_total) { | ||
2016 | /* doh. maybe a logic bug somewhere. | ||
2017 | * may also be just a race condition | ||
2018 | * between this and a disconnect during sync. | ||
2019 | * for now, just prevent in-kernel buffer overflow. | ||
2020 | */ | ||
2021 | smp_rmb(); | ||
2022 | dev_warn(DEV, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n", | ||
2023 | drbd_conn_str(mdev->state.conn), | ||
2024 | *bits_left, mdev->rs_total, mdev->rs_failed); | ||
2025 | *per_mil_done = 0; | ||
2026 | } else { | ||
2027 | /* make sure the calculation happens in long context */ | ||
2028 | unsigned long tmp = 1000UL - | ||
2029 | (*bits_left >> 10)*1000UL | ||
2030 | / ((mdev->rs_total >> 10) + 1UL); | ||
2031 | *per_mil_done = tmp; | ||
2032 | } | ||
2033 | } | ||
2034 | |||
2035 | |||
2036 | /* this throttles on-the-fly application requests | ||
2037 | * according to max_buffers settings; | ||
2038 | * maybe re-implement using semaphores? */ | ||
2039 | static inline int drbd_get_max_buffers(struct drbd_conf *mdev) | ||
2040 | { | ||
2041 | int mxb = 1000000; /* arbitrary limit on open requests */ | ||
2042 | if (get_net_conf(mdev)) { | ||
2043 | mxb = mdev->net_conf->max_buffers; | ||
2044 | put_net_conf(mdev); | ||
2045 | } | ||
2046 | return mxb; | ||
2047 | } | ||
2048 | |||
2049 | static inline int drbd_state_is_stable(union drbd_state s) | ||
2050 | { | ||
2051 | |||
2052 | /* DO NOT add a default clause, we want the compiler to warn us | ||
2053 | * for any newly introduced state we may have forgotten to add here */ | ||
2054 | |||
2055 | switch ((enum drbd_conns)s.conn) { | ||
2056 | /* new io only accepted when there is no connection, ... */ | ||
2057 | case C_STANDALONE: | ||
2058 | case C_WF_CONNECTION: | ||
2059 | /* ... or there is a well established connection. */ | ||
2060 | case C_CONNECTED: | ||
2061 | case C_SYNC_SOURCE: | ||
2062 | case C_SYNC_TARGET: | ||
2063 | case C_VERIFY_S: | ||
2064 | case C_VERIFY_T: | ||
2065 | case C_PAUSED_SYNC_S: | ||
2066 | case C_PAUSED_SYNC_T: | ||
2067 | /* maybe stable, look at the disk state */ | ||
2068 | break; | ||
2069 | |||
2070 | /* no new io accepted during tansitional states | ||
2071 | * like handshake or teardown */ | ||
2072 | case C_DISCONNECTING: | ||
2073 | case C_UNCONNECTED: | ||
2074 | case C_TIMEOUT: | ||
2075 | case C_BROKEN_PIPE: | ||
2076 | case C_NETWORK_FAILURE: | ||
2077 | case C_PROTOCOL_ERROR: | ||
2078 | case C_TEAR_DOWN: | ||
2079 | case C_WF_REPORT_PARAMS: | ||
2080 | case C_STARTING_SYNC_S: | ||
2081 | case C_STARTING_SYNC_T: | ||
2082 | case C_WF_BITMAP_S: | ||
2083 | case C_WF_BITMAP_T: | ||
2084 | case C_WF_SYNC_UUID: | ||
2085 | case C_MASK: | ||
2086 | /* not "stable" */ | ||
2087 | return 0; | ||
2088 | } | ||
2089 | |||
2090 | switch ((enum drbd_disk_state)s.disk) { | ||
2091 | case D_DISKLESS: | ||
2092 | case D_INCONSISTENT: | ||
2093 | case D_OUTDATED: | ||
2094 | case D_CONSISTENT: | ||
2095 | case D_UP_TO_DATE: | ||
2096 | /* disk state is stable as well. */ | ||
2097 | break; | ||
2098 | |||
2099 | /* no new io accepted during tansitional states */ | ||
2100 | case D_ATTACHING: | ||
2101 | case D_FAILED: | ||
2102 | case D_NEGOTIATING: | ||
2103 | case D_UNKNOWN: | ||
2104 | case D_MASK: | ||
2105 | /* not "stable" */ | ||
2106 | return 0; | ||
2107 | } | ||
2108 | |||
2109 | return 1; | ||
2110 | } | ||
2111 | |||
2112 | static inline int __inc_ap_bio_cond(struct drbd_conf *mdev) | ||
2113 | { | ||
2114 | int mxb = drbd_get_max_buffers(mdev); | ||
2115 | |||
2116 | if (mdev->state.susp) | ||
2117 | return 0; | ||
2118 | if (test_bit(SUSPEND_IO, &mdev->flags)) | ||
2119 | return 0; | ||
2120 | |||
2121 | /* to avoid potential deadlock or bitmap corruption, | ||
2122 | * in various places, we only allow new application io | ||
2123 | * to start during "stable" states. */ | ||
2124 | |||
2125 | /* no new io accepted when attaching or detaching the disk */ | ||
2126 | if (!drbd_state_is_stable(mdev->state)) | ||
2127 | return 0; | ||
2128 | |||
2129 | /* since some older kernels don't have atomic_add_unless, | ||
2130 | * and we are within the spinlock anyways, we have this workaround. */ | ||
2131 | if (atomic_read(&mdev->ap_bio_cnt) > mxb) | ||
2132 | return 0; | ||
2133 | if (test_bit(BITMAP_IO, &mdev->flags)) | ||
2134 | return 0; | ||
2135 | return 1; | ||
2136 | } | ||
2137 | |||
2138 | /* I'd like to use wait_event_lock_irq, | ||
2139 | * but I'm not sure when it got introduced, | ||
2140 | * and not sure when it has 3 or 4 arguments */ | ||
2141 | static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two) | ||
2142 | { | ||
2143 | /* compare with after_state_ch, | ||
2144 | * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */ | ||
2145 | DEFINE_WAIT(wait); | ||
2146 | |||
2147 | /* we wait here | ||
2148 | * as long as the device is suspended | ||
2149 | * until the bitmap is no longer on the fly during connection | ||
2150 | * handshake as long as we would exeed the max_buffer limit. | ||
2151 | * | ||
2152 | * to avoid races with the reconnect code, | ||
2153 | * we need to atomic_inc within the spinlock. */ | ||
2154 | |||
2155 | spin_lock_irq(&mdev->req_lock); | ||
2156 | while (!__inc_ap_bio_cond(mdev)) { | ||
2157 | prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE); | ||
2158 | spin_unlock_irq(&mdev->req_lock); | ||
2159 | schedule(); | ||
2160 | finish_wait(&mdev->misc_wait, &wait); | ||
2161 | spin_lock_irq(&mdev->req_lock); | ||
2162 | } | ||
2163 | atomic_add(one_or_two, &mdev->ap_bio_cnt); | ||
2164 | spin_unlock_irq(&mdev->req_lock); | ||
2165 | } | ||
2166 | |||
2167 | static inline void dec_ap_bio(struct drbd_conf *mdev) | ||
2168 | { | ||
2169 | int mxb = drbd_get_max_buffers(mdev); | ||
2170 | int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt); | ||
2171 | |||
2172 | D_ASSERT(ap_bio >= 0); | ||
2173 | /* this currently does wake_up for every dec_ap_bio! | ||
2174 | * maybe rather introduce some type of hysteresis? | ||
2175 | * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */ | ||
2176 | if (ap_bio < mxb) | ||
2177 | wake_up(&mdev->misc_wait); | ||
2178 | if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) { | ||
2179 | if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) | ||
2180 | drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); | ||
2181 | } | ||
2182 | } | ||
2183 | |||
2184 | static inline void drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val) | ||
2185 | { | ||
2186 | mdev->ed_uuid = val; | ||
2187 | } | ||
2188 | |||
2189 | static inline int seq_cmp(u32 a, u32 b) | ||
2190 | { | ||
2191 | /* we assume wrap around at 32bit. | ||
2192 | * for wrap around at 24bit (old atomic_t), | ||
2193 | * we'd have to | ||
2194 | * a <<= 8; b <<= 8; | ||
2195 | */ | ||
2196 | return (s32)(a) - (s32)(b); | ||
2197 | } | ||
2198 | #define seq_lt(a, b) (seq_cmp((a), (b)) < 0) | ||
2199 | #define seq_gt(a, b) (seq_cmp((a), (b)) > 0) | ||
2200 | #define seq_ge(a, b) (seq_cmp((a), (b)) >= 0) | ||
2201 | #define seq_le(a, b) (seq_cmp((a), (b)) <= 0) | ||
2202 | /* CAUTION: please no side effects in arguments! */ | ||
2203 | #define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b))) | ||
2204 | |||
2205 | static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq) | ||
2206 | { | ||
2207 | unsigned int m; | ||
2208 | spin_lock(&mdev->peer_seq_lock); | ||
2209 | m = seq_max(mdev->peer_seq, new_seq); | ||
2210 | mdev->peer_seq = m; | ||
2211 | spin_unlock(&mdev->peer_seq_lock); | ||
2212 | if (m == new_seq) | ||
2213 | wake_up(&mdev->seq_wait); | ||
2214 | } | ||
2215 | |||
2216 | static inline void drbd_update_congested(struct drbd_conf *mdev) | ||
2217 | { | ||
2218 | struct sock *sk = mdev->data.socket->sk; | ||
2219 | if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5) | ||
2220 | set_bit(NET_CONGESTED, &mdev->flags); | ||
2221 | } | ||
2222 | |||
2223 | static inline int drbd_queue_order_type(struct drbd_conf *mdev) | ||
2224 | { | ||
2225 | /* sorry, we currently have no working implementation | ||
2226 | * of distributed TCQ stuff */ | ||
2227 | #ifndef QUEUE_ORDERED_NONE | ||
2228 | #define QUEUE_ORDERED_NONE 0 | ||
2229 | #endif | ||
2230 | return QUEUE_ORDERED_NONE; | ||
2231 | } | ||
2232 | |||
2233 | static inline void drbd_blk_run_queue(struct request_queue *q) | ||
2234 | { | ||
2235 | if (q && q->unplug_fn) | ||
2236 | q->unplug_fn(q); | ||
2237 | } | ||
2238 | |||
2239 | static inline void drbd_kick_lo(struct drbd_conf *mdev) | ||
2240 | { | ||
2241 | if (get_ldev(mdev)) { | ||
2242 | drbd_blk_run_queue(bdev_get_queue(mdev->ldev->backing_bdev)); | ||
2243 | put_ldev(mdev); | ||
2244 | } | ||
2245 | } | ||
2246 | |||
2247 | static inline void drbd_md_flush(struct drbd_conf *mdev) | ||
2248 | { | ||
2249 | int r; | ||
2250 | |||
2251 | if (test_bit(MD_NO_BARRIER, &mdev->flags)) | ||
2252 | return; | ||
2253 | |||
2254 | r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL); | ||
2255 | if (r) { | ||
2256 | set_bit(MD_NO_BARRIER, &mdev->flags); | ||
2257 | dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); | ||
2258 | } | ||
2259 | } | ||
2260 | |||
2261 | #endif | ||
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c new file mode 100644 index 000000000000..93d1f9b469d4 --- /dev/null +++ b/drivers/block/drbd/drbd_main.c | |||
@@ -0,0 +1,3716 @@ | |||
1 | /* | ||
2 | drbd.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev | ||
11 | from Logicworks, Inc. for making SDP replication support possible. | ||
12 | |||
13 | drbd is free software; you can redistribute it and/or modify | ||
14 | it under the terms of the GNU General Public License as published by | ||
15 | the Free Software Foundation; either version 2, or (at your option) | ||
16 | any later version. | ||
17 | |||
18 | drbd is distributed in the hope that it will be useful, | ||
19 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
20 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
21 | GNU General Public License for more details. | ||
22 | |||
23 | You should have received a copy of the GNU General Public License | ||
24 | along with drbd; see the file COPYING. If not, write to | ||
25 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
26 | |||
27 | */ | ||
28 | |||
29 | #include <linux/module.h> | ||
30 | #include <linux/drbd.h> | ||
31 | #include <asm/uaccess.h> | ||
32 | #include <asm/types.h> | ||
33 | #include <net/sock.h> | ||
34 | #include <linux/ctype.h> | ||
35 | #include <linux/smp_lock.h> | ||
36 | #include <linux/fs.h> | ||
37 | #include <linux/file.h> | ||
38 | #include <linux/proc_fs.h> | ||
39 | #include <linux/init.h> | ||
40 | #include <linux/mm.h> | ||
41 | #include <linux/memcontrol.h> | ||
42 | #include <linux/mm_inline.h> | ||
43 | #include <linux/slab.h> | ||
44 | #include <linux/random.h> | ||
45 | #include <linux/reboot.h> | ||
46 | #include <linux/notifier.h> | ||
47 | #include <linux/kthread.h> | ||
48 | |||
49 | #define __KERNEL_SYSCALLS__ | ||
50 | #include <linux/unistd.h> | ||
51 | #include <linux/vmalloc.h> | ||
52 | |||
53 | #include <linux/drbd_limits.h> | ||
54 | #include "drbd_int.h" | ||
55 | #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */ | ||
56 | |||
57 | #include "drbd_vli.h" | ||
58 | |||
59 | struct after_state_chg_work { | ||
60 | struct drbd_work w; | ||
61 | union drbd_state os; | ||
62 | union drbd_state ns; | ||
63 | enum chg_state_flags flags; | ||
64 | struct completion *done; | ||
65 | }; | ||
66 | |||
67 | int drbdd_init(struct drbd_thread *); | ||
68 | int drbd_worker(struct drbd_thread *); | ||
69 | int drbd_asender(struct drbd_thread *); | ||
70 | |||
71 | int drbd_init(void); | ||
72 | static int drbd_open(struct block_device *bdev, fmode_t mode); | ||
73 | static int drbd_release(struct gendisk *gd, fmode_t mode); | ||
74 | static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused); | ||
75 | static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | ||
76 | union drbd_state ns, enum chg_state_flags flags); | ||
77 | static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused); | ||
78 | static void md_sync_timer_fn(unsigned long data); | ||
79 | static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); | ||
80 | |||
81 | MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " | ||
82 | "Lars Ellenberg <lars@linbit.com>"); | ||
83 | MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION); | ||
84 | MODULE_VERSION(REL_VERSION); | ||
85 | MODULE_LICENSE("GPL"); | ||
86 | MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)"); | ||
87 | MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR); | ||
88 | |||
89 | #include <linux/moduleparam.h> | ||
90 | /* allow_open_on_secondary */ | ||
91 | MODULE_PARM_DESC(allow_oos, "DONT USE!"); | ||
92 | /* thanks to these macros, if compiled into the kernel (not-module), | ||
93 | * this becomes the boot parameter drbd.minor_count */ | ||
94 | module_param(minor_count, uint, 0444); | ||
95 | module_param(disable_sendpage, bool, 0644); | ||
96 | module_param(allow_oos, bool, 0); | ||
97 | module_param(cn_idx, uint, 0444); | ||
98 | module_param(proc_details, int, 0644); | ||
99 | |||
100 | #ifdef CONFIG_DRBD_FAULT_INJECTION | ||
101 | int enable_faults; | ||
102 | int fault_rate; | ||
103 | static int fault_count; | ||
104 | int fault_devs; | ||
105 | /* bitmap of enabled faults */ | ||
106 | module_param(enable_faults, int, 0664); | ||
107 | /* fault rate % value - applies to all enabled faults */ | ||
108 | module_param(fault_rate, int, 0664); | ||
109 | /* count of faults inserted */ | ||
110 | module_param(fault_count, int, 0664); | ||
111 | /* bitmap of devices to insert faults on */ | ||
112 | module_param(fault_devs, int, 0644); | ||
113 | #endif | ||
114 | |||
115 | /* module parameter, defined */ | ||
116 | unsigned int minor_count = 32; | ||
117 | int disable_sendpage; | ||
118 | int allow_oos; | ||
119 | unsigned int cn_idx = CN_IDX_DRBD; | ||
120 | int proc_details; /* Detail level in proc drbd*/ | ||
121 | |||
122 | /* Module parameter for setting the user mode helper program | ||
123 | * to run. Default is /sbin/drbdadm */ | ||
124 | char usermode_helper[80] = "/sbin/drbdadm"; | ||
125 | |||
126 | module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644); | ||
127 | |||
128 | /* in 2.6.x, our device mapping and config info contains our virtual gendisks | ||
129 | * as member "struct gendisk *vdisk;" | ||
130 | */ | ||
131 | struct drbd_conf **minor_table; | ||
132 | |||
133 | struct kmem_cache *drbd_request_cache; | ||
134 | struct kmem_cache *drbd_ee_cache; /* epoch entries */ | ||
135 | struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ | ||
136 | struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ | ||
137 | mempool_t *drbd_request_mempool; | ||
138 | mempool_t *drbd_ee_mempool; | ||
139 | |||
140 | /* I do not use a standard mempool, because: | ||
141 | 1) I want to hand out the pre-allocated objects first. | ||
142 | 2) I want to be able to interrupt sleeping allocation with a signal. | ||
143 | Note: This is a single linked list, the next pointer is the private | ||
144 | member of struct page. | ||
145 | */ | ||
146 | struct page *drbd_pp_pool; | ||
147 | spinlock_t drbd_pp_lock; | ||
148 | int drbd_pp_vacant; | ||
149 | wait_queue_head_t drbd_pp_wait; | ||
150 | |||
151 | DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5); | ||
152 | |||
153 | static const struct block_device_operations drbd_ops = { | ||
154 | .owner = THIS_MODULE, | ||
155 | .open = drbd_open, | ||
156 | .release = drbd_release, | ||
157 | }; | ||
158 | |||
159 | #define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0])) | ||
160 | |||
161 | #ifdef __CHECKER__ | ||
162 | /* When checking with sparse, and this is an inline function, sparse will | ||
163 | give tons of false positives. When this is a real functions sparse works. | ||
164 | */ | ||
165 | int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins) | ||
166 | { | ||
167 | int io_allowed; | ||
168 | |||
169 | atomic_inc(&mdev->local_cnt); | ||
170 | io_allowed = (mdev->state.disk >= mins); | ||
171 | if (!io_allowed) { | ||
172 | if (atomic_dec_and_test(&mdev->local_cnt)) | ||
173 | wake_up(&mdev->misc_wait); | ||
174 | } | ||
175 | return io_allowed; | ||
176 | } | ||
177 | |||
178 | #endif | ||
179 | |||
180 | /** | ||
181 | * DOC: The transfer log | ||
182 | * | ||
183 | * The transfer log is a single linked list of &struct drbd_tl_epoch objects. | ||
184 | * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail | ||
185 | * of the list. There is always at least one &struct drbd_tl_epoch object. | ||
186 | * | ||
187 | * Each &struct drbd_tl_epoch has a circular double linked list of requests | ||
188 | * attached. | ||
189 | */ | ||
190 | static int tl_init(struct drbd_conf *mdev) | ||
191 | { | ||
192 | struct drbd_tl_epoch *b; | ||
193 | |||
194 | /* during device minor initialization, we may well use GFP_KERNEL */ | ||
195 | b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL); | ||
196 | if (!b) | ||
197 | return 0; | ||
198 | INIT_LIST_HEAD(&b->requests); | ||
199 | INIT_LIST_HEAD(&b->w.list); | ||
200 | b->next = NULL; | ||
201 | b->br_number = 4711; | ||
202 | b->n_req = 0; | ||
203 | b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ | ||
204 | |||
205 | mdev->oldest_tle = b; | ||
206 | mdev->newest_tle = b; | ||
207 | INIT_LIST_HEAD(&mdev->out_of_sequence_requests); | ||
208 | |||
209 | mdev->tl_hash = NULL; | ||
210 | mdev->tl_hash_s = 0; | ||
211 | |||
212 | return 1; | ||
213 | } | ||
214 | |||
215 | static void tl_cleanup(struct drbd_conf *mdev) | ||
216 | { | ||
217 | D_ASSERT(mdev->oldest_tle == mdev->newest_tle); | ||
218 | D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); | ||
219 | kfree(mdev->oldest_tle); | ||
220 | mdev->oldest_tle = NULL; | ||
221 | kfree(mdev->unused_spare_tle); | ||
222 | mdev->unused_spare_tle = NULL; | ||
223 | kfree(mdev->tl_hash); | ||
224 | mdev->tl_hash = NULL; | ||
225 | mdev->tl_hash_s = 0; | ||
226 | } | ||
227 | |||
228 | /** | ||
229 | * _tl_add_barrier() - Adds a barrier to the transfer log | ||
230 | * @mdev: DRBD device. | ||
231 | * @new: Barrier to be added before the current head of the TL. | ||
232 | * | ||
233 | * The caller must hold the req_lock. | ||
234 | */ | ||
235 | void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new) | ||
236 | { | ||
237 | struct drbd_tl_epoch *newest_before; | ||
238 | |||
239 | INIT_LIST_HEAD(&new->requests); | ||
240 | INIT_LIST_HEAD(&new->w.list); | ||
241 | new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ | ||
242 | new->next = NULL; | ||
243 | new->n_req = 0; | ||
244 | |||
245 | newest_before = mdev->newest_tle; | ||
246 | /* never send a barrier number == 0, because that is special-cased | ||
247 | * when using TCQ for our write ordering code */ | ||
248 | new->br_number = (newest_before->br_number+1) ?: 1; | ||
249 | if (mdev->newest_tle != new) { | ||
250 | mdev->newest_tle->next = new; | ||
251 | mdev->newest_tle = new; | ||
252 | } | ||
253 | } | ||
254 | |||
255 | /** | ||
256 | * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL | ||
257 | * @mdev: DRBD device. | ||
258 | * @barrier_nr: Expected identifier of the DRBD write barrier packet. | ||
259 | * @set_size: Expected number of requests before that barrier. | ||
260 | * | ||
261 | * In case the passed barrier_nr or set_size does not match the oldest | ||
262 | * &struct drbd_tl_epoch objects this function will cause a termination | ||
263 | * of the connection. | ||
264 | */ | ||
265 | void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, | ||
266 | unsigned int set_size) | ||
267 | { | ||
268 | struct drbd_tl_epoch *b, *nob; /* next old barrier */ | ||
269 | struct list_head *le, *tle; | ||
270 | struct drbd_request *r; | ||
271 | |||
272 | spin_lock_irq(&mdev->req_lock); | ||
273 | |||
274 | b = mdev->oldest_tle; | ||
275 | |||
276 | /* first some paranoia code */ | ||
277 | if (b == NULL) { | ||
278 | dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n", | ||
279 | barrier_nr); | ||
280 | goto bail; | ||
281 | } | ||
282 | if (b->br_number != barrier_nr) { | ||
283 | dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n", | ||
284 | barrier_nr, b->br_number); | ||
285 | goto bail; | ||
286 | } | ||
287 | if (b->n_req != set_size) { | ||
288 | dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n", | ||
289 | barrier_nr, set_size, b->n_req); | ||
290 | goto bail; | ||
291 | } | ||
292 | |||
293 | /* Clean up list of requests processed during current epoch */ | ||
294 | list_for_each_safe(le, tle, &b->requests) { | ||
295 | r = list_entry(le, struct drbd_request, tl_requests); | ||
296 | _req_mod(r, barrier_acked); | ||
297 | } | ||
298 | /* There could be requests on the list waiting for completion | ||
299 | of the write to the local disk. To avoid corruptions of | ||
300 | slab's data structures we have to remove the lists head. | ||
301 | |||
302 | Also there could have been a barrier ack out of sequence, overtaking | ||
303 | the write acks - which would be a bug and violating write ordering. | ||
304 | To not deadlock in case we lose connection while such requests are | ||
305 | still pending, we need some way to find them for the | ||
306 | _req_mode(connection_lost_while_pending). | ||
307 | |||
308 | These have been list_move'd to the out_of_sequence_requests list in | ||
309 | _req_mod(, barrier_acked) above. | ||
310 | */ | ||
311 | list_del_init(&b->requests); | ||
312 | |||
313 | nob = b->next; | ||
314 | if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { | ||
315 | _tl_add_barrier(mdev, b); | ||
316 | if (nob) | ||
317 | mdev->oldest_tle = nob; | ||
318 | /* if nob == NULL b was the only barrier, and becomes the new | ||
319 | barrier. Therefore mdev->oldest_tle points already to b */ | ||
320 | } else { | ||
321 | D_ASSERT(nob != NULL); | ||
322 | mdev->oldest_tle = nob; | ||
323 | kfree(b); | ||
324 | } | ||
325 | |||
326 | spin_unlock_irq(&mdev->req_lock); | ||
327 | dec_ap_pending(mdev); | ||
328 | |||
329 | return; | ||
330 | |||
331 | bail: | ||
332 | spin_unlock_irq(&mdev->req_lock); | ||
333 | drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); | ||
334 | } | ||
335 | |||
336 | |||
337 | /** | ||
338 | * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL | ||
339 | * @mdev: DRBD device. | ||
340 | * | ||
341 | * This is called after the connection to the peer was lost. The storage covered | ||
342 | * by the requests on the transfer gets marked as our of sync. Called from the | ||
343 | * receiver thread and the worker thread. | ||
344 | */ | ||
345 | void tl_clear(struct drbd_conf *mdev) | ||
346 | { | ||
347 | struct drbd_tl_epoch *b, *tmp; | ||
348 | struct list_head *le, *tle; | ||
349 | struct drbd_request *r; | ||
350 | int new_initial_bnr = net_random(); | ||
351 | |||
352 | spin_lock_irq(&mdev->req_lock); | ||
353 | |||
354 | b = mdev->oldest_tle; | ||
355 | while (b) { | ||
356 | list_for_each_safe(le, tle, &b->requests) { | ||
357 | r = list_entry(le, struct drbd_request, tl_requests); | ||
358 | /* It would be nice to complete outside of spinlock. | ||
359 | * But this is easier for now. */ | ||
360 | _req_mod(r, connection_lost_while_pending); | ||
361 | } | ||
362 | tmp = b->next; | ||
363 | |||
364 | /* there could still be requests on that ring list, | ||
365 | * in case local io is still pending */ | ||
366 | list_del(&b->requests); | ||
367 | |||
368 | /* dec_ap_pending corresponding to queue_barrier. | ||
369 | * the newest barrier may not have been queued yet, | ||
370 | * in which case w.cb is still NULL. */ | ||
371 | if (b->w.cb != NULL) | ||
372 | dec_ap_pending(mdev); | ||
373 | |||
374 | if (b == mdev->newest_tle) { | ||
375 | /* recycle, but reinit! */ | ||
376 | D_ASSERT(tmp == NULL); | ||
377 | INIT_LIST_HEAD(&b->requests); | ||
378 | INIT_LIST_HEAD(&b->w.list); | ||
379 | b->w.cb = NULL; | ||
380 | b->br_number = new_initial_bnr; | ||
381 | b->n_req = 0; | ||
382 | |||
383 | mdev->oldest_tle = b; | ||
384 | break; | ||
385 | } | ||
386 | kfree(b); | ||
387 | b = tmp; | ||
388 | } | ||
389 | |||
390 | /* we expect this list to be empty. */ | ||
391 | D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); | ||
392 | |||
393 | /* but just in case, clean it up anyways! */ | ||
394 | list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) { | ||
395 | r = list_entry(le, struct drbd_request, tl_requests); | ||
396 | /* It would be nice to complete outside of spinlock. | ||
397 | * But this is easier for now. */ | ||
398 | _req_mod(r, connection_lost_while_pending); | ||
399 | } | ||
400 | |||
401 | /* ensure bit indicating barrier is required is clear */ | ||
402 | clear_bit(CREATE_BARRIER, &mdev->flags); | ||
403 | |||
404 | spin_unlock_irq(&mdev->req_lock); | ||
405 | } | ||
406 | |||
407 | /** | ||
408 | * cl_wide_st_chg() - TRUE if the state change is a cluster wide one | ||
409 | * @mdev: DRBD device. | ||
410 | * @os: old (current) state. | ||
411 | * @ns: new (wanted) state. | ||
412 | */ | ||
413 | static int cl_wide_st_chg(struct drbd_conf *mdev, | ||
414 | union drbd_state os, union drbd_state ns) | ||
415 | { | ||
416 | return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED && | ||
417 | ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || | ||
418 | (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || | ||
419 | (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || | ||
420 | (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) || | ||
421 | (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || | ||
422 | (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S); | ||
423 | } | ||
424 | |||
425 | int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, | ||
426 | union drbd_state mask, union drbd_state val) | ||
427 | { | ||
428 | unsigned long flags; | ||
429 | union drbd_state os, ns; | ||
430 | int rv; | ||
431 | |||
432 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
433 | os = mdev->state; | ||
434 | ns.i = (os.i & ~mask.i) | val.i; | ||
435 | rv = _drbd_set_state(mdev, ns, f, NULL); | ||
436 | ns = mdev->state; | ||
437 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
438 | |||
439 | return rv; | ||
440 | } | ||
441 | |||
442 | /** | ||
443 | * drbd_force_state() - Impose a change which happens outside our control on our state | ||
444 | * @mdev: DRBD device. | ||
445 | * @mask: mask of state bits to change. | ||
446 | * @val: value of new state bits. | ||
447 | */ | ||
448 | void drbd_force_state(struct drbd_conf *mdev, | ||
449 | union drbd_state mask, union drbd_state val) | ||
450 | { | ||
451 | drbd_change_state(mdev, CS_HARD, mask, val); | ||
452 | } | ||
453 | |||
454 | static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns); | ||
455 | static int is_valid_state_transition(struct drbd_conf *, | ||
456 | union drbd_state, union drbd_state); | ||
457 | static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, | ||
458 | union drbd_state ns, int *warn_sync_abort); | ||
459 | int drbd_send_state_req(struct drbd_conf *, | ||
460 | union drbd_state, union drbd_state); | ||
461 | |||
462 | static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev, | ||
463 | union drbd_state mask, union drbd_state val) | ||
464 | { | ||
465 | union drbd_state os, ns; | ||
466 | unsigned long flags; | ||
467 | int rv; | ||
468 | |||
469 | if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags)) | ||
470 | return SS_CW_SUCCESS; | ||
471 | |||
472 | if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags)) | ||
473 | return SS_CW_FAILED_BY_PEER; | ||
474 | |||
475 | rv = 0; | ||
476 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
477 | os = mdev->state; | ||
478 | ns.i = (os.i & ~mask.i) | val.i; | ||
479 | ns = sanitize_state(mdev, os, ns, NULL); | ||
480 | |||
481 | if (!cl_wide_st_chg(mdev, os, ns)) | ||
482 | rv = SS_CW_NO_NEED; | ||
483 | if (!rv) { | ||
484 | rv = is_valid_state(mdev, ns); | ||
485 | if (rv == SS_SUCCESS) { | ||
486 | rv = is_valid_state_transition(mdev, ns, os); | ||
487 | if (rv == SS_SUCCESS) | ||
488 | rv = 0; /* cont waiting, otherwise fail. */ | ||
489 | } | ||
490 | } | ||
491 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
492 | |||
493 | return rv; | ||
494 | } | ||
495 | |||
496 | /** | ||
497 | * drbd_req_state() - Perform an eventually cluster wide state change | ||
498 | * @mdev: DRBD device. | ||
499 | * @mask: mask of state bits to change. | ||
500 | * @val: value of new state bits. | ||
501 | * @f: flags | ||
502 | * | ||
503 | * Should not be called directly, use drbd_request_state() or | ||
504 | * _drbd_request_state(). | ||
505 | */ | ||
506 | static int drbd_req_state(struct drbd_conf *mdev, | ||
507 | union drbd_state mask, union drbd_state val, | ||
508 | enum chg_state_flags f) | ||
509 | { | ||
510 | struct completion done; | ||
511 | unsigned long flags; | ||
512 | union drbd_state os, ns; | ||
513 | int rv; | ||
514 | |||
515 | init_completion(&done); | ||
516 | |||
517 | if (f & CS_SERIALIZE) | ||
518 | mutex_lock(&mdev->state_mutex); | ||
519 | |||
520 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
521 | os = mdev->state; | ||
522 | ns.i = (os.i & ~mask.i) | val.i; | ||
523 | ns = sanitize_state(mdev, os, ns, NULL); | ||
524 | |||
525 | if (cl_wide_st_chg(mdev, os, ns)) { | ||
526 | rv = is_valid_state(mdev, ns); | ||
527 | if (rv == SS_SUCCESS) | ||
528 | rv = is_valid_state_transition(mdev, ns, os); | ||
529 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
530 | |||
531 | if (rv < SS_SUCCESS) { | ||
532 | if (f & CS_VERBOSE) | ||
533 | print_st_err(mdev, os, ns, rv); | ||
534 | goto abort; | ||
535 | } | ||
536 | |||
537 | drbd_state_lock(mdev); | ||
538 | if (!drbd_send_state_req(mdev, mask, val)) { | ||
539 | drbd_state_unlock(mdev); | ||
540 | rv = SS_CW_FAILED_BY_PEER; | ||
541 | if (f & CS_VERBOSE) | ||
542 | print_st_err(mdev, os, ns, rv); | ||
543 | goto abort; | ||
544 | } | ||
545 | |||
546 | wait_event(mdev->state_wait, | ||
547 | (rv = _req_st_cond(mdev, mask, val))); | ||
548 | |||
549 | if (rv < SS_SUCCESS) { | ||
550 | drbd_state_unlock(mdev); | ||
551 | if (f & CS_VERBOSE) | ||
552 | print_st_err(mdev, os, ns, rv); | ||
553 | goto abort; | ||
554 | } | ||
555 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
556 | os = mdev->state; | ||
557 | ns.i = (os.i & ~mask.i) | val.i; | ||
558 | rv = _drbd_set_state(mdev, ns, f, &done); | ||
559 | drbd_state_unlock(mdev); | ||
560 | } else { | ||
561 | rv = _drbd_set_state(mdev, ns, f, &done); | ||
562 | } | ||
563 | |||
564 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
565 | |||
566 | if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) { | ||
567 | D_ASSERT(current != mdev->worker.task); | ||
568 | wait_for_completion(&done); | ||
569 | } | ||
570 | |||
571 | abort: | ||
572 | if (f & CS_SERIALIZE) | ||
573 | mutex_unlock(&mdev->state_mutex); | ||
574 | |||
575 | return rv; | ||
576 | } | ||
577 | |||
578 | /** | ||
579 | * _drbd_request_state() - Request a state change (with flags) | ||
580 | * @mdev: DRBD device. | ||
581 | * @mask: mask of state bits to change. | ||
582 | * @val: value of new state bits. | ||
583 | * @f: flags | ||
584 | * | ||
585 | * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE | ||
586 | * flag, or when logging of failed state change requests is not desired. | ||
587 | */ | ||
588 | int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask, | ||
589 | union drbd_state val, enum chg_state_flags f) | ||
590 | { | ||
591 | int rv; | ||
592 | |||
593 | wait_event(mdev->state_wait, | ||
594 | (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE); | ||
595 | |||
596 | return rv; | ||
597 | } | ||
598 | |||
599 | static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns) | ||
600 | { | ||
601 | dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n", | ||
602 | name, | ||
603 | drbd_conn_str(ns.conn), | ||
604 | drbd_role_str(ns.role), | ||
605 | drbd_role_str(ns.peer), | ||
606 | drbd_disk_str(ns.disk), | ||
607 | drbd_disk_str(ns.pdsk), | ||
608 | ns.susp ? 's' : 'r', | ||
609 | ns.aftr_isp ? 'a' : '-', | ||
610 | ns.peer_isp ? 'p' : '-', | ||
611 | ns.user_isp ? 'u' : '-' | ||
612 | ); | ||
613 | } | ||
614 | |||
615 | void print_st_err(struct drbd_conf *mdev, | ||
616 | union drbd_state os, union drbd_state ns, int err) | ||
617 | { | ||
618 | if (err == SS_IN_TRANSIENT_STATE) | ||
619 | return; | ||
620 | dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err)); | ||
621 | print_st(mdev, " state", os); | ||
622 | print_st(mdev, "wanted", ns); | ||
623 | } | ||
624 | |||
625 | |||
626 | #define drbd_peer_str drbd_role_str | ||
627 | #define drbd_pdsk_str drbd_disk_str | ||
628 | |||
629 | #define drbd_susp_str(A) ((A) ? "1" : "0") | ||
630 | #define drbd_aftr_isp_str(A) ((A) ? "1" : "0") | ||
631 | #define drbd_peer_isp_str(A) ((A) ? "1" : "0") | ||
632 | #define drbd_user_isp_str(A) ((A) ? "1" : "0") | ||
633 | |||
634 | #define PSC(A) \ | ||
635 | ({ if (ns.A != os.A) { \ | ||
636 | pbp += sprintf(pbp, #A "( %s -> %s ) ", \ | ||
637 | drbd_##A##_str(os.A), \ | ||
638 | drbd_##A##_str(ns.A)); \ | ||
639 | } }) | ||
640 | |||
641 | /** | ||
642 | * is_valid_state() - Returns an SS_ error code if ns is not valid | ||
643 | * @mdev: DRBD device. | ||
644 | * @ns: State to consider. | ||
645 | */ | ||
646 | static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns) | ||
647 | { | ||
648 | /* See drbd_state_sw_errors in drbd_strings.c */ | ||
649 | |||
650 | enum drbd_fencing_p fp; | ||
651 | int rv = SS_SUCCESS; | ||
652 | |||
653 | fp = FP_DONT_CARE; | ||
654 | if (get_ldev(mdev)) { | ||
655 | fp = mdev->ldev->dc.fencing; | ||
656 | put_ldev(mdev); | ||
657 | } | ||
658 | |||
659 | if (get_net_conf(mdev)) { | ||
660 | if (!mdev->net_conf->two_primaries && | ||
661 | ns.role == R_PRIMARY && ns.peer == R_PRIMARY) | ||
662 | rv = SS_TWO_PRIMARIES; | ||
663 | put_net_conf(mdev); | ||
664 | } | ||
665 | |||
666 | if (rv <= 0) | ||
667 | /* already found a reason to abort */; | ||
668 | else if (ns.role == R_SECONDARY && mdev->open_cnt) | ||
669 | rv = SS_DEVICE_IN_USE; | ||
670 | |||
671 | else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE) | ||
672 | rv = SS_NO_UP_TO_DATE_DISK; | ||
673 | |||
674 | else if (fp >= FP_RESOURCE && | ||
675 | ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN) | ||
676 | rv = SS_PRIMARY_NOP; | ||
677 | |||
678 | else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT) | ||
679 | rv = SS_NO_UP_TO_DATE_DISK; | ||
680 | |||
681 | else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT) | ||
682 | rv = SS_NO_LOCAL_DISK; | ||
683 | |||
684 | else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) | ||
685 | rv = SS_NO_REMOTE_DISK; | ||
686 | |||
687 | else if ((ns.conn == C_CONNECTED || | ||
688 | ns.conn == C_WF_BITMAP_S || | ||
689 | ns.conn == C_SYNC_SOURCE || | ||
690 | ns.conn == C_PAUSED_SYNC_S) && | ||
691 | ns.disk == D_OUTDATED) | ||
692 | rv = SS_CONNECTED_OUTDATES; | ||
693 | |||
694 | else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
695 | (mdev->sync_conf.verify_alg[0] == 0)) | ||
696 | rv = SS_NO_VERIFY_ALG; | ||
697 | |||
698 | else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
699 | mdev->agreed_pro_version < 88) | ||
700 | rv = SS_NOT_SUPPORTED; | ||
701 | |||
702 | return rv; | ||
703 | } | ||
704 | |||
705 | /** | ||
706 | * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible | ||
707 | * @mdev: DRBD device. | ||
708 | * @ns: new state. | ||
709 | * @os: old state. | ||
710 | */ | ||
711 | static int is_valid_state_transition(struct drbd_conf *mdev, | ||
712 | union drbd_state ns, union drbd_state os) | ||
713 | { | ||
714 | int rv = SS_SUCCESS; | ||
715 | |||
716 | if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) && | ||
717 | os.conn > C_CONNECTED) | ||
718 | rv = SS_RESYNC_RUNNING; | ||
719 | |||
720 | if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE) | ||
721 | rv = SS_ALREADY_STANDALONE; | ||
722 | |||
723 | if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS) | ||
724 | rv = SS_IS_DISKLESS; | ||
725 | |||
726 | if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED) | ||
727 | rv = SS_NO_NET_CONFIG; | ||
728 | |||
729 | if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING) | ||
730 | rv = SS_LOWER_THAN_OUTDATED; | ||
731 | |||
732 | if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED) | ||
733 | rv = SS_IN_TRANSIENT_STATE; | ||
734 | |||
735 | if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) | ||
736 | rv = SS_IN_TRANSIENT_STATE; | ||
737 | |||
738 | if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) | ||
739 | rv = SS_NEED_CONNECTION; | ||
740 | |||
741 | if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
742 | ns.conn != os.conn && os.conn > C_CONNECTED) | ||
743 | rv = SS_RESYNC_RUNNING; | ||
744 | |||
745 | if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && | ||
746 | os.conn < C_CONNECTED) | ||
747 | rv = SS_NEED_CONNECTION; | ||
748 | |||
749 | return rv; | ||
750 | } | ||
751 | |||
752 | /** | ||
753 | * sanitize_state() - Resolves implicitly necessary additional changes to a state transition | ||
754 | * @mdev: DRBD device. | ||
755 | * @os: old state. | ||
756 | * @ns: new state. | ||
757 | * @warn_sync_abort: | ||
758 | * | ||
759 | * When we loose connection, we have to set the state of the peers disk (pdsk) | ||
760 | * to D_UNKNOWN. This rule and many more along those lines are in this function. | ||
761 | */ | ||
762 | static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, | ||
763 | union drbd_state ns, int *warn_sync_abort) | ||
764 | { | ||
765 | enum drbd_fencing_p fp; | ||
766 | |||
767 | fp = FP_DONT_CARE; | ||
768 | if (get_ldev(mdev)) { | ||
769 | fp = mdev->ldev->dc.fencing; | ||
770 | put_ldev(mdev); | ||
771 | } | ||
772 | |||
773 | /* Disallow Network errors to configure a device's network part */ | ||
774 | if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) && | ||
775 | os.conn <= C_DISCONNECTING) | ||
776 | ns.conn = os.conn; | ||
777 | |||
778 | /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */ | ||
779 | if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN && | ||
780 | ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING) | ||
781 | ns.conn = os.conn; | ||
782 | |||
783 | /* After C_DISCONNECTING only C_STANDALONE may follow */ | ||
784 | if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) | ||
785 | ns.conn = os.conn; | ||
786 | |||
787 | if (ns.conn < C_CONNECTED) { | ||
788 | ns.peer_isp = 0; | ||
789 | ns.peer = R_UNKNOWN; | ||
790 | if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT) | ||
791 | ns.pdsk = D_UNKNOWN; | ||
792 | } | ||
793 | |||
794 | /* Clear the aftr_isp when becoming unconfigured */ | ||
795 | if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY) | ||
796 | ns.aftr_isp = 0; | ||
797 | |||
798 | if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS) | ||
799 | ns.pdsk = D_UNKNOWN; | ||
800 | |||
801 | /* Abort resync if a disk fails/detaches */ | ||
802 | if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED && | ||
803 | (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { | ||
804 | if (warn_sync_abort) | ||
805 | *warn_sync_abort = 1; | ||
806 | ns.conn = C_CONNECTED; | ||
807 | } | ||
808 | |||
809 | if (ns.conn >= C_CONNECTED && | ||
810 | ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) || | ||
811 | (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) { | ||
812 | switch (ns.conn) { | ||
813 | case C_WF_BITMAP_T: | ||
814 | case C_PAUSED_SYNC_T: | ||
815 | ns.disk = D_OUTDATED; | ||
816 | break; | ||
817 | case C_CONNECTED: | ||
818 | case C_WF_BITMAP_S: | ||
819 | case C_SYNC_SOURCE: | ||
820 | case C_PAUSED_SYNC_S: | ||
821 | ns.disk = D_UP_TO_DATE; | ||
822 | break; | ||
823 | case C_SYNC_TARGET: | ||
824 | ns.disk = D_INCONSISTENT; | ||
825 | dev_warn(DEV, "Implicitly set disk state Inconsistent!\n"); | ||
826 | break; | ||
827 | } | ||
828 | if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE) | ||
829 | dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n"); | ||
830 | } | ||
831 | |||
832 | if (ns.conn >= C_CONNECTED && | ||
833 | (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) { | ||
834 | switch (ns.conn) { | ||
835 | case C_CONNECTED: | ||
836 | case C_WF_BITMAP_T: | ||
837 | case C_PAUSED_SYNC_T: | ||
838 | case C_SYNC_TARGET: | ||
839 | ns.pdsk = D_UP_TO_DATE; | ||
840 | break; | ||
841 | case C_WF_BITMAP_S: | ||
842 | case C_PAUSED_SYNC_S: | ||
843 | ns.pdsk = D_OUTDATED; | ||
844 | break; | ||
845 | case C_SYNC_SOURCE: | ||
846 | ns.pdsk = D_INCONSISTENT; | ||
847 | dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n"); | ||
848 | break; | ||
849 | } | ||
850 | if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE) | ||
851 | dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n"); | ||
852 | } | ||
853 | |||
854 | /* Connection breaks down before we finished "Negotiating" */ | ||
855 | if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING && | ||
856 | get_ldev_if_state(mdev, D_NEGOTIATING)) { | ||
857 | if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) { | ||
858 | ns.disk = mdev->new_state_tmp.disk; | ||
859 | ns.pdsk = mdev->new_state_tmp.pdsk; | ||
860 | } else { | ||
861 | dev_alert(DEV, "Connection lost while negotiating, no data!\n"); | ||
862 | ns.disk = D_DISKLESS; | ||
863 | ns.pdsk = D_UNKNOWN; | ||
864 | } | ||
865 | put_ldev(mdev); | ||
866 | } | ||
867 | |||
868 | if (fp == FP_STONITH && | ||
869 | (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) && | ||
870 | !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)) | ||
871 | ns.susp = 1; | ||
872 | |||
873 | if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { | ||
874 | if (ns.conn == C_SYNC_SOURCE) | ||
875 | ns.conn = C_PAUSED_SYNC_S; | ||
876 | if (ns.conn == C_SYNC_TARGET) | ||
877 | ns.conn = C_PAUSED_SYNC_T; | ||
878 | } else { | ||
879 | if (ns.conn == C_PAUSED_SYNC_S) | ||
880 | ns.conn = C_SYNC_SOURCE; | ||
881 | if (ns.conn == C_PAUSED_SYNC_T) | ||
882 | ns.conn = C_SYNC_TARGET; | ||
883 | } | ||
884 | |||
885 | return ns; | ||
886 | } | ||
887 | |||
888 | /* helper for __drbd_set_state */ | ||
889 | static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs) | ||
890 | { | ||
891 | if (cs == C_VERIFY_T) { | ||
892 | /* starting online verify from an arbitrary position | ||
893 | * does not fit well into the existing protocol. | ||
894 | * on C_VERIFY_T, we initialize ov_left and friends | ||
895 | * implicitly in receive_DataRequest once the | ||
896 | * first P_OV_REQUEST is received */ | ||
897 | mdev->ov_start_sector = ~(sector_t)0; | ||
898 | } else { | ||
899 | unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector); | ||
900 | if (bit >= mdev->rs_total) | ||
901 | mdev->ov_start_sector = | ||
902 | BM_BIT_TO_SECT(mdev->rs_total - 1); | ||
903 | mdev->ov_position = mdev->ov_start_sector; | ||
904 | } | ||
905 | } | ||
906 | |||
907 | /** | ||
908 | * __drbd_set_state() - Set a new DRBD state | ||
909 | * @mdev: DRBD device. | ||
910 | * @ns: new state. | ||
911 | * @flags: Flags | ||
912 | * @done: Optional completion, that will get completed after the after_state_ch() finished | ||
913 | * | ||
914 | * Caller needs to hold req_lock, and global_state_lock. Do not call directly. | ||
915 | */ | ||
916 | int __drbd_set_state(struct drbd_conf *mdev, | ||
917 | union drbd_state ns, enum chg_state_flags flags, | ||
918 | struct completion *done) | ||
919 | { | ||
920 | union drbd_state os; | ||
921 | int rv = SS_SUCCESS; | ||
922 | int warn_sync_abort = 0; | ||
923 | struct after_state_chg_work *ascw; | ||
924 | |||
925 | os = mdev->state; | ||
926 | |||
927 | ns = sanitize_state(mdev, os, ns, &warn_sync_abort); | ||
928 | |||
929 | if (ns.i == os.i) | ||
930 | return SS_NOTHING_TO_DO; | ||
931 | |||
932 | if (!(flags & CS_HARD)) { | ||
933 | /* pre-state-change checks ; only look at ns */ | ||
934 | /* See drbd_state_sw_errors in drbd_strings.c */ | ||
935 | |||
936 | rv = is_valid_state(mdev, ns); | ||
937 | if (rv < SS_SUCCESS) { | ||
938 | /* If the old state was illegal as well, then let | ||
939 | this happen...*/ | ||
940 | |||
941 | if (is_valid_state(mdev, os) == rv) { | ||
942 | dev_err(DEV, "Considering state change from bad state. " | ||
943 | "Error would be: '%s'\n", | ||
944 | drbd_set_st_err_str(rv)); | ||
945 | print_st(mdev, "old", os); | ||
946 | print_st(mdev, "new", ns); | ||
947 | rv = is_valid_state_transition(mdev, ns, os); | ||
948 | } | ||
949 | } else | ||
950 | rv = is_valid_state_transition(mdev, ns, os); | ||
951 | } | ||
952 | |||
953 | if (rv < SS_SUCCESS) { | ||
954 | if (flags & CS_VERBOSE) | ||
955 | print_st_err(mdev, os, ns, rv); | ||
956 | return rv; | ||
957 | } | ||
958 | |||
959 | if (warn_sync_abort) | ||
960 | dev_warn(DEV, "Resync aborted.\n"); | ||
961 | |||
962 | { | ||
963 | char *pbp, pb[300]; | ||
964 | pbp = pb; | ||
965 | *pbp = 0; | ||
966 | PSC(role); | ||
967 | PSC(peer); | ||
968 | PSC(conn); | ||
969 | PSC(disk); | ||
970 | PSC(pdsk); | ||
971 | PSC(susp); | ||
972 | PSC(aftr_isp); | ||
973 | PSC(peer_isp); | ||
974 | PSC(user_isp); | ||
975 | dev_info(DEV, "%s\n", pb); | ||
976 | } | ||
977 | |||
978 | /* solve the race between becoming unconfigured, | ||
979 | * worker doing the cleanup, and | ||
980 | * admin reconfiguring us: | ||
981 | * on (re)configure, first set CONFIG_PENDING, | ||
982 | * then wait for a potentially exiting worker, | ||
983 | * start the worker, and schedule one no_op. | ||
984 | * then proceed with configuration. | ||
985 | */ | ||
986 | if (ns.disk == D_DISKLESS && | ||
987 | ns.conn == C_STANDALONE && | ||
988 | ns.role == R_SECONDARY && | ||
989 | !test_and_set_bit(CONFIG_PENDING, &mdev->flags)) | ||
990 | set_bit(DEVICE_DYING, &mdev->flags); | ||
991 | |||
992 | mdev->state.i = ns.i; | ||
993 | wake_up(&mdev->misc_wait); | ||
994 | wake_up(&mdev->state_wait); | ||
995 | |||
996 | /* post-state-change actions */ | ||
997 | if (os.conn >= C_SYNC_SOURCE && ns.conn <= C_CONNECTED) { | ||
998 | set_bit(STOP_SYNC_TIMER, &mdev->flags); | ||
999 | mod_timer(&mdev->resync_timer, jiffies); | ||
1000 | } | ||
1001 | |||
1002 | /* aborted verify run. log the last position */ | ||
1003 | if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && | ||
1004 | ns.conn < C_CONNECTED) { | ||
1005 | mdev->ov_start_sector = | ||
1006 | BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left); | ||
1007 | dev_info(DEV, "Online Verify reached sector %llu\n", | ||
1008 | (unsigned long long)mdev->ov_start_sector); | ||
1009 | } | ||
1010 | |||
1011 | if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) && | ||
1012 | (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) { | ||
1013 | dev_info(DEV, "Syncer continues.\n"); | ||
1014 | mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time; | ||
1015 | if (ns.conn == C_SYNC_TARGET) { | ||
1016 | if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags)) | ||
1017 | mod_timer(&mdev->resync_timer, jiffies); | ||
1018 | /* This if (!test_bit) is only needed for the case | ||
1019 | that a device that has ceased to used its timer, | ||
1020 | i.e. it is already in drbd_resync_finished() gets | ||
1021 | paused and resumed. */ | ||
1022 | } | ||
1023 | } | ||
1024 | |||
1025 | if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && | ||
1026 | (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { | ||
1027 | dev_info(DEV, "Resync suspended\n"); | ||
1028 | mdev->rs_mark_time = jiffies; | ||
1029 | if (ns.conn == C_PAUSED_SYNC_T) | ||
1030 | set_bit(STOP_SYNC_TIMER, &mdev->flags); | ||
1031 | } | ||
1032 | |||
1033 | if (os.conn == C_CONNECTED && | ||
1034 | (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) { | ||
1035 | mdev->ov_position = 0; | ||
1036 | mdev->rs_total = | ||
1037 | mdev->rs_mark_left = drbd_bm_bits(mdev); | ||
1038 | if (mdev->agreed_pro_version >= 90) | ||
1039 | set_ov_position(mdev, ns.conn); | ||
1040 | else | ||
1041 | mdev->ov_start_sector = 0; | ||
1042 | mdev->ov_left = mdev->rs_total | ||
1043 | - BM_SECT_TO_BIT(mdev->ov_position); | ||
1044 | mdev->rs_start = | ||
1045 | mdev->rs_mark_time = jiffies; | ||
1046 | mdev->ov_last_oos_size = 0; | ||
1047 | mdev->ov_last_oos_start = 0; | ||
1048 | |||
1049 | if (ns.conn == C_VERIFY_S) { | ||
1050 | dev_info(DEV, "Starting Online Verify from sector %llu\n", | ||
1051 | (unsigned long long)mdev->ov_position); | ||
1052 | mod_timer(&mdev->resync_timer, jiffies); | ||
1053 | } | ||
1054 | } | ||
1055 | |||
1056 | if (get_ldev(mdev)) { | ||
1057 | u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND| | ||
1058 | MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE| | ||
1059 | MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY); | ||
1060 | |||
1061 | if (test_bit(CRASHED_PRIMARY, &mdev->flags)) | ||
1062 | mdf |= MDF_CRASHED_PRIMARY; | ||
1063 | if (mdev->state.role == R_PRIMARY || | ||
1064 | (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY)) | ||
1065 | mdf |= MDF_PRIMARY_IND; | ||
1066 | if (mdev->state.conn > C_WF_REPORT_PARAMS) | ||
1067 | mdf |= MDF_CONNECTED_IND; | ||
1068 | if (mdev->state.disk > D_INCONSISTENT) | ||
1069 | mdf |= MDF_CONSISTENT; | ||
1070 | if (mdev->state.disk > D_OUTDATED) | ||
1071 | mdf |= MDF_WAS_UP_TO_DATE; | ||
1072 | if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT) | ||
1073 | mdf |= MDF_PEER_OUT_DATED; | ||
1074 | if (mdf != mdev->ldev->md.flags) { | ||
1075 | mdev->ldev->md.flags = mdf; | ||
1076 | drbd_md_mark_dirty(mdev); | ||
1077 | } | ||
1078 | if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT) | ||
1079 | drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]); | ||
1080 | put_ldev(mdev); | ||
1081 | } | ||
1082 | |||
1083 | /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */ | ||
1084 | if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT && | ||
1085 | os.peer == R_SECONDARY && ns.peer == R_PRIMARY) | ||
1086 | set_bit(CONSIDER_RESYNC, &mdev->flags); | ||
1087 | |||
1088 | /* Receiver should clean up itself */ | ||
1089 | if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING) | ||
1090 | drbd_thread_stop_nowait(&mdev->receiver); | ||
1091 | |||
1092 | /* Now the receiver finished cleaning up itself, it should die */ | ||
1093 | if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE) | ||
1094 | drbd_thread_stop_nowait(&mdev->receiver); | ||
1095 | |||
1096 | /* Upon network failure, we need to restart the receiver. */ | ||
1097 | if (os.conn > C_TEAR_DOWN && | ||
1098 | ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) | ||
1099 | drbd_thread_restart_nowait(&mdev->receiver); | ||
1100 | |||
1101 | ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); | ||
1102 | if (ascw) { | ||
1103 | ascw->os = os; | ||
1104 | ascw->ns = ns; | ||
1105 | ascw->flags = flags; | ||
1106 | ascw->w.cb = w_after_state_ch; | ||
1107 | ascw->done = done; | ||
1108 | drbd_queue_work(&mdev->data.work, &ascw->w); | ||
1109 | } else { | ||
1110 | dev_warn(DEV, "Could not kmalloc an ascw\n"); | ||
1111 | } | ||
1112 | |||
1113 | return rv; | ||
1114 | } | ||
1115 | |||
1116 | static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused) | ||
1117 | { | ||
1118 | struct after_state_chg_work *ascw = | ||
1119 | container_of(w, struct after_state_chg_work, w); | ||
1120 | after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags); | ||
1121 | if (ascw->flags & CS_WAIT_COMPLETE) { | ||
1122 | D_ASSERT(ascw->done != NULL); | ||
1123 | complete(ascw->done); | ||
1124 | } | ||
1125 | kfree(ascw); | ||
1126 | |||
1127 | return 1; | ||
1128 | } | ||
1129 | |||
1130 | static void abw_start_sync(struct drbd_conf *mdev, int rv) | ||
1131 | { | ||
1132 | if (rv) { | ||
1133 | dev_err(DEV, "Writing the bitmap failed not starting resync.\n"); | ||
1134 | _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE); | ||
1135 | return; | ||
1136 | } | ||
1137 | |||
1138 | switch (mdev->state.conn) { | ||
1139 | case C_STARTING_SYNC_T: | ||
1140 | _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); | ||
1141 | break; | ||
1142 | case C_STARTING_SYNC_S: | ||
1143 | drbd_start_resync(mdev, C_SYNC_SOURCE); | ||
1144 | break; | ||
1145 | } | ||
1146 | } | ||
1147 | |||
1148 | /** | ||
1149 | * after_state_ch() - Perform after state change actions that may sleep | ||
1150 | * @mdev: DRBD device. | ||
1151 | * @os: old state. | ||
1152 | * @ns: new state. | ||
1153 | * @flags: Flags | ||
1154 | */ | ||
1155 | static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | ||
1156 | union drbd_state ns, enum chg_state_flags flags) | ||
1157 | { | ||
1158 | enum drbd_fencing_p fp; | ||
1159 | |||
1160 | if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { | ||
1161 | clear_bit(CRASHED_PRIMARY, &mdev->flags); | ||
1162 | if (mdev->p_uuid) | ||
1163 | mdev->p_uuid[UI_FLAGS] &= ~((u64)2); | ||
1164 | } | ||
1165 | |||
1166 | fp = FP_DONT_CARE; | ||
1167 | if (get_ldev(mdev)) { | ||
1168 | fp = mdev->ldev->dc.fencing; | ||
1169 | put_ldev(mdev); | ||
1170 | } | ||
1171 | |||
1172 | /* Inform userspace about the change... */ | ||
1173 | drbd_bcast_state(mdev, ns); | ||
1174 | |||
1175 | if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) && | ||
1176 | (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) | ||
1177 | drbd_khelper(mdev, "pri-on-incon-degr"); | ||
1178 | |||
1179 | /* Here we have the actions that are performed after a | ||
1180 | state change. This function might sleep */ | ||
1181 | |||
1182 | if (fp == FP_STONITH && ns.susp) { | ||
1183 | /* case1: The outdate peer handler is successful: | ||
1184 | * case2: The connection was established again: */ | ||
1185 | if ((os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) || | ||
1186 | (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) { | ||
1187 | tl_clear(mdev); | ||
1188 | spin_lock_irq(&mdev->req_lock); | ||
1189 | _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL); | ||
1190 | spin_unlock_irq(&mdev->req_lock); | ||
1191 | } | ||
1192 | } | ||
1193 | /* Do not change the order of the if above and the two below... */ | ||
1194 | if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ | ||
1195 | drbd_send_uuids(mdev); | ||
1196 | drbd_send_state(mdev); | ||
1197 | } | ||
1198 | if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S) | ||
1199 | drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)"); | ||
1200 | |||
1201 | /* Lost contact to peer's copy of the data */ | ||
1202 | if ((os.pdsk >= D_INCONSISTENT && | ||
1203 | os.pdsk != D_UNKNOWN && | ||
1204 | os.pdsk != D_OUTDATED) | ||
1205 | && (ns.pdsk < D_INCONSISTENT || | ||
1206 | ns.pdsk == D_UNKNOWN || | ||
1207 | ns.pdsk == D_OUTDATED)) { | ||
1208 | kfree(mdev->p_uuid); | ||
1209 | mdev->p_uuid = NULL; | ||
1210 | if (get_ldev(mdev)) { | ||
1211 | if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && | ||
1212 | mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { | ||
1213 | drbd_uuid_new_current(mdev); | ||
1214 | drbd_send_uuids(mdev); | ||
1215 | } | ||
1216 | put_ldev(mdev); | ||
1217 | } | ||
1218 | } | ||
1219 | |||
1220 | if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { | ||
1221 | if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) | ||
1222 | drbd_uuid_new_current(mdev); | ||
1223 | |||
1224 | /* D_DISKLESS Peer becomes secondary */ | ||
1225 | if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) | ||
1226 | drbd_al_to_on_disk_bm(mdev); | ||
1227 | put_ldev(mdev); | ||
1228 | } | ||
1229 | |||
1230 | /* Last part of the attaching process ... */ | ||
1231 | if (ns.conn >= C_CONNECTED && | ||
1232 | os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { | ||
1233 | kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */ | ||
1234 | mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */ | ||
1235 | drbd_send_sizes(mdev, 0); /* to start sync... */ | ||
1236 | drbd_send_uuids(mdev); | ||
1237 | drbd_send_state(mdev); | ||
1238 | } | ||
1239 | |||
1240 | /* We want to pause/continue resync, tell peer. */ | ||
1241 | if (ns.conn >= C_CONNECTED && | ||
1242 | ((os.aftr_isp != ns.aftr_isp) || | ||
1243 | (os.user_isp != ns.user_isp))) | ||
1244 | drbd_send_state(mdev); | ||
1245 | |||
1246 | /* In case one of the isp bits got set, suspend other devices. */ | ||
1247 | if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && | ||
1248 | (ns.aftr_isp || ns.peer_isp || ns.user_isp)) | ||
1249 | suspend_other_sg(mdev); | ||
1250 | |||
1251 | /* Make sure the peer gets informed about eventual state | ||
1252 | changes (ISP bits) while we were in WFReportParams. */ | ||
1253 | if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) | ||
1254 | drbd_send_state(mdev); | ||
1255 | |||
1256 | /* We are in the progress to start a full sync... */ | ||
1257 | if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || | ||
1258 | (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S)) | ||
1259 | drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync"); | ||
1260 | |||
1261 | /* We are invalidating our self... */ | ||
1262 | if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED && | ||
1263 | os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) | ||
1264 | drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate"); | ||
1265 | |||
1266 | if (os.disk > D_FAILED && ns.disk == D_FAILED) { | ||
1267 | enum drbd_io_error_p eh; | ||
1268 | |||
1269 | eh = EP_PASS_ON; | ||
1270 | if (get_ldev_if_state(mdev, D_FAILED)) { | ||
1271 | eh = mdev->ldev->dc.on_io_error; | ||
1272 | put_ldev(mdev); | ||
1273 | } | ||
1274 | |||
1275 | drbd_rs_cancel_all(mdev); | ||
1276 | /* since get_ldev() only works as long as disk>=D_INCONSISTENT, | ||
1277 | and it is D_DISKLESS here, local_cnt can only go down, it can | ||
1278 | not increase... It will reach zero */ | ||
1279 | wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); | ||
1280 | mdev->rs_total = 0; | ||
1281 | mdev->rs_failed = 0; | ||
1282 | atomic_set(&mdev->rs_pending_cnt, 0); | ||
1283 | |||
1284 | spin_lock_irq(&mdev->req_lock); | ||
1285 | _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL); | ||
1286 | spin_unlock_irq(&mdev->req_lock); | ||
1287 | |||
1288 | if (eh == EP_CALL_HELPER) | ||
1289 | drbd_khelper(mdev, "local-io-error"); | ||
1290 | } | ||
1291 | |||
1292 | if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) { | ||
1293 | |||
1294 | if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ { | ||
1295 | if (drbd_send_state(mdev)) | ||
1296 | dev_warn(DEV, "Notified peer that my disk is broken.\n"); | ||
1297 | else | ||
1298 | dev_err(DEV, "Sending state in drbd_io_error() failed\n"); | ||
1299 | } | ||
1300 | |||
1301 | wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); | ||
1302 | lc_destroy(mdev->resync); | ||
1303 | mdev->resync = NULL; | ||
1304 | lc_destroy(mdev->act_log); | ||
1305 | mdev->act_log = NULL; | ||
1306 | __no_warn(local, | ||
1307 | drbd_free_bc(mdev->ldev); | ||
1308 | mdev->ldev = NULL;); | ||
1309 | |||
1310 | if (mdev->md_io_tmpp) | ||
1311 | __free_page(mdev->md_io_tmpp); | ||
1312 | } | ||
1313 | |||
1314 | /* Disks got bigger while they were detached */ | ||
1315 | if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && | ||
1316 | test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) { | ||
1317 | if (ns.conn == C_CONNECTED) | ||
1318 | resync_after_online_grow(mdev); | ||
1319 | } | ||
1320 | |||
1321 | /* A resync finished or aborted, wake paused devices... */ | ||
1322 | if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) || | ||
1323 | (os.peer_isp && !ns.peer_isp) || | ||
1324 | (os.user_isp && !ns.user_isp)) | ||
1325 | resume_next_sg(mdev); | ||
1326 | |||
1327 | /* Upon network connection, we need to start the receiver */ | ||
1328 | if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED) | ||
1329 | drbd_thread_start(&mdev->receiver); | ||
1330 | |||
1331 | /* Terminate worker thread if we are unconfigured - it will be | ||
1332 | restarted as needed... */ | ||
1333 | if (ns.disk == D_DISKLESS && | ||
1334 | ns.conn == C_STANDALONE && | ||
1335 | ns.role == R_SECONDARY) { | ||
1336 | if (os.aftr_isp != ns.aftr_isp) | ||
1337 | resume_next_sg(mdev); | ||
1338 | /* set in __drbd_set_state, unless CONFIG_PENDING was set */ | ||
1339 | if (test_bit(DEVICE_DYING, &mdev->flags)) | ||
1340 | drbd_thread_stop_nowait(&mdev->worker); | ||
1341 | } | ||
1342 | |||
1343 | drbd_md_sync(mdev); | ||
1344 | } | ||
1345 | |||
1346 | |||
1347 | static int drbd_thread_setup(void *arg) | ||
1348 | { | ||
1349 | struct drbd_thread *thi = (struct drbd_thread *) arg; | ||
1350 | struct drbd_conf *mdev = thi->mdev; | ||
1351 | unsigned long flags; | ||
1352 | int retval; | ||
1353 | |||
1354 | restart: | ||
1355 | retval = thi->function(thi); | ||
1356 | |||
1357 | spin_lock_irqsave(&thi->t_lock, flags); | ||
1358 | |||
1359 | /* if the receiver has been "Exiting", the last thing it did | ||
1360 | * was set the conn state to "StandAlone", | ||
1361 | * if now a re-connect request comes in, conn state goes C_UNCONNECTED, | ||
1362 | * and receiver thread will be "started". | ||
1363 | * drbd_thread_start needs to set "Restarting" in that case. | ||
1364 | * t_state check and assignment needs to be within the same spinlock, | ||
1365 | * so either thread_start sees Exiting, and can remap to Restarting, | ||
1366 | * or thread_start see None, and can proceed as normal. | ||
1367 | */ | ||
1368 | |||
1369 | if (thi->t_state == Restarting) { | ||
1370 | dev_info(DEV, "Restarting %s\n", current->comm); | ||
1371 | thi->t_state = Running; | ||
1372 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1373 | goto restart; | ||
1374 | } | ||
1375 | |||
1376 | thi->task = NULL; | ||
1377 | thi->t_state = None; | ||
1378 | smp_mb(); | ||
1379 | complete(&thi->stop); | ||
1380 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1381 | |||
1382 | dev_info(DEV, "Terminating %s\n", current->comm); | ||
1383 | |||
1384 | /* Release mod reference taken when thread was started */ | ||
1385 | module_put(THIS_MODULE); | ||
1386 | return retval; | ||
1387 | } | ||
1388 | |||
1389 | static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi, | ||
1390 | int (*func) (struct drbd_thread *)) | ||
1391 | { | ||
1392 | spin_lock_init(&thi->t_lock); | ||
1393 | thi->task = NULL; | ||
1394 | thi->t_state = None; | ||
1395 | thi->function = func; | ||
1396 | thi->mdev = mdev; | ||
1397 | } | ||
1398 | |||
1399 | int drbd_thread_start(struct drbd_thread *thi) | ||
1400 | { | ||
1401 | struct drbd_conf *mdev = thi->mdev; | ||
1402 | struct task_struct *nt; | ||
1403 | unsigned long flags; | ||
1404 | |||
1405 | const char *me = | ||
1406 | thi == &mdev->receiver ? "receiver" : | ||
1407 | thi == &mdev->asender ? "asender" : | ||
1408 | thi == &mdev->worker ? "worker" : "NONSENSE"; | ||
1409 | |||
1410 | /* is used from state engine doing drbd_thread_stop_nowait, | ||
1411 | * while holding the req lock irqsave */ | ||
1412 | spin_lock_irqsave(&thi->t_lock, flags); | ||
1413 | |||
1414 | switch (thi->t_state) { | ||
1415 | case None: | ||
1416 | dev_info(DEV, "Starting %s thread (from %s [%d])\n", | ||
1417 | me, current->comm, current->pid); | ||
1418 | |||
1419 | /* Get ref on module for thread - this is released when thread exits */ | ||
1420 | if (!try_module_get(THIS_MODULE)) { | ||
1421 | dev_err(DEV, "Failed to get module reference in drbd_thread_start\n"); | ||
1422 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1423 | return FALSE; | ||
1424 | } | ||
1425 | |||
1426 | init_completion(&thi->stop); | ||
1427 | D_ASSERT(thi->task == NULL); | ||
1428 | thi->reset_cpu_mask = 1; | ||
1429 | thi->t_state = Running; | ||
1430 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1431 | flush_signals(current); /* otherw. may get -ERESTARTNOINTR */ | ||
1432 | |||
1433 | nt = kthread_create(drbd_thread_setup, (void *) thi, | ||
1434 | "drbd%d_%s", mdev_to_minor(mdev), me); | ||
1435 | |||
1436 | if (IS_ERR(nt)) { | ||
1437 | dev_err(DEV, "Couldn't start thread\n"); | ||
1438 | |||
1439 | module_put(THIS_MODULE); | ||
1440 | return FALSE; | ||
1441 | } | ||
1442 | spin_lock_irqsave(&thi->t_lock, flags); | ||
1443 | thi->task = nt; | ||
1444 | thi->t_state = Running; | ||
1445 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1446 | wake_up_process(nt); | ||
1447 | break; | ||
1448 | case Exiting: | ||
1449 | thi->t_state = Restarting; | ||
1450 | dev_info(DEV, "Restarting %s thread (from %s [%d])\n", | ||
1451 | me, current->comm, current->pid); | ||
1452 | /* fall through */ | ||
1453 | case Running: | ||
1454 | case Restarting: | ||
1455 | default: | ||
1456 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1457 | break; | ||
1458 | } | ||
1459 | |||
1460 | return TRUE; | ||
1461 | } | ||
1462 | |||
1463 | |||
1464 | void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) | ||
1465 | { | ||
1466 | unsigned long flags; | ||
1467 | |||
1468 | enum drbd_thread_state ns = restart ? Restarting : Exiting; | ||
1469 | |||
1470 | /* may be called from state engine, holding the req lock irqsave */ | ||
1471 | spin_lock_irqsave(&thi->t_lock, flags); | ||
1472 | |||
1473 | if (thi->t_state == None) { | ||
1474 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1475 | if (restart) | ||
1476 | drbd_thread_start(thi); | ||
1477 | return; | ||
1478 | } | ||
1479 | |||
1480 | if (thi->t_state != ns) { | ||
1481 | if (thi->task == NULL) { | ||
1482 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1483 | return; | ||
1484 | } | ||
1485 | |||
1486 | thi->t_state = ns; | ||
1487 | smp_mb(); | ||
1488 | init_completion(&thi->stop); | ||
1489 | if (thi->task != current) | ||
1490 | force_sig(DRBD_SIGKILL, thi->task); | ||
1491 | |||
1492 | } | ||
1493 | |||
1494 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1495 | |||
1496 | if (wait) | ||
1497 | wait_for_completion(&thi->stop); | ||
1498 | } | ||
1499 | |||
1500 | #ifdef CONFIG_SMP | ||
1501 | /** | ||
1502 | * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs | ||
1503 | * @mdev: DRBD device. | ||
1504 | * | ||
1505 | * Forces all threads of a device onto the same CPU. This is beneficial for | ||
1506 | * DRBD's performance. May be overwritten by user's configuration. | ||
1507 | */ | ||
1508 | void drbd_calc_cpu_mask(struct drbd_conf *mdev) | ||
1509 | { | ||
1510 | int ord, cpu; | ||
1511 | |||
1512 | /* user override. */ | ||
1513 | if (cpumask_weight(mdev->cpu_mask)) | ||
1514 | return; | ||
1515 | |||
1516 | ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask); | ||
1517 | for_each_online_cpu(cpu) { | ||
1518 | if (ord-- == 0) { | ||
1519 | cpumask_set_cpu(cpu, mdev->cpu_mask); | ||
1520 | return; | ||
1521 | } | ||
1522 | } | ||
1523 | /* should not be reached */ | ||
1524 | cpumask_setall(mdev->cpu_mask); | ||
1525 | } | ||
1526 | |||
1527 | /** | ||
1528 | * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread | ||
1529 | * @mdev: DRBD device. | ||
1530 | * | ||
1531 | * call in the "main loop" of _all_ threads, no need for any mutex, current won't die | ||
1532 | * prematurely. | ||
1533 | */ | ||
1534 | void drbd_thread_current_set_cpu(struct drbd_conf *mdev) | ||
1535 | { | ||
1536 | struct task_struct *p = current; | ||
1537 | struct drbd_thread *thi = | ||
1538 | p == mdev->asender.task ? &mdev->asender : | ||
1539 | p == mdev->receiver.task ? &mdev->receiver : | ||
1540 | p == mdev->worker.task ? &mdev->worker : | ||
1541 | NULL; | ||
1542 | ERR_IF(thi == NULL) | ||
1543 | return; | ||
1544 | if (!thi->reset_cpu_mask) | ||
1545 | return; | ||
1546 | thi->reset_cpu_mask = 0; | ||
1547 | set_cpus_allowed_ptr(p, mdev->cpu_mask); | ||
1548 | } | ||
1549 | #endif | ||
1550 | |||
1551 | /* the appropriate socket mutex must be held already */ | ||
1552 | int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, | ||
1553 | enum drbd_packets cmd, struct p_header *h, | ||
1554 | size_t size, unsigned msg_flags) | ||
1555 | { | ||
1556 | int sent, ok; | ||
1557 | |||
1558 | ERR_IF(!h) return FALSE; | ||
1559 | ERR_IF(!size) return FALSE; | ||
1560 | |||
1561 | h->magic = BE_DRBD_MAGIC; | ||
1562 | h->command = cpu_to_be16(cmd); | ||
1563 | h->length = cpu_to_be16(size-sizeof(struct p_header)); | ||
1564 | |||
1565 | sent = drbd_send(mdev, sock, h, size, msg_flags); | ||
1566 | |||
1567 | ok = (sent == size); | ||
1568 | if (!ok) | ||
1569 | dev_err(DEV, "short sent %s size=%d sent=%d\n", | ||
1570 | cmdname(cmd), (int)size, sent); | ||
1571 | return ok; | ||
1572 | } | ||
1573 | |||
1574 | /* don't pass the socket. we may only look at it | ||
1575 | * when we hold the appropriate socket mutex. | ||
1576 | */ | ||
1577 | int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, | ||
1578 | enum drbd_packets cmd, struct p_header *h, size_t size) | ||
1579 | { | ||
1580 | int ok = 0; | ||
1581 | struct socket *sock; | ||
1582 | |||
1583 | if (use_data_socket) { | ||
1584 | mutex_lock(&mdev->data.mutex); | ||
1585 | sock = mdev->data.socket; | ||
1586 | } else { | ||
1587 | mutex_lock(&mdev->meta.mutex); | ||
1588 | sock = mdev->meta.socket; | ||
1589 | } | ||
1590 | |||
1591 | /* drbd_disconnect() could have called drbd_free_sock() | ||
1592 | * while we were waiting in down()... */ | ||
1593 | if (likely(sock != NULL)) | ||
1594 | ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0); | ||
1595 | |||
1596 | if (use_data_socket) | ||
1597 | mutex_unlock(&mdev->data.mutex); | ||
1598 | else | ||
1599 | mutex_unlock(&mdev->meta.mutex); | ||
1600 | return ok; | ||
1601 | } | ||
1602 | |||
1603 | int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data, | ||
1604 | size_t size) | ||
1605 | { | ||
1606 | struct p_header h; | ||
1607 | int ok; | ||
1608 | |||
1609 | h.magic = BE_DRBD_MAGIC; | ||
1610 | h.command = cpu_to_be16(cmd); | ||
1611 | h.length = cpu_to_be16(size); | ||
1612 | |||
1613 | if (!drbd_get_data_sock(mdev)) | ||
1614 | return 0; | ||
1615 | |||
1616 | ok = (sizeof(h) == | ||
1617 | drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0)); | ||
1618 | ok = ok && (size == | ||
1619 | drbd_send(mdev, mdev->data.socket, data, size, 0)); | ||
1620 | |||
1621 | drbd_put_data_sock(mdev); | ||
1622 | |||
1623 | return ok; | ||
1624 | } | ||
1625 | |||
1626 | int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc) | ||
1627 | { | ||
1628 | struct p_rs_param_89 *p; | ||
1629 | struct socket *sock; | ||
1630 | int size, rv; | ||
1631 | const int apv = mdev->agreed_pro_version; | ||
1632 | |||
1633 | size = apv <= 87 ? sizeof(struct p_rs_param) | ||
1634 | : apv == 88 ? sizeof(struct p_rs_param) | ||
1635 | + strlen(mdev->sync_conf.verify_alg) + 1 | ||
1636 | : /* 89 */ sizeof(struct p_rs_param_89); | ||
1637 | |||
1638 | /* used from admin command context and receiver/worker context. | ||
1639 | * to avoid kmalloc, grab the socket right here, | ||
1640 | * then use the pre-allocated sbuf there */ | ||
1641 | mutex_lock(&mdev->data.mutex); | ||
1642 | sock = mdev->data.socket; | ||
1643 | |||
1644 | if (likely(sock != NULL)) { | ||
1645 | enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM; | ||
1646 | |||
1647 | p = &mdev->data.sbuf.rs_param_89; | ||
1648 | |||
1649 | /* initialize verify_alg and csums_alg */ | ||
1650 | memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); | ||
1651 | |||
1652 | p->rate = cpu_to_be32(sc->rate); | ||
1653 | |||
1654 | if (apv >= 88) | ||
1655 | strcpy(p->verify_alg, mdev->sync_conf.verify_alg); | ||
1656 | if (apv >= 89) | ||
1657 | strcpy(p->csums_alg, mdev->sync_conf.csums_alg); | ||
1658 | |||
1659 | rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0); | ||
1660 | } else | ||
1661 | rv = 0; /* not ok */ | ||
1662 | |||
1663 | mutex_unlock(&mdev->data.mutex); | ||
1664 | |||
1665 | return rv; | ||
1666 | } | ||
1667 | |||
1668 | int drbd_send_protocol(struct drbd_conf *mdev) | ||
1669 | { | ||
1670 | struct p_protocol *p; | ||
1671 | int size, cf, rv; | ||
1672 | |||
1673 | size = sizeof(struct p_protocol); | ||
1674 | |||
1675 | if (mdev->agreed_pro_version >= 87) | ||
1676 | size += strlen(mdev->net_conf->integrity_alg) + 1; | ||
1677 | |||
1678 | /* we must not recurse into our own queue, | ||
1679 | * as that is blocked during handshake */ | ||
1680 | p = kmalloc(size, GFP_NOIO); | ||
1681 | if (p == NULL) | ||
1682 | return 0; | ||
1683 | |||
1684 | p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol); | ||
1685 | p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p); | ||
1686 | p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p); | ||
1687 | p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p); | ||
1688 | p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries); | ||
1689 | |||
1690 | cf = 0; | ||
1691 | if (mdev->net_conf->want_lose) | ||
1692 | cf |= CF_WANT_LOSE; | ||
1693 | if (mdev->net_conf->dry_run) { | ||
1694 | if (mdev->agreed_pro_version >= 92) | ||
1695 | cf |= CF_DRY_RUN; | ||
1696 | else { | ||
1697 | dev_err(DEV, "--dry-run is not supported by peer"); | ||
1698 | kfree(p); | ||
1699 | return 0; | ||
1700 | } | ||
1701 | } | ||
1702 | p->conn_flags = cpu_to_be32(cf); | ||
1703 | |||
1704 | if (mdev->agreed_pro_version >= 87) | ||
1705 | strcpy(p->integrity_alg, mdev->net_conf->integrity_alg); | ||
1706 | |||
1707 | rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL, | ||
1708 | (struct p_header *)p, size); | ||
1709 | kfree(p); | ||
1710 | return rv; | ||
1711 | } | ||
1712 | |||
1713 | int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags) | ||
1714 | { | ||
1715 | struct p_uuids p; | ||
1716 | int i; | ||
1717 | |||
1718 | if (!get_ldev_if_state(mdev, D_NEGOTIATING)) | ||
1719 | return 1; | ||
1720 | |||
1721 | for (i = UI_CURRENT; i < UI_SIZE; i++) | ||
1722 | p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0; | ||
1723 | |||
1724 | mdev->comm_bm_set = drbd_bm_total_weight(mdev); | ||
1725 | p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set); | ||
1726 | uuid_flags |= mdev->net_conf->want_lose ? 1 : 0; | ||
1727 | uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0; | ||
1728 | uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0; | ||
1729 | p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags); | ||
1730 | |||
1731 | put_ldev(mdev); | ||
1732 | |||
1733 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS, | ||
1734 | (struct p_header *)&p, sizeof(p)); | ||
1735 | } | ||
1736 | |||
1737 | int drbd_send_uuids(struct drbd_conf *mdev) | ||
1738 | { | ||
1739 | return _drbd_send_uuids(mdev, 0); | ||
1740 | } | ||
1741 | |||
1742 | int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev) | ||
1743 | { | ||
1744 | return _drbd_send_uuids(mdev, 8); | ||
1745 | } | ||
1746 | |||
1747 | |||
1748 | int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val) | ||
1749 | { | ||
1750 | struct p_rs_uuid p; | ||
1751 | |||
1752 | p.uuid = cpu_to_be64(val); | ||
1753 | |||
1754 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID, | ||
1755 | (struct p_header *)&p, sizeof(p)); | ||
1756 | } | ||
1757 | |||
1758 | int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply) | ||
1759 | { | ||
1760 | struct p_sizes p; | ||
1761 | sector_t d_size, u_size; | ||
1762 | int q_order_type; | ||
1763 | int ok; | ||
1764 | |||
1765 | if (get_ldev_if_state(mdev, D_NEGOTIATING)) { | ||
1766 | D_ASSERT(mdev->ldev->backing_bdev); | ||
1767 | d_size = drbd_get_max_capacity(mdev->ldev); | ||
1768 | u_size = mdev->ldev->dc.disk_size; | ||
1769 | q_order_type = drbd_queue_order_type(mdev); | ||
1770 | p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev)); | ||
1771 | put_ldev(mdev); | ||
1772 | } else { | ||
1773 | d_size = 0; | ||
1774 | u_size = 0; | ||
1775 | q_order_type = QUEUE_ORDERED_NONE; | ||
1776 | } | ||
1777 | |||
1778 | p.d_size = cpu_to_be64(d_size); | ||
1779 | p.u_size = cpu_to_be64(u_size); | ||
1780 | p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); | ||
1781 | p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue)); | ||
1782 | p.queue_order_type = cpu_to_be32(q_order_type); | ||
1783 | |||
1784 | ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, | ||
1785 | (struct p_header *)&p, sizeof(p)); | ||
1786 | return ok; | ||
1787 | } | ||
1788 | |||
1789 | /** | ||
1790 | * drbd_send_state() - Sends the drbd state to the peer | ||
1791 | * @mdev: DRBD device. | ||
1792 | */ | ||
1793 | int drbd_send_state(struct drbd_conf *mdev) | ||
1794 | { | ||
1795 | struct socket *sock; | ||
1796 | struct p_state p; | ||
1797 | int ok = 0; | ||
1798 | |||
1799 | /* Grab state lock so we wont send state if we're in the middle | ||
1800 | * of a cluster wide state change on another thread */ | ||
1801 | drbd_state_lock(mdev); | ||
1802 | |||
1803 | mutex_lock(&mdev->data.mutex); | ||
1804 | |||
1805 | p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */ | ||
1806 | sock = mdev->data.socket; | ||
1807 | |||
1808 | if (likely(sock != NULL)) { | ||
1809 | ok = _drbd_send_cmd(mdev, sock, P_STATE, | ||
1810 | (struct p_header *)&p, sizeof(p), 0); | ||
1811 | } | ||
1812 | |||
1813 | mutex_unlock(&mdev->data.mutex); | ||
1814 | |||
1815 | drbd_state_unlock(mdev); | ||
1816 | return ok; | ||
1817 | } | ||
1818 | |||
1819 | int drbd_send_state_req(struct drbd_conf *mdev, | ||
1820 | union drbd_state mask, union drbd_state val) | ||
1821 | { | ||
1822 | struct p_req_state p; | ||
1823 | |||
1824 | p.mask = cpu_to_be32(mask.i); | ||
1825 | p.val = cpu_to_be32(val.i); | ||
1826 | |||
1827 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ, | ||
1828 | (struct p_header *)&p, sizeof(p)); | ||
1829 | } | ||
1830 | |||
1831 | int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode) | ||
1832 | { | ||
1833 | struct p_req_state_reply p; | ||
1834 | |||
1835 | p.retcode = cpu_to_be32(retcode); | ||
1836 | |||
1837 | return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY, | ||
1838 | (struct p_header *)&p, sizeof(p)); | ||
1839 | } | ||
1840 | |||
1841 | int fill_bitmap_rle_bits(struct drbd_conf *mdev, | ||
1842 | struct p_compressed_bm *p, | ||
1843 | struct bm_xfer_ctx *c) | ||
1844 | { | ||
1845 | struct bitstream bs; | ||
1846 | unsigned long plain_bits; | ||
1847 | unsigned long tmp; | ||
1848 | unsigned long rl; | ||
1849 | unsigned len; | ||
1850 | unsigned toggle; | ||
1851 | int bits; | ||
1852 | |||
1853 | /* may we use this feature? */ | ||
1854 | if ((mdev->sync_conf.use_rle == 0) || | ||
1855 | (mdev->agreed_pro_version < 90)) | ||
1856 | return 0; | ||
1857 | |||
1858 | if (c->bit_offset >= c->bm_bits) | ||
1859 | return 0; /* nothing to do. */ | ||
1860 | |||
1861 | /* use at most thus many bytes */ | ||
1862 | bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0); | ||
1863 | memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX); | ||
1864 | /* plain bits covered in this code string */ | ||
1865 | plain_bits = 0; | ||
1866 | |||
1867 | /* p->encoding & 0x80 stores whether the first run length is set. | ||
1868 | * bit offset is implicit. | ||
1869 | * start with toggle == 2 to be able to tell the first iteration */ | ||
1870 | toggle = 2; | ||
1871 | |||
1872 | /* see how much plain bits we can stuff into one packet | ||
1873 | * using RLE and VLI. */ | ||
1874 | do { | ||
1875 | tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset) | ||
1876 | : _drbd_bm_find_next(mdev, c->bit_offset); | ||
1877 | if (tmp == -1UL) | ||
1878 | tmp = c->bm_bits; | ||
1879 | rl = tmp - c->bit_offset; | ||
1880 | |||
1881 | if (toggle == 2) { /* first iteration */ | ||
1882 | if (rl == 0) { | ||
1883 | /* the first checked bit was set, | ||
1884 | * store start value, */ | ||
1885 | DCBP_set_start(p, 1); | ||
1886 | /* but skip encoding of zero run length */ | ||
1887 | toggle = !toggle; | ||
1888 | continue; | ||
1889 | } | ||
1890 | DCBP_set_start(p, 0); | ||
1891 | } | ||
1892 | |||
1893 | /* paranoia: catch zero runlength. | ||
1894 | * can only happen if bitmap is modified while we scan it. */ | ||
1895 | if (rl == 0) { | ||
1896 | dev_err(DEV, "unexpected zero runlength while encoding bitmap " | ||
1897 | "t:%u bo:%lu\n", toggle, c->bit_offset); | ||
1898 | return -1; | ||
1899 | } | ||
1900 | |||
1901 | bits = vli_encode_bits(&bs, rl); | ||
1902 | if (bits == -ENOBUFS) /* buffer full */ | ||
1903 | break; | ||
1904 | if (bits <= 0) { | ||
1905 | dev_err(DEV, "error while encoding bitmap: %d\n", bits); | ||
1906 | return 0; | ||
1907 | } | ||
1908 | |||
1909 | toggle = !toggle; | ||
1910 | plain_bits += rl; | ||
1911 | c->bit_offset = tmp; | ||
1912 | } while (c->bit_offset < c->bm_bits); | ||
1913 | |||
1914 | len = bs.cur.b - p->code + !!bs.cur.bit; | ||
1915 | |||
1916 | if (plain_bits < (len << 3)) { | ||
1917 | /* incompressible with this method. | ||
1918 | * we need to rewind both word and bit position. */ | ||
1919 | c->bit_offset -= plain_bits; | ||
1920 | bm_xfer_ctx_bit_to_word_offset(c); | ||
1921 | c->bit_offset = c->word_offset * BITS_PER_LONG; | ||
1922 | return 0; | ||
1923 | } | ||
1924 | |||
1925 | /* RLE + VLI was able to compress it just fine. | ||
1926 | * update c->word_offset. */ | ||
1927 | bm_xfer_ctx_bit_to_word_offset(c); | ||
1928 | |||
1929 | /* store pad_bits */ | ||
1930 | DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7); | ||
1931 | |||
1932 | return len; | ||
1933 | } | ||
1934 | |||
1935 | enum { OK, FAILED, DONE } | ||
1936 | send_bitmap_rle_or_plain(struct drbd_conf *mdev, | ||
1937 | struct p_header *h, struct bm_xfer_ctx *c) | ||
1938 | { | ||
1939 | struct p_compressed_bm *p = (void*)h; | ||
1940 | unsigned long num_words; | ||
1941 | int len; | ||
1942 | int ok; | ||
1943 | |||
1944 | len = fill_bitmap_rle_bits(mdev, p, c); | ||
1945 | |||
1946 | if (len < 0) | ||
1947 | return FAILED; | ||
1948 | |||
1949 | if (len) { | ||
1950 | DCBP_set_code(p, RLE_VLI_Bits); | ||
1951 | ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h, | ||
1952 | sizeof(*p) + len, 0); | ||
1953 | |||
1954 | c->packets[0]++; | ||
1955 | c->bytes[0] += sizeof(*p) + len; | ||
1956 | |||
1957 | if (c->bit_offset >= c->bm_bits) | ||
1958 | len = 0; /* DONE */ | ||
1959 | } else { | ||
1960 | /* was not compressible. | ||
1961 | * send a buffer full of plain text bits instead. */ | ||
1962 | num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); | ||
1963 | len = num_words * sizeof(long); | ||
1964 | if (len) | ||
1965 | drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload); | ||
1966 | ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP, | ||
1967 | h, sizeof(struct p_header) + len, 0); | ||
1968 | c->word_offset += num_words; | ||
1969 | c->bit_offset = c->word_offset * BITS_PER_LONG; | ||
1970 | |||
1971 | c->packets[1]++; | ||
1972 | c->bytes[1] += sizeof(struct p_header) + len; | ||
1973 | |||
1974 | if (c->bit_offset > c->bm_bits) | ||
1975 | c->bit_offset = c->bm_bits; | ||
1976 | } | ||
1977 | ok = ok ? ((len == 0) ? DONE : OK) : FAILED; | ||
1978 | |||
1979 | if (ok == DONE) | ||
1980 | INFO_bm_xfer_stats(mdev, "send", c); | ||
1981 | return ok; | ||
1982 | } | ||
1983 | |||
1984 | /* See the comment at receive_bitmap() */ | ||
1985 | int _drbd_send_bitmap(struct drbd_conf *mdev) | ||
1986 | { | ||
1987 | struct bm_xfer_ctx c; | ||
1988 | struct p_header *p; | ||
1989 | int ret; | ||
1990 | |||
1991 | ERR_IF(!mdev->bitmap) return FALSE; | ||
1992 | |||
1993 | /* maybe we should use some per thread scratch page, | ||
1994 | * and allocate that during initial device creation? */ | ||
1995 | p = (struct p_header *) __get_free_page(GFP_NOIO); | ||
1996 | if (!p) { | ||
1997 | dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); | ||
1998 | return FALSE; | ||
1999 | } | ||
2000 | |||
2001 | if (get_ldev(mdev)) { | ||
2002 | if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { | ||
2003 | dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n"); | ||
2004 | drbd_bm_set_all(mdev); | ||
2005 | if (drbd_bm_write(mdev)) { | ||
2006 | /* write_bm did fail! Leave full sync flag set in Meta P_DATA | ||
2007 | * but otherwise process as per normal - need to tell other | ||
2008 | * side that a full resync is required! */ | ||
2009 | dev_err(DEV, "Failed to write bitmap to disk!\n"); | ||
2010 | } else { | ||
2011 | drbd_md_clear_flag(mdev, MDF_FULL_SYNC); | ||
2012 | drbd_md_sync(mdev); | ||
2013 | } | ||
2014 | } | ||
2015 | put_ldev(mdev); | ||
2016 | } | ||
2017 | |||
2018 | c = (struct bm_xfer_ctx) { | ||
2019 | .bm_bits = drbd_bm_bits(mdev), | ||
2020 | .bm_words = drbd_bm_words(mdev), | ||
2021 | }; | ||
2022 | |||
2023 | do { | ||
2024 | ret = send_bitmap_rle_or_plain(mdev, p, &c); | ||
2025 | } while (ret == OK); | ||
2026 | |||
2027 | free_page((unsigned long) p); | ||
2028 | return (ret == DONE); | ||
2029 | } | ||
2030 | |||
2031 | int drbd_send_bitmap(struct drbd_conf *mdev) | ||
2032 | { | ||
2033 | int err; | ||
2034 | |||
2035 | if (!drbd_get_data_sock(mdev)) | ||
2036 | return -1; | ||
2037 | err = !_drbd_send_bitmap(mdev); | ||
2038 | drbd_put_data_sock(mdev); | ||
2039 | return err; | ||
2040 | } | ||
2041 | |||
2042 | int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size) | ||
2043 | { | ||
2044 | int ok; | ||
2045 | struct p_barrier_ack p; | ||
2046 | |||
2047 | p.barrier = barrier_nr; | ||
2048 | p.set_size = cpu_to_be32(set_size); | ||
2049 | |||
2050 | if (mdev->state.conn < C_CONNECTED) | ||
2051 | return FALSE; | ||
2052 | ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK, | ||
2053 | (struct p_header *)&p, sizeof(p)); | ||
2054 | return ok; | ||
2055 | } | ||
2056 | |||
2057 | /** | ||
2058 | * _drbd_send_ack() - Sends an ack packet | ||
2059 | * @mdev: DRBD device. | ||
2060 | * @cmd: Packet command code. | ||
2061 | * @sector: sector, needs to be in big endian byte order | ||
2062 | * @blksize: size in byte, needs to be in big endian byte order | ||
2063 | * @block_id: Id, big endian byte order | ||
2064 | */ | ||
2065 | static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
2066 | u64 sector, | ||
2067 | u32 blksize, | ||
2068 | u64 block_id) | ||
2069 | { | ||
2070 | int ok; | ||
2071 | struct p_block_ack p; | ||
2072 | |||
2073 | p.sector = sector; | ||
2074 | p.block_id = block_id; | ||
2075 | p.blksize = blksize; | ||
2076 | p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq)); | ||
2077 | |||
2078 | if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED) | ||
2079 | return FALSE; | ||
2080 | ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd, | ||
2081 | (struct p_header *)&p, sizeof(p)); | ||
2082 | return ok; | ||
2083 | } | ||
2084 | |||
2085 | int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
2086 | struct p_data *dp) | ||
2087 | { | ||
2088 | const int header_size = sizeof(struct p_data) | ||
2089 | - sizeof(struct p_header); | ||
2090 | int data_size = ((struct p_header *)dp)->length - header_size; | ||
2091 | |||
2092 | return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size), | ||
2093 | dp->block_id); | ||
2094 | } | ||
2095 | |||
2096 | int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
2097 | struct p_block_req *rp) | ||
2098 | { | ||
2099 | return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id); | ||
2100 | } | ||
2101 | |||
2102 | /** | ||
2103 | * drbd_send_ack() - Sends an ack packet | ||
2104 | * @mdev: DRBD device. | ||
2105 | * @cmd: Packet command code. | ||
2106 | * @e: Epoch entry. | ||
2107 | */ | ||
2108 | int drbd_send_ack(struct drbd_conf *mdev, | ||
2109 | enum drbd_packets cmd, struct drbd_epoch_entry *e) | ||
2110 | { | ||
2111 | return _drbd_send_ack(mdev, cmd, | ||
2112 | cpu_to_be64(e->sector), | ||
2113 | cpu_to_be32(e->size), | ||
2114 | e->block_id); | ||
2115 | } | ||
2116 | |||
2117 | /* This function misuses the block_id field to signal if the blocks | ||
2118 | * are is sync or not. */ | ||
2119 | int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
2120 | sector_t sector, int blksize, u64 block_id) | ||
2121 | { | ||
2122 | return _drbd_send_ack(mdev, cmd, | ||
2123 | cpu_to_be64(sector), | ||
2124 | cpu_to_be32(blksize), | ||
2125 | cpu_to_be64(block_id)); | ||
2126 | } | ||
2127 | |||
2128 | int drbd_send_drequest(struct drbd_conf *mdev, int cmd, | ||
2129 | sector_t sector, int size, u64 block_id) | ||
2130 | { | ||
2131 | int ok; | ||
2132 | struct p_block_req p; | ||
2133 | |||
2134 | p.sector = cpu_to_be64(sector); | ||
2135 | p.block_id = block_id; | ||
2136 | p.blksize = cpu_to_be32(size); | ||
2137 | |||
2138 | ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, | ||
2139 | (struct p_header *)&p, sizeof(p)); | ||
2140 | return ok; | ||
2141 | } | ||
2142 | |||
2143 | int drbd_send_drequest_csum(struct drbd_conf *mdev, | ||
2144 | sector_t sector, int size, | ||
2145 | void *digest, int digest_size, | ||
2146 | enum drbd_packets cmd) | ||
2147 | { | ||
2148 | int ok; | ||
2149 | struct p_block_req p; | ||
2150 | |||
2151 | p.sector = cpu_to_be64(sector); | ||
2152 | p.block_id = BE_DRBD_MAGIC + 0xbeef; | ||
2153 | p.blksize = cpu_to_be32(size); | ||
2154 | |||
2155 | p.head.magic = BE_DRBD_MAGIC; | ||
2156 | p.head.command = cpu_to_be16(cmd); | ||
2157 | p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size); | ||
2158 | |||
2159 | mutex_lock(&mdev->data.mutex); | ||
2160 | |||
2161 | ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0)); | ||
2162 | ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0)); | ||
2163 | |||
2164 | mutex_unlock(&mdev->data.mutex); | ||
2165 | |||
2166 | return ok; | ||
2167 | } | ||
2168 | |||
2169 | int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size) | ||
2170 | { | ||
2171 | int ok; | ||
2172 | struct p_block_req p; | ||
2173 | |||
2174 | p.sector = cpu_to_be64(sector); | ||
2175 | p.block_id = BE_DRBD_MAGIC + 0xbabe; | ||
2176 | p.blksize = cpu_to_be32(size); | ||
2177 | |||
2178 | ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST, | ||
2179 | (struct p_header *)&p, sizeof(p)); | ||
2180 | return ok; | ||
2181 | } | ||
2182 | |||
2183 | /* called on sndtimeo | ||
2184 | * returns FALSE if we should retry, | ||
2185 | * TRUE if we think connection is dead | ||
2186 | */ | ||
2187 | static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock) | ||
2188 | { | ||
2189 | int drop_it; | ||
2190 | /* long elapsed = (long)(jiffies - mdev->last_received); */ | ||
2191 | |||
2192 | drop_it = mdev->meta.socket == sock | ||
2193 | || !mdev->asender.task | ||
2194 | || get_t_state(&mdev->asender) != Running | ||
2195 | || mdev->state.conn < C_CONNECTED; | ||
2196 | |||
2197 | if (drop_it) | ||
2198 | return TRUE; | ||
2199 | |||
2200 | drop_it = !--mdev->ko_count; | ||
2201 | if (!drop_it) { | ||
2202 | dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n", | ||
2203 | current->comm, current->pid, mdev->ko_count); | ||
2204 | request_ping(mdev); | ||
2205 | } | ||
2206 | |||
2207 | return drop_it; /* && (mdev->state == R_PRIMARY) */; | ||
2208 | } | ||
2209 | |||
2210 | /* The idea of sendpage seems to be to put some kind of reference | ||
2211 | * to the page into the skb, and to hand it over to the NIC. In | ||
2212 | * this process get_page() gets called. | ||
2213 | * | ||
2214 | * As soon as the page was really sent over the network put_page() | ||
2215 | * gets called by some part of the network layer. [ NIC driver? ] | ||
2216 | * | ||
2217 | * [ get_page() / put_page() increment/decrement the count. If count | ||
2218 | * reaches 0 the page will be freed. ] | ||
2219 | * | ||
2220 | * This works nicely with pages from FSs. | ||
2221 | * But this means that in protocol A we might signal IO completion too early! | ||
2222 | * | ||
2223 | * In order not to corrupt data during a resync we must make sure | ||
2224 | * that we do not reuse our own buffer pages (EEs) to early, therefore | ||
2225 | * we have the net_ee list. | ||
2226 | * | ||
2227 | * XFS seems to have problems, still, it submits pages with page_count == 0! | ||
2228 | * As a workaround, we disable sendpage on pages | ||
2229 | * with page_count == 0 or PageSlab. | ||
2230 | */ | ||
2231 | static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page, | ||
2232 | int offset, size_t size) | ||
2233 | { | ||
2234 | int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0); | ||
2235 | kunmap(page); | ||
2236 | if (sent == size) | ||
2237 | mdev->send_cnt += size>>9; | ||
2238 | return sent == size; | ||
2239 | } | ||
2240 | |||
2241 | static int _drbd_send_page(struct drbd_conf *mdev, struct page *page, | ||
2242 | int offset, size_t size) | ||
2243 | { | ||
2244 | mm_segment_t oldfs = get_fs(); | ||
2245 | int sent, ok; | ||
2246 | int len = size; | ||
2247 | |||
2248 | /* e.g. XFS meta- & log-data is in slab pages, which have a | ||
2249 | * page_count of 0 and/or have PageSlab() set. | ||
2250 | * we cannot use send_page for those, as that does get_page(); | ||
2251 | * put_page(); and would cause either a VM_BUG directly, or | ||
2252 | * __page_cache_release a page that would actually still be referenced | ||
2253 | * by someone, leading to some obscure delayed Oops somewhere else. */ | ||
2254 | if (disable_sendpage || (page_count(page) < 1) || PageSlab(page)) | ||
2255 | return _drbd_no_send_page(mdev, page, offset, size); | ||
2256 | |||
2257 | drbd_update_congested(mdev); | ||
2258 | set_fs(KERNEL_DS); | ||
2259 | do { | ||
2260 | sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page, | ||
2261 | offset, len, | ||
2262 | MSG_NOSIGNAL); | ||
2263 | if (sent == -EAGAIN) { | ||
2264 | if (we_should_drop_the_connection(mdev, | ||
2265 | mdev->data.socket)) | ||
2266 | break; | ||
2267 | else | ||
2268 | continue; | ||
2269 | } | ||
2270 | if (sent <= 0) { | ||
2271 | dev_warn(DEV, "%s: size=%d len=%d sent=%d\n", | ||
2272 | __func__, (int)size, len, sent); | ||
2273 | break; | ||
2274 | } | ||
2275 | len -= sent; | ||
2276 | offset += sent; | ||
2277 | } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/); | ||
2278 | set_fs(oldfs); | ||
2279 | clear_bit(NET_CONGESTED, &mdev->flags); | ||
2280 | |||
2281 | ok = (len == 0); | ||
2282 | if (likely(ok)) | ||
2283 | mdev->send_cnt += size>>9; | ||
2284 | return ok; | ||
2285 | } | ||
2286 | |||
2287 | static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) | ||
2288 | { | ||
2289 | struct bio_vec *bvec; | ||
2290 | int i; | ||
2291 | __bio_for_each_segment(bvec, bio, i, 0) { | ||
2292 | if (!_drbd_no_send_page(mdev, bvec->bv_page, | ||
2293 | bvec->bv_offset, bvec->bv_len)) | ||
2294 | return 0; | ||
2295 | } | ||
2296 | return 1; | ||
2297 | } | ||
2298 | |||
2299 | static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) | ||
2300 | { | ||
2301 | struct bio_vec *bvec; | ||
2302 | int i; | ||
2303 | __bio_for_each_segment(bvec, bio, i, 0) { | ||
2304 | if (!_drbd_send_page(mdev, bvec->bv_page, | ||
2305 | bvec->bv_offset, bvec->bv_len)) | ||
2306 | return 0; | ||
2307 | } | ||
2308 | |||
2309 | return 1; | ||
2310 | } | ||
2311 | |||
2312 | /* Used to send write requests | ||
2313 | * R_PRIMARY -> Peer (P_DATA) | ||
2314 | */ | ||
2315 | int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) | ||
2316 | { | ||
2317 | int ok = 1; | ||
2318 | struct p_data p; | ||
2319 | unsigned int dp_flags = 0; | ||
2320 | void *dgb; | ||
2321 | int dgs; | ||
2322 | |||
2323 | if (!drbd_get_data_sock(mdev)) | ||
2324 | return 0; | ||
2325 | |||
2326 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? | ||
2327 | crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; | ||
2328 | |||
2329 | p.head.magic = BE_DRBD_MAGIC; | ||
2330 | p.head.command = cpu_to_be16(P_DATA); | ||
2331 | p.head.length = | ||
2332 | cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size); | ||
2333 | |||
2334 | p.sector = cpu_to_be64(req->sector); | ||
2335 | p.block_id = (unsigned long)req; | ||
2336 | p.seq_num = cpu_to_be32(req->seq_num = | ||
2337 | atomic_add_return(1, &mdev->packet_seq)); | ||
2338 | dp_flags = 0; | ||
2339 | |||
2340 | /* NOTE: no need to check if barriers supported here as we would | ||
2341 | * not pass the test in make_request_common in that case | ||
2342 | */ | ||
2343 | if (bio_rw_flagged(req->master_bio, BIO_RW_BARRIER)) { | ||
2344 | dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n"); | ||
2345 | /* dp_flags |= DP_HARDBARRIER; */ | ||
2346 | } | ||
2347 | if (bio_rw_flagged(req->master_bio, BIO_RW_SYNCIO)) | ||
2348 | dp_flags |= DP_RW_SYNC; | ||
2349 | /* for now handle SYNCIO and UNPLUG | ||
2350 | * as if they still were one and the same flag */ | ||
2351 | if (bio_rw_flagged(req->master_bio, BIO_RW_UNPLUG)) | ||
2352 | dp_flags |= DP_RW_SYNC; | ||
2353 | if (mdev->state.conn >= C_SYNC_SOURCE && | ||
2354 | mdev->state.conn <= C_PAUSED_SYNC_T) | ||
2355 | dp_flags |= DP_MAY_SET_IN_SYNC; | ||
2356 | |||
2357 | p.dp_flags = cpu_to_be32(dp_flags); | ||
2358 | set_bit(UNPLUG_REMOTE, &mdev->flags); | ||
2359 | ok = (sizeof(p) == | ||
2360 | drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE)); | ||
2361 | if (ok && dgs) { | ||
2362 | dgb = mdev->int_dig_out; | ||
2363 | drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb); | ||
2364 | ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); | ||
2365 | } | ||
2366 | if (ok) { | ||
2367 | if (mdev->net_conf->wire_protocol == DRBD_PROT_A) | ||
2368 | ok = _drbd_send_bio(mdev, req->master_bio); | ||
2369 | else | ||
2370 | ok = _drbd_send_zc_bio(mdev, req->master_bio); | ||
2371 | } | ||
2372 | |||
2373 | drbd_put_data_sock(mdev); | ||
2374 | return ok; | ||
2375 | } | ||
2376 | |||
2377 | /* answer packet, used to send data back for read requests: | ||
2378 | * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY) | ||
2379 | * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY) | ||
2380 | */ | ||
2381 | int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
2382 | struct drbd_epoch_entry *e) | ||
2383 | { | ||
2384 | int ok; | ||
2385 | struct p_data p; | ||
2386 | void *dgb; | ||
2387 | int dgs; | ||
2388 | |||
2389 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? | ||
2390 | crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; | ||
2391 | |||
2392 | p.head.magic = BE_DRBD_MAGIC; | ||
2393 | p.head.command = cpu_to_be16(cmd); | ||
2394 | p.head.length = | ||
2395 | cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size); | ||
2396 | |||
2397 | p.sector = cpu_to_be64(e->sector); | ||
2398 | p.block_id = e->block_id; | ||
2399 | /* p.seq_num = 0; No sequence numbers here.. */ | ||
2400 | |||
2401 | /* Only called by our kernel thread. | ||
2402 | * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL | ||
2403 | * in response to admin command or module unload. | ||
2404 | */ | ||
2405 | if (!drbd_get_data_sock(mdev)) | ||
2406 | return 0; | ||
2407 | |||
2408 | ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, | ||
2409 | sizeof(p), MSG_MORE); | ||
2410 | if (ok && dgs) { | ||
2411 | dgb = mdev->int_dig_out; | ||
2412 | drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb); | ||
2413 | ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); | ||
2414 | } | ||
2415 | if (ok) | ||
2416 | ok = _drbd_send_zc_bio(mdev, e->private_bio); | ||
2417 | |||
2418 | drbd_put_data_sock(mdev); | ||
2419 | return ok; | ||
2420 | } | ||
2421 | |||
2422 | /* | ||
2423 | drbd_send distinguishes two cases: | ||
2424 | |||
2425 | Packets sent via the data socket "sock" | ||
2426 | and packets sent via the meta data socket "msock" | ||
2427 | |||
2428 | sock msock | ||
2429 | -----------------+-------------------------+------------------------------ | ||
2430 | timeout conf.timeout / 2 conf.timeout / 2 | ||
2431 | timeout action send a ping via msock Abort communication | ||
2432 | and close all sockets | ||
2433 | */ | ||
2434 | |||
2435 | /* | ||
2436 | * you must have down()ed the appropriate [m]sock_mutex elsewhere! | ||
2437 | */ | ||
2438 | int drbd_send(struct drbd_conf *mdev, struct socket *sock, | ||
2439 | void *buf, size_t size, unsigned msg_flags) | ||
2440 | { | ||
2441 | struct kvec iov; | ||
2442 | struct msghdr msg; | ||
2443 | int rv, sent = 0; | ||
2444 | |||
2445 | if (!sock) | ||
2446 | return -1000; | ||
2447 | |||
2448 | /* THINK if (signal_pending) return ... ? */ | ||
2449 | |||
2450 | iov.iov_base = buf; | ||
2451 | iov.iov_len = size; | ||
2452 | |||
2453 | msg.msg_name = NULL; | ||
2454 | msg.msg_namelen = 0; | ||
2455 | msg.msg_control = NULL; | ||
2456 | msg.msg_controllen = 0; | ||
2457 | msg.msg_flags = msg_flags | MSG_NOSIGNAL; | ||
2458 | |||
2459 | if (sock == mdev->data.socket) { | ||
2460 | mdev->ko_count = mdev->net_conf->ko_count; | ||
2461 | drbd_update_congested(mdev); | ||
2462 | } | ||
2463 | do { | ||
2464 | /* STRANGE | ||
2465 | * tcp_sendmsg does _not_ use its size parameter at all ? | ||
2466 | * | ||
2467 | * -EAGAIN on timeout, -EINTR on signal. | ||
2468 | */ | ||
2469 | /* THINK | ||
2470 | * do we need to block DRBD_SIG if sock == &meta.socket ?? | ||
2471 | * otherwise wake_asender() might interrupt some send_*Ack ! | ||
2472 | */ | ||
2473 | rv = kernel_sendmsg(sock, &msg, &iov, 1, size); | ||
2474 | if (rv == -EAGAIN) { | ||
2475 | if (we_should_drop_the_connection(mdev, sock)) | ||
2476 | break; | ||
2477 | else | ||
2478 | continue; | ||
2479 | } | ||
2480 | D_ASSERT(rv != 0); | ||
2481 | if (rv == -EINTR) { | ||
2482 | flush_signals(current); | ||
2483 | rv = 0; | ||
2484 | } | ||
2485 | if (rv < 0) | ||
2486 | break; | ||
2487 | sent += rv; | ||
2488 | iov.iov_base += rv; | ||
2489 | iov.iov_len -= rv; | ||
2490 | } while (sent < size); | ||
2491 | |||
2492 | if (sock == mdev->data.socket) | ||
2493 | clear_bit(NET_CONGESTED, &mdev->flags); | ||
2494 | |||
2495 | if (rv <= 0) { | ||
2496 | if (rv != -EAGAIN) { | ||
2497 | dev_err(DEV, "%s_sendmsg returned %d\n", | ||
2498 | sock == mdev->meta.socket ? "msock" : "sock", | ||
2499 | rv); | ||
2500 | drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE)); | ||
2501 | } else | ||
2502 | drbd_force_state(mdev, NS(conn, C_TIMEOUT)); | ||
2503 | } | ||
2504 | |||
2505 | return sent; | ||
2506 | } | ||
2507 | |||
2508 | static int drbd_open(struct block_device *bdev, fmode_t mode) | ||
2509 | { | ||
2510 | struct drbd_conf *mdev = bdev->bd_disk->private_data; | ||
2511 | unsigned long flags; | ||
2512 | int rv = 0; | ||
2513 | |||
2514 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
2515 | /* to have a stable mdev->state.role | ||
2516 | * and no race with updating open_cnt */ | ||
2517 | |||
2518 | if (mdev->state.role != R_PRIMARY) { | ||
2519 | if (mode & FMODE_WRITE) | ||
2520 | rv = -EROFS; | ||
2521 | else if (!allow_oos) | ||
2522 | rv = -EMEDIUMTYPE; | ||
2523 | } | ||
2524 | |||
2525 | if (!rv) | ||
2526 | mdev->open_cnt++; | ||
2527 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
2528 | |||
2529 | return rv; | ||
2530 | } | ||
2531 | |||
2532 | static int drbd_release(struct gendisk *gd, fmode_t mode) | ||
2533 | { | ||
2534 | struct drbd_conf *mdev = gd->private_data; | ||
2535 | mdev->open_cnt--; | ||
2536 | return 0; | ||
2537 | } | ||
2538 | |||
2539 | static void drbd_unplug_fn(struct request_queue *q) | ||
2540 | { | ||
2541 | struct drbd_conf *mdev = q->queuedata; | ||
2542 | |||
2543 | /* unplug FIRST */ | ||
2544 | spin_lock_irq(q->queue_lock); | ||
2545 | blk_remove_plug(q); | ||
2546 | spin_unlock_irq(q->queue_lock); | ||
2547 | |||
2548 | /* only if connected */ | ||
2549 | spin_lock_irq(&mdev->req_lock); | ||
2550 | if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) { | ||
2551 | D_ASSERT(mdev->state.role == R_PRIMARY); | ||
2552 | if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) { | ||
2553 | /* add to the data.work queue, | ||
2554 | * unless already queued. | ||
2555 | * XXX this might be a good addition to drbd_queue_work | ||
2556 | * anyways, to detect "double queuing" ... */ | ||
2557 | if (list_empty(&mdev->unplug_work.list)) | ||
2558 | drbd_queue_work(&mdev->data.work, | ||
2559 | &mdev->unplug_work); | ||
2560 | } | ||
2561 | } | ||
2562 | spin_unlock_irq(&mdev->req_lock); | ||
2563 | |||
2564 | if (mdev->state.disk >= D_INCONSISTENT) | ||
2565 | drbd_kick_lo(mdev); | ||
2566 | } | ||
2567 | |||
2568 | static void drbd_set_defaults(struct drbd_conf *mdev) | ||
2569 | { | ||
2570 | mdev->sync_conf.after = DRBD_AFTER_DEF; | ||
2571 | mdev->sync_conf.rate = DRBD_RATE_DEF; | ||
2572 | mdev->sync_conf.al_extents = DRBD_AL_EXTENTS_DEF; | ||
2573 | mdev->state = (union drbd_state) { | ||
2574 | { .role = R_SECONDARY, | ||
2575 | .peer = R_UNKNOWN, | ||
2576 | .conn = C_STANDALONE, | ||
2577 | .disk = D_DISKLESS, | ||
2578 | .pdsk = D_UNKNOWN, | ||
2579 | .susp = 0 | ||
2580 | } }; | ||
2581 | } | ||
2582 | |||
2583 | void drbd_init_set_defaults(struct drbd_conf *mdev) | ||
2584 | { | ||
2585 | /* the memset(,0,) did most of this. | ||
2586 | * note: only assignments, no allocation in here */ | ||
2587 | |||
2588 | drbd_set_defaults(mdev); | ||
2589 | |||
2590 | /* for now, we do NOT yet support it, | ||
2591 | * even though we start some framework | ||
2592 | * to eventually support barriers */ | ||
2593 | set_bit(NO_BARRIER_SUPP, &mdev->flags); | ||
2594 | |||
2595 | atomic_set(&mdev->ap_bio_cnt, 0); | ||
2596 | atomic_set(&mdev->ap_pending_cnt, 0); | ||
2597 | atomic_set(&mdev->rs_pending_cnt, 0); | ||
2598 | atomic_set(&mdev->unacked_cnt, 0); | ||
2599 | atomic_set(&mdev->local_cnt, 0); | ||
2600 | atomic_set(&mdev->net_cnt, 0); | ||
2601 | atomic_set(&mdev->packet_seq, 0); | ||
2602 | atomic_set(&mdev->pp_in_use, 0); | ||
2603 | |||
2604 | mutex_init(&mdev->md_io_mutex); | ||
2605 | mutex_init(&mdev->data.mutex); | ||
2606 | mutex_init(&mdev->meta.mutex); | ||
2607 | sema_init(&mdev->data.work.s, 0); | ||
2608 | sema_init(&mdev->meta.work.s, 0); | ||
2609 | mutex_init(&mdev->state_mutex); | ||
2610 | |||
2611 | spin_lock_init(&mdev->data.work.q_lock); | ||
2612 | spin_lock_init(&mdev->meta.work.q_lock); | ||
2613 | |||
2614 | spin_lock_init(&mdev->al_lock); | ||
2615 | spin_lock_init(&mdev->req_lock); | ||
2616 | spin_lock_init(&mdev->peer_seq_lock); | ||
2617 | spin_lock_init(&mdev->epoch_lock); | ||
2618 | |||
2619 | INIT_LIST_HEAD(&mdev->active_ee); | ||
2620 | INIT_LIST_HEAD(&mdev->sync_ee); | ||
2621 | INIT_LIST_HEAD(&mdev->done_ee); | ||
2622 | INIT_LIST_HEAD(&mdev->read_ee); | ||
2623 | INIT_LIST_HEAD(&mdev->net_ee); | ||
2624 | INIT_LIST_HEAD(&mdev->resync_reads); | ||
2625 | INIT_LIST_HEAD(&mdev->data.work.q); | ||
2626 | INIT_LIST_HEAD(&mdev->meta.work.q); | ||
2627 | INIT_LIST_HEAD(&mdev->resync_work.list); | ||
2628 | INIT_LIST_HEAD(&mdev->unplug_work.list); | ||
2629 | INIT_LIST_HEAD(&mdev->md_sync_work.list); | ||
2630 | INIT_LIST_HEAD(&mdev->bm_io_work.w.list); | ||
2631 | mdev->resync_work.cb = w_resync_inactive; | ||
2632 | mdev->unplug_work.cb = w_send_write_hint; | ||
2633 | mdev->md_sync_work.cb = w_md_sync; | ||
2634 | mdev->bm_io_work.w.cb = w_bitmap_io; | ||
2635 | init_timer(&mdev->resync_timer); | ||
2636 | init_timer(&mdev->md_sync_timer); | ||
2637 | mdev->resync_timer.function = resync_timer_fn; | ||
2638 | mdev->resync_timer.data = (unsigned long) mdev; | ||
2639 | mdev->md_sync_timer.function = md_sync_timer_fn; | ||
2640 | mdev->md_sync_timer.data = (unsigned long) mdev; | ||
2641 | |||
2642 | init_waitqueue_head(&mdev->misc_wait); | ||
2643 | init_waitqueue_head(&mdev->state_wait); | ||
2644 | init_waitqueue_head(&mdev->ee_wait); | ||
2645 | init_waitqueue_head(&mdev->al_wait); | ||
2646 | init_waitqueue_head(&mdev->seq_wait); | ||
2647 | |||
2648 | drbd_thread_init(mdev, &mdev->receiver, drbdd_init); | ||
2649 | drbd_thread_init(mdev, &mdev->worker, drbd_worker); | ||
2650 | drbd_thread_init(mdev, &mdev->asender, drbd_asender); | ||
2651 | |||
2652 | mdev->agreed_pro_version = PRO_VERSION_MAX; | ||
2653 | mdev->write_ordering = WO_bio_barrier; | ||
2654 | mdev->resync_wenr = LC_FREE; | ||
2655 | } | ||
2656 | |||
2657 | void drbd_mdev_cleanup(struct drbd_conf *mdev) | ||
2658 | { | ||
2659 | if (mdev->receiver.t_state != None) | ||
2660 | dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n", | ||
2661 | mdev->receiver.t_state); | ||
2662 | |||
2663 | /* no need to lock it, I'm the only thread alive */ | ||
2664 | if (atomic_read(&mdev->current_epoch->epoch_size) != 0) | ||
2665 | dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size)); | ||
2666 | mdev->al_writ_cnt = | ||
2667 | mdev->bm_writ_cnt = | ||
2668 | mdev->read_cnt = | ||
2669 | mdev->recv_cnt = | ||
2670 | mdev->send_cnt = | ||
2671 | mdev->writ_cnt = | ||
2672 | mdev->p_size = | ||
2673 | mdev->rs_start = | ||
2674 | mdev->rs_total = | ||
2675 | mdev->rs_failed = | ||
2676 | mdev->rs_mark_left = | ||
2677 | mdev->rs_mark_time = 0; | ||
2678 | D_ASSERT(mdev->net_conf == NULL); | ||
2679 | |||
2680 | drbd_set_my_capacity(mdev, 0); | ||
2681 | if (mdev->bitmap) { | ||
2682 | /* maybe never allocated. */ | ||
2683 | drbd_bm_resize(mdev, 0); | ||
2684 | drbd_bm_cleanup(mdev); | ||
2685 | } | ||
2686 | |||
2687 | drbd_free_resources(mdev); | ||
2688 | |||
2689 | /* | ||
2690 | * currently we drbd_init_ee only on module load, so | ||
2691 | * we may do drbd_release_ee only on module unload! | ||
2692 | */ | ||
2693 | D_ASSERT(list_empty(&mdev->active_ee)); | ||
2694 | D_ASSERT(list_empty(&mdev->sync_ee)); | ||
2695 | D_ASSERT(list_empty(&mdev->done_ee)); | ||
2696 | D_ASSERT(list_empty(&mdev->read_ee)); | ||
2697 | D_ASSERT(list_empty(&mdev->net_ee)); | ||
2698 | D_ASSERT(list_empty(&mdev->resync_reads)); | ||
2699 | D_ASSERT(list_empty(&mdev->data.work.q)); | ||
2700 | D_ASSERT(list_empty(&mdev->meta.work.q)); | ||
2701 | D_ASSERT(list_empty(&mdev->resync_work.list)); | ||
2702 | D_ASSERT(list_empty(&mdev->unplug_work.list)); | ||
2703 | |||
2704 | } | ||
2705 | |||
2706 | |||
2707 | static void drbd_destroy_mempools(void) | ||
2708 | { | ||
2709 | struct page *page; | ||
2710 | |||
2711 | while (drbd_pp_pool) { | ||
2712 | page = drbd_pp_pool; | ||
2713 | drbd_pp_pool = (struct page *)page_private(page); | ||
2714 | __free_page(page); | ||
2715 | drbd_pp_vacant--; | ||
2716 | } | ||
2717 | |||
2718 | /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */ | ||
2719 | |||
2720 | if (drbd_ee_mempool) | ||
2721 | mempool_destroy(drbd_ee_mempool); | ||
2722 | if (drbd_request_mempool) | ||
2723 | mempool_destroy(drbd_request_mempool); | ||
2724 | if (drbd_ee_cache) | ||
2725 | kmem_cache_destroy(drbd_ee_cache); | ||
2726 | if (drbd_request_cache) | ||
2727 | kmem_cache_destroy(drbd_request_cache); | ||
2728 | if (drbd_bm_ext_cache) | ||
2729 | kmem_cache_destroy(drbd_bm_ext_cache); | ||
2730 | if (drbd_al_ext_cache) | ||
2731 | kmem_cache_destroy(drbd_al_ext_cache); | ||
2732 | |||
2733 | drbd_ee_mempool = NULL; | ||
2734 | drbd_request_mempool = NULL; | ||
2735 | drbd_ee_cache = NULL; | ||
2736 | drbd_request_cache = NULL; | ||
2737 | drbd_bm_ext_cache = NULL; | ||
2738 | drbd_al_ext_cache = NULL; | ||
2739 | |||
2740 | return; | ||
2741 | } | ||
2742 | |||
2743 | static int drbd_create_mempools(void) | ||
2744 | { | ||
2745 | struct page *page; | ||
2746 | const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count; | ||
2747 | int i; | ||
2748 | |||
2749 | /* prepare our caches and mempools */ | ||
2750 | drbd_request_mempool = NULL; | ||
2751 | drbd_ee_cache = NULL; | ||
2752 | drbd_request_cache = NULL; | ||
2753 | drbd_bm_ext_cache = NULL; | ||
2754 | drbd_al_ext_cache = NULL; | ||
2755 | drbd_pp_pool = NULL; | ||
2756 | |||
2757 | /* caches */ | ||
2758 | drbd_request_cache = kmem_cache_create( | ||
2759 | "drbd_req", sizeof(struct drbd_request), 0, 0, NULL); | ||
2760 | if (drbd_request_cache == NULL) | ||
2761 | goto Enomem; | ||
2762 | |||
2763 | drbd_ee_cache = kmem_cache_create( | ||
2764 | "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL); | ||
2765 | if (drbd_ee_cache == NULL) | ||
2766 | goto Enomem; | ||
2767 | |||
2768 | drbd_bm_ext_cache = kmem_cache_create( | ||
2769 | "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL); | ||
2770 | if (drbd_bm_ext_cache == NULL) | ||
2771 | goto Enomem; | ||
2772 | |||
2773 | drbd_al_ext_cache = kmem_cache_create( | ||
2774 | "drbd_al", sizeof(struct lc_element), 0, 0, NULL); | ||
2775 | if (drbd_al_ext_cache == NULL) | ||
2776 | goto Enomem; | ||
2777 | |||
2778 | /* mempools */ | ||
2779 | drbd_request_mempool = mempool_create(number, | ||
2780 | mempool_alloc_slab, mempool_free_slab, drbd_request_cache); | ||
2781 | if (drbd_request_mempool == NULL) | ||
2782 | goto Enomem; | ||
2783 | |||
2784 | drbd_ee_mempool = mempool_create(number, | ||
2785 | mempool_alloc_slab, mempool_free_slab, drbd_ee_cache); | ||
2786 | if (drbd_request_mempool == NULL) | ||
2787 | goto Enomem; | ||
2788 | |||
2789 | /* drbd's page pool */ | ||
2790 | spin_lock_init(&drbd_pp_lock); | ||
2791 | |||
2792 | for (i = 0; i < number; i++) { | ||
2793 | page = alloc_page(GFP_HIGHUSER); | ||
2794 | if (!page) | ||
2795 | goto Enomem; | ||
2796 | set_page_private(page, (unsigned long)drbd_pp_pool); | ||
2797 | drbd_pp_pool = page; | ||
2798 | } | ||
2799 | drbd_pp_vacant = number; | ||
2800 | |||
2801 | return 0; | ||
2802 | |||
2803 | Enomem: | ||
2804 | drbd_destroy_mempools(); /* in case we allocated some */ | ||
2805 | return -ENOMEM; | ||
2806 | } | ||
2807 | |||
2808 | static int drbd_notify_sys(struct notifier_block *this, unsigned long code, | ||
2809 | void *unused) | ||
2810 | { | ||
2811 | /* just so we have it. you never know what interesting things we | ||
2812 | * might want to do here some day... | ||
2813 | */ | ||
2814 | |||
2815 | return NOTIFY_DONE; | ||
2816 | } | ||
2817 | |||
2818 | static struct notifier_block drbd_notifier = { | ||
2819 | .notifier_call = drbd_notify_sys, | ||
2820 | }; | ||
2821 | |||
2822 | static void drbd_release_ee_lists(struct drbd_conf *mdev) | ||
2823 | { | ||
2824 | int rr; | ||
2825 | |||
2826 | rr = drbd_release_ee(mdev, &mdev->active_ee); | ||
2827 | if (rr) | ||
2828 | dev_err(DEV, "%d EEs in active list found!\n", rr); | ||
2829 | |||
2830 | rr = drbd_release_ee(mdev, &mdev->sync_ee); | ||
2831 | if (rr) | ||
2832 | dev_err(DEV, "%d EEs in sync list found!\n", rr); | ||
2833 | |||
2834 | rr = drbd_release_ee(mdev, &mdev->read_ee); | ||
2835 | if (rr) | ||
2836 | dev_err(DEV, "%d EEs in read list found!\n", rr); | ||
2837 | |||
2838 | rr = drbd_release_ee(mdev, &mdev->done_ee); | ||
2839 | if (rr) | ||
2840 | dev_err(DEV, "%d EEs in done list found!\n", rr); | ||
2841 | |||
2842 | rr = drbd_release_ee(mdev, &mdev->net_ee); | ||
2843 | if (rr) | ||
2844 | dev_err(DEV, "%d EEs in net list found!\n", rr); | ||
2845 | } | ||
2846 | |||
2847 | /* caution. no locking. | ||
2848 | * currently only used from module cleanup code. */ | ||
2849 | static void drbd_delete_device(unsigned int minor) | ||
2850 | { | ||
2851 | struct drbd_conf *mdev = minor_to_mdev(minor); | ||
2852 | |||
2853 | if (!mdev) | ||
2854 | return; | ||
2855 | |||
2856 | /* paranoia asserts */ | ||
2857 | if (mdev->open_cnt != 0) | ||
2858 | dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt, | ||
2859 | __FILE__ , __LINE__); | ||
2860 | |||
2861 | ERR_IF (!list_empty(&mdev->data.work.q)) { | ||
2862 | struct list_head *lp; | ||
2863 | list_for_each(lp, &mdev->data.work.q) { | ||
2864 | dev_err(DEV, "lp = %p\n", lp); | ||
2865 | } | ||
2866 | }; | ||
2867 | /* end paranoia asserts */ | ||
2868 | |||
2869 | del_gendisk(mdev->vdisk); | ||
2870 | |||
2871 | /* cleanup stuff that may have been allocated during | ||
2872 | * device (re-)configuration or state changes */ | ||
2873 | |||
2874 | if (mdev->this_bdev) | ||
2875 | bdput(mdev->this_bdev); | ||
2876 | |||
2877 | drbd_free_resources(mdev); | ||
2878 | |||
2879 | drbd_release_ee_lists(mdev); | ||
2880 | |||
2881 | /* should be free'd on disconnect? */ | ||
2882 | kfree(mdev->ee_hash); | ||
2883 | /* | ||
2884 | mdev->ee_hash_s = 0; | ||
2885 | mdev->ee_hash = NULL; | ||
2886 | */ | ||
2887 | |||
2888 | lc_destroy(mdev->act_log); | ||
2889 | lc_destroy(mdev->resync); | ||
2890 | |||
2891 | kfree(mdev->p_uuid); | ||
2892 | /* mdev->p_uuid = NULL; */ | ||
2893 | |||
2894 | kfree(mdev->int_dig_out); | ||
2895 | kfree(mdev->int_dig_in); | ||
2896 | kfree(mdev->int_dig_vv); | ||
2897 | |||
2898 | /* cleanup the rest that has been | ||
2899 | * allocated from drbd_new_device | ||
2900 | * and actually free the mdev itself */ | ||
2901 | drbd_free_mdev(mdev); | ||
2902 | } | ||
2903 | |||
2904 | static void drbd_cleanup(void) | ||
2905 | { | ||
2906 | unsigned int i; | ||
2907 | |||
2908 | unregister_reboot_notifier(&drbd_notifier); | ||
2909 | |||
2910 | drbd_nl_cleanup(); | ||
2911 | |||
2912 | if (minor_table) { | ||
2913 | if (drbd_proc) | ||
2914 | remove_proc_entry("drbd", NULL); | ||
2915 | i = minor_count; | ||
2916 | while (i--) | ||
2917 | drbd_delete_device(i); | ||
2918 | drbd_destroy_mempools(); | ||
2919 | } | ||
2920 | |||
2921 | kfree(minor_table); | ||
2922 | |||
2923 | unregister_blkdev(DRBD_MAJOR, "drbd"); | ||
2924 | |||
2925 | printk(KERN_INFO "drbd: module cleanup done.\n"); | ||
2926 | } | ||
2927 | |||
2928 | /** | ||
2929 | * drbd_congested() - Callback for pdflush | ||
2930 | * @congested_data: User data | ||
2931 | * @bdi_bits: Bits pdflush is currently interested in | ||
2932 | * | ||
2933 | * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested. | ||
2934 | */ | ||
2935 | static int drbd_congested(void *congested_data, int bdi_bits) | ||
2936 | { | ||
2937 | struct drbd_conf *mdev = congested_data; | ||
2938 | struct request_queue *q; | ||
2939 | char reason = '-'; | ||
2940 | int r = 0; | ||
2941 | |||
2942 | if (!__inc_ap_bio_cond(mdev)) { | ||
2943 | /* DRBD has frozen IO */ | ||
2944 | r = bdi_bits; | ||
2945 | reason = 'd'; | ||
2946 | goto out; | ||
2947 | } | ||
2948 | |||
2949 | if (get_ldev(mdev)) { | ||
2950 | q = bdev_get_queue(mdev->ldev->backing_bdev); | ||
2951 | r = bdi_congested(&q->backing_dev_info, bdi_bits); | ||
2952 | put_ldev(mdev); | ||
2953 | if (r) | ||
2954 | reason = 'b'; | ||
2955 | } | ||
2956 | |||
2957 | if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) { | ||
2958 | r |= (1 << BDI_async_congested); | ||
2959 | reason = reason == 'b' ? 'a' : 'n'; | ||
2960 | } | ||
2961 | |||
2962 | out: | ||
2963 | mdev->congestion_reason = reason; | ||
2964 | return r; | ||
2965 | } | ||
2966 | |||
2967 | struct drbd_conf *drbd_new_device(unsigned int minor) | ||
2968 | { | ||
2969 | struct drbd_conf *mdev; | ||
2970 | struct gendisk *disk; | ||
2971 | struct request_queue *q; | ||
2972 | |||
2973 | /* GFP_KERNEL, we are outside of all write-out paths */ | ||
2974 | mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL); | ||
2975 | if (!mdev) | ||
2976 | return NULL; | ||
2977 | if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL)) | ||
2978 | goto out_no_cpumask; | ||
2979 | |||
2980 | mdev->minor = minor; | ||
2981 | |||
2982 | drbd_init_set_defaults(mdev); | ||
2983 | |||
2984 | q = blk_alloc_queue(GFP_KERNEL); | ||
2985 | if (!q) | ||
2986 | goto out_no_q; | ||
2987 | mdev->rq_queue = q; | ||
2988 | q->queuedata = mdev; | ||
2989 | |||
2990 | disk = alloc_disk(1); | ||
2991 | if (!disk) | ||
2992 | goto out_no_disk; | ||
2993 | mdev->vdisk = disk; | ||
2994 | |||
2995 | set_disk_ro(disk, TRUE); | ||
2996 | |||
2997 | disk->queue = q; | ||
2998 | disk->major = DRBD_MAJOR; | ||
2999 | disk->first_minor = minor; | ||
3000 | disk->fops = &drbd_ops; | ||
3001 | sprintf(disk->disk_name, "drbd%d", minor); | ||
3002 | disk->private_data = mdev; | ||
3003 | |||
3004 | mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor)); | ||
3005 | /* we have no partitions. we contain only ourselves. */ | ||
3006 | mdev->this_bdev->bd_contains = mdev->this_bdev; | ||
3007 | |||
3008 | q->backing_dev_info.congested_fn = drbd_congested; | ||
3009 | q->backing_dev_info.congested_data = mdev; | ||
3010 | |||
3011 | blk_queue_make_request(q, drbd_make_request_26); | ||
3012 | blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE); | ||
3013 | blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); | ||
3014 | blk_queue_merge_bvec(q, drbd_merge_bvec); | ||
3015 | q->queue_lock = &mdev->req_lock; /* needed since we use */ | ||
3016 | /* plugging on a queue, that actually has no requests! */ | ||
3017 | q->unplug_fn = drbd_unplug_fn; | ||
3018 | |||
3019 | mdev->md_io_page = alloc_page(GFP_KERNEL); | ||
3020 | if (!mdev->md_io_page) | ||
3021 | goto out_no_io_page; | ||
3022 | |||
3023 | if (drbd_bm_init(mdev)) | ||
3024 | goto out_no_bitmap; | ||
3025 | /* no need to lock access, we are still initializing this minor device. */ | ||
3026 | if (!tl_init(mdev)) | ||
3027 | goto out_no_tl; | ||
3028 | |||
3029 | mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL); | ||
3030 | if (!mdev->app_reads_hash) | ||
3031 | goto out_no_app_reads; | ||
3032 | |||
3033 | mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL); | ||
3034 | if (!mdev->current_epoch) | ||
3035 | goto out_no_epoch; | ||
3036 | |||
3037 | INIT_LIST_HEAD(&mdev->current_epoch->list); | ||
3038 | mdev->epochs = 1; | ||
3039 | |||
3040 | return mdev; | ||
3041 | |||
3042 | /* out_whatever_else: | ||
3043 | kfree(mdev->current_epoch); */ | ||
3044 | out_no_epoch: | ||
3045 | kfree(mdev->app_reads_hash); | ||
3046 | out_no_app_reads: | ||
3047 | tl_cleanup(mdev); | ||
3048 | out_no_tl: | ||
3049 | drbd_bm_cleanup(mdev); | ||
3050 | out_no_bitmap: | ||
3051 | __free_page(mdev->md_io_page); | ||
3052 | out_no_io_page: | ||
3053 | put_disk(disk); | ||
3054 | out_no_disk: | ||
3055 | blk_cleanup_queue(q); | ||
3056 | out_no_q: | ||
3057 | free_cpumask_var(mdev->cpu_mask); | ||
3058 | out_no_cpumask: | ||
3059 | kfree(mdev); | ||
3060 | return NULL; | ||
3061 | } | ||
3062 | |||
3063 | /* counterpart of drbd_new_device. | ||
3064 | * last part of drbd_delete_device. */ | ||
3065 | void drbd_free_mdev(struct drbd_conf *mdev) | ||
3066 | { | ||
3067 | kfree(mdev->current_epoch); | ||
3068 | kfree(mdev->app_reads_hash); | ||
3069 | tl_cleanup(mdev); | ||
3070 | if (mdev->bitmap) /* should no longer be there. */ | ||
3071 | drbd_bm_cleanup(mdev); | ||
3072 | __free_page(mdev->md_io_page); | ||
3073 | put_disk(mdev->vdisk); | ||
3074 | blk_cleanup_queue(mdev->rq_queue); | ||
3075 | free_cpumask_var(mdev->cpu_mask); | ||
3076 | kfree(mdev); | ||
3077 | } | ||
3078 | |||
3079 | |||
3080 | int __init drbd_init(void) | ||
3081 | { | ||
3082 | int err; | ||
3083 | |||
3084 | if (sizeof(struct p_handshake) != 80) { | ||
3085 | printk(KERN_ERR | ||
3086 | "drbd: never change the size or layout " | ||
3087 | "of the HandShake packet.\n"); | ||
3088 | return -EINVAL; | ||
3089 | } | ||
3090 | |||
3091 | if (1 > minor_count || minor_count > 255) { | ||
3092 | printk(KERN_ERR | ||
3093 | "drbd: invalid minor_count (%d)\n", minor_count); | ||
3094 | #ifdef MODULE | ||
3095 | return -EINVAL; | ||
3096 | #else | ||
3097 | minor_count = 8; | ||
3098 | #endif | ||
3099 | } | ||
3100 | |||
3101 | err = drbd_nl_init(); | ||
3102 | if (err) | ||
3103 | return err; | ||
3104 | |||
3105 | err = register_blkdev(DRBD_MAJOR, "drbd"); | ||
3106 | if (err) { | ||
3107 | printk(KERN_ERR | ||
3108 | "drbd: unable to register block device major %d\n", | ||
3109 | DRBD_MAJOR); | ||
3110 | return err; | ||
3111 | } | ||
3112 | |||
3113 | register_reboot_notifier(&drbd_notifier); | ||
3114 | |||
3115 | /* | ||
3116 | * allocate all necessary structs | ||
3117 | */ | ||
3118 | err = -ENOMEM; | ||
3119 | |||
3120 | init_waitqueue_head(&drbd_pp_wait); | ||
3121 | |||
3122 | drbd_proc = NULL; /* play safe for drbd_cleanup */ | ||
3123 | minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count, | ||
3124 | GFP_KERNEL); | ||
3125 | if (!minor_table) | ||
3126 | goto Enomem; | ||
3127 | |||
3128 | err = drbd_create_mempools(); | ||
3129 | if (err) | ||
3130 | goto Enomem; | ||
3131 | |||
3132 | drbd_proc = proc_create("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops); | ||
3133 | if (!drbd_proc) { | ||
3134 | printk(KERN_ERR "drbd: unable to register proc file\n"); | ||
3135 | goto Enomem; | ||
3136 | } | ||
3137 | |||
3138 | rwlock_init(&global_state_lock); | ||
3139 | |||
3140 | printk(KERN_INFO "drbd: initialized. " | ||
3141 | "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n", | ||
3142 | API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX); | ||
3143 | printk(KERN_INFO "drbd: %s\n", drbd_buildtag()); | ||
3144 | printk(KERN_INFO "drbd: registered as block device major %d\n", | ||
3145 | DRBD_MAJOR); | ||
3146 | printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table); | ||
3147 | |||
3148 | return 0; /* Success! */ | ||
3149 | |||
3150 | Enomem: | ||
3151 | drbd_cleanup(); | ||
3152 | if (err == -ENOMEM) | ||
3153 | /* currently always the case */ | ||
3154 | printk(KERN_ERR "drbd: ran out of memory\n"); | ||
3155 | else | ||
3156 | printk(KERN_ERR "drbd: initialization failure\n"); | ||
3157 | return err; | ||
3158 | } | ||
3159 | |||
3160 | void drbd_free_bc(struct drbd_backing_dev *ldev) | ||
3161 | { | ||
3162 | if (ldev == NULL) | ||
3163 | return; | ||
3164 | |||
3165 | bd_release(ldev->backing_bdev); | ||
3166 | bd_release(ldev->md_bdev); | ||
3167 | |||
3168 | fput(ldev->lo_file); | ||
3169 | fput(ldev->md_file); | ||
3170 | |||
3171 | kfree(ldev); | ||
3172 | } | ||
3173 | |||
3174 | void drbd_free_sock(struct drbd_conf *mdev) | ||
3175 | { | ||
3176 | if (mdev->data.socket) { | ||
3177 | mutex_lock(&mdev->data.mutex); | ||
3178 | kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR); | ||
3179 | sock_release(mdev->data.socket); | ||
3180 | mdev->data.socket = NULL; | ||
3181 | mutex_unlock(&mdev->data.mutex); | ||
3182 | } | ||
3183 | if (mdev->meta.socket) { | ||
3184 | mutex_lock(&mdev->meta.mutex); | ||
3185 | kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR); | ||
3186 | sock_release(mdev->meta.socket); | ||
3187 | mdev->meta.socket = NULL; | ||
3188 | mutex_unlock(&mdev->meta.mutex); | ||
3189 | } | ||
3190 | } | ||
3191 | |||
3192 | |||
3193 | void drbd_free_resources(struct drbd_conf *mdev) | ||
3194 | { | ||
3195 | crypto_free_hash(mdev->csums_tfm); | ||
3196 | mdev->csums_tfm = NULL; | ||
3197 | crypto_free_hash(mdev->verify_tfm); | ||
3198 | mdev->verify_tfm = NULL; | ||
3199 | crypto_free_hash(mdev->cram_hmac_tfm); | ||
3200 | mdev->cram_hmac_tfm = NULL; | ||
3201 | crypto_free_hash(mdev->integrity_w_tfm); | ||
3202 | mdev->integrity_w_tfm = NULL; | ||
3203 | crypto_free_hash(mdev->integrity_r_tfm); | ||
3204 | mdev->integrity_r_tfm = NULL; | ||
3205 | |||
3206 | drbd_free_sock(mdev); | ||
3207 | |||
3208 | __no_warn(local, | ||
3209 | drbd_free_bc(mdev->ldev); | ||
3210 | mdev->ldev = NULL;); | ||
3211 | } | ||
3212 | |||
3213 | /* meta data management */ | ||
3214 | |||
3215 | struct meta_data_on_disk { | ||
3216 | u64 la_size; /* last agreed size. */ | ||
3217 | u64 uuid[UI_SIZE]; /* UUIDs. */ | ||
3218 | u64 device_uuid; | ||
3219 | u64 reserved_u64_1; | ||
3220 | u32 flags; /* MDF */ | ||
3221 | u32 magic; | ||
3222 | u32 md_size_sect; | ||
3223 | u32 al_offset; /* offset to this block */ | ||
3224 | u32 al_nr_extents; /* important for restoring the AL */ | ||
3225 | /* `-- act_log->nr_elements <-- sync_conf.al_extents */ | ||
3226 | u32 bm_offset; /* offset to the bitmap, from here */ | ||
3227 | u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ | ||
3228 | u32 reserved_u32[4]; | ||
3229 | |||
3230 | } __packed; | ||
3231 | |||
3232 | /** | ||
3233 | * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set | ||
3234 | * @mdev: DRBD device. | ||
3235 | */ | ||
3236 | void drbd_md_sync(struct drbd_conf *mdev) | ||
3237 | { | ||
3238 | struct meta_data_on_disk *buffer; | ||
3239 | sector_t sector; | ||
3240 | int i; | ||
3241 | |||
3242 | if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) | ||
3243 | return; | ||
3244 | del_timer(&mdev->md_sync_timer); | ||
3245 | |||
3246 | /* We use here D_FAILED and not D_ATTACHING because we try to write | ||
3247 | * metadata even if we detach due to a disk failure! */ | ||
3248 | if (!get_ldev_if_state(mdev, D_FAILED)) | ||
3249 | return; | ||
3250 | |||
3251 | mutex_lock(&mdev->md_io_mutex); | ||
3252 | buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); | ||
3253 | memset(buffer, 0, 512); | ||
3254 | |||
3255 | buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); | ||
3256 | for (i = UI_CURRENT; i < UI_SIZE; i++) | ||
3257 | buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); | ||
3258 | buffer->flags = cpu_to_be32(mdev->ldev->md.flags); | ||
3259 | buffer->magic = cpu_to_be32(DRBD_MD_MAGIC); | ||
3260 | |||
3261 | buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect); | ||
3262 | buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset); | ||
3263 | buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements); | ||
3264 | buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE); | ||
3265 | buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid); | ||
3266 | |||
3267 | buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset); | ||
3268 | |||
3269 | D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); | ||
3270 | sector = mdev->ldev->md.md_offset; | ||
3271 | |||
3272 | if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { | ||
3273 | clear_bit(MD_DIRTY, &mdev->flags); | ||
3274 | } else { | ||
3275 | /* this was a try anyways ... */ | ||
3276 | dev_err(DEV, "meta data update failed!\n"); | ||
3277 | |||
3278 | drbd_chk_io_error(mdev, 1, TRUE); | ||
3279 | } | ||
3280 | |||
3281 | /* Update mdev->ldev->md.la_size_sect, | ||
3282 | * since we updated it on metadata. */ | ||
3283 | mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev); | ||
3284 | |||
3285 | mutex_unlock(&mdev->md_io_mutex); | ||
3286 | put_ldev(mdev); | ||
3287 | } | ||
3288 | |||
3289 | /** | ||
3290 | * drbd_md_read() - Reads in the meta data super block | ||
3291 | * @mdev: DRBD device. | ||
3292 | * @bdev: Device from which the meta data should be read in. | ||
3293 | * | ||
3294 | * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case | ||
3295 | * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID. | ||
3296 | */ | ||
3297 | int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | ||
3298 | { | ||
3299 | struct meta_data_on_disk *buffer; | ||
3300 | int i, rv = NO_ERROR; | ||
3301 | |||
3302 | if (!get_ldev_if_state(mdev, D_ATTACHING)) | ||
3303 | return ERR_IO_MD_DISK; | ||
3304 | |||
3305 | mutex_lock(&mdev->md_io_mutex); | ||
3306 | buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); | ||
3307 | |||
3308 | if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { | ||
3309 | /* NOTE: cant do normal error processing here as this is | ||
3310 | called BEFORE disk is attached */ | ||
3311 | dev_err(DEV, "Error while reading metadata.\n"); | ||
3312 | rv = ERR_IO_MD_DISK; | ||
3313 | goto err; | ||
3314 | } | ||
3315 | |||
3316 | if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) { | ||
3317 | dev_err(DEV, "Error while reading metadata, magic not found.\n"); | ||
3318 | rv = ERR_MD_INVALID; | ||
3319 | goto err; | ||
3320 | } | ||
3321 | if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) { | ||
3322 | dev_err(DEV, "unexpected al_offset: %d (expected %d)\n", | ||
3323 | be32_to_cpu(buffer->al_offset), bdev->md.al_offset); | ||
3324 | rv = ERR_MD_INVALID; | ||
3325 | goto err; | ||
3326 | } | ||
3327 | if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) { | ||
3328 | dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n", | ||
3329 | be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset); | ||
3330 | rv = ERR_MD_INVALID; | ||
3331 | goto err; | ||
3332 | } | ||
3333 | if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) { | ||
3334 | dev_err(DEV, "unexpected md_size: %u (expected %u)\n", | ||
3335 | be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect); | ||
3336 | rv = ERR_MD_INVALID; | ||
3337 | goto err; | ||
3338 | } | ||
3339 | |||
3340 | if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) { | ||
3341 | dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n", | ||
3342 | be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE); | ||
3343 | rv = ERR_MD_INVALID; | ||
3344 | goto err; | ||
3345 | } | ||
3346 | |||
3347 | bdev->md.la_size_sect = be64_to_cpu(buffer->la_size); | ||
3348 | for (i = UI_CURRENT; i < UI_SIZE; i++) | ||
3349 | bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); | ||
3350 | bdev->md.flags = be32_to_cpu(buffer->flags); | ||
3351 | mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents); | ||
3352 | bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); | ||
3353 | |||
3354 | if (mdev->sync_conf.al_extents < 7) | ||
3355 | mdev->sync_conf.al_extents = 127; | ||
3356 | |||
3357 | err: | ||
3358 | mutex_unlock(&mdev->md_io_mutex); | ||
3359 | put_ldev(mdev); | ||
3360 | |||
3361 | return rv; | ||
3362 | } | ||
3363 | |||
3364 | /** | ||
3365 | * drbd_md_mark_dirty() - Mark meta data super block as dirty | ||
3366 | * @mdev: DRBD device. | ||
3367 | * | ||
3368 | * Call this function if you change anything that should be written to | ||
3369 | * the meta-data super block. This function sets MD_DIRTY, and starts a | ||
3370 | * timer that ensures that within five seconds you have to call drbd_md_sync(). | ||
3371 | */ | ||
3372 | void drbd_md_mark_dirty(struct drbd_conf *mdev) | ||
3373 | { | ||
3374 | set_bit(MD_DIRTY, &mdev->flags); | ||
3375 | mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ); | ||
3376 | } | ||
3377 | |||
3378 | |||
3379 | static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local) | ||
3380 | { | ||
3381 | int i; | ||
3382 | |||
3383 | for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) | ||
3384 | mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i]; | ||
3385 | } | ||
3386 | |||
3387 | void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) | ||
3388 | { | ||
3389 | if (idx == UI_CURRENT) { | ||
3390 | if (mdev->state.role == R_PRIMARY) | ||
3391 | val |= 1; | ||
3392 | else | ||
3393 | val &= ~((u64)1); | ||
3394 | |||
3395 | drbd_set_ed_uuid(mdev, val); | ||
3396 | } | ||
3397 | |||
3398 | mdev->ldev->md.uuid[idx] = val; | ||
3399 | drbd_md_mark_dirty(mdev); | ||
3400 | } | ||
3401 | |||
3402 | |||
3403 | void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) | ||
3404 | { | ||
3405 | if (mdev->ldev->md.uuid[idx]) { | ||
3406 | drbd_uuid_move_history(mdev); | ||
3407 | mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx]; | ||
3408 | } | ||
3409 | _drbd_uuid_set(mdev, idx, val); | ||
3410 | } | ||
3411 | |||
3412 | /** | ||
3413 | * drbd_uuid_new_current() - Creates a new current UUID | ||
3414 | * @mdev: DRBD device. | ||
3415 | * | ||
3416 | * Creates a new current UUID, and rotates the old current UUID into | ||
3417 | * the bitmap slot. Causes an incremental resync upon next connect. | ||
3418 | */ | ||
3419 | void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local) | ||
3420 | { | ||
3421 | u64 val; | ||
3422 | |||
3423 | dev_info(DEV, "Creating new current UUID\n"); | ||
3424 | D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0); | ||
3425 | mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT]; | ||
3426 | |||
3427 | get_random_bytes(&val, sizeof(u64)); | ||
3428 | _drbd_uuid_set(mdev, UI_CURRENT, val); | ||
3429 | } | ||
3430 | |||
3431 | void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) | ||
3432 | { | ||
3433 | if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0) | ||
3434 | return; | ||
3435 | |||
3436 | if (val == 0) { | ||
3437 | drbd_uuid_move_history(mdev); | ||
3438 | mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP]; | ||
3439 | mdev->ldev->md.uuid[UI_BITMAP] = 0; | ||
3440 | } else { | ||
3441 | if (mdev->ldev->md.uuid[UI_BITMAP]) | ||
3442 | dev_warn(DEV, "bm UUID already set"); | ||
3443 | |||
3444 | mdev->ldev->md.uuid[UI_BITMAP] = val; | ||
3445 | mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1); | ||
3446 | |||
3447 | } | ||
3448 | drbd_md_mark_dirty(mdev); | ||
3449 | } | ||
3450 | |||
3451 | /** | ||
3452 | * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() | ||
3453 | * @mdev: DRBD device. | ||
3454 | * | ||
3455 | * Sets all bits in the bitmap and writes the whole bitmap to stable storage. | ||
3456 | */ | ||
3457 | int drbd_bmio_set_n_write(struct drbd_conf *mdev) | ||
3458 | { | ||
3459 | int rv = -EIO; | ||
3460 | |||
3461 | if (get_ldev_if_state(mdev, D_ATTACHING)) { | ||
3462 | drbd_md_set_flag(mdev, MDF_FULL_SYNC); | ||
3463 | drbd_md_sync(mdev); | ||
3464 | drbd_bm_set_all(mdev); | ||
3465 | |||
3466 | rv = drbd_bm_write(mdev); | ||
3467 | |||
3468 | if (!rv) { | ||
3469 | drbd_md_clear_flag(mdev, MDF_FULL_SYNC); | ||
3470 | drbd_md_sync(mdev); | ||
3471 | } | ||
3472 | |||
3473 | put_ldev(mdev); | ||
3474 | } | ||
3475 | |||
3476 | return rv; | ||
3477 | } | ||
3478 | |||
3479 | /** | ||
3480 | * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() | ||
3481 | * @mdev: DRBD device. | ||
3482 | * | ||
3483 | * Clears all bits in the bitmap and writes the whole bitmap to stable storage. | ||
3484 | */ | ||
3485 | int drbd_bmio_clear_n_write(struct drbd_conf *mdev) | ||
3486 | { | ||
3487 | int rv = -EIO; | ||
3488 | |||
3489 | if (get_ldev_if_state(mdev, D_ATTACHING)) { | ||
3490 | drbd_bm_clear_all(mdev); | ||
3491 | rv = drbd_bm_write(mdev); | ||
3492 | put_ldev(mdev); | ||
3493 | } | ||
3494 | |||
3495 | return rv; | ||
3496 | } | ||
3497 | |||
3498 | static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused) | ||
3499 | { | ||
3500 | struct bm_io_work *work = container_of(w, struct bm_io_work, w); | ||
3501 | int rv; | ||
3502 | |||
3503 | D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0); | ||
3504 | |||
3505 | drbd_bm_lock(mdev, work->why); | ||
3506 | rv = work->io_fn(mdev); | ||
3507 | drbd_bm_unlock(mdev); | ||
3508 | |||
3509 | clear_bit(BITMAP_IO, &mdev->flags); | ||
3510 | wake_up(&mdev->misc_wait); | ||
3511 | |||
3512 | if (work->done) | ||
3513 | work->done(mdev, rv); | ||
3514 | |||
3515 | clear_bit(BITMAP_IO_QUEUED, &mdev->flags); | ||
3516 | work->why = NULL; | ||
3517 | |||
3518 | return 1; | ||
3519 | } | ||
3520 | |||
3521 | /** | ||
3522 | * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap | ||
3523 | * @mdev: DRBD device. | ||
3524 | * @io_fn: IO callback to be called when bitmap IO is possible | ||
3525 | * @done: callback to be called after the bitmap IO was performed | ||
3526 | * @why: Descriptive text of the reason for doing the IO | ||
3527 | * | ||
3528 | * While IO on the bitmap happens we freeze application IO thus we ensure | ||
3529 | * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be | ||
3530 | * called from worker context. It MUST NOT be used while a previous such | ||
3531 | * work is still pending! | ||
3532 | */ | ||
3533 | void drbd_queue_bitmap_io(struct drbd_conf *mdev, | ||
3534 | int (*io_fn)(struct drbd_conf *), | ||
3535 | void (*done)(struct drbd_conf *, int), | ||
3536 | char *why) | ||
3537 | { | ||
3538 | D_ASSERT(current == mdev->worker.task); | ||
3539 | |||
3540 | D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags)); | ||
3541 | D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags)); | ||
3542 | D_ASSERT(list_empty(&mdev->bm_io_work.w.list)); | ||
3543 | if (mdev->bm_io_work.why) | ||
3544 | dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n", | ||
3545 | why, mdev->bm_io_work.why); | ||
3546 | |||
3547 | mdev->bm_io_work.io_fn = io_fn; | ||
3548 | mdev->bm_io_work.done = done; | ||
3549 | mdev->bm_io_work.why = why; | ||
3550 | |||
3551 | set_bit(BITMAP_IO, &mdev->flags); | ||
3552 | if (atomic_read(&mdev->ap_bio_cnt) == 0) { | ||
3553 | if (list_empty(&mdev->bm_io_work.w.list)) { | ||
3554 | set_bit(BITMAP_IO_QUEUED, &mdev->flags); | ||
3555 | drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); | ||
3556 | } else | ||
3557 | dev_err(DEV, "FIXME avoided double queuing bm_io_work\n"); | ||
3558 | } | ||
3559 | } | ||
3560 | |||
3561 | /** | ||
3562 | * drbd_bitmap_io() - Does an IO operation on the whole bitmap | ||
3563 | * @mdev: DRBD device. | ||
3564 | * @io_fn: IO callback to be called when bitmap IO is possible | ||
3565 | * @why: Descriptive text of the reason for doing the IO | ||
3566 | * | ||
3567 | * freezes application IO while that the actual IO operations runs. This | ||
3568 | * functions MAY NOT be called from worker context. | ||
3569 | */ | ||
3570 | int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why) | ||
3571 | { | ||
3572 | int rv; | ||
3573 | |||
3574 | D_ASSERT(current != mdev->worker.task); | ||
3575 | |||
3576 | drbd_suspend_io(mdev); | ||
3577 | |||
3578 | drbd_bm_lock(mdev, why); | ||
3579 | rv = io_fn(mdev); | ||
3580 | drbd_bm_unlock(mdev); | ||
3581 | |||
3582 | drbd_resume_io(mdev); | ||
3583 | |||
3584 | return rv; | ||
3585 | } | ||
3586 | |||
3587 | void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local) | ||
3588 | { | ||
3589 | if ((mdev->ldev->md.flags & flag) != flag) { | ||
3590 | drbd_md_mark_dirty(mdev); | ||
3591 | mdev->ldev->md.flags |= flag; | ||
3592 | } | ||
3593 | } | ||
3594 | |||
3595 | void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local) | ||
3596 | { | ||
3597 | if ((mdev->ldev->md.flags & flag) != 0) { | ||
3598 | drbd_md_mark_dirty(mdev); | ||
3599 | mdev->ldev->md.flags &= ~flag; | ||
3600 | } | ||
3601 | } | ||
3602 | int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag) | ||
3603 | { | ||
3604 | return (bdev->md.flags & flag) != 0; | ||
3605 | } | ||
3606 | |||
3607 | static void md_sync_timer_fn(unsigned long data) | ||
3608 | { | ||
3609 | struct drbd_conf *mdev = (struct drbd_conf *) data; | ||
3610 | |||
3611 | drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work); | ||
3612 | } | ||
3613 | |||
3614 | static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused) | ||
3615 | { | ||
3616 | dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n"); | ||
3617 | drbd_md_sync(mdev); | ||
3618 | |||
3619 | return 1; | ||
3620 | } | ||
3621 | |||
3622 | #ifdef CONFIG_DRBD_FAULT_INJECTION | ||
3623 | /* Fault insertion support including random number generator shamelessly | ||
3624 | * stolen from kernel/rcutorture.c */ | ||
3625 | struct fault_random_state { | ||
3626 | unsigned long state; | ||
3627 | unsigned long count; | ||
3628 | }; | ||
3629 | |||
3630 | #define FAULT_RANDOM_MULT 39916801 /* prime */ | ||
3631 | #define FAULT_RANDOM_ADD 479001701 /* prime */ | ||
3632 | #define FAULT_RANDOM_REFRESH 10000 | ||
3633 | |||
3634 | /* | ||
3635 | * Crude but fast random-number generator. Uses a linear congruential | ||
3636 | * generator, with occasional help from get_random_bytes(). | ||
3637 | */ | ||
3638 | static unsigned long | ||
3639 | _drbd_fault_random(struct fault_random_state *rsp) | ||
3640 | { | ||
3641 | long refresh; | ||
3642 | |||
3643 | if (!rsp->count--) { | ||
3644 | get_random_bytes(&refresh, sizeof(refresh)); | ||
3645 | rsp->state += refresh; | ||
3646 | rsp->count = FAULT_RANDOM_REFRESH; | ||
3647 | } | ||
3648 | rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD; | ||
3649 | return swahw32(rsp->state); | ||
3650 | } | ||
3651 | |||
3652 | static char * | ||
3653 | _drbd_fault_str(unsigned int type) { | ||
3654 | static char *_faults[] = { | ||
3655 | [DRBD_FAULT_MD_WR] = "Meta-data write", | ||
3656 | [DRBD_FAULT_MD_RD] = "Meta-data read", | ||
3657 | [DRBD_FAULT_RS_WR] = "Resync write", | ||
3658 | [DRBD_FAULT_RS_RD] = "Resync read", | ||
3659 | [DRBD_FAULT_DT_WR] = "Data write", | ||
3660 | [DRBD_FAULT_DT_RD] = "Data read", | ||
3661 | [DRBD_FAULT_DT_RA] = "Data read ahead", | ||
3662 | [DRBD_FAULT_BM_ALLOC] = "BM allocation", | ||
3663 | [DRBD_FAULT_AL_EE] = "EE allocation" | ||
3664 | }; | ||
3665 | |||
3666 | return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**"; | ||
3667 | } | ||
3668 | |||
3669 | unsigned int | ||
3670 | _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) | ||
3671 | { | ||
3672 | static struct fault_random_state rrs = {0, 0}; | ||
3673 | |||
3674 | unsigned int ret = ( | ||
3675 | (fault_devs == 0 || | ||
3676 | ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) && | ||
3677 | (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate)); | ||
3678 | |||
3679 | if (ret) { | ||
3680 | fault_count++; | ||
3681 | |||
3682 | if (printk_ratelimit()) | ||
3683 | dev_warn(DEV, "***Simulating %s failure\n", | ||
3684 | _drbd_fault_str(type)); | ||
3685 | } | ||
3686 | |||
3687 | return ret; | ||
3688 | } | ||
3689 | #endif | ||
3690 | |||
3691 | const char *drbd_buildtag(void) | ||
3692 | { | ||
3693 | /* DRBD built from external sources has here a reference to the | ||
3694 | git hash of the source code. */ | ||
3695 | |||
3696 | static char buildtag[38] = "\0uilt-in"; | ||
3697 | |||
3698 | if (buildtag[0] == 0) { | ||
3699 | #ifdef CONFIG_MODULES | ||
3700 | if (THIS_MODULE != NULL) | ||
3701 | sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion); | ||
3702 | else | ||
3703 | #endif | ||
3704 | buildtag[0] = 'b'; | ||
3705 | } | ||
3706 | |||
3707 | return buildtag; | ||
3708 | } | ||
3709 | |||
3710 | module_init(drbd_init) | ||
3711 | module_exit(drbd_cleanup) | ||
3712 | |||
3713 | EXPORT_SYMBOL(drbd_conn_str); | ||
3714 | EXPORT_SYMBOL(drbd_role_str); | ||
3715 | EXPORT_SYMBOL(drbd_disk_str); | ||
3716 | EXPORT_SYMBOL(drbd_set_st_err_str); | ||
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c new file mode 100644 index 000000000000..6429d2b19e06 --- /dev/null +++ b/drivers/block/drbd/drbd_nl.c | |||
@@ -0,0 +1,2367 @@ | |||
1 | /* | ||
2 | drbd_nl.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | #include <linux/drbd.h> | ||
28 | #include <linux/in.h> | ||
29 | #include <linux/fs.h> | ||
30 | #include <linux/file.h> | ||
31 | #include <linux/slab.h> | ||
32 | #include <linux/connector.h> | ||
33 | #include <linux/blkpg.h> | ||
34 | #include <linux/cpumask.h> | ||
35 | #include "drbd_int.h" | ||
36 | #include "drbd_wrappers.h" | ||
37 | #include <asm/unaligned.h> | ||
38 | #include <linux/drbd_tag_magic.h> | ||
39 | #include <linux/drbd_limits.h> | ||
40 | |||
41 | static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int); | ||
42 | static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *); | ||
43 | static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *); | ||
44 | |||
45 | /* see get_sb_bdev and bd_claim */ | ||
46 | static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; | ||
47 | |||
48 | /* Generate the tag_list to struct functions */ | ||
49 | #define NL_PACKET(name, number, fields) \ | ||
50 | static int name ## _from_tags(struct drbd_conf *mdev, \ | ||
51 | unsigned short *tags, struct name *arg) __attribute__ ((unused)); \ | ||
52 | static int name ## _from_tags(struct drbd_conf *mdev, \ | ||
53 | unsigned short *tags, struct name *arg) \ | ||
54 | { \ | ||
55 | int tag; \ | ||
56 | int dlen; \ | ||
57 | \ | ||
58 | while ((tag = get_unaligned(tags++)) != TT_END) { \ | ||
59 | dlen = get_unaligned(tags++); \ | ||
60 | switch (tag_number(tag)) { \ | ||
61 | fields \ | ||
62 | default: \ | ||
63 | if (tag & T_MANDATORY) { \ | ||
64 | dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \ | ||
65 | return 0; \ | ||
66 | } \ | ||
67 | } \ | ||
68 | tags = (unsigned short *)((char *)tags + dlen); \ | ||
69 | } \ | ||
70 | return 1; \ | ||
71 | } | ||
72 | #define NL_INTEGER(pn, pr, member) \ | ||
73 | case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \ | ||
74 | arg->member = get_unaligned((int *)(tags)); \ | ||
75 | break; | ||
76 | #define NL_INT64(pn, pr, member) \ | ||
77 | case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \ | ||
78 | arg->member = get_unaligned((u64 *)(tags)); \ | ||
79 | break; | ||
80 | #define NL_BIT(pn, pr, member) \ | ||
81 | case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \ | ||
82 | arg->member = *(char *)(tags) ? 1 : 0; \ | ||
83 | break; | ||
84 | #define NL_STRING(pn, pr, member, len) \ | ||
85 | case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \ | ||
86 | if (dlen > len) { \ | ||
87 | dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \ | ||
88 | #member, dlen, (unsigned int)len); \ | ||
89 | return 0; \ | ||
90 | } \ | ||
91 | arg->member ## _len = dlen; \ | ||
92 | memcpy(arg->member, tags, min_t(size_t, dlen, len)); \ | ||
93 | break; | ||
94 | #include "linux/drbd_nl.h" | ||
95 | |||
96 | /* Generate the struct to tag_list functions */ | ||
97 | #define NL_PACKET(name, number, fields) \ | ||
98 | static unsigned short* \ | ||
99 | name ## _to_tags(struct drbd_conf *mdev, \ | ||
100 | struct name *arg, unsigned short *tags) __attribute__ ((unused)); \ | ||
101 | static unsigned short* \ | ||
102 | name ## _to_tags(struct drbd_conf *mdev, \ | ||
103 | struct name *arg, unsigned short *tags) \ | ||
104 | { \ | ||
105 | fields \ | ||
106 | return tags; \ | ||
107 | } | ||
108 | |||
109 | #define NL_INTEGER(pn, pr, member) \ | ||
110 | put_unaligned(pn | pr | TT_INTEGER, tags++); \ | ||
111 | put_unaligned(sizeof(int), tags++); \ | ||
112 | put_unaligned(arg->member, (int *)tags); \ | ||
113 | tags = (unsigned short *)((char *)tags+sizeof(int)); | ||
114 | #define NL_INT64(pn, pr, member) \ | ||
115 | put_unaligned(pn | pr | TT_INT64, tags++); \ | ||
116 | put_unaligned(sizeof(u64), tags++); \ | ||
117 | put_unaligned(arg->member, (u64 *)tags); \ | ||
118 | tags = (unsigned short *)((char *)tags+sizeof(u64)); | ||
119 | #define NL_BIT(pn, pr, member) \ | ||
120 | put_unaligned(pn | pr | TT_BIT, tags++); \ | ||
121 | put_unaligned(sizeof(char), tags++); \ | ||
122 | *(char *)tags = arg->member; \ | ||
123 | tags = (unsigned short *)((char *)tags+sizeof(char)); | ||
124 | #define NL_STRING(pn, pr, member, len) \ | ||
125 | put_unaligned(pn | pr | TT_STRING, tags++); \ | ||
126 | put_unaligned(arg->member ## _len, tags++); \ | ||
127 | memcpy(tags, arg->member, arg->member ## _len); \ | ||
128 | tags = (unsigned short *)((char *)tags + arg->member ## _len); | ||
129 | #include "linux/drbd_nl.h" | ||
130 | |||
131 | void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name); | ||
132 | void drbd_nl_send_reply(struct cn_msg *, int); | ||
133 | |||
134 | int drbd_khelper(struct drbd_conf *mdev, char *cmd) | ||
135 | { | ||
136 | char *envp[] = { "HOME=/", | ||
137 | "TERM=linux", | ||
138 | "PATH=/sbin:/usr/sbin:/bin:/usr/bin", | ||
139 | NULL, /* Will be set to address family */ | ||
140 | NULL, /* Will be set to address */ | ||
141 | NULL }; | ||
142 | |||
143 | char mb[12], af[20], ad[60], *afs; | ||
144 | char *argv[] = {usermode_helper, cmd, mb, NULL }; | ||
145 | int ret; | ||
146 | |||
147 | snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev)); | ||
148 | |||
149 | if (get_net_conf(mdev)) { | ||
150 | switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) { | ||
151 | case AF_INET6: | ||
152 | afs = "ipv6"; | ||
153 | snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6", | ||
154 | &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr); | ||
155 | break; | ||
156 | case AF_INET: | ||
157 | afs = "ipv4"; | ||
158 | snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4", | ||
159 | &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr); | ||
160 | break; | ||
161 | default: | ||
162 | afs = "ssocks"; | ||
163 | snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4", | ||
164 | &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr); | ||
165 | } | ||
166 | snprintf(af, 20, "DRBD_PEER_AF=%s", afs); | ||
167 | envp[3]=af; | ||
168 | envp[4]=ad; | ||
169 | put_net_conf(mdev); | ||
170 | } | ||
171 | |||
172 | dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb); | ||
173 | |||
174 | drbd_bcast_ev_helper(mdev, cmd); | ||
175 | ret = call_usermodehelper(usermode_helper, argv, envp, 1); | ||
176 | if (ret) | ||
177 | dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", | ||
178 | usermode_helper, cmd, mb, | ||
179 | (ret >> 8) & 0xff, ret); | ||
180 | else | ||
181 | dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", | ||
182 | usermode_helper, cmd, mb, | ||
183 | (ret >> 8) & 0xff, ret); | ||
184 | |||
185 | if (ret < 0) /* Ignore any ERRNOs we got. */ | ||
186 | ret = 0; | ||
187 | |||
188 | return ret; | ||
189 | } | ||
190 | |||
191 | enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev) | ||
192 | { | ||
193 | char *ex_to_string; | ||
194 | int r; | ||
195 | enum drbd_disk_state nps; | ||
196 | enum drbd_fencing_p fp; | ||
197 | |||
198 | D_ASSERT(mdev->state.pdsk == D_UNKNOWN); | ||
199 | |||
200 | if (get_ldev_if_state(mdev, D_CONSISTENT)) { | ||
201 | fp = mdev->ldev->dc.fencing; | ||
202 | put_ldev(mdev); | ||
203 | } else { | ||
204 | dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n"); | ||
205 | return mdev->state.pdsk; | ||
206 | } | ||
207 | |||
208 | if (fp == FP_STONITH) | ||
209 | _drbd_request_state(mdev, NS(susp, 1), CS_WAIT_COMPLETE); | ||
210 | |||
211 | r = drbd_khelper(mdev, "fence-peer"); | ||
212 | |||
213 | switch ((r>>8) & 0xff) { | ||
214 | case 3: /* peer is inconsistent */ | ||
215 | ex_to_string = "peer is inconsistent or worse"; | ||
216 | nps = D_INCONSISTENT; | ||
217 | break; | ||
218 | case 4: /* peer got outdated, or was already outdated */ | ||
219 | ex_to_string = "peer was fenced"; | ||
220 | nps = D_OUTDATED; | ||
221 | break; | ||
222 | case 5: /* peer was down */ | ||
223 | if (mdev->state.disk == D_UP_TO_DATE) { | ||
224 | /* we will(have) create(d) a new UUID anyways... */ | ||
225 | ex_to_string = "peer is unreachable, assumed to be dead"; | ||
226 | nps = D_OUTDATED; | ||
227 | } else { | ||
228 | ex_to_string = "peer unreachable, doing nothing since disk != UpToDate"; | ||
229 | nps = mdev->state.pdsk; | ||
230 | } | ||
231 | break; | ||
232 | case 6: /* Peer is primary, voluntarily outdate myself. | ||
233 | * This is useful when an unconnected R_SECONDARY is asked to | ||
234 | * become R_PRIMARY, but finds the other peer being active. */ | ||
235 | ex_to_string = "peer is active"; | ||
236 | dev_warn(DEV, "Peer is primary, outdating myself.\n"); | ||
237 | nps = D_UNKNOWN; | ||
238 | _drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE); | ||
239 | break; | ||
240 | case 7: | ||
241 | if (fp != FP_STONITH) | ||
242 | dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n"); | ||
243 | ex_to_string = "peer was stonithed"; | ||
244 | nps = D_OUTDATED; | ||
245 | break; | ||
246 | default: | ||
247 | /* The script is broken ... */ | ||
248 | nps = D_UNKNOWN; | ||
249 | dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff); | ||
250 | return nps; | ||
251 | } | ||
252 | |||
253 | dev_info(DEV, "fence-peer helper returned %d (%s)\n", | ||
254 | (r>>8) & 0xff, ex_to_string); | ||
255 | return nps; | ||
256 | } | ||
257 | |||
258 | |||
259 | int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) | ||
260 | { | ||
261 | const int max_tries = 4; | ||
262 | int r = 0; | ||
263 | int try = 0; | ||
264 | int forced = 0; | ||
265 | union drbd_state mask, val; | ||
266 | enum drbd_disk_state nps; | ||
267 | |||
268 | if (new_role == R_PRIMARY) | ||
269 | request_ping(mdev); /* Detect a dead peer ASAP */ | ||
270 | |||
271 | mutex_lock(&mdev->state_mutex); | ||
272 | |||
273 | mask.i = 0; mask.role = R_MASK; | ||
274 | val.i = 0; val.role = new_role; | ||
275 | |||
276 | while (try++ < max_tries) { | ||
277 | r = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE); | ||
278 | |||
279 | /* in case we first succeeded to outdate, | ||
280 | * but now suddenly could establish a connection */ | ||
281 | if (r == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) { | ||
282 | val.pdsk = 0; | ||
283 | mask.pdsk = 0; | ||
284 | continue; | ||
285 | } | ||
286 | |||
287 | if (r == SS_NO_UP_TO_DATE_DISK && force && | ||
288 | (mdev->state.disk < D_UP_TO_DATE && | ||
289 | mdev->state.disk >= D_INCONSISTENT)) { | ||
290 | mask.disk = D_MASK; | ||
291 | val.disk = D_UP_TO_DATE; | ||
292 | forced = 1; | ||
293 | continue; | ||
294 | } | ||
295 | |||
296 | if (r == SS_NO_UP_TO_DATE_DISK && | ||
297 | mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) { | ||
298 | D_ASSERT(mdev->state.pdsk == D_UNKNOWN); | ||
299 | nps = drbd_try_outdate_peer(mdev); | ||
300 | |||
301 | if (nps == D_OUTDATED || nps == D_INCONSISTENT) { | ||
302 | val.disk = D_UP_TO_DATE; | ||
303 | mask.disk = D_MASK; | ||
304 | } | ||
305 | |||
306 | val.pdsk = nps; | ||
307 | mask.pdsk = D_MASK; | ||
308 | |||
309 | continue; | ||
310 | } | ||
311 | |||
312 | if (r == SS_NOTHING_TO_DO) | ||
313 | goto fail; | ||
314 | if (r == SS_PRIMARY_NOP && mask.pdsk == 0) { | ||
315 | nps = drbd_try_outdate_peer(mdev); | ||
316 | |||
317 | if (force && nps > D_OUTDATED) { | ||
318 | dev_warn(DEV, "Forced into split brain situation!\n"); | ||
319 | nps = D_OUTDATED; | ||
320 | } | ||
321 | |||
322 | mask.pdsk = D_MASK; | ||
323 | val.pdsk = nps; | ||
324 | |||
325 | continue; | ||
326 | } | ||
327 | if (r == SS_TWO_PRIMARIES) { | ||
328 | /* Maybe the peer is detected as dead very soon... | ||
329 | retry at most once more in this case. */ | ||
330 | __set_current_state(TASK_INTERRUPTIBLE); | ||
331 | schedule_timeout((mdev->net_conf->ping_timeo+1)*HZ/10); | ||
332 | if (try < max_tries) | ||
333 | try = max_tries - 1; | ||
334 | continue; | ||
335 | } | ||
336 | if (r < SS_SUCCESS) { | ||
337 | r = _drbd_request_state(mdev, mask, val, | ||
338 | CS_VERBOSE + CS_WAIT_COMPLETE); | ||
339 | if (r < SS_SUCCESS) | ||
340 | goto fail; | ||
341 | } | ||
342 | break; | ||
343 | } | ||
344 | |||
345 | if (r < SS_SUCCESS) | ||
346 | goto fail; | ||
347 | |||
348 | if (forced) | ||
349 | dev_warn(DEV, "Forced to consider local data as UpToDate!\n"); | ||
350 | |||
351 | /* Wait until nothing is on the fly :) */ | ||
352 | wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0); | ||
353 | |||
354 | if (new_role == R_SECONDARY) { | ||
355 | set_disk_ro(mdev->vdisk, TRUE); | ||
356 | if (get_ldev(mdev)) { | ||
357 | mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1; | ||
358 | put_ldev(mdev); | ||
359 | } | ||
360 | } else { | ||
361 | if (get_net_conf(mdev)) { | ||
362 | mdev->net_conf->want_lose = 0; | ||
363 | put_net_conf(mdev); | ||
364 | } | ||
365 | set_disk_ro(mdev->vdisk, FALSE); | ||
366 | if (get_ldev(mdev)) { | ||
367 | if (((mdev->state.conn < C_CONNECTED || | ||
368 | mdev->state.pdsk <= D_FAILED) | ||
369 | && mdev->ldev->md.uuid[UI_BITMAP] == 0) || forced) | ||
370 | drbd_uuid_new_current(mdev); | ||
371 | |||
372 | mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1; | ||
373 | put_ldev(mdev); | ||
374 | } | ||
375 | } | ||
376 | |||
377 | if ((new_role == R_SECONDARY) && get_ldev(mdev)) { | ||
378 | drbd_al_to_on_disk_bm(mdev); | ||
379 | put_ldev(mdev); | ||
380 | } | ||
381 | |||
382 | if (mdev->state.conn >= C_WF_REPORT_PARAMS) { | ||
383 | /* if this was forced, we should consider sync */ | ||
384 | if (forced) | ||
385 | drbd_send_uuids(mdev); | ||
386 | drbd_send_state(mdev); | ||
387 | } | ||
388 | |||
389 | drbd_md_sync(mdev); | ||
390 | |||
391 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); | ||
392 | fail: | ||
393 | mutex_unlock(&mdev->state_mutex); | ||
394 | return r; | ||
395 | } | ||
396 | |||
397 | |||
398 | static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
399 | struct drbd_nl_cfg_reply *reply) | ||
400 | { | ||
401 | struct primary primary_args; | ||
402 | |||
403 | memset(&primary_args, 0, sizeof(struct primary)); | ||
404 | if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) { | ||
405 | reply->ret_code = ERR_MANDATORY_TAG; | ||
406 | return 0; | ||
407 | } | ||
408 | |||
409 | reply->ret_code = | ||
410 | drbd_set_role(mdev, R_PRIMARY, primary_args.primary_force); | ||
411 | |||
412 | return 0; | ||
413 | } | ||
414 | |||
415 | static int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
416 | struct drbd_nl_cfg_reply *reply) | ||
417 | { | ||
418 | reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0); | ||
419 | |||
420 | return 0; | ||
421 | } | ||
422 | |||
423 | /* initializes the md.*_offset members, so we are able to find | ||
424 | * the on disk meta data */ | ||
425 | static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, | ||
426 | struct drbd_backing_dev *bdev) | ||
427 | { | ||
428 | sector_t md_size_sect = 0; | ||
429 | switch (bdev->dc.meta_dev_idx) { | ||
430 | default: | ||
431 | /* v07 style fixed size indexed meta data */ | ||
432 | bdev->md.md_size_sect = MD_RESERVED_SECT; | ||
433 | bdev->md.md_offset = drbd_md_ss__(mdev, bdev); | ||
434 | bdev->md.al_offset = MD_AL_OFFSET; | ||
435 | bdev->md.bm_offset = MD_BM_OFFSET; | ||
436 | break; | ||
437 | case DRBD_MD_INDEX_FLEX_EXT: | ||
438 | /* just occupy the full device; unit: sectors */ | ||
439 | bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev); | ||
440 | bdev->md.md_offset = 0; | ||
441 | bdev->md.al_offset = MD_AL_OFFSET; | ||
442 | bdev->md.bm_offset = MD_BM_OFFSET; | ||
443 | break; | ||
444 | case DRBD_MD_INDEX_INTERNAL: | ||
445 | case DRBD_MD_INDEX_FLEX_INT: | ||
446 | bdev->md.md_offset = drbd_md_ss__(mdev, bdev); | ||
447 | /* al size is still fixed */ | ||
448 | bdev->md.al_offset = -MD_AL_MAX_SIZE; | ||
449 | /* we need (slightly less than) ~ this much bitmap sectors: */ | ||
450 | md_size_sect = drbd_get_capacity(bdev->backing_bdev); | ||
451 | md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); | ||
452 | md_size_sect = BM_SECT_TO_EXT(md_size_sect); | ||
453 | md_size_sect = ALIGN(md_size_sect, 8); | ||
454 | |||
455 | /* plus the "drbd meta data super block", | ||
456 | * and the activity log; */ | ||
457 | md_size_sect += MD_BM_OFFSET; | ||
458 | |||
459 | bdev->md.md_size_sect = md_size_sect; | ||
460 | /* bitmap offset is adjusted by 'super' block size */ | ||
461 | bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET; | ||
462 | break; | ||
463 | } | ||
464 | } | ||
465 | |||
466 | char *ppsize(char *buf, unsigned long long size) | ||
467 | { | ||
468 | /* Needs 9 bytes at max. */ | ||
469 | static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' }; | ||
470 | int base = 0; | ||
471 | while (size >= 10000) { | ||
472 | /* shift + round */ | ||
473 | size = (size >> 10) + !!(size & (1<<9)); | ||
474 | base++; | ||
475 | } | ||
476 | sprintf(buf, "%lu %cB", (long)size, units[base]); | ||
477 | |||
478 | return buf; | ||
479 | } | ||
480 | |||
481 | /* there is still a theoretical deadlock when called from receiver | ||
482 | * on an D_INCONSISTENT R_PRIMARY: | ||
483 | * remote READ does inc_ap_bio, receiver would need to receive answer | ||
484 | * packet from remote to dec_ap_bio again. | ||
485 | * receiver receive_sizes(), comes here, | ||
486 | * waits for ap_bio_cnt == 0. -> deadlock. | ||
487 | * but this cannot happen, actually, because: | ||
488 | * R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable | ||
489 | * (not connected, or bad/no disk on peer): | ||
490 | * see drbd_fail_request_early, ap_bio_cnt is zero. | ||
491 | * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET: | ||
492 | * peer may not initiate a resize. | ||
493 | */ | ||
494 | void drbd_suspend_io(struct drbd_conf *mdev) | ||
495 | { | ||
496 | set_bit(SUSPEND_IO, &mdev->flags); | ||
497 | wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); | ||
498 | } | ||
499 | |||
500 | void drbd_resume_io(struct drbd_conf *mdev) | ||
501 | { | ||
502 | clear_bit(SUSPEND_IO, &mdev->flags); | ||
503 | wake_up(&mdev->misc_wait); | ||
504 | } | ||
505 | |||
506 | /** | ||
507 | * drbd_determine_dev_size() - Sets the right device size obeying all constraints | ||
508 | * @mdev: DRBD device. | ||
509 | * | ||
510 | * Returns 0 on success, negative return values indicate errors. | ||
511 | * You should call drbd_md_sync() after calling this function. | ||
512 | */ | ||
513 | enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, int force) __must_hold(local) | ||
514 | { | ||
515 | sector_t prev_first_sect, prev_size; /* previous meta location */ | ||
516 | sector_t la_size; | ||
517 | sector_t size; | ||
518 | char ppb[10]; | ||
519 | |||
520 | int md_moved, la_size_changed; | ||
521 | enum determine_dev_size rv = unchanged; | ||
522 | |||
523 | /* race: | ||
524 | * application request passes inc_ap_bio, | ||
525 | * but then cannot get an AL-reference. | ||
526 | * this function later may wait on ap_bio_cnt == 0. -> deadlock. | ||
527 | * | ||
528 | * to avoid that: | ||
529 | * Suspend IO right here. | ||
530 | * still lock the act_log to not trigger ASSERTs there. | ||
531 | */ | ||
532 | drbd_suspend_io(mdev); | ||
533 | |||
534 | /* no wait necessary anymore, actually we could assert that */ | ||
535 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); | ||
536 | |||
537 | prev_first_sect = drbd_md_first_sector(mdev->ldev); | ||
538 | prev_size = mdev->ldev->md.md_size_sect; | ||
539 | la_size = mdev->ldev->md.la_size_sect; | ||
540 | |||
541 | /* TODO: should only be some assert here, not (re)init... */ | ||
542 | drbd_md_set_sector_offsets(mdev, mdev->ldev); | ||
543 | |||
544 | size = drbd_new_dev_size(mdev, mdev->ldev, force); | ||
545 | |||
546 | if (drbd_get_capacity(mdev->this_bdev) != size || | ||
547 | drbd_bm_capacity(mdev) != size) { | ||
548 | int err; | ||
549 | err = drbd_bm_resize(mdev, size); | ||
550 | if (unlikely(err)) { | ||
551 | /* currently there is only one error: ENOMEM! */ | ||
552 | size = drbd_bm_capacity(mdev)>>1; | ||
553 | if (size == 0) { | ||
554 | dev_err(DEV, "OUT OF MEMORY! " | ||
555 | "Could not allocate bitmap!\n"); | ||
556 | } else { | ||
557 | dev_err(DEV, "BM resizing failed. " | ||
558 | "Leaving size unchanged at size = %lu KB\n", | ||
559 | (unsigned long)size); | ||
560 | } | ||
561 | rv = dev_size_error; | ||
562 | } | ||
563 | /* racy, see comments above. */ | ||
564 | drbd_set_my_capacity(mdev, size); | ||
565 | mdev->ldev->md.la_size_sect = size; | ||
566 | dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1), | ||
567 | (unsigned long long)size>>1); | ||
568 | } | ||
569 | if (rv == dev_size_error) | ||
570 | goto out; | ||
571 | |||
572 | la_size_changed = (la_size != mdev->ldev->md.la_size_sect); | ||
573 | |||
574 | md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev) | ||
575 | || prev_size != mdev->ldev->md.md_size_sect; | ||
576 | |||
577 | if (la_size_changed || md_moved) { | ||
578 | drbd_al_shrink(mdev); /* All extents inactive. */ | ||
579 | dev_info(DEV, "Writing the whole bitmap, %s\n", | ||
580 | la_size_changed && md_moved ? "size changed and md moved" : | ||
581 | la_size_changed ? "size changed" : "md moved"); | ||
582 | rv = drbd_bitmap_io(mdev, &drbd_bm_write, "size changed"); /* does drbd_resume_io() ! */ | ||
583 | drbd_md_mark_dirty(mdev); | ||
584 | } | ||
585 | |||
586 | if (size > la_size) | ||
587 | rv = grew; | ||
588 | if (size < la_size) | ||
589 | rv = shrunk; | ||
590 | out: | ||
591 | lc_unlock(mdev->act_log); | ||
592 | wake_up(&mdev->al_wait); | ||
593 | drbd_resume_io(mdev); | ||
594 | |||
595 | return rv; | ||
596 | } | ||
597 | |||
598 | sector_t | ||
599 | drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int assume_peer_has_space) | ||
600 | { | ||
601 | sector_t p_size = mdev->p_size; /* partner's disk size. */ | ||
602 | sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */ | ||
603 | sector_t m_size; /* my size */ | ||
604 | sector_t u_size = bdev->dc.disk_size; /* size requested by user. */ | ||
605 | sector_t size = 0; | ||
606 | |||
607 | m_size = drbd_get_max_capacity(bdev); | ||
608 | |||
609 | if (mdev->state.conn < C_CONNECTED && assume_peer_has_space) { | ||
610 | dev_warn(DEV, "Resize while not connected was forced by the user!\n"); | ||
611 | p_size = m_size; | ||
612 | } | ||
613 | |||
614 | if (p_size && m_size) { | ||
615 | size = min_t(sector_t, p_size, m_size); | ||
616 | } else { | ||
617 | if (la_size) { | ||
618 | size = la_size; | ||
619 | if (m_size && m_size < size) | ||
620 | size = m_size; | ||
621 | if (p_size && p_size < size) | ||
622 | size = p_size; | ||
623 | } else { | ||
624 | if (m_size) | ||
625 | size = m_size; | ||
626 | if (p_size) | ||
627 | size = p_size; | ||
628 | } | ||
629 | } | ||
630 | |||
631 | if (size == 0) | ||
632 | dev_err(DEV, "Both nodes diskless!\n"); | ||
633 | |||
634 | if (u_size) { | ||
635 | if (u_size > size) | ||
636 | dev_err(DEV, "Requested disk size is too big (%lu > %lu)\n", | ||
637 | (unsigned long)u_size>>1, (unsigned long)size>>1); | ||
638 | else | ||
639 | size = u_size; | ||
640 | } | ||
641 | |||
642 | return size; | ||
643 | } | ||
644 | |||
645 | /** | ||
646 | * drbd_check_al_size() - Ensures that the AL is of the right size | ||
647 | * @mdev: DRBD device. | ||
648 | * | ||
649 | * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation | ||
650 | * failed, and 0 on success. You should call drbd_md_sync() after you called | ||
651 | * this function. | ||
652 | */ | ||
653 | static int drbd_check_al_size(struct drbd_conf *mdev) | ||
654 | { | ||
655 | struct lru_cache *n, *t; | ||
656 | struct lc_element *e; | ||
657 | unsigned int in_use; | ||
658 | int i; | ||
659 | |||
660 | ERR_IF(mdev->sync_conf.al_extents < 7) | ||
661 | mdev->sync_conf.al_extents = 127; | ||
662 | |||
663 | if (mdev->act_log && | ||
664 | mdev->act_log->nr_elements == mdev->sync_conf.al_extents) | ||
665 | return 0; | ||
666 | |||
667 | in_use = 0; | ||
668 | t = mdev->act_log; | ||
669 | n = lc_create("act_log", drbd_al_ext_cache, | ||
670 | mdev->sync_conf.al_extents, sizeof(struct lc_element), 0); | ||
671 | |||
672 | if (n == NULL) { | ||
673 | dev_err(DEV, "Cannot allocate act_log lru!\n"); | ||
674 | return -ENOMEM; | ||
675 | } | ||
676 | spin_lock_irq(&mdev->al_lock); | ||
677 | if (t) { | ||
678 | for (i = 0; i < t->nr_elements; i++) { | ||
679 | e = lc_element_by_index(t, i); | ||
680 | if (e->refcnt) | ||
681 | dev_err(DEV, "refcnt(%d)==%d\n", | ||
682 | e->lc_number, e->refcnt); | ||
683 | in_use += e->refcnt; | ||
684 | } | ||
685 | } | ||
686 | if (!in_use) | ||
687 | mdev->act_log = n; | ||
688 | spin_unlock_irq(&mdev->al_lock); | ||
689 | if (in_use) { | ||
690 | dev_err(DEV, "Activity log still in use!\n"); | ||
691 | lc_destroy(n); | ||
692 | return -EBUSY; | ||
693 | } else { | ||
694 | if (t) | ||
695 | lc_destroy(t); | ||
696 | } | ||
697 | drbd_md_mark_dirty(mdev); /* we changed mdev->act_log->nr_elemens */ | ||
698 | return 0; | ||
699 | } | ||
700 | |||
701 | void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __must_hold(local) | ||
702 | { | ||
703 | struct request_queue * const q = mdev->rq_queue; | ||
704 | struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; | ||
705 | int max_segments = mdev->ldev->dc.max_bio_bvecs; | ||
706 | |||
707 | if (b->merge_bvec_fn && !mdev->ldev->dc.use_bmbv) | ||
708 | max_seg_s = PAGE_SIZE; | ||
709 | |||
710 | max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s); | ||
711 | |||
712 | blk_queue_max_hw_sectors(q, max_seg_s >> 9); | ||
713 | blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); | ||
714 | blk_queue_max_segment_size(q, max_seg_s); | ||
715 | blk_queue_logical_block_size(q, 512); | ||
716 | blk_queue_segment_boundary(q, PAGE_SIZE-1); | ||
717 | blk_stack_limits(&q->limits, &b->limits, 0); | ||
718 | |||
719 | if (b->merge_bvec_fn) | ||
720 | dev_warn(DEV, "Backing device's merge_bvec_fn() = %p\n", | ||
721 | b->merge_bvec_fn); | ||
722 | dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q)); | ||
723 | |||
724 | if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { | ||
725 | dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n", | ||
726 | q->backing_dev_info.ra_pages, | ||
727 | b->backing_dev_info.ra_pages); | ||
728 | q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; | ||
729 | } | ||
730 | } | ||
731 | |||
732 | /* serialize deconfig (worker exiting, doing cleanup) | ||
733 | * and reconfig (drbdsetup disk, drbdsetup net) | ||
734 | * | ||
735 | * wait for a potentially exiting worker, then restart it, | ||
736 | * or start a new one. | ||
737 | */ | ||
738 | static void drbd_reconfig_start(struct drbd_conf *mdev) | ||
739 | { | ||
740 | wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags)); | ||
741 | wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags)); | ||
742 | drbd_thread_start(&mdev->worker); | ||
743 | } | ||
744 | |||
745 | /* if still unconfigured, stops worker again. | ||
746 | * if configured now, clears CONFIG_PENDING. | ||
747 | * wakes potential waiters */ | ||
748 | static void drbd_reconfig_done(struct drbd_conf *mdev) | ||
749 | { | ||
750 | spin_lock_irq(&mdev->req_lock); | ||
751 | if (mdev->state.disk == D_DISKLESS && | ||
752 | mdev->state.conn == C_STANDALONE && | ||
753 | mdev->state.role == R_SECONDARY) { | ||
754 | set_bit(DEVICE_DYING, &mdev->flags); | ||
755 | drbd_thread_stop_nowait(&mdev->worker); | ||
756 | } else | ||
757 | clear_bit(CONFIG_PENDING, &mdev->flags); | ||
758 | spin_unlock_irq(&mdev->req_lock); | ||
759 | wake_up(&mdev->state_wait); | ||
760 | } | ||
761 | |||
762 | /* does always return 0; | ||
763 | * interesting return code is in reply->ret_code */ | ||
764 | static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
765 | struct drbd_nl_cfg_reply *reply) | ||
766 | { | ||
767 | enum drbd_ret_codes retcode; | ||
768 | enum determine_dev_size dd; | ||
769 | sector_t max_possible_sectors; | ||
770 | sector_t min_md_device_sectors; | ||
771 | struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ | ||
772 | struct inode *inode, *inode2; | ||
773 | struct lru_cache *resync_lru = NULL; | ||
774 | union drbd_state ns, os; | ||
775 | int rv; | ||
776 | int cp_discovered = 0; | ||
777 | int logical_block_size; | ||
778 | |||
779 | drbd_reconfig_start(mdev); | ||
780 | |||
781 | /* if you want to reconfigure, please tear down first */ | ||
782 | if (mdev->state.disk > D_DISKLESS) { | ||
783 | retcode = ERR_DISK_CONFIGURED; | ||
784 | goto fail; | ||
785 | } | ||
786 | |||
787 | /* allocation not in the IO path, cqueue thread context */ | ||
788 | nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); | ||
789 | if (!nbc) { | ||
790 | retcode = ERR_NOMEM; | ||
791 | goto fail; | ||
792 | } | ||
793 | |||
794 | nbc->dc.disk_size = DRBD_DISK_SIZE_SECT_DEF; | ||
795 | nbc->dc.on_io_error = DRBD_ON_IO_ERROR_DEF; | ||
796 | nbc->dc.fencing = DRBD_FENCING_DEF; | ||
797 | nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF; | ||
798 | |||
799 | if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) { | ||
800 | retcode = ERR_MANDATORY_TAG; | ||
801 | goto fail; | ||
802 | } | ||
803 | |||
804 | if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) { | ||
805 | retcode = ERR_MD_IDX_INVALID; | ||
806 | goto fail; | ||
807 | } | ||
808 | |||
809 | nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0); | ||
810 | if (IS_ERR(nbc->lo_file)) { | ||
811 | dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev, | ||
812 | PTR_ERR(nbc->lo_file)); | ||
813 | nbc->lo_file = NULL; | ||
814 | retcode = ERR_OPEN_DISK; | ||
815 | goto fail; | ||
816 | } | ||
817 | |||
818 | inode = nbc->lo_file->f_dentry->d_inode; | ||
819 | |||
820 | if (!S_ISBLK(inode->i_mode)) { | ||
821 | retcode = ERR_DISK_NOT_BDEV; | ||
822 | goto fail; | ||
823 | } | ||
824 | |||
825 | nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0); | ||
826 | if (IS_ERR(nbc->md_file)) { | ||
827 | dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev, | ||
828 | PTR_ERR(nbc->md_file)); | ||
829 | nbc->md_file = NULL; | ||
830 | retcode = ERR_OPEN_MD_DISK; | ||
831 | goto fail; | ||
832 | } | ||
833 | |||
834 | inode2 = nbc->md_file->f_dentry->d_inode; | ||
835 | |||
836 | if (!S_ISBLK(inode2->i_mode)) { | ||
837 | retcode = ERR_MD_NOT_BDEV; | ||
838 | goto fail; | ||
839 | } | ||
840 | |||
841 | nbc->backing_bdev = inode->i_bdev; | ||
842 | if (bd_claim(nbc->backing_bdev, mdev)) { | ||
843 | printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n", | ||
844 | nbc->backing_bdev, mdev, | ||
845 | nbc->backing_bdev->bd_holder, | ||
846 | nbc->backing_bdev->bd_contains->bd_holder, | ||
847 | nbc->backing_bdev->bd_holders); | ||
848 | retcode = ERR_BDCLAIM_DISK; | ||
849 | goto fail; | ||
850 | } | ||
851 | |||
852 | resync_lru = lc_create("resync", drbd_bm_ext_cache, | ||
853 | 61, sizeof(struct bm_extent), | ||
854 | offsetof(struct bm_extent, lce)); | ||
855 | if (!resync_lru) { | ||
856 | retcode = ERR_NOMEM; | ||
857 | goto release_bdev_fail; | ||
858 | } | ||
859 | |||
860 | /* meta_dev_idx >= 0: external fixed size, | ||
861 | * possibly multiple drbd sharing one meta device. | ||
862 | * TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is | ||
863 | * not yet used by some other drbd minor! | ||
864 | * (if you use drbd.conf + drbdadm, | ||
865 | * that should check it for you already; but if you don't, or someone | ||
866 | * fooled it, we need to double check here) */ | ||
867 | nbc->md_bdev = inode2->i_bdev; | ||
868 | if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev | ||
869 | : (void *) drbd_m_holder)) { | ||
870 | retcode = ERR_BDCLAIM_MD_DISK; | ||
871 | goto release_bdev_fail; | ||
872 | } | ||
873 | |||
874 | if ((nbc->backing_bdev == nbc->md_bdev) != | ||
875 | (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL || | ||
876 | nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) { | ||
877 | retcode = ERR_MD_IDX_INVALID; | ||
878 | goto release_bdev2_fail; | ||
879 | } | ||
880 | |||
881 | /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */ | ||
882 | drbd_md_set_sector_offsets(mdev, nbc); | ||
883 | |||
884 | if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) { | ||
885 | dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", | ||
886 | (unsigned long long) drbd_get_max_capacity(nbc), | ||
887 | (unsigned long long) nbc->dc.disk_size); | ||
888 | retcode = ERR_DISK_TO_SMALL; | ||
889 | goto release_bdev2_fail; | ||
890 | } | ||
891 | |||
892 | if (nbc->dc.meta_dev_idx < 0) { | ||
893 | max_possible_sectors = DRBD_MAX_SECTORS_FLEX; | ||
894 | /* at least one MB, otherwise it does not make sense */ | ||
895 | min_md_device_sectors = (2<<10); | ||
896 | } else { | ||
897 | max_possible_sectors = DRBD_MAX_SECTORS; | ||
898 | min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1); | ||
899 | } | ||
900 | |||
901 | if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { | ||
902 | retcode = ERR_MD_DISK_TO_SMALL; | ||
903 | dev_warn(DEV, "refusing attach: md-device too small, " | ||
904 | "at least %llu sectors needed for this meta-disk type\n", | ||
905 | (unsigned long long) min_md_device_sectors); | ||
906 | goto release_bdev2_fail; | ||
907 | } | ||
908 | |||
909 | /* Make sure the new disk is big enough | ||
910 | * (we may currently be R_PRIMARY with no local disk...) */ | ||
911 | if (drbd_get_max_capacity(nbc) < | ||
912 | drbd_get_capacity(mdev->this_bdev)) { | ||
913 | retcode = ERR_DISK_TO_SMALL; | ||
914 | goto release_bdev2_fail; | ||
915 | } | ||
916 | |||
917 | nbc->known_size = drbd_get_capacity(nbc->backing_bdev); | ||
918 | |||
919 | if (nbc->known_size > max_possible_sectors) { | ||
920 | dev_warn(DEV, "==> truncating very big lower level device " | ||
921 | "to currently maximum possible %llu sectors <==\n", | ||
922 | (unsigned long long) max_possible_sectors); | ||
923 | if (nbc->dc.meta_dev_idx >= 0) | ||
924 | dev_warn(DEV, "==>> using internal or flexible " | ||
925 | "meta data may help <<==\n"); | ||
926 | } | ||
927 | |||
928 | drbd_suspend_io(mdev); | ||
929 | /* also wait for the last barrier ack. */ | ||
930 | wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt)); | ||
931 | /* and for any other previously queued work */ | ||
932 | drbd_flush_workqueue(mdev); | ||
933 | |||
934 | retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE); | ||
935 | drbd_resume_io(mdev); | ||
936 | if (retcode < SS_SUCCESS) | ||
937 | goto release_bdev2_fail; | ||
938 | |||
939 | if (!get_ldev_if_state(mdev, D_ATTACHING)) | ||
940 | goto force_diskless; | ||
941 | |||
942 | drbd_md_set_sector_offsets(mdev, nbc); | ||
943 | |||
944 | /* allocate a second IO page if logical_block_size != 512 */ | ||
945 | logical_block_size = bdev_logical_block_size(nbc->md_bdev); | ||
946 | if (logical_block_size == 0) | ||
947 | logical_block_size = MD_SECTOR_SIZE; | ||
948 | |||
949 | if (logical_block_size != MD_SECTOR_SIZE) { | ||
950 | if (!mdev->md_io_tmpp) { | ||
951 | struct page *page = alloc_page(GFP_NOIO); | ||
952 | if (!page) | ||
953 | goto force_diskless_dec; | ||
954 | |||
955 | dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n", | ||
956 | logical_block_size, MD_SECTOR_SIZE); | ||
957 | dev_warn(DEV, "Workaround engaged (has performance impact).\n"); | ||
958 | |||
959 | mdev->md_io_tmpp = page; | ||
960 | } | ||
961 | } | ||
962 | |||
963 | if (!mdev->bitmap) { | ||
964 | if (drbd_bm_init(mdev)) { | ||
965 | retcode = ERR_NOMEM; | ||
966 | goto force_diskless_dec; | ||
967 | } | ||
968 | } | ||
969 | |||
970 | retcode = drbd_md_read(mdev, nbc); | ||
971 | if (retcode != NO_ERROR) | ||
972 | goto force_diskless_dec; | ||
973 | |||
974 | if (mdev->state.conn < C_CONNECTED && | ||
975 | mdev->state.role == R_PRIMARY && | ||
976 | (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { | ||
977 | dev_err(DEV, "Can only attach to data with current UUID=%016llX\n", | ||
978 | (unsigned long long)mdev->ed_uuid); | ||
979 | retcode = ERR_DATA_NOT_CURRENT; | ||
980 | goto force_diskless_dec; | ||
981 | } | ||
982 | |||
983 | /* Since we are diskless, fix the activity log first... */ | ||
984 | if (drbd_check_al_size(mdev)) { | ||
985 | retcode = ERR_NOMEM; | ||
986 | goto force_diskless_dec; | ||
987 | } | ||
988 | |||
989 | /* Prevent shrinking of consistent devices ! */ | ||
990 | if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && | ||
991 | drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) { | ||
992 | dev_warn(DEV, "refusing to truncate a consistent device\n"); | ||
993 | retcode = ERR_DISK_TO_SMALL; | ||
994 | goto force_diskless_dec; | ||
995 | } | ||
996 | |||
997 | if (!drbd_al_read_log(mdev, nbc)) { | ||
998 | retcode = ERR_IO_MD_DISK; | ||
999 | goto force_diskless_dec; | ||
1000 | } | ||
1001 | |||
1002 | /* Reset the "barriers don't work" bits here, then force meta data to | ||
1003 | * be written, to ensure we determine if barriers are supported. */ | ||
1004 | if (nbc->dc.no_md_flush) | ||
1005 | set_bit(MD_NO_BARRIER, &mdev->flags); | ||
1006 | else | ||
1007 | clear_bit(MD_NO_BARRIER, &mdev->flags); | ||
1008 | |||
1009 | /* Point of no return reached. | ||
1010 | * Devices and memory are no longer released by error cleanup below. | ||
1011 | * now mdev takes over responsibility, and the state engine should | ||
1012 | * clean it up somewhere. */ | ||
1013 | D_ASSERT(mdev->ldev == NULL); | ||
1014 | mdev->ldev = nbc; | ||
1015 | mdev->resync = resync_lru; | ||
1016 | nbc = NULL; | ||
1017 | resync_lru = NULL; | ||
1018 | |||
1019 | mdev->write_ordering = WO_bio_barrier; | ||
1020 | drbd_bump_write_ordering(mdev, WO_bio_barrier); | ||
1021 | |||
1022 | if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY)) | ||
1023 | set_bit(CRASHED_PRIMARY, &mdev->flags); | ||
1024 | else | ||
1025 | clear_bit(CRASHED_PRIMARY, &mdev->flags); | ||
1026 | |||
1027 | if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND)) { | ||
1028 | set_bit(CRASHED_PRIMARY, &mdev->flags); | ||
1029 | cp_discovered = 1; | ||
1030 | } | ||
1031 | |||
1032 | mdev->send_cnt = 0; | ||
1033 | mdev->recv_cnt = 0; | ||
1034 | mdev->read_cnt = 0; | ||
1035 | mdev->writ_cnt = 0; | ||
1036 | |||
1037 | drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE); | ||
1038 | |||
1039 | /* If I am currently not R_PRIMARY, | ||
1040 | * but meta data primary indicator is set, | ||
1041 | * I just now recover from a hard crash, | ||
1042 | * and have been R_PRIMARY before that crash. | ||
1043 | * | ||
1044 | * Now, if I had no connection before that crash | ||
1045 | * (have been degraded R_PRIMARY), chances are that | ||
1046 | * I won't find my peer now either. | ||
1047 | * | ||
1048 | * In that case, and _only_ in that case, | ||
1049 | * we use the degr-wfc-timeout instead of the default, | ||
1050 | * so we can automatically recover from a crash of a | ||
1051 | * degraded but active "cluster" after a certain timeout. | ||
1052 | */ | ||
1053 | clear_bit(USE_DEGR_WFC_T, &mdev->flags); | ||
1054 | if (mdev->state.role != R_PRIMARY && | ||
1055 | drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) && | ||
1056 | !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND)) | ||
1057 | set_bit(USE_DEGR_WFC_T, &mdev->flags); | ||
1058 | |||
1059 | dd = drbd_determin_dev_size(mdev, 0); | ||
1060 | if (dd == dev_size_error) { | ||
1061 | retcode = ERR_NOMEM_BITMAP; | ||
1062 | goto force_diskless_dec; | ||
1063 | } else if (dd == grew) | ||
1064 | set_bit(RESYNC_AFTER_NEG, &mdev->flags); | ||
1065 | |||
1066 | if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { | ||
1067 | dev_info(DEV, "Assuming that all blocks are out of sync " | ||
1068 | "(aka FullSync)\n"); | ||
1069 | if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from attaching")) { | ||
1070 | retcode = ERR_IO_MD_DISK; | ||
1071 | goto force_diskless_dec; | ||
1072 | } | ||
1073 | } else { | ||
1074 | if (drbd_bitmap_io(mdev, &drbd_bm_read, "read from attaching") < 0) { | ||
1075 | retcode = ERR_IO_MD_DISK; | ||
1076 | goto force_diskless_dec; | ||
1077 | } | ||
1078 | } | ||
1079 | |||
1080 | if (cp_discovered) { | ||
1081 | drbd_al_apply_to_bm(mdev); | ||
1082 | drbd_al_to_on_disk_bm(mdev); | ||
1083 | } | ||
1084 | |||
1085 | spin_lock_irq(&mdev->req_lock); | ||
1086 | os = mdev->state; | ||
1087 | ns.i = os.i; | ||
1088 | /* If MDF_CONSISTENT is not set go into inconsistent state, | ||
1089 | otherwise investigate MDF_WasUpToDate... | ||
1090 | If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state, | ||
1091 | otherwise into D_CONSISTENT state. | ||
1092 | */ | ||
1093 | if (drbd_md_test_flag(mdev->ldev, MDF_CONSISTENT)) { | ||
1094 | if (drbd_md_test_flag(mdev->ldev, MDF_WAS_UP_TO_DATE)) | ||
1095 | ns.disk = D_CONSISTENT; | ||
1096 | else | ||
1097 | ns.disk = D_OUTDATED; | ||
1098 | } else { | ||
1099 | ns.disk = D_INCONSISTENT; | ||
1100 | } | ||
1101 | |||
1102 | if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED)) | ||
1103 | ns.pdsk = D_OUTDATED; | ||
1104 | |||
1105 | if ( ns.disk == D_CONSISTENT && | ||
1106 | (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE)) | ||
1107 | ns.disk = D_UP_TO_DATE; | ||
1108 | |||
1109 | /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND, | ||
1110 | MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before | ||
1111 | this point, because drbd_request_state() modifies these | ||
1112 | flags. */ | ||
1113 | |||
1114 | /* In case we are C_CONNECTED postpone any decision on the new disk | ||
1115 | state after the negotiation phase. */ | ||
1116 | if (mdev->state.conn == C_CONNECTED) { | ||
1117 | mdev->new_state_tmp.i = ns.i; | ||
1118 | ns.i = os.i; | ||
1119 | ns.disk = D_NEGOTIATING; | ||
1120 | } | ||
1121 | |||
1122 | rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | ||
1123 | ns = mdev->state; | ||
1124 | spin_unlock_irq(&mdev->req_lock); | ||
1125 | |||
1126 | if (rv < SS_SUCCESS) | ||
1127 | goto force_diskless_dec; | ||
1128 | |||
1129 | if (mdev->state.role == R_PRIMARY) | ||
1130 | mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1; | ||
1131 | else | ||
1132 | mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1; | ||
1133 | |||
1134 | drbd_md_mark_dirty(mdev); | ||
1135 | drbd_md_sync(mdev); | ||
1136 | |||
1137 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); | ||
1138 | put_ldev(mdev); | ||
1139 | reply->ret_code = retcode; | ||
1140 | drbd_reconfig_done(mdev); | ||
1141 | return 0; | ||
1142 | |||
1143 | force_diskless_dec: | ||
1144 | put_ldev(mdev); | ||
1145 | force_diskless: | ||
1146 | drbd_force_state(mdev, NS(disk, D_DISKLESS)); | ||
1147 | drbd_md_sync(mdev); | ||
1148 | release_bdev2_fail: | ||
1149 | if (nbc) | ||
1150 | bd_release(nbc->md_bdev); | ||
1151 | release_bdev_fail: | ||
1152 | if (nbc) | ||
1153 | bd_release(nbc->backing_bdev); | ||
1154 | fail: | ||
1155 | if (nbc) { | ||
1156 | if (nbc->lo_file) | ||
1157 | fput(nbc->lo_file); | ||
1158 | if (nbc->md_file) | ||
1159 | fput(nbc->md_file); | ||
1160 | kfree(nbc); | ||
1161 | } | ||
1162 | lc_destroy(resync_lru); | ||
1163 | |||
1164 | reply->ret_code = retcode; | ||
1165 | drbd_reconfig_done(mdev); | ||
1166 | return 0; | ||
1167 | } | ||
1168 | |||
1169 | static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1170 | struct drbd_nl_cfg_reply *reply) | ||
1171 | { | ||
1172 | reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS)); | ||
1173 | return 0; | ||
1174 | } | ||
1175 | |||
1176 | static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1177 | struct drbd_nl_cfg_reply *reply) | ||
1178 | { | ||
1179 | int i, ns; | ||
1180 | enum drbd_ret_codes retcode; | ||
1181 | struct net_conf *new_conf = NULL; | ||
1182 | struct crypto_hash *tfm = NULL; | ||
1183 | struct crypto_hash *integrity_w_tfm = NULL; | ||
1184 | struct crypto_hash *integrity_r_tfm = NULL; | ||
1185 | struct hlist_head *new_tl_hash = NULL; | ||
1186 | struct hlist_head *new_ee_hash = NULL; | ||
1187 | struct drbd_conf *odev; | ||
1188 | char hmac_name[CRYPTO_MAX_ALG_NAME]; | ||
1189 | void *int_dig_out = NULL; | ||
1190 | void *int_dig_in = NULL; | ||
1191 | void *int_dig_vv = NULL; | ||
1192 | struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr; | ||
1193 | |||
1194 | drbd_reconfig_start(mdev); | ||
1195 | |||
1196 | if (mdev->state.conn > C_STANDALONE) { | ||
1197 | retcode = ERR_NET_CONFIGURED; | ||
1198 | goto fail; | ||
1199 | } | ||
1200 | |||
1201 | /* allocation not in the IO path, cqueue thread context */ | ||
1202 | new_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL); | ||
1203 | if (!new_conf) { | ||
1204 | retcode = ERR_NOMEM; | ||
1205 | goto fail; | ||
1206 | } | ||
1207 | |||
1208 | memset(new_conf, 0, sizeof(struct net_conf)); | ||
1209 | new_conf->timeout = DRBD_TIMEOUT_DEF; | ||
1210 | new_conf->try_connect_int = DRBD_CONNECT_INT_DEF; | ||
1211 | new_conf->ping_int = DRBD_PING_INT_DEF; | ||
1212 | new_conf->max_epoch_size = DRBD_MAX_EPOCH_SIZE_DEF; | ||
1213 | new_conf->max_buffers = DRBD_MAX_BUFFERS_DEF; | ||
1214 | new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF; | ||
1215 | new_conf->sndbuf_size = DRBD_SNDBUF_SIZE_DEF; | ||
1216 | new_conf->rcvbuf_size = DRBD_RCVBUF_SIZE_DEF; | ||
1217 | new_conf->ko_count = DRBD_KO_COUNT_DEF; | ||
1218 | new_conf->after_sb_0p = DRBD_AFTER_SB_0P_DEF; | ||
1219 | new_conf->after_sb_1p = DRBD_AFTER_SB_1P_DEF; | ||
1220 | new_conf->after_sb_2p = DRBD_AFTER_SB_2P_DEF; | ||
1221 | new_conf->want_lose = 0; | ||
1222 | new_conf->two_primaries = 0; | ||
1223 | new_conf->wire_protocol = DRBD_PROT_C; | ||
1224 | new_conf->ping_timeo = DRBD_PING_TIMEO_DEF; | ||
1225 | new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF; | ||
1226 | |||
1227 | if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) { | ||
1228 | retcode = ERR_MANDATORY_TAG; | ||
1229 | goto fail; | ||
1230 | } | ||
1231 | |||
1232 | if (new_conf->two_primaries | ||
1233 | && (new_conf->wire_protocol != DRBD_PROT_C)) { | ||
1234 | retcode = ERR_NOT_PROTO_C; | ||
1235 | goto fail; | ||
1236 | }; | ||
1237 | |||
1238 | if (mdev->state.role == R_PRIMARY && new_conf->want_lose) { | ||
1239 | retcode = ERR_DISCARD; | ||
1240 | goto fail; | ||
1241 | } | ||
1242 | |||
1243 | retcode = NO_ERROR; | ||
1244 | |||
1245 | new_my_addr = (struct sockaddr *)&new_conf->my_addr; | ||
1246 | new_peer_addr = (struct sockaddr *)&new_conf->peer_addr; | ||
1247 | for (i = 0; i < minor_count; i++) { | ||
1248 | odev = minor_to_mdev(i); | ||
1249 | if (!odev || odev == mdev) | ||
1250 | continue; | ||
1251 | if (get_net_conf(odev)) { | ||
1252 | taken_addr = (struct sockaddr *)&odev->net_conf->my_addr; | ||
1253 | if (new_conf->my_addr_len == odev->net_conf->my_addr_len && | ||
1254 | !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len)) | ||
1255 | retcode = ERR_LOCAL_ADDR; | ||
1256 | |||
1257 | taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr; | ||
1258 | if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len && | ||
1259 | !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len)) | ||
1260 | retcode = ERR_PEER_ADDR; | ||
1261 | |||
1262 | put_net_conf(odev); | ||
1263 | if (retcode != NO_ERROR) | ||
1264 | goto fail; | ||
1265 | } | ||
1266 | } | ||
1267 | |||
1268 | if (new_conf->cram_hmac_alg[0] != 0) { | ||
1269 | snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)", | ||
1270 | new_conf->cram_hmac_alg); | ||
1271 | tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC); | ||
1272 | if (IS_ERR(tfm)) { | ||
1273 | tfm = NULL; | ||
1274 | retcode = ERR_AUTH_ALG; | ||
1275 | goto fail; | ||
1276 | } | ||
1277 | |||
1278 | if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) { | ||
1279 | retcode = ERR_AUTH_ALG_ND; | ||
1280 | goto fail; | ||
1281 | } | ||
1282 | } | ||
1283 | |||
1284 | if (new_conf->integrity_alg[0]) { | ||
1285 | integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC); | ||
1286 | if (IS_ERR(integrity_w_tfm)) { | ||
1287 | integrity_w_tfm = NULL; | ||
1288 | retcode=ERR_INTEGRITY_ALG; | ||
1289 | goto fail; | ||
1290 | } | ||
1291 | |||
1292 | if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) { | ||
1293 | retcode=ERR_INTEGRITY_ALG_ND; | ||
1294 | goto fail; | ||
1295 | } | ||
1296 | |||
1297 | integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC); | ||
1298 | if (IS_ERR(integrity_r_tfm)) { | ||
1299 | integrity_r_tfm = NULL; | ||
1300 | retcode=ERR_INTEGRITY_ALG; | ||
1301 | goto fail; | ||
1302 | } | ||
1303 | } | ||
1304 | |||
1305 | ns = new_conf->max_epoch_size/8; | ||
1306 | if (mdev->tl_hash_s != ns) { | ||
1307 | new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL); | ||
1308 | if (!new_tl_hash) { | ||
1309 | retcode = ERR_NOMEM; | ||
1310 | goto fail; | ||
1311 | } | ||
1312 | } | ||
1313 | |||
1314 | ns = new_conf->max_buffers/8; | ||
1315 | if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) { | ||
1316 | new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL); | ||
1317 | if (!new_ee_hash) { | ||
1318 | retcode = ERR_NOMEM; | ||
1319 | goto fail; | ||
1320 | } | ||
1321 | } | ||
1322 | |||
1323 | ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0; | ||
1324 | |||
1325 | if (integrity_w_tfm) { | ||
1326 | i = crypto_hash_digestsize(integrity_w_tfm); | ||
1327 | int_dig_out = kmalloc(i, GFP_KERNEL); | ||
1328 | if (!int_dig_out) { | ||
1329 | retcode = ERR_NOMEM; | ||
1330 | goto fail; | ||
1331 | } | ||
1332 | int_dig_in = kmalloc(i, GFP_KERNEL); | ||
1333 | if (!int_dig_in) { | ||
1334 | retcode = ERR_NOMEM; | ||
1335 | goto fail; | ||
1336 | } | ||
1337 | int_dig_vv = kmalloc(i, GFP_KERNEL); | ||
1338 | if (!int_dig_vv) { | ||
1339 | retcode = ERR_NOMEM; | ||
1340 | goto fail; | ||
1341 | } | ||
1342 | } | ||
1343 | |||
1344 | if (!mdev->bitmap) { | ||
1345 | if(drbd_bm_init(mdev)) { | ||
1346 | retcode = ERR_NOMEM; | ||
1347 | goto fail; | ||
1348 | } | ||
1349 | } | ||
1350 | |||
1351 | spin_lock_irq(&mdev->req_lock); | ||
1352 | if (mdev->net_conf != NULL) { | ||
1353 | retcode = ERR_NET_CONFIGURED; | ||
1354 | spin_unlock_irq(&mdev->req_lock); | ||
1355 | goto fail; | ||
1356 | } | ||
1357 | mdev->net_conf = new_conf; | ||
1358 | |||
1359 | mdev->send_cnt = 0; | ||
1360 | mdev->recv_cnt = 0; | ||
1361 | |||
1362 | if (new_tl_hash) { | ||
1363 | kfree(mdev->tl_hash); | ||
1364 | mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8; | ||
1365 | mdev->tl_hash = new_tl_hash; | ||
1366 | } | ||
1367 | |||
1368 | if (new_ee_hash) { | ||
1369 | kfree(mdev->ee_hash); | ||
1370 | mdev->ee_hash_s = mdev->net_conf->max_buffers/8; | ||
1371 | mdev->ee_hash = new_ee_hash; | ||
1372 | } | ||
1373 | |||
1374 | crypto_free_hash(mdev->cram_hmac_tfm); | ||
1375 | mdev->cram_hmac_tfm = tfm; | ||
1376 | |||
1377 | crypto_free_hash(mdev->integrity_w_tfm); | ||
1378 | mdev->integrity_w_tfm = integrity_w_tfm; | ||
1379 | |||
1380 | crypto_free_hash(mdev->integrity_r_tfm); | ||
1381 | mdev->integrity_r_tfm = integrity_r_tfm; | ||
1382 | |||
1383 | kfree(mdev->int_dig_out); | ||
1384 | kfree(mdev->int_dig_in); | ||
1385 | kfree(mdev->int_dig_vv); | ||
1386 | mdev->int_dig_out=int_dig_out; | ||
1387 | mdev->int_dig_in=int_dig_in; | ||
1388 | mdev->int_dig_vv=int_dig_vv; | ||
1389 | spin_unlock_irq(&mdev->req_lock); | ||
1390 | |||
1391 | retcode = _drbd_request_state(mdev, NS(conn, C_UNCONNECTED), CS_VERBOSE); | ||
1392 | |||
1393 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); | ||
1394 | reply->ret_code = retcode; | ||
1395 | drbd_reconfig_done(mdev); | ||
1396 | return 0; | ||
1397 | |||
1398 | fail: | ||
1399 | kfree(int_dig_out); | ||
1400 | kfree(int_dig_in); | ||
1401 | kfree(int_dig_vv); | ||
1402 | crypto_free_hash(tfm); | ||
1403 | crypto_free_hash(integrity_w_tfm); | ||
1404 | crypto_free_hash(integrity_r_tfm); | ||
1405 | kfree(new_tl_hash); | ||
1406 | kfree(new_ee_hash); | ||
1407 | kfree(new_conf); | ||
1408 | |||
1409 | reply->ret_code = retcode; | ||
1410 | drbd_reconfig_done(mdev); | ||
1411 | return 0; | ||
1412 | } | ||
1413 | |||
1414 | static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1415 | struct drbd_nl_cfg_reply *reply) | ||
1416 | { | ||
1417 | int retcode; | ||
1418 | |||
1419 | retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED); | ||
1420 | |||
1421 | if (retcode == SS_NOTHING_TO_DO) | ||
1422 | goto done; | ||
1423 | else if (retcode == SS_ALREADY_STANDALONE) | ||
1424 | goto done; | ||
1425 | else if (retcode == SS_PRIMARY_NOP) { | ||
1426 | /* Our statche checking code wants to see the peer outdated. */ | ||
1427 | retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING, | ||
1428 | pdsk, D_OUTDATED)); | ||
1429 | } else if (retcode == SS_CW_FAILED_BY_PEER) { | ||
1430 | /* The peer probably wants to see us outdated. */ | ||
1431 | retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING, | ||
1432 | disk, D_OUTDATED), | ||
1433 | CS_ORDERED); | ||
1434 | if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) { | ||
1435 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
1436 | retcode = SS_SUCCESS; | ||
1437 | } | ||
1438 | } | ||
1439 | |||
1440 | if (retcode < SS_SUCCESS) | ||
1441 | goto fail; | ||
1442 | |||
1443 | if (wait_event_interruptible(mdev->state_wait, | ||
1444 | mdev->state.conn != C_DISCONNECTING)) { | ||
1445 | /* Do not test for mdev->state.conn == C_STANDALONE, since | ||
1446 | someone else might connect us in the mean time! */ | ||
1447 | retcode = ERR_INTR; | ||
1448 | goto fail; | ||
1449 | } | ||
1450 | |||
1451 | done: | ||
1452 | retcode = NO_ERROR; | ||
1453 | fail: | ||
1454 | drbd_md_sync(mdev); | ||
1455 | reply->ret_code = retcode; | ||
1456 | return 0; | ||
1457 | } | ||
1458 | |||
1459 | void resync_after_online_grow(struct drbd_conf *mdev) | ||
1460 | { | ||
1461 | int iass; /* I am sync source */ | ||
1462 | |||
1463 | dev_info(DEV, "Resync of new storage after online grow\n"); | ||
1464 | if (mdev->state.role != mdev->state.peer) | ||
1465 | iass = (mdev->state.role == R_PRIMARY); | ||
1466 | else | ||
1467 | iass = test_bit(DISCARD_CONCURRENT, &mdev->flags); | ||
1468 | |||
1469 | if (iass) | ||
1470 | drbd_start_resync(mdev, C_SYNC_SOURCE); | ||
1471 | else | ||
1472 | _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE); | ||
1473 | } | ||
1474 | |||
1475 | static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1476 | struct drbd_nl_cfg_reply *reply) | ||
1477 | { | ||
1478 | struct resize rs; | ||
1479 | int retcode = NO_ERROR; | ||
1480 | int ldsc = 0; /* local disk size changed */ | ||
1481 | enum determine_dev_size dd; | ||
1482 | |||
1483 | memset(&rs, 0, sizeof(struct resize)); | ||
1484 | if (!resize_from_tags(mdev, nlp->tag_list, &rs)) { | ||
1485 | retcode = ERR_MANDATORY_TAG; | ||
1486 | goto fail; | ||
1487 | } | ||
1488 | |||
1489 | if (mdev->state.conn > C_CONNECTED) { | ||
1490 | retcode = ERR_RESIZE_RESYNC; | ||
1491 | goto fail; | ||
1492 | } | ||
1493 | |||
1494 | if (mdev->state.role == R_SECONDARY && | ||
1495 | mdev->state.peer == R_SECONDARY) { | ||
1496 | retcode = ERR_NO_PRIMARY; | ||
1497 | goto fail; | ||
1498 | } | ||
1499 | |||
1500 | if (!get_ldev(mdev)) { | ||
1501 | retcode = ERR_NO_DISK; | ||
1502 | goto fail; | ||
1503 | } | ||
1504 | |||
1505 | if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { | ||
1506 | mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); | ||
1507 | ldsc = 1; | ||
1508 | } | ||
1509 | |||
1510 | mdev->ldev->dc.disk_size = (sector_t)rs.resize_size; | ||
1511 | dd = drbd_determin_dev_size(mdev, rs.resize_force); | ||
1512 | drbd_md_sync(mdev); | ||
1513 | put_ldev(mdev); | ||
1514 | if (dd == dev_size_error) { | ||
1515 | retcode = ERR_NOMEM_BITMAP; | ||
1516 | goto fail; | ||
1517 | } | ||
1518 | |||
1519 | if (mdev->state.conn == C_CONNECTED && (dd != unchanged || ldsc)) { | ||
1520 | if (dd == grew) | ||
1521 | set_bit(RESIZE_PENDING, &mdev->flags); | ||
1522 | |||
1523 | drbd_send_uuids(mdev); | ||
1524 | drbd_send_sizes(mdev, 1); | ||
1525 | } | ||
1526 | |||
1527 | fail: | ||
1528 | reply->ret_code = retcode; | ||
1529 | return 0; | ||
1530 | } | ||
1531 | |||
1532 | static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1533 | struct drbd_nl_cfg_reply *reply) | ||
1534 | { | ||
1535 | int retcode = NO_ERROR; | ||
1536 | int err; | ||
1537 | int ovr; /* online verify running */ | ||
1538 | int rsr; /* re-sync running */ | ||
1539 | struct crypto_hash *verify_tfm = NULL; | ||
1540 | struct crypto_hash *csums_tfm = NULL; | ||
1541 | struct syncer_conf sc; | ||
1542 | cpumask_var_t new_cpu_mask; | ||
1543 | |||
1544 | if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) { | ||
1545 | retcode = ERR_NOMEM; | ||
1546 | goto fail; | ||
1547 | } | ||
1548 | |||
1549 | if (nlp->flags & DRBD_NL_SET_DEFAULTS) { | ||
1550 | memset(&sc, 0, sizeof(struct syncer_conf)); | ||
1551 | sc.rate = DRBD_RATE_DEF; | ||
1552 | sc.after = DRBD_AFTER_DEF; | ||
1553 | sc.al_extents = DRBD_AL_EXTENTS_DEF; | ||
1554 | } else | ||
1555 | memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf)); | ||
1556 | |||
1557 | if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) { | ||
1558 | retcode = ERR_MANDATORY_TAG; | ||
1559 | goto fail; | ||
1560 | } | ||
1561 | |||
1562 | /* re-sync running */ | ||
1563 | rsr = ( mdev->state.conn == C_SYNC_SOURCE || | ||
1564 | mdev->state.conn == C_SYNC_TARGET || | ||
1565 | mdev->state.conn == C_PAUSED_SYNC_S || | ||
1566 | mdev->state.conn == C_PAUSED_SYNC_T ); | ||
1567 | |||
1568 | if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) { | ||
1569 | retcode = ERR_CSUMS_RESYNC_RUNNING; | ||
1570 | goto fail; | ||
1571 | } | ||
1572 | |||
1573 | if (!rsr && sc.csums_alg[0]) { | ||
1574 | csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC); | ||
1575 | if (IS_ERR(csums_tfm)) { | ||
1576 | csums_tfm = NULL; | ||
1577 | retcode = ERR_CSUMS_ALG; | ||
1578 | goto fail; | ||
1579 | } | ||
1580 | |||
1581 | if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) { | ||
1582 | retcode = ERR_CSUMS_ALG_ND; | ||
1583 | goto fail; | ||
1584 | } | ||
1585 | } | ||
1586 | |||
1587 | /* online verify running */ | ||
1588 | ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T); | ||
1589 | |||
1590 | if (ovr) { | ||
1591 | if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) { | ||
1592 | retcode = ERR_VERIFY_RUNNING; | ||
1593 | goto fail; | ||
1594 | } | ||
1595 | } | ||
1596 | |||
1597 | if (!ovr && sc.verify_alg[0]) { | ||
1598 | verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC); | ||
1599 | if (IS_ERR(verify_tfm)) { | ||
1600 | verify_tfm = NULL; | ||
1601 | retcode = ERR_VERIFY_ALG; | ||
1602 | goto fail; | ||
1603 | } | ||
1604 | |||
1605 | if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) { | ||
1606 | retcode = ERR_VERIFY_ALG_ND; | ||
1607 | goto fail; | ||
1608 | } | ||
1609 | } | ||
1610 | |||
1611 | /* silently ignore cpu mask on UP kernel */ | ||
1612 | if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) { | ||
1613 | err = __bitmap_parse(sc.cpu_mask, 32, 0, | ||
1614 | cpumask_bits(new_cpu_mask), nr_cpu_ids); | ||
1615 | if (err) { | ||
1616 | dev_warn(DEV, "__bitmap_parse() failed with %d\n", err); | ||
1617 | retcode = ERR_CPU_MASK_PARSE; | ||
1618 | goto fail; | ||
1619 | } | ||
1620 | } | ||
1621 | |||
1622 | ERR_IF (sc.rate < 1) sc.rate = 1; | ||
1623 | ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */ | ||
1624 | #define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT) | ||
1625 | if (sc.al_extents > AL_MAX) { | ||
1626 | dev_err(DEV, "sc.al_extents > %d\n", AL_MAX); | ||
1627 | sc.al_extents = AL_MAX; | ||
1628 | } | ||
1629 | #undef AL_MAX | ||
1630 | |||
1631 | /* most sanity checks done, try to assign the new sync-after | ||
1632 | * dependency. need to hold the global lock in there, | ||
1633 | * to avoid a race in the dependency loop check. */ | ||
1634 | retcode = drbd_alter_sa(mdev, sc.after); | ||
1635 | if (retcode != NO_ERROR) | ||
1636 | goto fail; | ||
1637 | |||
1638 | /* ok, assign the rest of it as well. | ||
1639 | * lock against receive_SyncParam() */ | ||
1640 | spin_lock(&mdev->peer_seq_lock); | ||
1641 | mdev->sync_conf = sc; | ||
1642 | |||
1643 | if (!rsr) { | ||
1644 | crypto_free_hash(mdev->csums_tfm); | ||
1645 | mdev->csums_tfm = csums_tfm; | ||
1646 | csums_tfm = NULL; | ||
1647 | } | ||
1648 | |||
1649 | if (!ovr) { | ||
1650 | crypto_free_hash(mdev->verify_tfm); | ||
1651 | mdev->verify_tfm = verify_tfm; | ||
1652 | verify_tfm = NULL; | ||
1653 | } | ||
1654 | spin_unlock(&mdev->peer_seq_lock); | ||
1655 | |||
1656 | if (get_ldev(mdev)) { | ||
1657 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); | ||
1658 | drbd_al_shrink(mdev); | ||
1659 | err = drbd_check_al_size(mdev); | ||
1660 | lc_unlock(mdev->act_log); | ||
1661 | wake_up(&mdev->al_wait); | ||
1662 | |||
1663 | put_ldev(mdev); | ||
1664 | drbd_md_sync(mdev); | ||
1665 | |||
1666 | if (err) { | ||
1667 | retcode = ERR_NOMEM; | ||
1668 | goto fail; | ||
1669 | } | ||
1670 | } | ||
1671 | |||
1672 | if (mdev->state.conn >= C_CONNECTED) | ||
1673 | drbd_send_sync_param(mdev, &sc); | ||
1674 | |||
1675 | if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) { | ||
1676 | cpumask_copy(mdev->cpu_mask, new_cpu_mask); | ||
1677 | drbd_calc_cpu_mask(mdev); | ||
1678 | mdev->receiver.reset_cpu_mask = 1; | ||
1679 | mdev->asender.reset_cpu_mask = 1; | ||
1680 | mdev->worker.reset_cpu_mask = 1; | ||
1681 | } | ||
1682 | |||
1683 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); | ||
1684 | fail: | ||
1685 | free_cpumask_var(new_cpu_mask); | ||
1686 | crypto_free_hash(csums_tfm); | ||
1687 | crypto_free_hash(verify_tfm); | ||
1688 | reply->ret_code = retcode; | ||
1689 | return 0; | ||
1690 | } | ||
1691 | |||
1692 | static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1693 | struct drbd_nl_cfg_reply *reply) | ||
1694 | { | ||
1695 | int retcode; | ||
1696 | |||
1697 | retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); | ||
1698 | |||
1699 | if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION) | ||
1700 | retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); | ||
1701 | |||
1702 | while (retcode == SS_NEED_CONNECTION) { | ||
1703 | spin_lock_irq(&mdev->req_lock); | ||
1704 | if (mdev->state.conn < C_CONNECTED) | ||
1705 | retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL); | ||
1706 | spin_unlock_irq(&mdev->req_lock); | ||
1707 | |||
1708 | if (retcode != SS_NEED_CONNECTION) | ||
1709 | break; | ||
1710 | |||
1711 | retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); | ||
1712 | } | ||
1713 | |||
1714 | reply->ret_code = retcode; | ||
1715 | return 0; | ||
1716 | } | ||
1717 | |||
1718 | static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1719 | struct drbd_nl_cfg_reply *reply) | ||
1720 | { | ||
1721 | |||
1722 | reply->ret_code = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); | ||
1723 | |||
1724 | return 0; | ||
1725 | } | ||
1726 | |||
1727 | static int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1728 | struct drbd_nl_cfg_reply *reply) | ||
1729 | { | ||
1730 | int retcode = NO_ERROR; | ||
1731 | |||
1732 | if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO) | ||
1733 | retcode = ERR_PAUSE_IS_SET; | ||
1734 | |||
1735 | reply->ret_code = retcode; | ||
1736 | return 0; | ||
1737 | } | ||
1738 | |||
1739 | static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1740 | struct drbd_nl_cfg_reply *reply) | ||
1741 | { | ||
1742 | int retcode = NO_ERROR; | ||
1743 | |||
1744 | if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) | ||
1745 | retcode = ERR_PAUSE_IS_CLEAR; | ||
1746 | |||
1747 | reply->ret_code = retcode; | ||
1748 | return 0; | ||
1749 | } | ||
1750 | |||
1751 | static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1752 | struct drbd_nl_cfg_reply *reply) | ||
1753 | { | ||
1754 | reply->ret_code = drbd_request_state(mdev, NS(susp, 1)); | ||
1755 | |||
1756 | return 0; | ||
1757 | } | ||
1758 | |||
1759 | static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1760 | struct drbd_nl_cfg_reply *reply) | ||
1761 | { | ||
1762 | reply->ret_code = drbd_request_state(mdev, NS(susp, 0)); | ||
1763 | return 0; | ||
1764 | } | ||
1765 | |||
1766 | static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1767 | struct drbd_nl_cfg_reply *reply) | ||
1768 | { | ||
1769 | reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED)); | ||
1770 | return 0; | ||
1771 | } | ||
1772 | |||
1773 | static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1774 | struct drbd_nl_cfg_reply *reply) | ||
1775 | { | ||
1776 | unsigned short *tl; | ||
1777 | |||
1778 | tl = reply->tag_list; | ||
1779 | |||
1780 | if (get_ldev(mdev)) { | ||
1781 | tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl); | ||
1782 | put_ldev(mdev); | ||
1783 | } | ||
1784 | |||
1785 | if (get_net_conf(mdev)) { | ||
1786 | tl = net_conf_to_tags(mdev, mdev->net_conf, tl); | ||
1787 | put_net_conf(mdev); | ||
1788 | } | ||
1789 | tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl); | ||
1790 | |||
1791 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
1792 | |||
1793 | return (int)((char *)tl - (char *)reply->tag_list); | ||
1794 | } | ||
1795 | |||
1796 | static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1797 | struct drbd_nl_cfg_reply *reply) | ||
1798 | { | ||
1799 | unsigned short *tl = reply->tag_list; | ||
1800 | union drbd_state s = mdev->state; | ||
1801 | unsigned long rs_left; | ||
1802 | unsigned int res; | ||
1803 | |||
1804 | tl = get_state_to_tags(mdev, (struct get_state *)&s, tl); | ||
1805 | |||
1806 | /* no local ref, no bitmap, no syncer progress. */ | ||
1807 | if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) { | ||
1808 | if (get_ldev(mdev)) { | ||
1809 | drbd_get_syncer_progress(mdev, &rs_left, &res); | ||
1810 | tl = tl_add_int(tl, T_sync_progress, &res); | ||
1811 | put_ldev(mdev); | ||
1812 | } | ||
1813 | } | ||
1814 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
1815 | |||
1816 | return (int)((char *)tl - (char *)reply->tag_list); | ||
1817 | } | ||
1818 | |||
1819 | static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1820 | struct drbd_nl_cfg_reply *reply) | ||
1821 | { | ||
1822 | unsigned short *tl; | ||
1823 | |||
1824 | tl = reply->tag_list; | ||
1825 | |||
1826 | if (get_ldev(mdev)) { | ||
1827 | tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64)); | ||
1828 | tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags); | ||
1829 | put_ldev(mdev); | ||
1830 | } | ||
1831 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
1832 | |||
1833 | return (int)((char *)tl - (char *)reply->tag_list); | ||
1834 | } | ||
1835 | |||
1836 | /** | ||
1837 | * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use | ||
1838 | * @mdev: DRBD device. | ||
1839 | * @nlp: Netlink/connector packet from drbdsetup | ||
1840 | * @reply: Reply packet for drbdsetup | ||
1841 | */ | ||
1842 | static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1843 | struct drbd_nl_cfg_reply *reply) | ||
1844 | { | ||
1845 | unsigned short *tl; | ||
1846 | char rv; | ||
1847 | |||
1848 | tl = reply->tag_list; | ||
1849 | |||
1850 | rv = mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : | ||
1851 | test_bit(USE_DEGR_WFC_T, &mdev->flags) ? UT_DEGRADED : UT_DEFAULT; | ||
1852 | |||
1853 | tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv)); | ||
1854 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
1855 | |||
1856 | return (int)((char *)tl - (char *)reply->tag_list); | ||
1857 | } | ||
1858 | |||
1859 | static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1860 | struct drbd_nl_cfg_reply *reply) | ||
1861 | { | ||
1862 | /* default to resume from last known position, if possible */ | ||
1863 | struct start_ov args = | ||
1864 | { .start_sector = mdev->ov_start_sector }; | ||
1865 | |||
1866 | if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) { | ||
1867 | reply->ret_code = ERR_MANDATORY_TAG; | ||
1868 | return 0; | ||
1869 | } | ||
1870 | /* w_make_ov_request expects position to be aligned */ | ||
1871 | mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT; | ||
1872 | reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); | ||
1873 | return 0; | ||
1874 | } | ||
1875 | |||
1876 | |||
1877 | static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1878 | struct drbd_nl_cfg_reply *reply) | ||
1879 | { | ||
1880 | int retcode = NO_ERROR; | ||
1881 | int skip_initial_sync = 0; | ||
1882 | int err; | ||
1883 | |||
1884 | struct new_c_uuid args; | ||
1885 | |||
1886 | memset(&args, 0, sizeof(struct new_c_uuid)); | ||
1887 | if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) { | ||
1888 | reply->ret_code = ERR_MANDATORY_TAG; | ||
1889 | return 0; | ||
1890 | } | ||
1891 | |||
1892 | mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */ | ||
1893 | |||
1894 | if (!get_ldev(mdev)) { | ||
1895 | retcode = ERR_NO_DISK; | ||
1896 | goto out; | ||
1897 | } | ||
1898 | |||
1899 | /* this is "skip initial sync", assume to be clean */ | ||
1900 | if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 && | ||
1901 | mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) { | ||
1902 | dev_info(DEV, "Preparing to skip initial sync\n"); | ||
1903 | skip_initial_sync = 1; | ||
1904 | } else if (mdev->state.conn != C_STANDALONE) { | ||
1905 | retcode = ERR_CONNECTED; | ||
1906 | goto out_dec; | ||
1907 | } | ||
1908 | |||
1909 | drbd_uuid_set(mdev, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */ | ||
1910 | drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */ | ||
1911 | |||
1912 | if (args.clear_bm) { | ||
1913 | err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, "clear_n_write from new_c_uuid"); | ||
1914 | if (err) { | ||
1915 | dev_err(DEV, "Writing bitmap failed with %d\n",err); | ||
1916 | retcode = ERR_IO_MD_DISK; | ||
1917 | } | ||
1918 | if (skip_initial_sync) { | ||
1919 | drbd_send_uuids_skip_initial_sync(mdev); | ||
1920 | _drbd_uuid_set(mdev, UI_BITMAP, 0); | ||
1921 | spin_lock_irq(&mdev->req_lock); | ||
1922 | _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), | ||
1923 | CS_VERBOSE, NULL); | ||
1924 | spin_unlock_irq(&mdev->req_lock); | ||
1925 | } | ||
1926 | } | ||
1927 | |||
1928 | drbd_md_sync(mdev); | ||
1929 | out_dec: | ||
1930 | put_ldev(mdev); | ||
1931 | out: | ||
1932 | mutex_unlock(&mdev->state_mutex); | ||
1933 | |||
1934 | reply->ret_code = retcode; | ||
1935 | return 0; | ||
1936 | } | ||
1937 | |||
1938 | static struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp) | ||
1939 | { | ||
1940 | struct drbd_conf *mdev; | ||
1941 | |||
1942 | if (nlp->drbd_minor >= minor_count) | ||
1943 | return NULL; | ||
1944 | |||
1945 | mdev = minor_to_mdev(nlp->drbd_minor); | ||
1946 | |||
1947 | if (!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) { | ||
1948 | struct gendisk *disk = NULL; | ||
1949 | mdev = drbd_new_device(nlp->drbd_minor); | ||
1950 | |||
1951 | spin_lock_irq(&drbd_pp_lock); | ||
1952 | if (minor_table[nlp->drbd_minor] == NULL) { | ||
1953 | minor_table[nlp->drbd_minor] = mdev; | ||
1954 | disk = mdev->vdisk; | ||
1955 | mdev = NULL; | ||
1956 | } /* else: we lost the race */ | ||
1957 | spin_unlock_irq(&drbd_pp_lock); | ||
1958 | |||
1959 | if (disk) /* we won the race above */ | ||
1960 | /* in case we ever add a drbd_delete_device(), | ||
1961 | * don't forget the del_gendisk! */ | ||
1962 | add_disk(disk); | ||
1963 | else /* we lost the race above */ | ||
1964 | drbd_free_mdev(mdev); | ||
1965 | |||
1966 | mdev = minor_to_mdev(nlp->drbd_minor); | ||
1967 | } | ||
1968 | |||
1969 | return mdev; | ||
1970 | } | ||
1971 | |||
1972 | struct cn_handler_struct { | ||
1973 | int (*function)(struct drbd_conf *, | ||
1974 | struct drbd_nl_cfg_req *, | ||
1975 | struct drbd_nl_cfg_reply *); | ||
1976 | int reply_body_size; | ||
1977 | }; | ||
1978 | |||
1979 | static struct cn_handler_struct cnd_table[] = { | ||
1980 | [ P_primary ] = { &drbd_nl_primary, 0 }, | ||
1981 | [ P_secondary ] = { &drbd_nl_secondary, 0 }, | ||
1982 | [ P_disk_conf ] = { &drbd_nl_disk_conf, 0 }, | ||
1983 | [ P_detach ] = { &drbd_nl_detach, 0 }, | ||
1984 | [ P_net_conf ] = { &drbd_nl_net_conf, 0 }, | ||
1985 | [ P_disconnect ] = { &drbd_nl_disconnect, 0 }, | ||
1986 | [ P_resize ] = { &drbd_nl_resize, 0 }, | ||
1987 | [ P_syncer_conf ] = { &drbd_nl_syncer_conf, 0 }, | ||
1988 | [ P_invalidate ] = { &drbd_nl_invalidate, 0 }, | ||
1989 | [ P_invalidate_peer ] = { &drbd_nl_invalidate_peer, 0 }, | ||
1990 | [ P_pause_sync ] = { &drbd_nl_pause_sync, 0 }, | ||
1991 | [ P_resume_sync ] = { &drbd_nl_resume_sync, 0 }, | ||
1992 | [ P_suspend_io ] = { &drbd_nl_suspend_io, 0 }, | ||
1993 | [ P_resume_io ] = { &drbd_nl_resume_io, 0 }, | ||
1994 | [ P_outdate ] = { &drbd_nl_outdate, 0 }, | ||
1995 | [ P_get_config ] = { &drbd_nl_get_config, | ||
1996 | sizeof(struct syncer_conf_tag_len_struct) + | ||
1997 | sizeof(struct disk_conf_tag_len_struct) + | ||
1998 | sizeof(struct net_conf_tag_len_struct) }, | ||
1999 | [ P_get_state ] = { &drbd_nl_get_state, | ||
2000 | sizeof(struct get_state_tag_len_struct) + | ||
2001 | sizeof(struct sync_progress_tag_len_struct) }, | ||
2002 | [ P_get_uuids ] = { &drbd_nl_get_uuids, | ||
2003 | sizeof(struct get_uuids_tag_len_struct) }, | ||
2004 | [ P_get_timeout_flag ] = { &drbd_nl_get_timeout_flag, | ||
2005 | sizeof(struct get_timeout_flag_tag_len_struct)}, | ||
2006 | [ P_start_ov ] = { &drbd_nl_start_ov, 0 }, | ||
2007 | [ P_new_c_uuid ] = { &drbd_nl_new_c_uuid, 0 }, | ||
2008 | }; | ||
2009 | |||
2010 | static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms *nsp) | ||
2011 | { | ||
2012 | struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data; | ||
2013 | struct cn_handler_struct *cm; | ||
2014 | struct cn_msg *cn_reply; | ||
2015 | struct drbd_nl_cfg_reply *reply; | ||
2016 | struct drbd_conf *mdev; | ||
2017 | int retcode, rr; | ||
2018 | int reply_size = sizeof(struct cn_msg) | ||
2019 | + sizeof(struct drbd_nl_cfg_reply) | ||
2020 | + sizeof(short int); | ||
2021 | |||
2022 | if (!try_module_get(THIS_MODULE)) { | ||
2023 | printk(KERN_ERR "drbd: try_module_get() failed!\n"); | ||
2024 | return; | ||
2025 | } | ||
2026 | |||
2027 | if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN)) { | ||
2028 | retcode = ERR_PERM; | ||
2029 | goto fail; | ||
2030 | } | ||
2031 | |||
2032 | mdev = ensure_mdev(nlp); | ||
2033 | if (!mdev) { | ||
2034 | retcode = ERR_MINOR_INVALID; | ||
2035 | goto fail; | ||
2036 | } | ||
2037 | |||
2038 | if (nlp->packet_type >= P_nl_after_last_packet) { | ||
2039 | retcode = ERR_PACKET_NR; | ||
2040 | goto fail; | ||
2041 | } | ||
2042 | |||
2043 | cm = cnd_table + nlp->packet_type; | ||
2044 | |||
2045 | /* This may happen if packet number is 0: */ | ||
2046 | if (cm->function == NULL) { | ||
2047 | retcode = ERR_PACKET_NR; | ||
2048 | goto fail; | ||
2049 | } | ||
2050 | |||
2051 | reply_size += cm->reply_body_size; | ||
2052 | |||
2053 | /* allocation not in the IO path, cqueue thread context */ | ||
2054 | cn_reply = kmalloc(reply_size, GFP_KERNEL); | ||
2055 | if (!cn_reply) { | ||
2056 | retcode = ERR_NOMEM; | ||
2057 | goto fail; | ||
2058 | } | ||
2059 | reply = (struct drbd_nl_cfg_reply *) cn_reply->data; | ||
2060 | |||
2061 | reply->packet_type = | ||
2062 | cm->reply_body_size ? nlp->packet_type : P_nl_after_last_packet; | ||
2063 | reply->minor = nlp->drbd_minor; | ||
2064 | reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */ | ||
2065 | /* reply->tag_list; might be modified by cm->function. */ | ||
2066 | |||
2067 | rr = cm->function(mdev, nlp, reply); | ||
2068 | |||
2069 | cn_reply->id = req->id; | ||
2070 | cn_reply->seq = req->seq; | ||
2071 | cn_reply->ack = req->ack + 1; | ||
2072 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr; | ||
2073 | cn_reply->flags = 0; | ||
2074 | |||
2075 | rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL); | ||
2076 | if (rr && rr != -ESRCH) | ||
2077 | printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr); | ||
2078 | |||
2079 | kfree(cn_reply); | ||
2080 | module_put(THIS_MODULE); | ||
2081 | return; | ||
2082 | fail: | ||
2083 | drbd_nl_send_reply(req, retcode); | ||
2084 | module_put(THIS_MODULE); | ||
2085 | } | ||
2086 | |||
2087 | static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */ | ||
2088 | |||
2089 | static unsigned short * | ||
2090 | __tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, | ||
2091 | unsigned short len, int nul_terminated) | ||
2092 | { | ||
2093 | unsigned short l = tag_descriptions[tag_number(tag)].max_len; | ||
2094 | len = (len < l) ? len : l; | ||
2095 | put_unaligned(tag, tl++); | ||
2096 | put_unaligned(len, tl++); | ||
2097 | memcpy(tl, data, len); | ||
2098 | tl = (unsigned short*)((char*)tl + len); | ||
2099 | if (nul_terminated) | ||
2100 | *((char*)tl - 1) = 0; | ||
2101 | return tl; | ||
2102 | } | ||
2103 | |||
2104 | static unsigned short * | ||
2105 | tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len) | ||
2106 | { | ||
2107 | return __tl_add_blob(tl, tag, data, len, 0); | ||
2108 | } | ||
2109 | |||
2110 | static unsigned short * | ||
2111 | tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str) | ||
2112 | { | ||
2113 | return __tl_add_blob(tl, tag, str, strlen(str)+1, 0); | ||
2114 | } | ||
2115 | |||
2116 | static unsigned short * | ||
2117 | tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val) | ||
2118 | { | ||
2119 | put_unaligned(tag, tl++); | ||
2120 | switch(tag_type(tag)) { | ||
2121 | case TT_INTEGER: | ||
2122 | put_unaligned(sizeof(int), tl++); | ||
2123 | put_unaligned(*(int *)val, (int *)tl); | ||
2124 | tl = (unsigned short*)((char*)tl+sizeof(int)); | ||
2125 | break; | ||
2126 | case TT_INT64: | ||
2127 | put_unaligned(sizeof(u64), tl++); | ||
2128 | put_unaligned(*(u64 *)val, (u64 *)tl); | ||
2129 | tl = (unsigned short*)((char*)tl+sizeof(u64)); | ||
2130 | break; | ||
2131 | default: | ||
2132 | /* someone did something stupid. */ | ||
2133 | ; | ||
2134 | } | ||
2135 | return tl; | ||
2136 | } | ||
2137 | |||
2138 | void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state) | ||
2139 | { | ||
2140 | char buffer[sizeof(struct cn_msg)+ | ||
2141 | sizeof(struct drbd_nl_cfg_reply)+ | ||
2142 | sizeof(struct get_state_tag_len_struct)+ | ||
2143 | sizeof(short int)]; | ||
2144 | struct cn_msg *cn_reply = (struct cn_msg *) buffer; | ||
2145 | struct drbd_nl_cfg_reply *reply = | ||
2146 | (struct drbd_nl_cfg_reply *)cn_reply->data; | ||
2147 | unsigned short *tl = reply->tag_list; | ||
2148 | |||
2149 | /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */ | ||
2150 | |||
2151 | tl = get_state_to_tags(mdev, (struct get_state *)&state, tl); | ||
2152 | |||
2153 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
2154 | |||
2155 | cn_reply->id.idx = CN_IDX_DRBD; | ||
2156 | cn_reply->id.val = CN_VAL_DRBD; | ||
2157 | |||
2158 | cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); | ||
2159 | cn_reply->ack = 0; /* not used here. */ | ||
2160 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + | ||
2161 | (int)((char *)tl - (char *)reply->tag_list); | ||
2162 | cn_reply->flags = 0; | ||
2163 | |||
2164 | reply->packet_type = P_get_state; | ||
2165 | reply->minor = mdev_to_minor(mdev); | ||
2166 | reply->ret_code = NO_ERROR; | ||
2167 | |||
2168 | cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | ||
2169 | } | ||
2170 | |||
2171 | void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name) | ||
2172 | { | ||
2173 | char buffer[sizeof(struct cn_msg)+ | ||
2174 | sizeof(struct drbd_nl_cfg_reply)+ | ||
2175 | sizeof(struct call_helper_tag_len_struct)+ | ||
2176 | sizeof(short int)]; | ||
2177 | struct cn_msg *cn_reply = (struct cn_msg *) buffer; | ||
2178 | struct drbd_nl_cfg_reply *reply = | ||
2179 | (struct drbd_nl_cfg_reply *)cn_reply->data; | ||
2180 | unsigned short *tl = reply->tag_list; | ||
2181 | |||
2182 | /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */ | ||
2183 | |||
2184 | tl = tl_add_str(tl, T_helper, helper_name); | ||
2185 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
2186 | |||
2187 | cn_reply->id.idx = CN_IDX_DRBD; | ||
2188 | cn_reply->id.val = CN_VAL_DRBD; | ||
2189 | |||
2190 | cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); | ||
2191 | cn_reply->ack = 0; /* not used here. */ | ||
2192 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + | ||
2193 | (int)((char *)tl - (char *)reply->tag_list); | ||
2194 | cn_reply->flags = 0; | ||
2195 | |||
2196 | reply->packet_type = P_call_helper; | ||
2197 | reply->minor = mdev_to_minor(mdev); | ||
2198 | reply->ret_code = NO_ERROR; | ||
2199 | |||
2200 | cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | ||
2201 | } | ||
2202 | |||
2203 | void drbd_bcast_ee(struct drbd_conf *mdev, | ||
2204 | const char *reason, const int dgs, | ||
2205 | const char* seen_hash, const char* calc_hash, | ||
2206 | const struct drbd_epoch_entry* e) | ||
2207 | { | ||
2208 | struct cn_msg *cn_reply; | ||
2209 | struct drbd_nl_cfg_reply *reply; | ||
2210 | struct bio_vec *bvec; | ||
2211 | unsigned short *tl; | ||
2212 | int i; | ||
2213 | |||
2214 | if (!e) | ||
2215 | return; | ||
2216 | if (!reason || !reason[0]) | ||
2217 | return; | ||
2218 | |||
2219 | /* apparently we have to memcpy twice, first to prepare the data for the | ||
2220 | * struct cn_msg, then within cn_netlink_send from the cn_msg to the | ||
2221 | * netlink skb. */ | ||
2222 | /* receiver thread context, which is not in the writeout path (of this node), | ||
2223 | * but may be in the writeout path of the _other_ node. | ||
2224 | * GFP_NOIO to avoid potential "distributed deadlock". */ | ||
2225 | cn_reply = kmalloc( | ||
2226 | sizeof(struct cn_msg)+ | ||
2227 | sizeof(struct drbd_nl_cfg_reply)+ | ||
2228 | sizeof(struct dump_ee_tag_len_struct)+ | ||
2229 | sizeof(short int), | ||
2230 | GFP_NOIO); | ||
2231 | |||
2232 | if (!cn_reply) { | ||
2233 | dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n", | ||
2234 | (unsigned long long)e->sector, e->size); | ||
2235 | return; | ||
2236 | } | ||
2237 | |||
2238 | reply = (struct drbd_nl_cfg_reply*)cn_reply->data; | ||
2239 | tl = reply->tag_list; | ||
2240 | |||
2241 | tl = tl_add_str(tl, T_dump_ee_reason, reason); | ||
2242 | tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs); | ||
2243 | tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs); | ||
2244 | tl = tl_add_int(tl, T_ee_sector, &e->sector); | ||
2245 | tl = tl_add_int(tl, T_ee_block_id, &e->block_id); | ||
2246 | |||
2247 | put_unaligned(T_ee_data, tl++); | ||
2248 | put_unaligned(e->size, tl++); | ||
2249 | |||
2250 | __bio_for_each_segment(bvec, e->private_bio, i, 0) { | ||
2251 | void *d = kmap(bvec->bv_page); | ||
2252 | memcpy(tl, d + bvec->bv_offset, bvec->bv_len); | ||
2253 | kunmap(bvec->bv_page); | ||
2254 | tl=(unsigned short*)((char*)tl + bvec->bv_len); | ||
2255 | } | ||
2256 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
2257 | |||
2258 | cn_reply->id.idx = CN_IDX_DRBD; | ||
2259 | cn_reply->id.val = CN_VAL_DRBD; | ||
2260 | |||
2261 | cn_reply->seq = atomic_add_return(1,&drbd_nl_seq); | ||
2262 | cn_reply->ack = 0; // not used here. | ||
2263 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + | ||
2264 | (int)((char*)tl - (char*)reply->tag_list); | ||
2265 | cn_reply->flags = 0; | ||
2266 | |||
2267 | reply->packet_type = P_dump_ee; | ||
2268 | reply->minor = mdev_to_minor(mdev); | ||
2269 | reply->ret_code = NO_ERROR; | ||
2270 | |||
2271 | cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | ||
2272 | kfree(cn_reply); | ||
2273 | } | ||
2274 | |||
2275 | void drbd_bcast_sync_progress(struct drbd_conf *mdev) | ||
2276 | { | ||
2277 | char buffer[sizeof(struct cn_msg)+ | ||
2278 | sizeof(struct drbd_nl_cfg_reply)+ | ||
2279 | sizeof(struct sync_progress_tag_len_struct)+ | ||
2280 | sizeof(short int)]; | ||
2281 | struct cn_msg *cn_reply = (struct cn_msg *) buffer; | ||
2282 | struct drbd_nl_cfg_reply *reply = | ||
2283 | (struct drbd_nl_cfg_reply *)cn_reply->data; | ||
2284 | unsigned short *tl = reply->tag_list; | ||
2285 | unsigned long rs_left; | ||
2286 | unsigned int res; | ||
2287 | |||
2288 | /* no local ref, no bitmap, no syncer progress, no broadcast. */ | ||
2289 | if (!get_ldev(mdev)) | ||
2290 | return; | ||
2291 | drbd_get_syncer_progress(mdev, &rs_left, &res); | ||
2292 | put_ldev(mdev); | ||
2293 | |||
2294 | tl = tl_add_int(tl, T_sync_progress, &res); | ||
2295 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
2296 | |||
2297 | cn_reply->id.idx = CN_IDX_DRBD; | ||
2298 | cn_reply->id.val = CN_VAL_DRBD; | ||
2299 | |||
2300 | cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); | ||
2301 | cn_reply->ack = 0; /* not used here. */ | ||
2302 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + | ||
2303 | (int)((char *)tl - (char *)reply->tag_list); | ||
2304 | cn_reply->flags = 0; | ||
2305 | |||
2306 | reply->packet_type = P_sync_progress; | ||
2307 | reply->minor = mdev_to_minor(mdev); | ||
2308 | reply->ret_code = NO_ERROR; | ||
2309 | |||
2310 | cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | ||
2311 | } | ||
2312 | |||
2313 | int __init drbd_nl_init(void) | ||
2314 | { | ||
2315 | static struct cb_id cn_id_drbd; | ||
2316 | int err, try=10; | ||
2317 | |||
2318 | cn_id_drbd.val = CN_VAL_DRBD; | ||
2319 | do { | ||
2320 | cn_id_drbd.idx = cn_idx; | ||
2321 | err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback); | ||
2322 | if (!err) | ||
2323 | break; | ||
2324 | cn_idx = (cn_idx + CN_IDX_STEP); | ||
2325 | } while (try--); | ||
2326 | |||
2327 | if (err) { | ||
2328 | printk(KERN_ERR "drbd: cn_drbd failed to register\n"); | ||
2329 | return err; | ||
2330 | } | ||
2331 | |||
2332 | return 0; | ||
2333 | } | ||
2334 | |||
2335 | void drbd_nl_cleanup(void) | ||
2336 | { | ||
2337 | static struct cb_id cn_id_drbd; | ||
2338 | |||
2339 | cn_id_drbd.idx = cn_idx; | ||
2340 | cn_id_drbd.val = CN_VAL_DRBD; | ||
2341 | |||
2342 | cn_del_callback(&cn_id_drbd); | ||
2343 | } | ||
2344 | |||
2345 | void drbd_nl_send_reply(struct cn_msg *req, int ret_code) | ||
2346 | { | ||
2347 | char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)]; | ||
2348 | struct cn_msg *cn_reply = (struct cn_msg *) buffer; | ||
2349 | struct drbd_nl_cfg_reply *reply = | ||
2350 | (struct drbd_nl_cfg_reply *)cn_reply->data; | ||
2351 | int rr; | ||
2352 | |||
2353 | cn_reply->id = req->id; | ||
2354 | |||
2355 | cn_reply->seq = req->seq; | ||
2356 | cn_reply->ack = req->ack + 1; | ||
2357 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply); | ||
2358 | cn_reply->flags = 0; | ||
2359 | |||
2360 | reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor; | ||
2361 | reply->ret_code = ret_code; | ||
2362 | |||
2363 | rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | ||
2364 | if (rr && rr != -ESRCH) | ||
2365 | printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr); | ||
2366 | } | ||
2367 | |||
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c new file mode 100644 index 000000000000..be3374b68460 --- /dev/null +++ b/drivers/block/drbd/drbd_proc.c | |||
@@ -0,0 +1,264 @@ | |||
1 | /* | ||
2 | drbd_proc.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | |||
28 | #include <asm/uaccess.h> | ||
29 | #include <linux/fs.h> | ||
30 | #include <linux/file.h> | ||
31 | #include <linux/proc_fs.h> | ||
32 | #include <linux/seq_file.h> | ||
33 | #include <linux/drbd.h> | ||
34 | #include "drbd_int.h" | ||
35 | |||
36 | static int drbd_proc_open(struct inode *inode, struct file *file); | ||
37 | |||
38 | |||
39 | struct proc_dir_entry *drbd_proc; | ||
40 | const struct file_operations drbd_proc_fops = { | ||
41 | .owner = THIS_MODULE, | ||
42 | .open = drbd_proc_open, | ||
43 | .read = seq_read, | ||
44 | .llseek = seq_lseek, | ||
45 | .release = single_release, | ||
46 | }; | ||
47 | |||
48 | |||
49 | /*lge | ||
50 | * progress bars shamelessly adapted from driver/md/md.c | ||
51 | * output looks like | ||
52 | * [=====>..............] 33.5% (23456/123456) | ||
53 | * finish: 2:20:20 speed: 6,345 (6,456) K/sec | ||
54 | */ | ||
55 | static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq) | ||
56 | { | ||
57 | unsigned long db, dt, dbdt, rt, rs_left; | ||
58 | unsigned int res; | ||
59 | int i, x, y; | ||
60 | |||
61 | drbd_get_syncer_progress(mdev, &rs_left, &res); | ||
62 | |||
63 | x = res/50; | ||
64 | y = 20-x; | ||
65 | seq_printf(seq, "\t["); | ||
66 | for (i = 1; i < x; i++) | ||
67 | seq_printf(seq, "="); | ||
68 | seq_printf(seq, ">"); | ||
69 | for (i = 0; i < y; i++) | ||
70 | seq_printf(seq, "."); | ||
71 | seq_printf(seq, "] "); | ||
72 | |||
73 | seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10); | ||
74 | /* if more than 1 GB display in MB */ | ||
75 | if (mdev->rs_total > 0x100000L) | ||
76 | seq_printf(seq, "(%lu/%lu)M\n\t", | ||
77 | (unsigned long) Bit2KB(rs_left >> 10), | ||
78 | (unsigned long) Bit2KB(mdev->rs_total >> 10)); | ||
79 | else | ||
80 | seq_printf(seq, "(%lu/%lu)K\n\t", | ||
81 | (unsigned long) Bit2KB(rs_left), | ||
82 | (unsigned long) Bit2KB(mdev->rs_total)); | ||
83 | |||
84 | /* see drivers/md/md.c | ||
85 | * We do not want to overflow, so the order of operands and | ||
86 | * the * 100 / 100 trick are important. We do a +1 to be | ||
87 | * safe against division by zero. We only estimate anyway. | ||
88 | * | ||
89 | * dt: time from mark until now | ||
90 | * db: blocks written from mark until now | ||
91 | * rt: remaining time | ||
92 | */ | ||
93 | dt = (jiffies - mdev->rs_mark_time) / HZ; | ||
94 | |||
95 | if (dt > 20) { | ||
96 | /* if we made no update to rs_mark_time for too long, | ||
97 | * we are stalled. show that. */ | ||
98 | seq_printf(seq, "stalled\n"); | ||
99 | return; | ||
100 | } | ||
101 | |||
102 | if (!dt) | ||
103 | dt++; | ||
104 | db = mdev->rs_mark_left - rs_left; | ||
105 | rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */ | ||
106 | |||
107 | seq_printf(seq, "finish: %lu:%02lu:%02lu", | ||
108 | rt / 3600, (rt % 3600) / 60, rt % 60); | ||
109 | |||
110 | /* current speed average over (SYNC_MARKS * SYNC_MARK_STEP) jiffies */ | ||
111 | dbdt = Bit2KB(db/dt); | ||
112 | if (dbdt > 1000) | ||
113 | seq_printf(seq, " speed: %ld,%03ld", | ||
114 | dbdt/1000, dbdt % 1000); | ||
115 | else | ||
116 | seq_printf(seq, " speed: %ld", dbdt); | ||
117 | |||
118 | /* mean speed since syncer started | ||
119 | * we do account for PausedSync periods */ | ||
120 | dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; | ||
121 | if (dt <= 0) | ||
122 | dt = 1; | ||
123 | db = mdev->rs_total - rs_left; | ||
124 | dbdt = Bit2KB(db/dt); | ||
125 | if (dbdt > 1000) | ||
126 | seq_printf(seq, " (%ld,%03ld)", | ||
127 | dbdt/1000, dbdt % 1000); | ||
128 | else | ||
129 | seq_printf(seq, " (%ld)", dbdt); | ||
130 | |||
131 | seq_printf(seq, " K/sec\n"); | ||
132 | } | ||
133 | |||
134 | static void resync_dump_detail(struct seq_file *seq, struct lc_element *e) | ||
135 | { | ||
136 | struct bm_extent *bme = lc_entry(e, struct bm_extent, lce); | ||
137 | |||
138 | seq_printf(seq, "%5d %s %s\n", bme->rs_left, | ||
139 | bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------", | ||
140 | bme->flags & BME_LOCKED ? "LOCKED" : "------" | ||
141 | ); | ||
142 | } | ||
143 | |||
144 | static int drbd_seq_show(struct seq_file *seq, void *v) | ||
145 | { | ||
146 | int i, hole = 0; | ||
147 | const char *sn; | ||
148 | struct drbd_conf *mdev; | ||
149 | |||
150 | static char write_ordering_chars[] = { | ||
151 | [WO_none] = 'n', | ||
152 | [WO_drain_io] = 'd', | ||
153 | [WO_bdev_flush] = 'f', | ||
154 | [WO_bio_barrier] = 'b', | ||
155 | }; | ||
156 | |||
157 | seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n", | ||
158 | API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag()); | ||
159 | |||
160 | /* | ||
161 | cs .. connection state | ||
162 | ro .. node role (local/remote) | ||
163 | ds .. disk state (local/remote) | ||
164 | protocol | ||
165 | various flags | ||
166 | ns .. network send | ||
167 | nr .. network receive | ||
168 | dw .. disk write | ||
169 | dr .. disk read | ||
170 | al .. activity log write count | ||
171 | bm .. bitmap update write count | ||
172 | pe .. pending (waiting for ack or data reply) | ||
173 | ua .. unack'd (still need to send ack or data reply) | ||
174 | ap .. application requests accepted, but not yet completed | ||
175 | ep .. number of epochs currently "on the fly", P_BARRIER_ACK pending | ||
176 | wo .. write ordering mode currently in use | ||
177 | oos .. known out-of-sync kB | ||
178 | */ | ||
179 | |||
180 | for (i = 0; i < minor_count; i++) { | ||
181 | mdev = minor_to_mdev(i); | ||
182 | if (!mdev) { | ||
183 | hole = 1; | ||
184 | continue; | ||
185 | } | ||
186 | if (hole) { | ||
187 | hole = 0; | ||
188 | seq_printf(seq, "\n"); | ||
189 | } | ||
190 | |||
191 | sn = drbd_conn_str(mdev->state.conn); | ||
192 | |||
193 | if (mdev->state.conn == C_STANDALONE && | ||
194 | mdev->state.disk == D_DISKLESS && | ||
195 | mdev->state.role == R_SECONDARY) { | ||
196 | seq_printf(seq, "%2d: cs:Unconfigured\n", i); | ||
197 | } else { | ||
198 | seq_printf(seq, | ||
199 | "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c\n" | ||
200 | " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " | ||
201 | "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c", | ||
202 | i, sn, | ||
203 | drbd_role_str(mdev->state.role), | ||
204 | drbd_role_str(mdev->state.peer), | ||
205 | drbd_disk_str(mdev->state.disk), | ||
206 | drbd_disk_str(mdev->state.pdsk), | ||
207 | (mdev->net_conf == NULL ? ' ' : | ||
208 | (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')), | ||
209 | mdev->state.susp ? 's' : 'r', | ||
210 | mdev->state.aftr_isp ? 'a' : '-', | ||
211 | mdev->state.peer_isp ? 'p' : '-', | ||
212 | mdev->state.user_isp ? 'u' : '-', | ||
213 | mdev->congestion_reason ?: '-', | ||
214 | mdev->send_cnt/2, | ||
215 | mdev->recv_cnt/2, | ||
216 | mdev->writ_cnt/2, | ||
217 | mdev->read_cnt/2, | ||
218 | mdev->al_writ_cnt, | ||
219 | mdev->bm_writ_cnt, | ||
220 | atomic_read(&mdev->local_cnt), | ||
221 | atomic_read(&mdev->ap_pending_cnt) + | ||
222 | atomic_read(&mdev->rs_pending_cnt), | ||
223 | atomic_read(&mdev->unacked_cnt), | ||
224 | atomic_read(&mdev->ap_bio_cnt), | ||
225 | mdev->epochs, | ||
226 | write_ordering_chars[mdev->write_ordering] | ||
227 | ); | ||
228 | seq_printf(seq, " oos:%lu\n", | ||
229 | Bit2KB(drbd_bm_total_weight(mdev))); | ||
230 | } | ||
231 | if (mdev->state.conn == C_SYNC_SOURCE || | ||
232 | mdev->state.conn == C_SYNC_TARGET) | ||
233 | drbd_syncer_progress(mdev, seq); | ||
234 | |||
235 | if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T) | ||
236 | seq_printf(seq, "\t%3d%% %lu/%lu\n", | ||
237 | (int)((mdev->rs_total-mdev->ov_left) / | ||
238 | (mdev->rs_total/100+1)), | ||
239 | mdev->rs_total - mdev->ov_left, | ||
240 | mdev->rs_total); | ||
241 | |||
242 | if (proc_details >= 1 && get_ldev_if_state(mdev, D_FAILED)) { | ||
243 | lc_seq_printf_stats(seq, mdev->resync); | ||
244 | lc_seq_printf_stats(seq, mdev->act_log); | ||
245 | put_ldev(mdev); | ||
246 | } | ||
247 | |||
248 | if (proc_details >= 2) { | ||
249 | if (mdev->resync) { | ||
250 | lc_seq_dump_details(seq, mdev->resync, "rs_left", | ||
251 | resync_dump_detail); | ||
252 | } | ||
253 | } | ||
254 | } | ||
255 | |||
256 | return 0; | ||
257 | } | ||
258 | |||
259 | static int drbd_proc_open(struct inode *inode, struct file *file) | ||
260 | { | ||
261 | return single_open(file, drbd_seq_show, PDE(inode)->data); | ||
262 | } | ||
263 | |||
264 | /* PROC FS stuff end */ | ||
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c new file mode 100644 index 000000000000..3f096e7959b4 --- /dev/null +++ b/drivers/block/drbd/drbd_receiver.c | |||
@@ -0,0 +1,4462 @@ | |||
1 | /* | ||
2 | drbd_receiver.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | */ | ||
24 | |||
25 | |||
26 | #include <linux/module.h> | ||
27 | |||
28 | #include <asm/uaccess.h> | ||
29 | #include <net/sock.h> | ||
30 | |||
31 | #include <linux/drbd.h> | ||
32 | #include <linux/fs.h> | ||
33 | #include <linux/file.h> | ||
34 | #include <linux/in.h> | ||
35 | #include <linux/mm.h> | ||
36 | #include <linux/memcontrol.h> | ||
37 | #include <linux/mm_inline.h> | ||
38 | #include <linux/slab.h> | ||
39 | #include <linux/smp_lock.h> | ||
40 | #include <linux/pkt_sched.h> | ||
41 | #define __KERNEL_SYSCALLS__ | ||
42 | #include <linux/unistd.h> | ||
43 | #include <linux/vmalloc.h> | ||
44 | #include <linux/random.h> | ||
45 | #include <linux/mm.h> | ||
46 | #include <linux/string.h> | ||
47 | #include <linux/scatterlist.h> | ||
48 | #include "drbd_int.h" | ||
49 | #include "drbd_req.h" | ||
50 | |||
51 | #include "drbd_vli.h" | ||
52 | |||
53 | struct flush_work { | ||
54 | struct drbd_work w; | ||
55 | struct drbd_epoch *epoch; | ||
56 | }; | ||
57 | |||
58 | enum finish_epoch { | ||
59 | FE_STILL_LIVE, | ||
60 | FE_DESTROYED, | ||
61 | FE_RECYCLED, | ||
62 | }; | ||
63 | |||
64 | static int drbd_do_handshake(struct drbd_conf *mdev); | ||
65 | static int drbd_do_auth(struct drbd_conf *mdev); | ||
66 | |||
67 | static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event); | ||
68 | static int e_end_block(struct drbd_conf *, struct drbd_work *, int); | ||
69 | |||
70 | static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch) | ||
71 | { | ||
72 | struct drbd_epoch *prev; | ||
73 | spin_lock(&mdev->epoch_lock); | ||
74 | prev = list_entry(epoch->list.prev, struct drbd_epoch, list); | ||
75 | if (prev == epoch || prev == mdev->current_epoch) | ||
76 | prev = NULL; | ||
77 | spin_unlock(&mdev->epoch_lock); | ||
78 | return prev; | ||
79 | } | ||
80 | |||
81 | #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) | ||
82 | |||
83 | static struct page *drbd_pp_first_page_or_try_alloc(struct drbd_conf *mdev) | ||
84 | { | ||
85 | struct page *page = NULL; | ||
86 | |||
87 | /* Yes, testing drbd_pp_vacant outside the lock is racy. | ||
88 | * So what. It saves a spin_lock. */ | ||
89 | if (drbd_pp_vacant > 0) { | ||
90 | spin_lock(&drbd_pp_lock); | ||
91 | page = drbd_pp_pool; | ||
92 | if (page) { | ||
93 | drbd_pp_pool = (struct page *)page_private(page); | ||
94 | set_page_private(page, 0); /* just to be polite */ | ||
95 | drbd_pp_vacant--; | ||
96 | } | ||
97 | spin_unlock(&drbd_pp_lock); | ||
98 | } | ||
99 | /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD | ||
100 | * "criss-cross" setup, that might cause write-out on some other DRBD, | ||
101 | * which in turn might block on the other node at this very place. */ | ||
102 | if (!page) | ||
103 | page = alloc_page(GFP_TRY); | ||
104 | if (page) | ||
105 | atomic_inc(&mdev->pp_in_use); | ||
106 | return page; | ||
107 | } | ||
108 | |||
109 | /* kick lower level device, if we have more than (arbitrary number) | ||
110 | * reference counts on it, which typically are locally submitted io | ||
111 | * requests. don't use unacked_cnt, so we speed up proto A and B, too. */ | ||
112 | static void maybe_kick_lo(struct drbd_conf *mdev) | ||
113 | { | ||
114 | if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark) | ||
115 | drbd_kick_lo(mdev); | ||
116 | } | ||
117 | |||
118 | static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed) | ||
119 | { | ||
120 | struct drbd_epoch_entry *e; | ||
121 | struct list_head *le, *tle; | ||
122 | |||
123 | /* The EEs are always appended to the end of the list. Since | ||
124 | they are sent in order over the wire, they have to finish | ||
125 | in order. As soon as we see the first not finished we can | ||
126 | stop to examine the list... */ | ||
127 | |||
128 | list_for_each_safe(le, tle, &mdev->net_ee) { | ||
129 | e = list_entry(le, struct drbd_epoch_entry, w.list); | ||
130 | if (drbd_bio_has_active_page(e->private_bio)) | ||
131 | break; | ||
132 | list_move(le, to_be_freed); | ||
133 | } | ||
134 | } | ||
135 | |||
136 | static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev) | ||
137 | { | ||
138 | LIST_HEAD(reclaimed); | ||
139 | struct drbd_epoch_entry *e, *t; | ||
140 | |||
141 | maybe_kick_lo(mdev); | ||
142 | spin_lock_irq(&mdev->req_lock); | ||
143 | reclaim_net_ee(mdev, &reclaimed); | ||
144 | spin_unlock_irq(&mdev->req_lock); | ||
145 | |||
146 | list_for_each_entry_safe(e, t, &reclaimed, w.list) | ||
147 | drbd_free_ee(mdev, e); | ||
148 | } | ||
149 | |||
150 | /** | ||
151 | * drbd_pp_alloc() - Returns a page, fails only if a signal comes in | ||
152 | * @mdev: DRBD device. | ||
153 | * @retry: whether or not to retry allocation forever (or until signalled) | ||
154 | * | ||
155 | * Tries to allocate a page, first from our own page pool, then from the | ||
156 | * kernel, unless this allocation would exceed the max_buffers setting. | ||
157 | * If @retry is non-zero, retry until DRBD frees a page somewhere else. | ||
158 | */ | ||
159 | static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry) | ||
160 | { | ||
161 | struct page *page = NULL; | ||
162 | DEFINE_WAIT(wait); | ||
163 | |||
164 | if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { | ||
165 | page = drbd_pp_first_page_or_try_alloc(mdev); | ||
166 | if (page) | ||
167 | return page; | ||
168 | } | ||
169 | |||
170 | for (;;) { | ||
171 | prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); | ||
172 | |||
173 | drbd_kick_lo_and_reclaim_net(mdev); | ||
174 | |||
175 | if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { | ||
176 | page = drbd_pp_first_page_or_try_alloc(mdev); | ||
177 | if (page) | ||
178 | break; | ||
179 | } | ||
180 | |||
181 | if (!retry) | ||
182 | break; | ||
183 | |||
184 | if (signal_pending(current)) { | ||
185 | dev_warn(DEV, "drbd_pp_alloc interrupted!\n"); | ||
186 | break; | ||
187 | } | ||
188 | |||
189 | schedule(); | ||
190 | } | ||
191 | finish_wait(&drbd_pp_wait, &wait); | ||
192 | |||
193 | return page; | ||
194 | } | ||
195 | |||
196 | /* Must not be used from irq, as that may deadlock: see drbd_pp_alloc. | ||
197 | * Is also used from inside an other spin_lock_irq(&mdev->req_lock) */ | ||
198 | static void drbd_pp_free(struct drbd_conf *mdev, struct page *page) | ||
199 | { | ||
200 | int free_it; | ||
201 | |||
202 | spin_lock(&drbd_pp_lock); | ||
203 | if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) { | ||
204 | free_it = 1; | ||
205 | } else { | ||
206 | set_page_private(page, (unsigned long)drbd_pp_pool); | ||
207 | drbd_pp_pool = page; | ||
208 | drbd_pp_vacant++; | ||
209 | free_it = 0; | ||
210 | } | ||
211 | spin_unlock(&drbd_pp_lock); | ||
212 | |||
213 | atomic_dec(&mdev->pp_in_use); | ||
214 | |||
215 | if (free_it) | ||
216 | __free_page(page); | ||
217 | |||
218 | wake_up(&drbd_pp_wait); | ||
219 | } | ||
220 | |||
221 | static void drbd_pp_free_bio_pages(struct drbd_conf *mdev, struct bio *bio) | ||
222 | { | ||
223 | struct page *p_to_be_freed = NULL; | ||
224 | struct page *page; | ||
225 | struct bio_vec *bvec; | ||
226 | int i; | ||
227 | |||
228 | spin_lock(&drbd_pp_lock); | ||
229 | __bio_for_each_segment(bvec, bio, i, 0) { | ||
230 | if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) { | ||
231 | set_page_private(bvec->bv_page, (unsigned long)p_to_be_freed); | ||
232 | p_to_be_freed = bvec->bv_page; | ||
233 | } else { | ||
234 | set_page_private(bvec->bv_page, (unsigned long)drbd_pp_pool); | ||
235 | drbd_pp_pool = bvec->bv_page; | ||
236 | drbd_pp_vacant++; | ||
237 | } | ||
238 | } | ||
239 | spin_unlock(&drbd_pp_lock); | ||
240 | atomic_sub(bio->bi_vcnt, &mdev->pp_in_use); | ||
241 | |||
242 | while (p_to_be_freed) { | ||
243 | page = p_to_be_freed; | ||
244 | p_to_be_freed = (struct page *)page_private(page); | ||
245 | set_page_private(page, 0); /* just to be polite */ | ||
246 | put_page(page); | ||
247 | } | ||
248 | |||
249 | wake_up(&drbd_pp_wait); | ||
250 | } | ||
251 | |||
252 | /* | ||
253 | You need to hold the req_lock: | ||
254 | _drbd_wait_ee_list_empty() | ||
255 | |||
256 | You must not have the req_lock: | ||
257 | drbd_free_ee() | ||
258 | drbd_alloc_ee() | ||
259 | drbd_init_ee() | ||
260 | drbd_release_ee() | ||
261 | drbd_ee_fix_bhs() | ||
262 | drbd_process_done_ee() | ||
263 | drbd_clear_done_ee() | ||
264 | drbd_wait_ee_list_empty() | ||
265 | */ | ||
266 | |||
267 | struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, | ||
268 | u64 id, | ||
269 | sector_t sector, | ||
270 | unsigned int data_size, | ||
271 | gfp_t gfp_mask) __must_hold(local) | ||
272 | { | ||
273 | struct request_queue *q; | ||
274 | struct drbd_epoch_entry *e; | ||
275 | struct page *page; | ||
276 | struct bio *bio; | ||
277 | unsigned int ds; | ||
278 | |||
279 | if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE)) | ||
280 | return NULL; | ||
281 | |||
282 | e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); | ||
283 | if (!e) { | ||
284 | if (!(gfp_mask & __GFP_NOWARN)) | ||
285 | dev_err(DEV, "alloc_ee: Allocation of an EE failed\n"); | ||
286 | return NULL; | ||
287 | } | ||
288 | |||
289 | bio = bio_alloc(gfp_mask & ~__GFP_HIGHMEM, div_ceil(data_size, PAGE_SIZE)); | ||
290 | if (!bio) { | ||
291 | if (!(gfp_mask & __GFP_NOWARN)) | ||
292 | dev_err(DEV, "alloc_ee: Allocation of a bio failed\n"); | ||
293 | goto fail1; | ||
294 | } | ||
295 | |||
296 | bio->bi_bdev = mdev->ldev->backing_bdev; | ||
297 | bio->bi_sector = sector; | ||
298 | |||
299 | ds = data_size; | ||
300 | while (ds) { | ||
301 | page = drbd_pp_alloc(mdev, (gfp_mask & __GFP_WAIT)); | ||
302 | if (!page) { | ||
303 | if (!(gfp_mask & __GFP_NOWARN)) | ||
304 | dev_err(DEV, "alloc_ee: Allocation of a page failed\n"); | ||
305 | goto fail2; | ||
306 | } | ||
307 | if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) { | ||
308 | drbd_pp_free(mdev, page); | ||
309 | dev_err(DEV, "alloc_ee: bio_add_page(s=%llu," | ||
310 | "data_size=%u,ds=%u) failed\n", | ||
311 | (unsigned long long)sector, data_size, ds); | ||
312 | |||
313 | q = bdev_get_queue(bio->bi_bdev); | ||
314 | if (q->merge_bvec_fn) { | ||
315 | struct bvec_merge_data bvm = { | ||
316 | .bi_bdev = bio->bi_bdev, | ||
317 | .bi_sector = bio->bi_sector, | ||
318 | .bi_size = bio->bi_size, | ||
319 | .bi_rw = bio->bi_rw, | ||
320 | }; | ||
321 | int l = q->merge_bvec_fn(q, &bvm, | ||
322 | &bio->bi_io_vec[bio->bi_vcnt]); | ||
323 | dev_err(DEV, "merge_bvec_fn() = %d\n", l); | ||
324 | } | ||
325 | |||
326 | /* dump more of the bio. */ | ||
327 | dev_err(DEV, "bio->bi_max_vecs = %d\n", bio->bi_max_vecs); | ||
328 | dev_err(DEV, "bio->bi_vcnt = %d\n", bio->bi_vcnt); | ||
329 | dev_err(DEV, "bio->bi_size = %d\n", bio->bi_size); | ||
330 | dev_err(DEV, "bio->bi_phys_segments = %d\n", bio->bi_phys_segments); | ||
331 | |||
332 | goto fail2; | ||
333 | break; | ||
334 | } | ||
335 | ds -= min_t(int, ds, PAGE_SIZE); | ||
336 | } | ||
337 | |||
338 | D_ASSERT(data_size == bio->bi_size); | ||
339 | |||
340 | bio->bi_private = e; | ||
341 | e->mdev = mdev; | ||
342 | e->sector = sector; | ||
343 | e->size = bio->bi_size; | ||
344 | |||
345 | e->private_bio = bio; | ||
346 | e->block_id = id; | ||
347 | INIT_HLIST_NODE(&e->colision); | ||
348 | e->epoch = NULL; | ||
349 | e->flags = 0; | ||
350 | |||
351 | return e; | ||
352 | |||
353 | fail2: | ||
354 | drbd_pp_free_bio_pages(mdev, bio); | ||
355 | bio_put(bio); | ||
356 | fail1: | ||
357 | mempool_free(e, drbd_ee_mempool); | ||
358 | |||
359 | return NULL; | ||
360 | } | ||
361 | |||
362 | void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) | ||
363 | { | ||
364 | struct bio *bio = e->private_bio; | ||
365 | drbd_pp_free_bio_pages(mdev, bio); | ||
366 | bio_put(bio); | ||
367 | D_ASSERT(hlist_unhashed(&e->colision)); | ||
368 | mempool_free(e, drbd_ee_mempool); | ||
369 | } | ||
370 | |||
371 | int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list) | ||
372 | { | ||
373 | LIST_HEAD(work_list); | ||
374 | struct drbd_epoch_entry *e, *t; | ||
375 | int count = 0; | ||
376 | |||
377 | spin_lock_irq(&mdev->req_lock); | ||
378 | list_splice_init(list, &work_list); | ||
379 | spin_unlock_irq(&mdev->req_lock); | ||
380 | |||
381 | list_for_each_entry_safe(e, t, &work_list, w.list) { | ||
382 | drbd_free_ee(mdev, e); | ||
383 | count++; | ||
384 | } | ||
385 | return count; | ||
386 | } | ||
387 | |||
388 | |||
389 | /* | ||
390 | * This function is called from _asender only_ | ||
391 | * but see also comments in _req_mod(,barrier_acked) | ||
392 | * and receive_Barrier. | ||
393 | * | ||
394 | * Move entries from net_ee to done_ee, if ready. | ||
395 | * Grab done_ee, call all callbacks, free the entries. | ||
396 | * The callbacks typically send out ACKs. | ||
397 | */ | ||
398 | static int drbd_process_done_ee(struct drbd_conf *mdev) | ||
399 | { | ||
400 | LIST_HEAD(work_list); | ||
401 | LIST_HEAD(reclaimed); | ||
402 | struct drbd_epoch_entry *e, *t; | ||
403 | int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS); | ||
404 | |||
405 | spin_lock_irq(&mdev->req_lock); | ||
406 | reclaim_net_ee(mdev, &reclaimed); | ||
407 | list_splice_init(&mdev->done_ee, &work_list); | ||
408 | spin_unlock_irq(&mdev->req_lock); | ||
409 | |||
410 | list_for_each_entry_safe(e, t, &reclaimed, w.list) | ||
411 | drbd_free_ee(mdev, e); | ||
412 | |||
413 | /* possible callbacks here: | ||
414 | * e_end_block, and e_end_resync_block, e_send_discard_ack. | ||
415 | * all ignore the last argument. | ||
416 | */ | ||
417 | list_for_each_entry_safe(e, t, &work_list, w.list) { | ||
418 | /* list_del not necessary, next/prev members not touched */ | ||
419 | ok = e->w.cb(mdev, &e->w, !ok) && ok; | ||
420 | drbd_free_ee(mdev, e); | ||
421 | } | ||
422 | wake_up(&mdev->ee_wait); | ||
423 | |||
424 | return ok; | ||
425 | } | ||
426 | |||
427 | void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) | ||
428 | { | ||
429 | DEFINE_WAIT(wait); | ||
430 | |||
431 | /* avoids spin_lock/unlock | ||
432 | * and calling prepare_to_wait in the fast path */ | ||
433 | while (!list_empty(head)) { | ||
434 | prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE); | ||
435 | spin_unlock_irq(&mdev->req_lock); | ||
436 | drbd_kick_lo(mdev); | ||
437 | schedule(); | ||
438 | finish_wait(&mdev->ee_wait, &wait); | ||
439 | spin_lock_irq(&mdev->req_lock); | ||
440 | } | ||
441 | } | ||
442 | |||
443 | void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) | ||
444 | { | ||
445 | spin_lock_irq(&mdev->req_lock); | ||
446 | _drbd_wait_ee_list_empty(mdev, head); | ||
447 | spin_unlock_irq(&mdev->req_lock); | ||
448 | } | ||
449 | |||
450 | /* see also kernel_accept; which is only present since 2.6.18. | ||
451 | * also we want to log which part of it failed, exactly */ | ||
452 | static int drbd_accept(struct drbd_conf *mdev, const char **what, | ||
453 | struct socket *sock, struct socket **newsock) | ||
454 | { | ||
455 | struct sock *sk = sock->sk; | ||
456 | int err = 0; | ||
457 | |||
458 | *what = "listen"; | ||
459 | err = sock->ops->listen(sock, 5); | ||
460 | if (err < 0) | ||
461 | goto out; | ||
462 | |||
463 | *what = "sock_create_lite"; | ||
464 | err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol, | ||
465 | newsock); | ||
466 | if (err < 0) | ||
467 | goto out; | ||
468 | |||
469 | *what = "accept"; | ||
470 | err = sock->ops->accept(sock, *newsock, 0); | ||
471 | if (err < 0) { | ||
472 | sock_release(*newsock); | ||
473 | *newsock = NULL; | ||
474 | goto out; | ||
475 | } | ||
476 | (*newsock)->ops = sock->ops; | ||
477 | |||
478 | out: | ||
479 | return err; | ||
480 | } | ||
481 | |||
482 | static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock, | ||
483 | void *buf, size_t size, int flags) | ||
484 | { | ||
485 | mm_segment_t oldfs; | ||
486 | struct kvec iov = { | ||
487 | .iov_base = buf, | ||
488 | .iov_len = size, | ||
489 | }; | ||
490 | struct msghdr msg = { | ||
491 | .msg_iovlen = 1, | ||
492 | .msg_iov = (struct iovec *)&iov, | ||
493 | .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL) | ||
494 | }; | ||
495 | int rv; | ||
496 | |||
497 | oldfs = get_fs(); | ||
498 | set_fs(KERNEL_DS); | ||
499 | rv = sock_recvmsg(sock, &msg, size, msg.msg_flags); | ||
500 | set_fs(oldfs); | ||
501 | |||
502 | return rv; | ||
503 | } | ||
504 | |||
505 | static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size) | ||
506 | { | ||
507 | mm_segment_t oldfs; | ||
508 | struct kvec iov = { | ||
509 | .iov_base = buf, | ||
510 | .iov_len = size, | ||
511 | }; | ||
512 | struct msghdr msg = { | ||
513 | .msg_iovlen = 1, | ||
514 | .msg_iov = (struct iovec *)&iov, | ||
515 | .msg_flags = MSG_WAITALL | MSG_NOSIGNAL | ||
516 | }; | ||
517 | int rv; | ||
518 | |||
519 | oldfs = get_fs(); | ||
520 | set_fs(KERNEL_DS); | ||
521 | |||
522 | for (;;) { | ||
523 | rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags); | ||
524 | if (rv == size) | ||
525 | break; | ||
526 | |||
527 | /* Note: | ||
528 | * ECONNRESET other side closed the connection | ||
529 | * ERESTARTSYS (on sock) we got a signal | ||
530 | */ | ||
531 | |||
532 | if (rv < 0) { | ||
533 | if (rv == -ECONNRESET) | ||
534 | dev_info(DEV, "sock was reset by peer\n"); | ||
535 | else if (rv != -ERESTARTSYS) | ||
536 | dev_err(DEV, "sock_recvmsg returned %d\n", rv); | ||
537 | break; | ||
538 | } else if (rv == 0) { | ||
539 | dev_info(DEV, "sock was shut down by peer\n"); | ||
540 | break; | ||
541 | } else { | ||
542 | /* signal came in, or peer/link went down, | ||
543 | * after we read a partial message | ||
544 | */ | ||
545 | /* D_ASSERT(signal_pending(current)); */ | ||
546 | break; | ||
547 | } | ||
548 | }; | ||
549 | |||
550 | set_fs(oldfs); | ||
551 | |||
552 | if (rv != size) | ||
553 | drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE)); | ||
554 | |||
555 | return rv; | ||
556 | } | ||
557 | |||
558 | static struct socket *drbd_try_connect(struct drbd_conf *mdev) | ||
559 | { | ||
560 | const char *what; | ||
561 | struct socket *sock; | ||
562 | struct sockaddr_in6 src_in6; | ||
563 | int err; | ||
564 | int disconnect_on_error = 1; | ||
565 | |||
566 | if (!get_net_conf(mdev)) | ||
567 | return NULL; | ||
568 | |||
569 | what = "sock_create_kern"; | ||
570 | err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family, | ||
571 | SOCK_STREAM, IPPROTO_TCP, &sock); | ||
572 | if (err < 0) { | ||
573 | sock = NULL; | ||
574 | goto out; | ||
575 | } | ||
576 | |||
577 | sock->sk->sk_rcvtimeo = | ||
578 | sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ; | ||
579 | |||
580 | /* explicitly bind to the configured IP as source IP | ||
581 | * for the outgoing connections. | ||
582 | * This is needed for multihomed hosts and to be | ||
583 | * able to use lo: interfaces for drbd. | ||
584 | * Make sure to use 0 as port number, so linux selects | ||
585 | * a free one dynamically. | ||
586 | */ | ||
587 | memcpy(&src_in6, mdev->net_conf->my_addr, | ||
588 | min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6))); | ||
589 | if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6) | ||
590 | src_in6.sin6_port = 0; | ||
591 | else | ||
592 | ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */ | ||
593 | |||
594 | what = "bind before connect"; | ||
595 | err = sock->ops->bind(sock, | ||
596 | (struct sockaddr *) &src_in6, | ||
597 | mdev->net_conf->my_addr_len); | ||
598 | if (err < 0) | ||
599 | goto out; | ||
600 | |||
601 | /* connect may fail, peer not yet available. | ||
602 | * stay C_WF_CONNECTION, don't go Disconnecting! */ | ||
603 | disconnect_on_error = 0; | ||
604 | what = "connect"; | ||
605 | err = sock->ops->connect(sock, | ||
606 | (struct sockaddr *)mdev->net_conf->peer_addr, | ||
607 | mdev->net_conf->peer_addr_len, 0); | ||
608 | |||
609 | out: | ||
610 | if (err < 0) { | ||
611 | if (sock) { | ||
612 | sock_release(sock); | ||
613 | sock = NULL; | ||
614 | } | ||
615 | switch (-err) { | ||
616 | /* timeout, busy, signal pending */ | ||
617 | case ETIMEDOUT: case EAGAIN: case EINPROGRESS: | ||
618 | case EINTR: case ERESTARTSYS: | ||
619 | /* peer not (yet) available, network problem */ | ||
620 | case ECONNREFUSED: case ENETUNREACH: | ||
621 | case EHOSTDOWN: case EHOSTUNREACH: | ||
622 | disconnect_on_error = 0; | ||
623 | break; | ||
624 | default: | ||
625 | dev_err(DEV, "%s failed, err = %d\n", what, err); | ||
626 | } | ||
627 | if (disconnect_on_error) | ||
628 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
629 | } | ||
630 | put_net_conf(mdev); | ||
631 | return sock; | ||
632 | } | ||
633 | |||
634 | static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev) | ||
635 | { | ||
636 | int timeo, err; | ||
637 | struct socket *s_estab = NULL, *s_listen; | ||
638 | const char *what; | ||
639 | |||
640 | if (!get_net_conf(mdev)) | ||
641 | return NULL; | ||
642 | |||
643 | what = "sock_create_kern"; | ||
644 | err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family, | ||
645 | SOCK_STREAM, IPPROTO_TCP, &s_listen); | ||
646 | if (err) { | ||
647 | s_listen = NULL; | ||
648 | goto out; | ||
649 | } | ||
650 | |||
651 | timeo = mdev->net_conf->try_connect_int * HZ; | ||
652 | timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */ | ||
653 | |||
654 | s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */ | ||
655 | s_listen->sk->sk_rcvtimeo = timeo; | ||
656 | s_listen->sk->sk_sndtimeo = timeo; | ||
657 | |||
658 | what = "bind before listen"; | ||
659 | err = s_listen->ops->bind(s_listen, | ||
660 | (struct sockaddr *) mdev->net_conf->my_addr, | ||
661 | mdev->net_conf->my_addr_len); | ||
662 | if (err < 0) | ||
663 | goto out; | ||
664 | |||
665 | err = drbd_accept(mdev, &what, s_listen, &s_estab); | ||
666 | |||
667 | out: | ||
668 | if (s_listen) | ||
669 | sock_release(s_listen); | ||
670 | if (err < 0) { | ||
671 | if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { | ||
672 | dev_err(DEV, "%s failed, err = %d\n", what, err); | ||
673 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
674 | } | ||
675 | } | ||
676 | put_net_conf(mdev); | ||
677 | |||
678 | return s_estab; | ||
679 | } | ||
680 | |||
681 | static int drbd_send_fp(struct drbd_conf *mdev, | ||
682 | struct socket *sock, enum drbd_packets cmd) | ||
683 | { | ||
684 | struct p_header *h = (struct p_header *) &mdev->data.sbuf.header; | ||
685 | |||
686 | return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0); | ||
687 | } | ||
688 | |||
689 | static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock) | ||
690 | { | ||
691 | struct p_header *h = (struct p_header *) &mdev->data.sbuf.header; | ||
692 | int rr; | ||
693 | |||
694 | rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0); | ||
695 | |||
696 | if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC) | ||
697 | return be16_to_cpu(h->command); | ||
698 | |||
699 | return 0xffff; | ||
700 | } | ||
701 | |||
702 | /** | ||
703 | * drbd_socket_okay() - Free the socket if its connection is not okay | ||
704 | * @mdev: DRBD device. | ||
705 | * @sock: pointer to the pointer to the socket. | ||
706 | */ | ||
707 | static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock) | ||
708 | { | ||
709 | int rr; | ||
710 | char tb[4]; | ||
711 | |||
712 | if (!*sock) | ||
713 | return FALSE; | ||
714 | |||
715 | rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK); | ||
716 | |||
717 | if (rr > 0 || rr == -EAGAIN) { | ||
718 | return TRUE; | ||
719 | } else { | ||
720 | sock_release(*sock); | ||
721 | *sock = NULL; | ||
722 | return FALSE; | ||
723 | } | ||
724 | } | ||
725 | |||
726 | /* | ||
727 | * return values: | ||
728 | * 1 yes, we have a valid connection | ||
729 | * 0 oops, did not work out, please try again | ||
730 | * -1 peer talks different language, | ||
731 | * no point in trying again, please go standalone. | ||
732 | * -2 We do not have a network config... | ||
733 | */ | ||
734 | static int drbd_connect(struct drbd_conf *mdev) | ||
735 | { | ||
736 | struct socket *s, *sock, *msock; | ||
737 | int try, h, ok; | ||
738 | |||
739 | D_ASSERT(!mdev->data.socket); | ||
740 | |||
741 | if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) | ||
742 | dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n"); | ||
743 | |||
744 | if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS) | ||
745 | return -2; | ||
746 | |||
747 | clear_bit(DISCARD_CONCURRENT, &mdev->flags); | ||
748 | |||
749 | sock = NULL; | ||
750 | msock = NULL; | ||
751 | |||
752 | do { | ||
753 | for (try = 0;;) { | ||
754 | /* 3 tries, this should take less than a second! */ | ||
755 | s = drbd_try_connect(mdev); | ||
756 | if (s || ++try >= 3) | ||
757 | break; | ||
758 | /* give the other side time to call bind() & listen() */ | ||
759 | __set_current_state(TASK_INTERRUPTIBLE); | ||
760 | schedule_timeout(HZ / 10); | ||
761 | } | ||
762 | |||
763 | if (s) { | ||
764 | if (!sock) { | ||
765 | drbd_send_fp(mdev, s, P_HAND_SHAKE_S); | ||
766 | sock = s; | ||
767 | s = NULL; | ||
768 | } else if (!msock) { | ||
769 | drbd_send_fp(mdev, s, P_HAND_SHAKE_M); | ||
770 | msock = s; | ||
771 | s = NULL; | ||
772 | } else { | ||
773 | dev_err(DEV, "Logic error in drbd_connect()\n"); | ||
774 | goto out_release_sockets; | ||
775 | } | ||
776 | } | ||
777 | |||
778 | if (sock && msock) { | ||
779 | __set_current_state(TASK_INTERRUPTIBLE); | ||
780 | schedule_timeout(HZ / 10); | ||
781 | ok = drbd_socket_okay(mdev, &sock); | ||
782 | ok = drbd_socket_okay(mdev, &msock) && ok; | ||
783 | if (ok) | ||
784 | break; | ||
785 | } | ||
786 | |||
787 | retry: | ||
788 | s = drbd_wait_for_connect(mdev); | ||
789 | if (s) { | ||
790 | try = drbd_recv_fp(mdev, s); | ||
791 | drbd_socket_okay(mdev, &sock); | ||
792 | drbd_socket_okay(mdev, &msock); | ||
793 | switch (try) { | ||
794 | case P_HAND_SHAKE_S: | ||
795 | if (sock) { | ||
796 | dev_warn(DEV, "initial packet S crossed\n"); | ||
797 | sock_release(sock); | ||
798 | } | ||
799 | sock = s; | ||
800 | break; | ||
801 | case P_HAND_SHAKE_M: | ||
802 | if (msock) { | ||
803 | dev_warn(DEV, "initial packet M crossed\n"); | ||
804 | sock_release(msock); | ||
805 | } | ||
806 | msock = s; | ||
807 | set_bit(DISCARD_CONCURRENT, &mdev->flags); | ||
808 | break; | ||
809 | default: | ||
810 | dev_warn(DEV, "Error receiving initial packet\n"); | ||
811 | sock_release(s); | ||
812 | if (random32() & 1) | ||
813 | goto retry; | ||
814 | } | ||
815 | } | ||
816 | |||
817 | if (mdev->state.conn <= C_DISCONNECTING) | ||
818 | goto out_release_sockets; | ||
819 | if (signal_pending(current)) { | ||
820 | flush_signals(current); | ||
821 | smp_rmb(); | ||
822 | if (get_t_state(&mdev->receiver) == Exiting) | ||
823 | goto out_release_sockets; | ||
824 | } | ||
825 | |||
826 | if (sock && msock) { | ||
827 | ok = drbd_socket_okay(mdev, &sock); | ||
828 | ok = drbd_socket_okay(mdev, &msock) && ok; | ||
829 | if (ok) | ||
830 | break; | ||
831 | } | ||
832 | } while (1); | ||
833 | |||
834 | msock->sk->sk_reuse = 1; /* SO_REUSEADDR */ | ||
835 | sock->sk->sk_reuse = 1; /* SO_REUSEADDR */ | ||
836 | |||
837 | sock->sk->sk_allocation = GFP_NOIO; | ||
838 | msock->sk->sk_allocation = GFP_NOIO; | ||
839 | |||
840 | sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; | ||
841 | msock->sk->sk_priority = TC_PRIO_INTERACTIVE; | ||
842 | |||
843 | if (mdev->net_conf->sndbuf_size) { | ||
844 | sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size; | ||
845 | sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK; | ||
846 | } | ||
847 | |||
848 | if (mdev->net_conf->rcvbuf_size) { | ||
849 | sock->sk->sk_rcvbuf = mdev->net_conf->rcvbuf_size; | ||
850 | sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK; | ||
851 | } | ||
852 | |||
853 | /* NOT YET ... | ||
854 | * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; | ||
855 | * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; | ||
856 | * first set it to the P_HAND_SHAKE timeout, | ||
857 | * which we set to 4x the configured ping_timeout. */ | ||
858 | sock->sk->sk_sndtimeo = | ||
859 | sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10; | ||
860 | |||
861 | msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; | ||
862 | msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; | ||
863 | |||
864 | /* we don't want delays. | ||
865 | * we use TCP_CORK where apropriate, though */ | ||
866 | drbd_tcp_nodelay(sock); | ||
867 | drbd_tcp_nodelay(msock); | ||
868 | |||
869 | mdev->data.socket = sock; | ||
870 | mdev->meta.socket = msock; | ||
871 | mdev->last_received = jiffies; | ||
872 | |||
873 | D_ASSERT(mdev->asender.task == NULL); | ||
874 | |||
875 | h = drbd_do_handshake(mdev); | ||
876 | if (h <= 0) | ||
877 | return h; | ||
878 | |||
879 | if (mdev->cram_hmac_tfm) { | ||
880 | /* drbd_request_state(mdev, NS(conn, WFAuth)); */ | ||
881 | switch (drbd_do_auth(mdev)) { | ||
882 | case -1: | ||
883 | dev_err(DEV, "Authentication of peer failed\n"); | ||
884 | return -1; | ||
885 | case 0: | ||
886 | dev_err(DEV, "Authentication of peer failed, trying again.\n"); | ||
887 | return 0; | ||
888 | } | ||
889 | } | ||
890 | |||
891 | if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS) | ||
892 | return 0; | ||
893 | |||
894 | sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; | ||
895 | sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; | ||
896 | |||
897 | atomic_set(&mdev->packet_seq, 0); | ||
898 | mdev->peer_seq = 0; | ||
899 | |||
900 | drbd_thread_start(&mdev->asender); | ||
901 | |||
902 | if (!drbd_send_protocol(mdev)) | ||
903 | return -1; | ||
904 | drbd_send_sync_param(mdev, &mdev->sync_conf); | ||
905 | drbd_send_sizes(mdev, 0); | ||
906 | drbd_send_uuids(mdev); | ||
907 | drbd_send_state(mdev); | ||
908 | clear_bit(USE_DEGR_WFC_T, &mdev->flags); | ||
909 | clear_bit(RESIZE_PENDING, &mdev->flags); | ||
910 | |||
911 | return 1; | ||
912 | |||
913 | out_release_sockets: | ||
914 | if (sock) | ||
915 | sock_release(sock); | ||
916 | if (msock) | ||
917 | sock_release(msock); | ||
918 | return -1; | ||
919 | } | ||
920 | |||
921 | static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h) | ||
922 | { | ||
923 | int r; | ||
924 | |||
925 | r = drbd_recv(mdev, h, sizeof(*h)); | ||
926 | |||
927 | if (unlikely(r != sizeof(*h))) { | ||
928 | dev_err(DEV, "short read expecting header on sock: r=%d\n", r); | ||
929 | return FALSE; | ||
930 | }; | ||
931 | h->command = be16_to_cpu(h->command); | ||
932 | h->length = be16_to_cpu(h->length); | ||
933 | if (unlikely(h->magic != BE_DRBD_MAGIC)) { | ||
934 | dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n", | ||
935 | (long)be32_to_cpu(h->magic), | ||
936 | h->command, h->length); | ||
937 | return FALSE; | ||
938 | } | ||
939 | mdev->last_received = jiffies; | ||
940 | |||
941 | return TRUE; | ||
942 | } | ||
943 | |||
944 | static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch) | ||
945 | { | ||
946 | int rv; | ||
947 | |||
948 | if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) { | ||
949 | rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL); | ||
950 | if (rv) { | ||
951 | dev_err(DEV, "local disk flush failed with status %d\n", rv); | ||
952 | /* would rather check on EOPNOTSUPP, but that is not reliable. | ||
953 | * don't try again for ANY return value != 0 | ||
954 | * if (rv == -EOPNOTSUPP) */ | ||
955 | drbd_bump_write_ordering(mdev, WO_drain_io); | ||
956 | } | ||
957 | put_ldev(mdev); | ||
958 | } | ||
959 | |||
960 | return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE); | ||
961 | } | ||
962 | |||
963 | static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
964 | { | ||
965 | struct flush_work *fw = (struct flush_work *)w; | ||
966 | struct drbd_epoch *epoch = fw->epoch; | ||
967 | |||
968 | kfree(w); | ||
969 | |||
970 | if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags)) | ||
971 | drbd_flush_after_epoch(mdev, epoch); | ||
972 | |||
973 | drbd_may_finish_epoch(mdev, epoch, EV_PUT | | ||
974 | (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0)); | ||
975 | |||
976 | return 1; | ||
977 | } | ||
978 | |||
979 | /** | ||
980 | * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it. | ||
981 | * @mdev: DRBD device. | ||
982 | * @epoch: Epoch object. | ||
983 | * @ev: Epoch event. | ||
984 | */ | ||
985 | static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, | ||
986 | struct drbd_epoch *epoch, | ||
987 | enum epoch_event ev) | ||
988 | { | ||
989 | int finish, epoch_size; | ||
990 | struct drbd_epoch *next_epoch; | ||
991 | int schedule_flush = 0; | ||
992 | enum finish_epoch rv = FE_STILL_LIVE; | ||
993 | |||
994 | spin_lock(&mdev->epoch_lock); | ||
995 | do { | ||
996 | next_epoch = NULL; | ||
997 | finish = 0; | ||
998 | |||
999 | epoch_size = atomic_read(&epoch->epoch_size); | ||
1000 | |||
1001 | switch (ev & ~EV_CLEANUP) { | ||
1002 | case EV_PUT: | ||
1003 | atomic_dec(&epoch->active); | ||
1004 | break; | ||
1005 | case EV_GOT_BARRIER_NR: | ||
1006 | set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags); | ||
1007 | |||
1008 | /* Special case: If we just switched from WO_bio_barrier to | ||
1009 | WO_bdev_flush we should not finish the current epoch */ | ||
1010 | if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 && | ||
1011 | mdev->write_ordering != WO_bio_barrier && | ||
1012 | epoch == mdev->current_epoch) | ||
1013 | clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags); | ||
1014 | break; | ||
1015 | case EV_BARRIER_DONE: | ||
1016 | set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags); | ||
1017 | break; | ||
1018 | case EV_BECAME_LAST: | ||
1019 | /* nothing to do*/ | ||
1020 | break; | ||
1021 | } | ||
1022 | |||
1023 | if (epoch_size != 0 && | ||
1024 | atomic_read(&epoch->active) == 0 && | ||
1025 | test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) && | ||
1026 | epoch->list.prev == &mdev->current_epoch->list && | ||
1027 | !test_bit(DE_IS_FINISHING, &epoch->flags)) { | ||
1028 | /* Nearly all conditions are met to finish that epoch... */ | ||
1029 | if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) || | ||
1030 | mdev->write_ordering == WO_none || | ||
1031 | (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) || | ||
1032 | ev & EV_CLEANUP) { | ||
1033 | finish = 1; | ||
1034 | set_bit(DE_IS_FINISHING, &epoch->flags); | ||
1035 | } else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) && | ||
1036 | mdev->write_ordering == WO_bio_barrier) { | ||
1037 | atomic_inc(&epoch->active); | ||
1038 | schedule_flush = 1; | ||
1039 | } | ||
1040 | } | ||
1041 | if (finish) { | ||
1042 | if (!(ev & EV_CLEANUP)) { | ||
1043 | spin_unlock(&mdev->epoch_lock); | ||
1044 | drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size); | ||
1045 | spin_lock(&mdev->epoch_lock); | ||
1046 | } | ||
1047 | dec_unacked(mdev); | ||
1048 | |||
1049 | if (mdev->current_epoch != epoch) { | ||
1050 | next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); | ||
1051 | list_del(&epoch->list); | ||
1052 | ev = EV_BECAME_LAST | (ev & EV_CLEANUP); | ||
1053 | mdev->epochs--; | ||
1054 | kfree(epoch); | ||
1055 | |||
1056 | if (rv == FE_STILL_LIVE) | ||
1057 | rv = FE_DESTROYED; | ||
1058 | } else { | ||
1059 | epoch->flags = 0; | ||
1060 | atomic_set(&epoch->epoch_size, 0); | ||
1061 | /* atomic_set(&epoch->active, 0); is alrady zero */ | ||
1062 | if (rv == FE_STILL_LIVE) | ||
1063 | rv = FE_RECYCLED; | ||
1064 | } | ||
1065 | } | ||
1066 | |||
1067 | if (!next_epoch) | ||
1068 | break; | ||
1069 | |||
1070 | epoch = next_epoch; | ||
1071 | } while (1); | ||
1072 | |||
1073 | spin_unlock(&mdev->epoch_lock); | ||
1074 | |||
1075 | if (schedule_flush) { | ||
1076 | struct flush_work *fw; | ||
1077 | fw = kmalloc(sizeof(*fw), GFP_ATOMIC); | ||
1078 | if (fw) { | ||
1079 | fw->w.cb = w_flush; | ||
1080 | fw->epoch = epoch; | ||
1081 | drbd_queue_work(&mdev->data.work, &fw->w); | ||
1082 | } else { | ||
1083 | dev_warn(DEV, "Could not kmalloc a flush_work obj\n"); | ||
1084 | set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); | ||
1085 | /* That is not a recursion, only one level */ | ||
1086 | drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE); | ||
1087 | drbd_may_finish_epoch(mdev, epoch, EV_PUT); | ||
1088 | } | ||
1089 | } | ||
1090 | |||
1091 | return rv; | ||
1092 | } | ||
1093 | |||
1094 | /** | ||
1095 | * drbd_bump_write_ordering() - Fall back to an other write ordering method | ||
1096 | * @mdev: DRBD device. | ||
1097 | * @wo: Write ordering method to try. | ||
1098 | */ | ||
1099 | void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local) | ||
1100 | { | ||
1101 | enum write_ordering_e pwo; | ||
1102 | static char *write_ordering_str[] = { | ||
1103 | [WO_none] = "none", | ||
1104 | [WO_drain_io] = "drain", | ||
1105 | [WO_bdev_flush] = "flush", | ||
1106 | [WO_bio_barrier] = "barrier", | ||
1107 | }; | ||
1108 | |||
1109 | pwo = mdev->write_ordering; | ||
1110 | wo = min(pwo, wo); | ||
1111 | if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier) | ||
1112 | wo = WO_bdev_flush; | ||
1113 | if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush) | ||
1114 | wo = WO_drain_io; | ||
1115 | if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain) | ||
1116 | wo = WO_none; | ||
1117 | mdev->write_ordering = wo; | ||
1118 | if (pwo != mdev->write_ordering || wo == WO_bio_barrier) | ||
1119 | dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]); | ||
1120 | } | ||
1121 | |||
1122 | /** | ||
1123 | * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set | ||
1124 | * @mdev: DRBD device. | ||
1125 | * @w: work object. | ||
1126 | * @cancel: The connection will be closed anyways (unused in this callback) | ||
1127 | */ | ||
1128 | int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local) | ||
1129 | { | ||
1130 | struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; | ||
1131 | struct bio *bio = e->private_bio; | ||
1132 | |||
1133 | /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place, | ||
1134 | (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch) | ||
1135 | so that we can finish that epoch in drbd_may_finish_epoch(). | ||
1136 | That is necessary if we already have a long chain of Epochs, before | ||
1137 | we realize that BIO_RW_BARRIER is actually not supported */ | ||
1138 | |||
1139 | /* As long as the -ENOTSUPP on the barrier is reported immediately | ||
1140 | that will never trigger. If it is reported late, we will just | ||
1141 | print that warning and continue correctly for all future requests | ||
1142 | with WO_bdev_flush */ | ||
1143 | if (previous_epoch(mdev, e->epoch)) | ||
1144 | dev_warn(DEV, "Write ordering was not enforced (one time event)\n"); | ||
1145 | |||
1146 | /* prepare bio for re-submit, | ||
1147 | * re-init volatile members */ | ||
1148 | /* we still have a local reference, | ||
1149 | * get_ldev was done in receive_Data. */ | ||
1150 | bio->bi_bdev = mdev->ldev->backing_bdev; | ||
1151 | bio->bi_sector = e->sector; | ||
1152 | bio->bi_size = e->size; | ||
1153 | bio->bi_idx = 0; | ||
1154 | |||
1155 | bio->bi_flags &= ~(BIO_POOL_MASK - 1); | ||
1156 | bio->bi_flags |= 1 << BIO_UPTODATE; | ||
1157 | |||
1158 | /* don't know whether this is necessary: */ | ||
1159 | bio->bi_phys_segments = 0; | ||
1160 | bio->bi_next = NULL; | ||
1161 | |||
1162 | /* these should be unchanged: */ | ||
1163 | /* bio->bi_end_io = drbd_endio_write_sec; */ | ||
1164 | /* bio->bi_vcnt = whatever; */ | ||
1165 | |||
1166 | e->w.cb = e_end_block; | ||
1167 | |||
1168 | /* This is no longer a barrier request. */ | ||
1169 | bio->bi_rw &= ~(1UL << BIO_RW_BARRIER); | ||
1170 | |||
1171 | drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, bio); | ||
1172 | |||
1173 | return 1; | ||
1174 | } | ||
1175 | |||
1176 | static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h) | ||
1177 | { | ||
1178 | int rv, issue_flush; | ||
1179 | struct p_barrier *p = (struct p_barrier *)h; | ||
1180 | struct drbd_epoch *epoch; | ||
1181 | |||
1182 | ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; | ||
1183 | |||
1184 | rv = drbd_recv(mdev, h->payload, h->length); | ||
1185 | ERR_IF(rv != h->length) return FALSE; | ||
1186 | |||
1187 | inc_unacked(mdev); | ||
1188 | |||
1189 | if (mdev->net_conf->wire_protocol != DRBD_PROT_C) | ||
1190 | drbd_kick_lo(mdev); | ||
1191 | |||
1192 | mdev->current_epoch->barrier_nr = p->barrier; | ||
1193 | rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR); | ||
1194 | |||
1195 | /* P_BARRIER_ACK may imply that the corresponding extent is dropped from | ||
1196 | * the activity log, which means it would not be resynced in case the | ||
1197 | * R_PRIMARY crashes now. | ||
1198 | * Therefore we must send the barrier_ack after the barrier request was | ||
1199 | * completed. */ | ||
1200 | switch (mdev->write_ordering) { | ||
1201 | case WO_bio_barrier: | ||
1202 | case WO_none: | ||
1203 | if (rv == FE_RECYCLED) | ||
1204 | return TRUE; | ||
1205 | break; | ||
1206 | |||
1207 | case WO_bdev_flush: | ||
1208 | case WO_drain_io: | ||
1209 | if (rv == FE_STILL_LIVE) { | ||
1210 | set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags); | ||
1211 | drbd_wait_ee_list_empty(mdev, &mdev->active_ee); | ||
1212 | rv = drbd_flush_after_epoch(mdev, mdev->current_epoch); | ||
1213 | } | ||
1214 | if (rv == FE_RECYCLED) | ||
1215 | return TRUE; | ||
1216 | |||
1217 | /* The asender will send all the ACKs and barrier ACKs out, since | ||
1218 | all EEs moved from the active_ee to the done_ee. We need to | ||
1219 | provide a new epoch object for the EEs that come in soon */ | ||
1220 | break; | ||
1221 | } | ||
1222 | |||
1223 | /* receiver context, in the writeout path of the other node. | ||
1224 | * avoid potential distributed deadlock */ | ||
1225 | epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); | ||
1226 | if (!epoch) { | ||
1227 | dev_warn(DEV, "Allocation of an epoch failed, slowing down\n"); | ||
1228 | issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags); | ||
1229 | drbd_wait_ee_list_empty(mdev, &mdev->active_ee); | ||
1230 | if (issue_flush) { | ||
1231 | rv = drbd_flush_after_epoch(mdev, mdev->current_epoch); | ||
1232 | if (rv == FE_RECYCLED) | ||
1233 | return TRUE; | ||
1234 | } | ||
1235 | |||
1236 | drbd_wait_ee_list_empty(mdev, &mdev->done_ee); | ||
1237 | |||
1238 | return TRUE; | ||
1239 | } | ||
1240 | |||
1241 | epoch->flags = 0; | ||
1242 | atomic_set(&epoch->epoch_size, 0); | ||
1243 | atomic_set(&epoch->active, 0); | ||
1244 | |||
1245 | spin_lock(&mdev->epoch_lock); | ||
1246 | if (atomic_read(&mdev->current_epoch->epoch_size)) { | ||
1247 | list_add(&epoch->list, &mdev->current_epoch->list); | ||
1248 | mdev->current_epoch = epoch; | ||
1249 | mdev->epochs++; | ||
1250 | } else { | ||
1251 | /* The current_epoch got recycled while we allocated this one... */ | ||
1252 | kfree(epoch); | ||
1253 | } | ||
1254 | spin_unlock(&mdev->epoch_lock); | ||
1255 | |||
1256 | return TRUE; | ||
1257 | } | ||
1258 | |||
1259 | /* used from receive_RSDataReply (recv_resync_read) | ||
1260 | * and from receive_Data */ | ||
1261 | static struct drbd_epoch_entry * | ||
1262 | read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local) | ||
1263 | { | ||
1264 | struct drbd_epoch_entry *e; | ||
1265 | struct bio_vec *bvec; | ||
1266 | struct page *page; | ||
1267 | struct bio *bio; | ||
1268 | int dgs, ds, i, rr; | ||
1269 | void *dig_in = mdev->int_dig_in; | ||
1270 | void *dig_vv = mdev->int_dig_vv; | ||
1271 | |||
1272 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? | ||
1273 | crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; | ||
1274 | |||
1275 | if (dgs) { | ||
1276 | rr = drbd_recv(mdev, dig_in, dgs); | ||
1277 | if (rr != dgs) { | ||
1278 | dev_warn(DEV, "short read receiving data digest: read %d expected %d\n", | ||
1279 | rr, dgs); | ||
1280 | return NULL; | ||
1281 | } | ||
1282 | } | ||
1283 | |||
1284 | data_size -= dgs; | ||
1285 | |||
1286 | ERR_IF(data_size & 0x1ff) return NULL; | ||
1287 | ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return NULL; | ||
1288 | |||
1289 | /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD | ||
1290 | * "criss-cross" setup, that might cause write-out on some other DRBD, | ||
1291 | * which in turn might block on the other node at this very place. */ | ||
1292 | e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO); | ||
1293 | if (!e) | ||
1294 | return NULL; | ||
1295 | bio = e->private_bio; | ||
1296 | ds = data_size; | ||
1297 | bio_for_each_segment(bvec, bio, i) { | ||
1298 | page = bvec->bv_page; | ||
1299 | rr = drbd_recv(mdev, kmap(page), min_t(int, ds, PAGE_SIZE)); | ||
1300 | kunmap(page); | ||
1301 | if (rr != min_t(int, ds, PAGE_SIZE)) { | ||
1302 | drbd_free_ee(mdev, e); | ||
1303 | dev_warn(DEV, "short read receiving data: read %d expected %d\n", | ||
1304 | rr, min_t(int, ds, PAGE_SIZE)); | ||
1305 | return NULL; | ||
1306 | } | ||
1307 | ds -= rr; | ||
1308 | } | ||
1309 | |||
1310 | if (dgs) { | ||
1311 | drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv); | ||
1312 | if (memcmp(dig_in, dig_vv, dgs)) { | ||
1313 | dev_err(DEV, "Digest integrity check FAILED.\n"); | ||
1314 | drbd_bcast_ee(mdev, "digest failed", | ||
1315 | dgs, dig_in, dig_vv, e); | ||
1316 | drbd_free_ee(mdev, e); | ||
1317 | return NULL; | ||
1318 | } | ||
1319 | } | ||
1320 | mdev->recv_cnt += data_size>>9; | ||
1321 | return e; | ||
1322 | } | ||
1323 | |||
1324 | /* drbd_drain_block() just takes a data block | ||
1325 | * out of the socket input buffer, and discards it. | ||
1326 | */ | ||
1327 | static int drbd_drain_block(struct drbd_conf *mdev, int data_size) | ||
1328 | { | ||
1329 | struct page *page; | ||
1330 | int rr, rv = 1; | ||
1331 | void *data; | ||
1332 | |||
1333 | page = drbd_pp_alloc(mdev, 1); | ||
1334 | |||
1335 | data = kmap(page); | ||
1336 | while (data_size) { | ||
1337 | rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE)); | ||
1338 | if (rr != min_t(int, data_size, PAGE_SIZE)) { | ||
1339 | rv = 0; | ||
1340 | dev_warn(DEV, "short read receiving data: read %d expected %d\n", | ||
1341 | rr, min_t(int, data_size, PAGE_SIZE)); | ||
1342 | break; | ||
1343 | } | ||
1344 | data_size -= rr; | ||
1345 | } | ||
1346 | kunmap(page); | ||
1347 | drbd_pp_free(mdev, page); | ||
1348 | return rv; | ||
1349 | } | ||
1350 | |||
1351 | static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req, | ||
1352 | sector_t sector, int data_size) | ||
1353 | { | ||
1354 | struct bio_vec *bvec; | ||
1355 | struct bio *bio; | ||
1356 | int dgs, rr, i, expect; | ||
1357 | void *dig_in = mdev->int_dig_in; | ||
1358 | void *dig_vv = mdev->int_dig_vv; | ||
1359 | |||
1360 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? | ||
1361 | crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; | ||
1362 | |||
1363 | if (dgs) { | ||
1364 | rr = drbd_recv(mdev, dig_in, dgs); | ||
1365 | if (rr != dgs) { | ||
1366 | dev_warn(DEV, "short read receiving data reply digest: read %d expected %d\n", | ||
1367 | rr, dgs); | ||
1368 | return 0; | ||
1369 | } | ||
1370 | } | ||
1371 | |||
1372 | data_size -= dgs; | ||
1373 | |||
1374 | /* optimistically update recv_cnt. if receiving fails below, | ||
1375 | * we disconnect anyways, and counters will be reset. */ | ||
1376 | mdev->recv_cnt += data_size>>9; | ||
1377 | |||
1378 | bio = req->master_bio; | ||
1379 | D_ASSERT(sector == bio->bi_sector); | ||
1380 | |||
1381 | bio_for_each_segment(bvec, bio, i) { | ||
1382 | expect = min_t(int, data_size, bvec->bv_len); | ||
1383 | rr = drbd_recv(mdev, | ||
1384 | kmap(bvec->bv_page)+bvec->bv_offset, | ||
1385 | expect); | ||
1386 | kunmap(bvec->bv_page); | ||
1387 | if (rr != expect) { | ||
1388 | dev_warn(DEV, "short read receiving data reply: " | ||
1389 | "read %d expected %d\n", | ||
1390 | rr, expect); | ||
1391 | return 0; | ||
1392 | } | ||
1393 | data_size -= rr; | ||
1394 | } | ||
1395 | |||
1396 | if (dgs) { | ||
1397 | drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv); | ||
1398 | if (memcmp(dig_in, dig_vv, dgs)) { | ||
1399 | dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n"); | ||
1400 | return 0; | ||
1401 | } | ||
1402 | } | ||
1403 | |||
1404 | D_ASSERT(data_size == 0); | ||
1405 | return 1; | ||
1406 | } | ||
1407 | |||
1408 | /* e_end_resync_block() is called via | ||
1409 | * drbd_process_done_ee() by asender only */ | ||
1410 | static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused) | ||
1411 | { | ||
1412 | struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; | ||
1413 | sector_t sector = e->sector; | ||
1414 | int ok; | ||
1415 | |||
1416 | D_ASSERT(hlist_unhashed(&e->colision)); | ||
1417 | |||
1418 | if (likely(drbd_bio_uptodate(e->private_bio))) { | ||
1419 | drbd_set_in_sync(mdev, sector, e->size); | ||
1420 | ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e); | ||
1421 | } else { | ||
1422 | /* Record failure to sync */ | ||
1423 | drbd_rs_failed_io(mdev, sector, e->size); | ||
1424 | |||
1425 | ok = drbd_send_ack(mdev, P_NEG_ACK, e); | ||
1426 | } | ||
1427 | dec_unacked(mdev); | ||
1428 | |||
1429 | return ok; | ||
1430 | } | ||
1431 | |||
1432 | static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local) | ||
1433 | { | ||
1434 | struct drbd_epoch_entry *e; | ||
1435 | |||
1436 | e = read_in_block(mdev, ID_SYNCER, sector, data_size); | ||
1437 | if (!e) { | ||
1438 | put_ldev(mdev); | ||
1439 | return FALSE; | ||
1440 | } | ||
1441 | |||
1442 | dec_rs_pending(mdev); | ||
1443 | |||
1444 | e->private_bio->bi_end_io = drbd_endio_write_sec; | ||
1445 | e->private_bio->bi_rw = WRITE; | ||
1446 | e->w.cb = e_end_resync_block; | ||
1447 | |||
1448 | inc_unacked(mdev); | ||
1449 | /* corresponding dec_unacked() in e_end_resync_block() | ||
1450 | * respective _drbd_clear_done_ee */ | ||
1451 | |||
1452 | spin_lock_irq(&mdev->req_lock); | ||
1453 | list_add(&e->w.list, &mdev->sync_ee); | ||
1454 | spin_unlock_irq(&mdev->req_lock); | ||
1455 | |||
1456 | drbd_generic_make_request(mdev, DRBD_FAULT_RS_WR, e->private_bio); | ||
1457 | /* accounting done in endio */ | ||
1458 | |||
1459 | maybe_kick_lo(mdev); | ||
1460 | return TRUE; | ||
1461 | } | ||
1462 | |||
1463 | static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h) | ||
1464 | { | ||
1465 | struct drbd_request *req; | ||
1466 | sector_t sector; | ||
1467 | unsigned int header_size, data_size; | ||
1468 | int ok; | ||
1469 | struct p_data *p = (struct p_data *)h; | ||
1470 | |||
1471 | header_size = sizeof(*p) - sizeof(*h); | ||
1472 | data_size = h->length - header_size; | ||
1473 | |||
1474 | ERR_IF(data_size == 0) return FALSE; | ||
1475 | |||
1476 | if (drbd_recv(mdev, h->payload, header_size) != header_size) | ||
1477 | return FALSE; | ||
1478 | |||
1479 | sector = be64_to_cpu(p->sector); | ||
1480 | |||
1481 | spin_lock_irq(&mdev->req_lock); | ||
1482 | req = _ar_id_to_req(mdev, p->block_id, sector); | ||
1483 | spin_unlock_irq(&mdev->req_lock); | ||
1484 | if (unlikely(!req)) { | ||
1485 | dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n"); | ||
1486 | return FALSE; | ||
1487 | } | ||
1488 | |||
1489 | /* hlist_del(&req->colision) is done in _req_may_be_done, to avoid | ||
1490 | * special casing it there for the various failure cases. | ||
1491 | * still no race with drbd_fail_pending_reads */ | ||
1492 | ok = recv_dless_read(mdev, req, sector, data_size); | ||
1493 | |||
1494 | if (ok) | ||
1495 | req_mod(req, data_received); | ||
1496 | /* else: nothing. handled from drbd_disconnect... | ||
1497 | * I don't think we may complete this just yet | ||
1498 | * in case we are "on-disconnect: freeze" */ | ||
1499 | |||
1500 | return ok; | ||
1501 | } | ||
1502 | |||
1503 | static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h) | ||
1504 | { | ||
1505 | sector_t sector; | ||
1506 | unsigned int header_size, data_size; | ||
1507 | int ok; | ||
1508 | struct p_data *p = (struct p_data *)h; | ||
1509 | |||
1510 | header_size = sizeof(*p) - sizeof(*h); | ||
1511 | data_size = h->length - header_size; | ||
1512 | |||
1513 | ERR_IF(data_size == 0) return FALSE; | ||
1514 | |||
1515 | if (drbd_recv(mdev, h->payload, header_size) != header_size) | ||
1516 | return FALSE; | ||
1517 | |||
1518 | sector = be64_to_cpu(p->sector); | ||
1519 | D_ASSERT(p->block_id == ID_SYNCER); | ||
1520 | |||
1521 | if (get_ldev(mdev)) { | ||
1522 | /* data is submitted to disk within recv_resync_read. | ||
1523 | * corresponding put_ldev done below on error, | ||
1524 | * or in drbd_endio_write_sec. */ | ||
1525 | ok = recv_resync_read(mdev, sector, data_size); | ||
1526 | } else { | ||
1527 | if (__ratelimit(&drbd_ratelimit_state)) | ||
1528 | dev_err(DEV, "Can not write resync data to local disk.\n"); | ||
1529 | |||
1530 | ok = drbd_drain_block(mdev, data_size); | ||
1531 | |||
1532 | drbd_send_ack_dp(mdev, P_NEG_ACK, p); | ||
1533 | } | ||
1534 | |||
1535 | return ok; | ||
1536 | } | ||
1537 | |||
1538 | /* e_end_block() is called via drbd_process_done_ee(). | ||
1539 | * this means this function only runs in the asender thread | ||
1540 | */ | ||
1541 | static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1542 | { | ||
1543 | struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; | ||
1544 | sector_t sector = e->sector; | ||
1545 | struct drbd_epoch *epoch; | ||
1546 | int ok = 1, pcmd; | ||
1547 | |||
1548 | if (e->flags & EE_IS_BARRIER) { | ||
1549 | epoch = previous_epoch(mdev, e->epoch); | ||
1550 | if (epoch) | ||
1551 | drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0)); | ||
1552 | } | ||
1553 | |||
1554 | if (mdev->net_conf->wire_protocol == DRBD_PROT_C) { | ||
1555 | if (likely(drbd_bio_uptodate(e->private_bio))) { | ||
1556 | pcmd = (mdev->state.conn >= C_SYNC_SOURCE && | ||
1557 | mdev->state.conn <= C_PAUSED_SYNC_T && | ||
1558 | e->flags & EE_MAY_SET_IN_SYNC) ? | ||
1559 | P_RS_WRITE_ACK : P_WRITE_ACK; | ||
1560 | ok &= drbd_send_ack(mdev, pcmd, e); | ||
1561 | if (pcmd == P_RS_WRITE_ACK) | ||
1562 | drbd_set_in_sync(mdev, sector, e->size); | ||
1563 | } else { | ||
1564 | ok = drbd_send_ack(mdev, P_NEG_ACK, e); | ||
1565 | /* we expect it to be marked out of sync anyways... | ||
1566 | * maybe assert this? */ | ||
1567 | } | ||
1568 | dec_unacked(mdev); | ||
1569 | } | ||
1570 | /* we delete from the conflict detection hash _after_ we sent out the | ||
1571 | * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ | ||
1572 | if (mdev->net_conf->two_primaries) { | ||
1573 | spin_lock_irq(&mdev->req_lock); | ||
1574 | D_ASSERT(!hlist_unhashed(&e->colision)); | ||
1575 | hlist_del_init(&e->colision); | ||
1576 | spin_unlock_irq(&mdev->req_lock); | ||
1577 | } else { | ||
1578 | D_ASSERT(hlist_unhashed(&e->colision)); | ||
1579 | } | ||
1580 | |||
1581 | drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); | ||
1582 | |||
1583 | return ok; | ||
1584 | } | ||
1585 | |||
1586 | static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused) | ||
1587 | { | ||
1588 | struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; | ||
1589 | int ok = 1; | ||
1590 | |||
1591 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | ||
1592 | ok = drbd_send_ack(mdev, P_DISCARD_ACK, e); | ||
1593 | |||
1594 | spin_lock_irq(&mdev->req_lock); | ||
1595 | D_ASSERT(!hlist_unhashed(&e->colision)); | ||
1596 | hlist_del_init(&e->colision); | ||
1597 | spin_unlock_irq(&mdev->req_lock); | ||
1598 | |||
1599 | dec_unacked(mdev); | ||
1600 | |||
1601 | return ok; | ||
1602 | } | ||
1603 | |||
1604 | /* Called from receive_Data. | ||
1605 | * Synchronize packets on sock with packets on msock. | ||
1606 | * | ||
1607 | * This is here so even when a P_DATA packet traveling via sock overtook an Ack | ||
1608 | * packet traveling on msock, they are still processed in the order they have | ||
1609 | * been sent. | ||
1610 | * | ||
1611 | * Note: we don't care for Ack packets overtaking P_DATA packets. | ||
1612 | * | ||
1613 | * In case packet_seq is larger than mdev->peer_seq number, there are | ||
1614 | * outstanding packets on the msock. We wait for them to arrive. | ||
1615 | * In case we are the logically next packet, we update mdev->peer_seq | ||
1616 | * ourselves. Correctly handles 32bit wrap around. | ||
1617 | * | ||
1618 | * Assume we have a 10 GBit connection, that is about 1<<30 byte per second, | ||
1619 | * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds | ||
1620 | * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have | ||
1621 | * 1<<9 == 512 seconds aka ages for the 32bit wrap around... | ||
1622 | * | ||
1623 | * returns 0 if we may process the packet, | ||
1624 | * -ERESTARTSYS if we were interrupted (by disconnect signal). */ | ||
1625 | static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq) | ||
1626 | { | ||
1627 | DEFINE_WAIT(wait); | ||
1628 | unsigned int p_seq; | ||
1629 | long timeout; | ||
1630 | int ret = 0; | ||
1631 | spin_lock(&mdev->peer_seq_lock); | ||
1632 | for (;;) { | ||
1633 | prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE); | ||
1634 | if (seq_le(packet_seq, mdev->peer_seq+1)) | ||
1635 | break; | ||
1636 | if (signal_pending(current)) { | ||
1637 | ret = -ERESTARTSYS; | ||
1638 | break; | ||
1639 | } | ||
1640 | p_seq = mdev->peer_seq; | ||
1641 | spin_unlock(&mdev->peer_seq_lock); | ||
1642 | timeout = schedule_timeout(30*HZ); | ||
1643 | spin_lock(&mdev->peer_seq_lock); | ||
1644 | if (timeout == 0 && p_seq == mdev->peer_seq) { | ||
1645 | ret = -ETIMEDOUT; | ||
1646 | dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n"); | ||
1647 | break; | ||
1648 | } | ||
1649 | } | ||
1650 | finish_wait(&mdev->seq_wait, &wait); | ||
1651 | if (mdev->peer_seq+1 == packet_seq) | ||
1652 | mdev->peer_seq++; | ||
1653 | spin_unlock(&mdev->peer_seq_lock); | ||
1654 | return ret; | ||
1655 | } | ||
1656 | |||
1657 | /* mirrored write */ | ||
1658 | static int receive_Data(struct drbd_conf *mdev, struct p_header *h) | ||
1659 | { | ||
1660 | sector_t sector; | ||
1661 | struct drbd_epoch_entry *e; | ||
1662 | struct p_data *p = (struct p_data *)h; | ||
1663 | int header_size, data_size; | ||
1664 | int rw = WRITE; | ||
1665 | u32 dp_flags; | ||
1666 | |||
1667 | header_size = sizeof(*p) - sizeof(*h); | ||
1668 | data_size = h->length - header_size; | ||
1669 | |||
1670 | ERR_IF(data_size == 0) return FALSE; | ||
1671 | |||
1672 | if (drbd_recv(mdev, h->payload, header_size) != header_size) | ||
1673 | return FALSE; | ||
1674 | |||
1675 | if (!get_ldev(mdev)) { | ||
1676 | if (__ratelimit(&drbd_ratelimit_state)) | ||
1677 | dev_err(DEV, "Can not write mirrored data block " | ||
1678 | "to local disk.\n"); | ||
1679 | spin_lock(&mdev->peer_seq_lock); | ||
1680 | if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num)) | ||
1681 | mdev->peer_seq++; | ||
1682 | spin_unlock(&mdev->peer_seq_lock); | ||
1683 | |||
1684 | drbd_send_ack_dp(mdev, P_NEG_ACK, p); | ||
1685 | atomic_inc(&mdev->current_epoch->epoch_size); | ||
1686 | return drbd_drain_block(mdev, data_size); | ||
1687 | } | ||
1688 | |||
1689 | /* get_ldev(mdev) successful. | ||
1690 | * Corresponding put_ldev done either below (on various errors), | ||
1691 | * or in drbd_endio_write_sec, if we successfully submit the data at | ||
1692 | * the end of this function. */ | ||
1693 | |||
1694 | sector = be64_to_cpu(p->sector); | ||
1695 | e = read_in_block(mdev, p->block_id, sector, data_size); | ||
1696 | if (!e) { | ||
1697 | put_ldev(mdev); | ||
1698 | return FALSE; | ||
1699 | } | ||
1700 | |||
1701 | e->private_bio->bi_end_io = drbd_endio_write_sec; | ||
1702 | e->w.cb = e_end_block; | ||
1703 | |||
1704 | spin_lock(&mdev->epoch_lock); | ||
1705 | e->epoch = mdev->current_epoch; | ||
1706 | atomic_inc(&e->epoch->epoch_size); | ||
1707 | atomic_inc(&e->epoch->active); | ||
1708 | |||
1709 | if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) { | ||
1710 | struct drbd_epoch *epoch; | ||
1711 | /* Issue a barrier if we start a new epoch, and the previous epoch | ||
1712 | was not a epoch containing a single request which already was | ||
1713 | a Barrier. */ | ||
1714 | epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list); | ||
1715 | if (epoch == e->epoch) { | ||
1716 | set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags); | ||
1717 | rw |= (1<<BIO_RW_BARRIER); | ||
1718 | e->flags |= EE_IS_BARRIER; | ||
1719 | } else { | ||
1720 | if (atomic_read(&epoch->epoch_size) > 1 || | ||
1721 | !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) { | ||
1722 | set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); | ||
1723 | set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags); | ||
1724 | rw |= (1<<BIO_RW_BARRIER); | ||
1725 | e->flags |= EE_IS_BARRIER; | ||
1726 | } | ||
1727 | } | ||
1728 | } | ||
1729 | spin_unlock(&mdev->epoch_lock); | ||
1730 | |||
1731 | dp_flags = be32_to_cpu(p->dp_flags); | ||
1732 | if (dp_flags & DP_HARDBARRIER) { | ||
1733 | dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n"); | ||
1734 | /* rw |= (1<<BIO_RW_BARRIER); */ | ||
1735 | } | ||
1736 | if (dp_flags & DP_RW_SYNC) | ||
1737 | rw |= (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG); | ||
1738 | if (dp_flags & DP_MAY_SET_IN_SYNC) | ||
1739 | e->flags |= EE_MAY_SET_IN_SYNC; | ||
1740 | |||
1741 | /* I'm the receiver, I do hold a net_cnt reference. */ | ||
1742 | if (!mdev->net_conf->two_primaries) { | ||
1743 | spin_lock_irq(&mdev->req_lock); | ||
1744 | } else { | ||
1745 | /* don't get the req_lock yet, | ||
1746 | * we may sleep in drbd_wait_peer_seq */ | ||
1747 | const int size = e->size; | ||
1748 | const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags); | ||
1749 | DEFINE_WAIT(wait); | ||
1750 | struct drbd_request *i; | ||
1751 | struct hlist_node *n; | ||
1752 | struct hlist_head *slot; | ||
1753 | int first; | ||
1754 | |||
1755 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | ||
1756 | BUG_ON(mdev->ee_hash == NULL); | ||
1757 | BUG_ON(mdev->tl_hash == NULL); | ||
1758 | |||
1759 | /* conflict detection and handling: | ||
1760 | * 1. wait on the sequence number, | ||
1761 | * in case this data packet overtook ACK packets. | ||
1762 | * 2. check our hash tables for conflicting requests. | ||
1763 | * we only need to walk the tl_hash, since an ee can not | ||
1764 | * have a conflict with an other ee: on the submitting | ||
1765 | * node, the corresponding req had already been conflicting, | ||
1766 | * and a conflicting req is never sent. | ||
1767 | * | ||
1768 | * Note: for two_primaries, we are protocol C, | ||
1769 | * so there cannot be any request that is DONE | ||
1770 | * but still on the transfer log. | ||
1771 | * | ||
1772 | * unconditionally add to the ee_hash. | ||
1773 | * | ||
1774 | * if no conflicting request is found: | ||
1775 | * submit. | ||
1776 | * | ||
1777 | * if any conflicting request is found | ||
1778 | * that has not yet been acked, | ||
1779 | * AND I have the "discard concurrent writes" flag: | ||
1780 | * queue (via done_ee) the P_DISCARD_ACK; OUT. | ||
1781 | * | ||
1782 | * if any conflicting request is found: | ||
1783 | * block the receiver, waiting on misc_wait | ||
1784 | * until no more conflicting requests are there, | ||
1785 | * or we get interrupted (disconnect). | ||
1786 | * | ||
1787 | * we do not just write after local io completion of those | ||
1788 | * requests, but only after req is done completely, i.e. | ||
1789 | * we wait for the P_DISCARD_ACK to arrive! | ||
1790 | * | ||
1791 | * then proceed normally, i.e. submit. | ||
1792 | */ | ||
1793 | if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num))) | ||
1794 | goto out_interrupted; | ||
1795 | |||
1796 | spin_lock_irq(&mdev->req_lock); | ||
1797 | |||
1798 | hlist_add_head(&e->colision, ee_hash_slot(mdev, sector)); | ||
1799 | |||
1800 | #define OVERLAPS overlaps(i->sector, i->size, sector, size) | ||
1801 | slot = tl_hash_slot(mdev, sector); | ||
1802 | first = 1; | ||
1803 | for (;;) { | ||
1804 | int have_unacked = 0; | ||
1805 | int have_conflict = 0; | ||
1806 | prepare_to_wait(&mdev->misc_wait, &wait, | ||
1807 | TASK_INTERRUPTIBLE); | ||
1808 | hlist_for_each_entry(i, n, slot, colision) { | ||
1809 | if (OVERLAPS) { | ||
1810 | /* only ALERT on first iteration, | ||
1811 | * we may be woken up early... */ | ||
1812 | if (first) | ||
1813 | dev_alert(DEV, "%s[%u] Concurrent local write detected!" | ||
1814 | " new: %llus +%u; pending: %llus +%u\n", | ||
1815 | current->comm, current->pid, | ||
1816 | (unsigned long long)sector, size, | ||
1817 | (unsigned long long)i->sector, i->size); | ||
1818 | if (i->rq_state & RQ_NET_PENDING) | ||
1819 | ++have_unacked; | ||
1820 | ++have_conflict; | ||
1821 | } | ||
1822 | } | ||
1823 | #undef OVERLAPS | ||
1824 | if (!have_conflict) | ||
1825 | break; | ||
1826 | |||
1827 | /* Discard Ack only for the _first_ iteration */ | ||
1828 | if (first && discard && have_unacked) { | ||
1829 | dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n", | ||
1830 | (unsigned long long)sector); | ||
1831 | inc_unacked(mdev); | ||
1832 | e->w.cb = e_send_discard_ack; | ||
1833 | list_add_tail(&e->w.list, &mdev->done_ee); | ||
1834 | |||
1835 | spin_unlock_irq(&mdev->req_lock); | ||
1836 | |||
1837 | /* we could probably send that P_DISCARD_ACK ourselves, | ||
1838 | * but I don't like the receiver using the msock */ | ||
1839 | |||
1840 | put_ldev(mdev); | ||
1841 | wake_asender(mdev); | ||
1842 | finish_wait(&mdev->misc_wait, &wait); | ||
1843 | return TRUE; | ||
1844 | } | ||
1845 | |||
1846 | if (signal_pending(current)) { | ||
1847 | hlist_del_init(&e->colision); | ||
1848 | |||
1849 | spin_unlock_irq(&mdev->req_lock); | ||
1850 | |||
1851 | finish_wait(&mdev->misc_wait, &wait); | ||
1852 | goto out_interrupted; | ||
1853 | } | ||
1854 | |||
1855 | spin_unlock_irq(&mdev->req_lock); | ||
1856 | if (first) { | ||
1857 | first = 0; | ||
1858 | dev_alert(DEV, "Concurrent write! [W AFTERWARDS] " | ||
1859 | "sec=%llus\n", (unsigned long long)sector); | ||
1860 | } else if (discard) { | ||
1861 | /* we had none on the first iteration. | ||
1862 | * there must be none now. */ | ||
1863 | D_ASSERT(have_unacked == 0); | ||
1864 | } | ||
1865 | schedule(); | ||
1866 | spin_lock_irq(&mdev->req_lock); | ||
1867 | } | ||
1868 | finish_wait(&mdev->misc_wait, &wait); | ||
1869 | } | ||
1870 | |||
1871 | list_add(&e->w.list, &mdev->active_ee); | ||
1872 | spin_unlock_irq(&mdev->req_lock); | ||
1873 | |||
1874 | switch (mdev->net_conf->wire_protocol) { | ||
1875 | case DRBD_PROT_C: | ||
1876 | inc_unacked(mdev); | ||
1877 | /* corresponding dec_unacked() in e_end_block() | ||
1878 | * respective _drbd_clear_done_ee */ | ||
1879 | break; | ||
1880 | case DRBD_PROT_B: | ||
1881 | /* I really don't like it that the receiver thread | ||
1882 | * sends on the msock, but anyways */ | ||
1883 | drbd_send_ack(mdev, P_RECV_ACK, e); | ||
1884 | break; | ||
1885 | case DRBD_PROT_A: | ||
1886 | /* nothing to do */ | ||
1887 | break; | ||
1888 | } | ||
1889 | |||
1890 | if (mdev->state.pdsk == D_DISKLESS) { | ||
1891 | /* In case we have the only disk of the cluster, */ | ||
1892 | drbd_set_out_of_sync(mdev, e->sector, e->size); | ||
1893 | e->flags |= EE_CALL_AL_COMPLETE_IO; | ||
1894 | drbd_al_begin_io(mdev, e->sector); | ||
1895 | } | ||
1896 | |||
1897 | e->private_bio->bi_rw = rw; | ||
1898 | drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, e->private_bio); | ||
1899 | /* accounting done in endio */ | ||
1900 | |||
1901 | maybe_kick_lo(mdev); | ||
1902 | return TRUE; | ||
1903 | |||
1904 | out_interrupted: | ||
1905 | /* yes, the epoch_size now is imbalanced. | ||
1906 | * but we drop the connection anyways, so we don't have a chance to | ||
1907 | * receive a barrier... atomic_inc(&mdev->epoch_size); */ | ||
1908 | put_ldev(mdev); | ||
1909 | drbd_free_ee(mdev, e); | ||
1910 | return FALSE; | ||
1911 | } | ||
1912 | |||
1913 | static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) | ||
1914 | { | ||
1915 | sector_t sector; | ||
1916 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); | ||
1917 | struct drbd_epoch_entry *e; | ||
1918 | struct digest_info *di = NULL; | ||
1919 | int size, digest_size; | ||
1920 | unsigned int fault_type; | ||
1921 | struct p_block_req *p = | ||
1922 | (struct p_block_req *)h; | ||
1923 | const int brps = sizeof(*p)-sizeof(*h); | ||
1924 | |||
1925 | if (drbd_recv(mdev, h->payload, brps) != brps) | ||
1926 | return FALSE; | ||
1927 | |||
1928 | sector = be64_to_cpu(p->sector); | ||
1929 | size = be32_to_cpu(p->blksize); | ||
1930 | |||
1931 | if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { | ||
1932 | dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, | ||
1933 | (unsigned long long)sector, size); | ||
1934 | return FALSE; | ||
1935 | } | ||
1936 | if (sector + (size>>9) > capacity) { | ||
1937 | dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, | ||
1938 | (unsigned long long)sector, size); | ||
1939 | return FALSE; | ||
1940 | } | ||
1941 | |||
1942 | if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) { | ||
1943 | if (__ratelimit(&drbd_ratelimit_state)) | ||
1944 | dev_err(DEV, "Can not satisfy peer's read request, " | ||
1945 | "no local data.\n"); | ||
1946 | drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY : | ||
1947 | P_NEG_RS_DREPLY , p); | ||
1948 | return TRUE; | ||
1949 | } | ||
1950 | |||
1951 | /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD | ||
1952 | * "criss-cross" setup, that might cause write-out on some other DRBD, | ||
1953 | * which in turn might block on the other node at this very place. */ | ||
1954 | e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO); | ||
1955 | if (!e) { | ||
1956 | put_ldev(mdev); | ||
1957 | return FALSE; | ||
1958 | } | ||
1959 | |||
1960 | e->private_bio->bi_rw = READ; | ||
1961 | e->private_bio->bi_end_io = drbd_endio_read_sec; | ||
1962 | |||
1963 | switch (h->command) { | ||
1964 | case P_DATA_REQUEST: | ||
1965 | e->w.cb = w_e_end_data_req; | ||
1966 | fault_type = DRBD_FAULT_DT_RD; | ||
1967 | break; | ||
1968 | case P_RS_DATA_REQUEST: | ||
1969 | e->w.cb = w_e_end_rsdata_req; | ||
1970 | fault_type = DRBD_FAULT_RS_RD; | ||
1971 | /* Eventually this should become asynchronously. Currently it | ||
1972 | * blocks the whole receiver just to delay the reading of a | ||
1973 | * resync data block. | ||
1974 | * the drbd_work_queue mechanism is made for this... | ||
1975 | */ | ||
1976 | if (!drbd_rs_begin_io(mdev, sector)) { | ||
1977 | /* we have been interrupted, | ||
1978 | * probably connection lost! */ | ||
1979 | D_ASSERT(signal_pending(current)); | ||
1980 | goto out_free_e; | ||
1981 | } | ||
1982 | break; | ||
1983 | |||
1984 | case P_OV_REPLY: | ||
1985 | case P_CSUM_RS_REQUEST: | ||
1986 | fault_type = DRBD_FAULT_RS_RD; | ||
1987 | digest_size = h->length - brps ; | ||
1988 | di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO); | ||
1989 | if (!di) | ||
1990 | goto out_free_e; | ||
1991 | |||
1992 | di->digest_size = digest_size; | ||
1993 | di->digest = (((char *)di)+sizeof(struct digest_info)); | ||
1994 | |||
1995 | if (drbd_recv(mdev, di->digest, digest_size) != digest_size) | ||
1996 | goto out_free_e; | ||
1997 | |||
1998 | e->block_id = (u64)(unsigned long)di; | ||
1999 | if (h->command == P_CSUM_RS_REQUEST) { | ||
2000 | D_ASSERT(mdev->agreed_pro_version >= 89); | ||
2001 | e->w.cb = w_e_end_csum_rs_req; | ||
2002 | } else if (h->command == P_OV_REPLY) { | ||
2003 | e->w.cb = w_e_end_ov_reply; | ||
2004 | dec_rs_pending(mdev); | ||
2005 | break; | ||
2006 | } | ||
2007 | |||
2008 | if (!drbd_rs_begin_io(mdev, sector)) { | ||
2009 | /* we have been interrupted, probably connection lost! */ | ||
2010 | D_ASSERT(signal_pending(current)); | ||
2011 | goto out_free_e; | ||
2012 | } | ||
2013 | break; | ||
2014 | |||
2015 | case P_OV_REQUEST: | ||
2016 | if (mdev->state.conn >= C_CONNECTED && | ||
2017 | mdev->state.conn != C_VERIFY_T) | ||
2018 | dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n", | ||
2019 | drbd_conn_str(mdev->state.conn)); | ||
2020 | if (mdev->ov_start_sector == ~(sector_t)0 && | ||
2021 | mdev->agreed_pro_version >= 90) { | ||
2022 | mdev->ov_start_sector = sector; | ||
2023 | mdev->ov_position = sector; | ||
2024 | mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(sector); | ||
2025 | dev_info(DEV, "Online Verify start sector: %llu\n", | ||
2026 | (unsigned long long)sector); | ||
2027 | } | ||
2028 | e->w.cb = w_e_end_ov_req; | ||
2029 | fault_type = DRBD_FAULT_RS_RD; | ||
2030 | /* Eventually this should become asynchronous. Currently it | ||
2031 | * blocks the whole receiver just to delay the reading of a | ||
2032 | * resync data block. | ||
2033 | * the drbd_work_queue mechanism is made for this... | ||
2034 | */ | ||
2035 | if (!drbd_rs_begin_io(mdev, sector)) { | ||
2036 | /* we have been interrupted, | ||
2037 | * probably connection lost! */ | ||
2038 | D_ASSERT(signal_pending(current)); | ||
2039 | goto out_free_e; | ||
2040 | } | ||
2041 | break; | ||
2042 | |||
2043 | |||
2044 | default: | ||
2045 | dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n", | ||
2046 | cmdname(h->command)); | ||
2047 | fault_type = DRBD_FAULT_MAX; | ||
2048 | } | ||
2049 | |||
2050 | spin_lock_irq(&mdev->req_lock); | ||
2051 | list_add(&e->w.list, &mdev->read_ee); | ||
2052 | spin_unlock_irq(&mdev->req_lock); | ||
2053 | |||
2054 | inc_unacked(mdev); | ||
2055 | |||
2056 | drbd_generic_make_request(mdev, fault_type, e->private_bio); | ||
2057 | maybe_kick_lo(mdev); | ||
2058 | |||
2059 | return TRUE; | ||
2060 | |||
2061 | out_free_e: | ||
2062 | kfree(di); | ||
2063 | put_ldev(mdev); | ||
2064 | drbd_free_ee(mdev, e); | ||
2065 | return FALSE; | ||
2066 | } | ||
2067 | |||
2068 | static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) | ||
2069 | { | ||
2070 | int self, peer, rv = -100; | ||
2071 | unsigned long ch_self, ch_peer; | ||
2072 | |||
2073 | self = mdev->ldev->md.uuid[UI_BITMAP] & 1; | ||
2074 | peer = mdev->p_uuid[UI_BITMAP] & 1; | ||
2075 | |||
2076 | ch_peer = mdev->p_uuid[UI_SIZE]; | ||
2077 | ch_self = mdev->comm_bm_set; | ||
2078 | |||
2079 | switch (mdev->net_conf->after_sb_0p) { | ||
2080 | case ASB_CONSENSUS: | ||
2081 | case ASB_DISCARD_SECONDARY: | ||
2082 | case ASB_CALL_HELPER: | ||
2083 | dev_err(DEV, "Configuration error.\n"); | ||
2084 | break; | ||
2085 | case ASB_DISCONNECT: | ||
2086 | break; | ||
2087 | case ASB_DISCARD_YOUNGER_PRI: | ||
2088 | if (self == 0 && peer == 1) { | ||
2089 | rv = -1; | ||
2090 | break; | ||
2091 | } | ||
2092 | if (self == 1 && peer == 0) { | ||
2093 | rv = 1; | ||
2094 | break; | ||
2095 | } | ||
2096 | /* Else fall through to one of the other strategies... */ | ||
2097 | case ASB_DISCARD_OLDER_PRI: | ||
2098 | if (self == 0 && peer == 1) { | ||
2099 | rv = 1; | ||
2100 | break; | ||
2101 | } | ||
2102 | if (self == 1 && peer == 0) { | ||
2103 | rv = -1; | ||
2104 | break; | ||
2105 | } | ||
2106 | /* Else fall through to one of the other strategies... */ | ||
2107 | dev_warn(DEV, "Discard younger/older primary did not find a decision\n" | ||
2108 | "Using discard-least-changes instead\n"); | ||
2109 | case ASB_DISCARD_ZERO_CHG: | ||
2110 | if (ch_peer == 0 && ch_self == 0) { | ||
2111 | rv = test_bit(DISCARD_CONCURRENT, &mdev->flags) | ||
2112 | ? -1 : 1; | ||
2113 | break; | ||
2114 | } else { | ||
2115 | if (ch_peer == 0) { rv = 1; break; } | ||
2116 | if (ch_self == 0) { rv = -1; break; } | ||
2117 | } | ||
2118 | if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG) | ||
2119 | break; | ||
2120 | case ASB_DISCARD_LEAST_CHG: | ||
2121 | if (ch_self < ch_peer) | ||
2122 | rv = -1; | ||
2123 | else if (ch_self > ch_peer) | ||
2124 | rv = 1; | ||
2125 | else /* ( ch_self == ch_peer ) */ | ||
2126 | /* Well, then use something else. */ | ||
2127 | rv = test_bit(DISCARD_CONCURRENT, &mdev->flags) | ||
2128 | ? -1 : 1; | ||
2129 | break; | ||
2130 | case ASB_DISCARD_LOCAL: | ||
2131 | rv = -1; | ||
2132 | break; | ||
2133 | case ASB_DISCARD_REMOTE: | ||
2134 | rv = 1; | ||
2135 | } | ||
2136 | |||
2137 | return rv; | ||
2138 | } | ||
2139 | |||
2140 | static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local) | ||
2141 | { | ||
2142 | int self, peer, hg, rv = -100; | ||
2143 | |||
2144 | self = mdev->ldev->md.uuid[UI_BITMAP] & 1; | ||
2145 | peer = mdev->p_uuid[UI_BITMAP] & 1; | ||
2146 | |||
2147 | switch (mdev->net_conf->after_sb_1p) { | ||
2148 | case ASB_DISCARD_YOUNGER_PRI: | ||
2149 | case ASB_DISCARD_OLDER_PRI: | ||
2150 | case ASB_DISCARD_LEAST_CHG: | ||
2151 | case ASB_DISCARD_LOCAL: | ||
2152 | case ASB_DISCARD_REMOTE: | ||
2153 | dev_err(DEV, "Configuration error.\n"); | ||
2154 | break; | ||
2155 | case ASB_DISCONNECT: | ||
2156 | break; | ||
2157 | case ASB_CONSENSUS: | ||
2158 | hg = drbd_asb_recover_0p(mdev); | ||
2159 | if (hg == -1 && mdev->state.role == R_SECONDARY) | ||
2160 | rv = hg; | ||
2161 | if (hg == 1 && mdev->state.role == R_PRIMARY) | ||
2162 | rv = hg; | ||
2163 | break; | ||
2164 | case ASB_VIOLENTLY: | ||
2165 | rv = drbd_asb_recover_0p(mdev); | ||
2166 | break; | ||
2167 | case ASB_DISCARD_SECONDARY: | ||
2168 | return mdev->state.role == R_PRIMARY ? 1 : -1; | ||
2169 | case ASB_CALL_HELPER: | ||
2170 | hg = drbd_asb_recover_0p(mdev); | ||
2171 | if (hg == -1 && mdev->state.role == R_PRIMARY) { | ||
2172 | self = drbd_set_role(mdev, R_SECONDARY, 0); | ||
2173 | /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, | ||
2174 | * we might be here in C_WF_REPORT_PARAMS which is transient. | ||
2175 | * we do not need to wait for the after state change work either. */ | ||
2176 | self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY)); | ||
2177 | if (self != SS_SUCCESS) { | ||
2178 | drbd_khelper(mdev, "pri-lost-after-sb"); | ||
2179 | } else { | ||
2180 | dev_warn(DEV, "Successfully gave up primary role.\n"); | ||
2181 | rv = hg; | ||
2182 | } | ||
2183 | } else | ||
2184 | rv = hg; | ||
2185 | } | ||
2186 | |||
2187 | return rv; | ||
2188 | } | ||
2189 | |||
2190 | static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local) | ||
2191 | { | ||
2192 | int self, peer, hg, rv = -100; | ||
2193 | |||
2194 | self = mdev->ldev->md.uuid[UI_BITMAP] & 1; | ||
2195 | peer = mdev->p_uuid[UI_BITMAP] & 1; | ||
2196 | |||
2197 | switch (mdev->net_conf->after_sb_2p) { | ||
2198 | case ASB_DISCARD_YOUNGER_PRI: | ||
2199 | case ASB_DISCARD_OLDER_PRI: | ||
2200 | case ASB_DISCARD_LEAST_CHG: | ||
2201 | case ASB_DISCARD_LOCAL: | ||
2202 | case ASB_DISCARD_REMOTE: | ||
2203 | case ASB_CONSENSUS: | ||
2204 | case ASB_DISCARD_SECONDARY: | ||
2205 | dev_err(DEV, "Configuration error.\n"); | ||
2206 | break; | ||
2207 | case ASB_VIOLENTLY: | ||
2208 | rv = drbd_asb_recover_0p(mdev); | ||
2209 | break; | ||
2210 | case ASB_DISCONNECT: | ||
2211 | break; | ||
2212 | case ASB_CALL_HELPER: | ||
2213 | hg = drbd_asb_recover_0p(mdev); | ||
2214 | if (hg == -1) { | ||
2215 | /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, | ||
2216 | * we might be here in C_WF_REPORT_PARAMS which is transient. | ||
2217 | * we do not need to wait for the after state change work either. */ | ||
2218 | self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY)); | ||
2219 | if (self != SS_SUCCESS) { | ||
2220 | drbd_khelper(mdev, "pri-lost-after-sb"); | ||
2221 | } else { | ||
2222 | dev_warn(DEV, "Successfully gave up primary role.\n"); | ||
2223 | rv = hg; | ||
2224 | } | ||
2225 | } else | ||
2226 | rv = hg; | ||
2227 | } | ||
2228 | |||
2229 | return rv; | ||
2230 | } | ||
2231 | |||
2232 | static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid, | ||
2233 | u64 bits, u64 flags) | ||
2234 | { | ||
2235 | if (!uuid) { | ||
2236 | dev_info(DEV, "%s uuid info vanished while I was looking!\n", text); | ||
2237 | return; | ||
2238 | } | ||
2239 | dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n", | ||
2240 | text, | ||
2241 | (unsigned long long)uuid[UI_CURRENT], | ||
2242 | (unsigned long long)uuid[UI_BITMAP], | ||
2243 | (unsigned long long)uuid[UI_HISTORY_START], | ||
2244 | (unsigned long long)uuid[UI_HISTORY_END], | ||
2245 | (unsigned long long)bits, | ||
2246 | (unsigned long long)flags); | ||
2247 | } | ||
2248 | |||
2249 | /* | ||
2250 | 100 after split brain try auto recover | ||
2251 | 2 C_SYNC_SOURCE set BitMap | ||
2252 | 1 C_SYNC_SOURCE use BitMap | ||
2253 | 0 no Sync | ||
2254 | -1 C_SYNC_TARGET use BitMap | ||
2255 | -2 C_SYNC_TARGET set BitMap | ||
2256 | -100 after split brain, disconnect | ||
2257 | -1000 unrelated data | ||
2258 | */ | ||
2259 | static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local) | ||
2260 | { | ||
2261 | u64 self, peer; | ||
2262 | int i, j; | ||
2263 | |||
2264 | self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1); | ||
2265 | peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1); | ||
2266 | |||
2267 | *rule_nr = 10; | ||
2268 | if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED) | ||
2269 | return 0; | ||
2270 | |||
2271 | *rule_nr = 20; | ||
2272 | if ((self == UUID_JUST_CREATED || self == (u64)0) && | ||
2273 | peer != UUID_JUST_CREATED) | ||
2274 | return -2; | ||
2275 | |||
2276 | *rule_nr = 30; | ||
2277 | if (self != UUID_JUST_CREATED && | ||
2278 | (peer == UUID_JUST_CREATED || peer == (u64)0)) | ||
2279 | return 2; | ||
2280 | |||
2281 | if (self == peer) { | ||
2282 | int rct, dc; /* roles at crash time */ | ||
2283 | |||
2284 | if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) { | ||
2285 | |||
2286 | if (mdev->agreed_pro_version < 91) | ||
2287 | return -1001; | ||
2288 | |||
2289 | if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) && | ||
2290 | (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) { | ||
2291 | dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n"); | ||
2292 | drbd_uuid_set_bm(mdev, 0UL); | ||
2293 | |||
2294 | drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, | ||
2295 | mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0); | ||
2296 | *rule_nr = 34; | ||
2297 | } else { | ||
2298 | dev_info(DEV, "was SyncSource (peer failed to write sync_uuid)\n"); | ||
2299 | *rule_nr = 36; | ||
2300 | } | ||
2301 | |||
2302 | return 1; | ||
2303 | } | ||
2304 | |||
2305 | if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) { | ||
2306 | |||
2307 | if (mdev->agreed_pro_version < 91) | ||
2308 | return -1001; | ||
2309 | |||
2310 | if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) && | ||
2311 | (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) { | ||
2312 | dev_info(DEV, "was SyncTarget, peer missed the resync finished event, corrected peer:\n"); | ||
2313 | |||
2314 | mdev->p_uuid[UI_HISTORY_START + 1] = mdev->p_uuid[UI_HISTORY_START]; | ||
2315 | mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_BITMAP]; | ||
2316 | mdev->p_uuid[UI_BITMAP] = 0UL; | ||
2317 | |||
2318 | drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); | ||
2319 | *rule_nr = 35; | ||
2320 | } else { | ||
2321 | dev_info(DEV, "was SyncTarget (failed to write sync_uuid)\n"); | ||
2322 | *rule_nr = 37; | ||
2323 | } | ||
2324 | |||
2325 | return -1; | ||
2326 | } | ||
2327 | |||
2328 | /* Common power [off|failure] */ | ||
2329 | rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) + | ||
2330 | (mdev->p_uuid[UI_FLAGS] & 2); | ||
2331 | /* lowest bit is set when we were primary, | ||
2332 | * next bit (weight 2) is set when peer was primary */ | ||
2333 | *rule_nr = 40; | ||
2334 | |||
2335 | switch (rct) { | ||
2336 | case 0: /* !self_pri && !peer_pri */ return 0; | ||
2337 | case 1: /* self_pri && !peer_pri */ return 1; | ||
2338 | case 2: /* !self_pri && peer_pri */ return -1; | ||
2339 | case 3: /* self_pri && peer_pri */ | ||
2340 | dc = test_bit(DISCARD_CONCURRENT, &mdev->flags); | ||
2341 | return dc ? -1 : 1; | ||
2342 | } | ||
2343 | } | ||
2344 | |||
2345 | *rule_nr = 50; | ||
2346 | peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1); | ||
2347 | if (self == peer) | ||
2348 | return -1; | ||
2349 | |||
2350 | *rule_nr = 51; | ||
2351 | peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); | ||
2352 | if (self == peer) { | ||
2353 | self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); | ||
2354 | peer = mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1); | ||
2355 | if (self == peer) { | ||
2356 | /* The last P_SYNC_UUID did not get though. Undo the last start of | ||
2357 | resync as sync source modifications of the peer's UUIDs. */ | ||
2358 | |||
2359 | if (mdev->agreed_pro_version < 91) | ||
2360 | return -1001; | ||
2361 | |||
2362 | mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START]; | ||
2363 | mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1]; | ||
2364 | return -1; | ||
2365 | } | ||
2366 | } | ||
2367 | |||
2368 | *rule_nr = 60; | ||
2369 | self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1); | ||
2370 | for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { | ||
2371 | peer = mdev->p_uuid[i] & ~((u64)1); | ||
2372 | if (self == peer) | ||
2373 | return -2; | ||
2374 | } | ||
2375 | |||
2376 | *rule_nr = 70; | ||
2377 | self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1); | ||
2378 | peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1); | ||
2379 | if (self == peer) | ||
2380 | return 1; | ||
2381 | |||
2382 | *rule_nr = 71; | ||
2383 | self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); | ||
2384 | if (self == peer) { | ||
2385 | self = mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1); | ||
2386 | peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); | ||
2387 | if (self == peer) { | ||
2388 | /* The last P_SYNC_UUID did not get though. Undo the last start of | ||
2389 | resync as sync source modifications of our UUIDs. */ | ||
2390 | |||
2391 | if (mdev->agreed_pro_version < 91) | ||
2392 | return -1001; | ||
2393 | |||
2394 | _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]); | ||
2395 | _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]); | ||
2396 | |||
2397 | dev_info(DEV, "Undid last start of resync:\n"); | ||
2398 | |||
2399 | drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, | ||
2400 | mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0); | ||
2401 | |||
2402 | return 1; | ||
2403 | } | ||
2404 | } | ||
2405 | |||
2406 | |||
2407 | *rule_nr = 80; | ||
2408 | peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1); | ||
2409 | for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { | ||
2410 | self = mdev->ldev->md.uuid[i] & ~((u64)1); | ||
2411 | if (self == peer) | ||
2412 | return 2; | ||
2413 | } | ||
2414 | |||
2415 | *rule_nr = 90; | ||
2416 | self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1); | ||
2417 | peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1); | ||
2418 | if (self == peer && self != ((u64)0)) | ||
2419 | return 100; | ||
2420 | |||
2421 | *rule_nr = 100; | ||
2422 | for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { | ||
2423 | self = mdev->ldev->md.uuid[i] & ~((u64)1); | ||
2424 | for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) { | ||
2425 | peer = mdev->p_uuid[j] & ~((u64)1); | ||
2426 | if (self == peer) | ||
2427 | return -100; | ||
2428 | } | ||
2429 | } | ||
2430 | |||
2431 | return -1000; | ||
2432 | } | ||
2433 | |||
2434 | /* drbd_sync_handshake() returns the new conn state on success, or | ||
2435 | CONN_MASK (-1) on failure. | ||
2436 | */ | ||
2437 | static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role, | ||
2438 | enum drbd_disk_state peer_disk) __must_hold(local) | ||
2439 | { | ||
2440 | int hg, rule_nr; | ||
2441 | enum drbd_conns rv = C_MASK; | ||
2442 | enum drbd_disk_state mydisk; | ||
2443 | |||
2444 | mydisk = mdev->state.disk; | ||
2445 | if (mydisk == D_NEGOTIATING) | ||
2446 | mydisk = mdev->new_state_tmp.disk; | ||
2447 | |||
2448 | dev_info(DEV, "drbd_sync_handshake:\n"); | ||
2449 | drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0); | ||
2450 | drbd_uuid_dump(mdev, "peer", mdev->p_uuid, | ||
2451 | mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); | ||
2452 | |||
2453 | hg = drbd_uuid_compare(mdev, &rule_nr); | ||
2454 | |||
2455 | dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr); | ||
2456 | |||
2457 | if (hg == -1000) { | ||
2458 | dev_alert(DEV, "Unrelated data, aborting!\n"); | ||
2459 | return C_MASK; | ||
2460 | } | ||
2461 | if (hg == -1001) { | ||
2462 | dev_alert(DEV, "To resolve this both sides have to support at least protocol\n"); | ||
2463 | return C_MASK; | ||
2464 | } | ||
2465 | |||
2466 | if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) || | ||
2467 | (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) { | ||
2468 | int f = (hg == -100) || abs(hg) == 2; | ||
2469 | hg = mydisk > D_INCONSISTENT ? 1 : -1; | ||
2470 | if (f) | ||
2471 | hg = hg*2; | ||
2472 | dev_info(DEV, "Becoming sync %s due to disk states.\n", | ||
2473 | hg > 0 ? "source" : "target"); | ||
2474 | } | ||
2475 | |||
2476 | if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) { | ||
2477 | int pcount = (mdev->state.role == R_PRIMARY) | ||
2478 | + (peer_role == R_PRIMARY); | ||
2479 | int forced = (hg == -100); | ||
2480 | |||
2481 | switch (pcount) { | ||
2482 | case 0: | ||
2483 | hg = drbd_asb_recover_0p(mdev); | ||
2484 | break; | ||
2485 | case 1: | ||
2486 | hg = drbd_asb_recover_1p(mdev); | ||
2487 | break; | ||
2488 | case 2: | ||
2489 | hg = drbd_asb_recover_2p(mdev); | ||
2490 | break; | ||
2491 | } | ||
2492 | if (abs(hg) < 100) { | ||
2493 | dev_warn(DEV, "Split-Brain detected, %d primaries, " | ||
2494 | "automatically solved. Sync from %s node\n", | ||
2495 | pcount, (hg < 0) ? "peer" : "this"); | ||
2496 | if (forced) { | ||
2497 | dev_warn(DEV, "Doing a full sync, since" | ||
2498 | " UUIDs where ambiguous.\n"); | ||
2499 | hg = hg*2; | ||
2500 | } | ||
2501 | } | ||
2502 | } | ||
2503 | |||
2504 | if (hg == -100) { | ||
2505 | if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1)) | ||
2506 | hg = -1; | ||
2507 | if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1)) | ||
2508 | hg = 1; | ||
2509 | |||
2510 | if (abs(hg) < 100) | ||
2511 | dev_warn(DEV, "Split-Brain detected, manually solved. " | ||
2512 | "Sync from %s node\n", | ||
2513 | (hg < 0) ? "peer" : "this"); | ||
2514 | } | ||
2515 | |||
2516 | if (hg == -100) { | ||
2517 | /* FIXME this log message is not correct if we end up here | ||
2518 | * after an attempted attach on a diskless node. | ||
2519 | * We just refuse to attach -- well, we drop the "connection" | ||
2520 | * to that disk, in a way... */ | ||
2521 | dev_alert(DEV, "Split-Brain detected, dropping connection!\n"); | ||
2522 | drbd_khelper(mdev, "split-brain"); | ||
2523 | return C_MASK; | ||
2524 | } | ||
2525 | |||
2526 | if (hg > 0 && mydisk <= D_INCONSISTENT) { | ||
2527 | dev_err(DEV, "I shall become SyncSource, but I am inconsistent!\n"); | ||
2528 | return C_MASK; | ||
2529 | } | ||
2530 | |||
2531 | if (hg < 0 && /* by intention we do not use mydisk here. */ | ||
2532 | mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) { | ||
2533 | switch (mdev->net_conf->rr_conflict) { | ||
2534 | case ASB_CALL_HELPER: | ||
2535 | drbd_khelper(mdev, "pri-lost"); | ||
2536 | /* fall through */ | ||
2537 | case ASB_DISCONNECT: | ||
2538 | dev_err(DEV, "I shall become SyncTarget, but I am primary!\n"); | ||
2539 | return C_MASK; | ||
2540 | case ASB_VIOLENTLY: | ||
2541 | dev_warn(DEV, "Becoming SyncTarget, violating the stable-data" | ||
2542 | "assumption\n"); | ||
2543 | } | ||
2544 | } | ||
2545 | |||
2546 | if (mdev->net_conf->dry_run || test_bit(CONN_DRY_RUN, &mdev->flags)) { | ||
2547 | if (hg == 0) | ||
2548 | dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n"); | ||
2549 | else | ||
2550 | dev_info(DEV, "dry-run connect: Would become %s, doing a %s resync.", | ||
2551 | drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET), | ||
2552 | abs(hg) >= 2 ? "full" : "bit-map based"); | ||
2553 | return C_MASK; | ||
2554 | } | ||
2555 | |||
2556 | if (abs(hg) >= 2) { | ||
2557 | dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n"); | ||
2558 | if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake")) | ||
2559 | return C_MASK; | ||
2560 | } | ||
2561 | |||
2562 | if (hg > 0) { /* become sync source. */ | ||
2563 | rv = C_WF_BITMAP_S; | ||
2564 | } else if (hg < 0) { /* become sync target */ | ||
2565 | rv = C_WF_BITMAP_T; | ||
2566 | } else { | ||
2567 | rv = C_CONNECTED; | ||
2568 | if (drbd_bm_total_weight(mdev)) { | ||
2569 | dev_info(DEV, "No resync, but %lu bits in bitmap!\n", | ||
2570 | drbd_bm_total_weight(mdev)); | ||
2571 | } | ||
2572 | } | ||
2573 | |||
2574 | return rv; | ||
2575 | } | ||
2576 | |||
2577 | /* returns 1 if invalid */ | ||
2578 | static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self) | ||
2579 | { | ||
2580 | /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */ | ||
2581 | if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) || | ||
2582 | (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL)) | ||
2583 | return 0; | ||
2584 | |||
2585 | /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */ | ||
2586 | if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL || | ||
2587 | self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL) | ||
2588 | return 1; | ||
2589 | |||
2590 | /* everything else is valid if they are equal on both sides. */ | ||
2591 | if (peer == self) | ||
2592 | return 0; | ||
2593 | |||
2594 | /* everything es is invalid. */ | ||
2595 | return 1; | ||
2596 | } | ||
2597 | |||
2598 | static int receive_protocol(struct drbd_conf *mdev, struct p_header *h) | ||
2599 | { | ||
2600 | struct p_protocol *p = (struct p_protocol *)h; | ||
2601 | int header_size, data_size; | ||
2602 | int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; | ||
2603 | int p_want_lose, p_two_primaries, cf; | ||
2604 | char p_integrity_alg[SHARED_SECRET_MAX] = ""; | ||
2605 | |||
2606 | header_size = sizeof(*p) - sizeof(*h); | ||
2607 | data_size = h->length - header_size; | ||
2608 | |||
2609 | if (drbd_recv(mdev, h->payload, header_size) != header_size) | ||
2610 | return FALSE; | ||
2611 | |||
2612 | p_proto = be32_to_cpu(p->protocol); | ||
2613 | p_after_sb_0p = be32_to_cpu(p->after_sb_0p); | ||
2614 | p_after_sb_1p = be32_to_cpu(p->after_sb_1p); | ||
2615 | p_after_sb_2p = be32_to_cpu(p->after_sb_2p); | ||
2616 | p_two_primaries = be32_to_cpu(p->two_primaries); | ||
2617 | cf = be32_to_cpu(p->conn_flags); | ||
2618 | p_want_lose = cf & CF_WANT_LOSE; | ||
2619 | |||
2620 | clear_bit(CONN_DRY_RUN, &mdev->flags); | ||
2621 | |||
2622 | if (cf & CF_DRY_RUN) | ||
2623 | set_bit(CONN_DRY_RUN, &mdev->flags); | ||
2624 | |||
2625 | if (p_proto != mdev->net_conf->wire_protocol) { | ||
2626 | dev_err(DEV, "incompatible communication protocols\n"); | ||
2627 | goto disconnect; | ||
2628 | } | ||
2629 | |||
2630 | if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) { | ||
2631 | dev_err(DEV, "incompatible after-sb-0pri settings\n"); | ||
2632 | goto disconnect; | ||
2633 | } | ||
2634 | |||
2635 | if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) { | ||
2636 | dev_err(DEV, "incompatible after-sb-1pri settings\n"); | ||
2637 | goto disconnect; | ||
2638 | } | ||
2639 | |||
2640 | if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) { | ||
2641 | dev_err(DEV, "incompatible after-sb-2pri settings\n"); | ||
2642 | goto disconnect; | ||
2643 | } | ||
2644 | |||
2645 | if (p_want_lose && mdev->net_conf->want_lose) { | ||
2646 | dev_err(DEV, "both sides have the 'want_lose' flag set\n"); | ||
2647 | goto disconnect; | ||
2648 | } | ||
2649 | |||
2650 | if (p_two_primaries != mdev->net_conf->two_primaries) { | ||
2651 | dev_err(DEV, "incompatible setting of the two-primaries options\n"); | ||
2652 | goto disconnect; | ||
2653 | } | ||
2654 | |||
2655 | if (mdev->agreed_pro_version >= 87) { | ||
2656 | unsigned char *my_alg = mdev->net_conf->integrity_alg; | ||
2657 | |||
2658 | if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size) | ||
2659 | return FALSE; | ||
2660 | |||
2661 | p_integrity_alg[SHARED_SECRET_MAX-1] = 0; | ||
2662 | if (strcmp(p_integrity_alg, my_alg)) { | ||
2663 | dev_err(DEV, "incompatible setting of the data-integrity-alg\n"); | ||
2664 | goto disconnect; | ||
2665 | } | ||
2666 | dev_info(DEV, "data-integrity-alg: %s\n", | ||
2667 | my_alg[0] ? my_alg : (unsigned char *)"<not-used>"); | ||
2668 | } | ||
2669 | |||
2670 | return TRUE; | ||
2671 | |||
2672 | disconnect: | ||
2673 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2674 | return FALSE; | ||
2675 | } | ||
2676 | |||
2677 | /* helper function | ||
2678 | * input: alg name, feature name | ||
2679 | * return: NULL (alg name was "") | ||
2680 | * ERR_PTR(error) if something goes wrong | ||
2681 | * or the crypto hash ptr, if it worked out ok. */ | ||
2682 | struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev, | ||
2683 | const char *alg, const char *name) | ||
2684 | { | ||
2685 | struct crypto_hash *tfm; | ||
2686 | |||
2687 | if (!alg[0]) | ||
2688 | return NULL; | ||
2689 | |||
2690 | tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC); | ||
2691 | if (IS_ERR(tfm)) { | ||
2692 | dev_err(DEV, "Can not allocate \"%s\" as %s (reason: %ld)\n", | ||
2693 | alg, name, PTR_ERR(tfm)); | ||
2694 | return tfm; | ||
2695 | } | ||
2696 | if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) { | ||
2697 | crypto_free_hash(tfm); | ||
2698 | dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name); | ||
2699 | return ERR_PTR(-EINVAL); | ||
2700 | } | ||
2701 | return tfm; | ||
2702 | } | ||
2703 | |||
2704 | static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h) | ||
2705 | { | ||
2706 | int ok = TRUE; | ||
2707 | struct p_rs_param_89 *p = (struct p_rs_param_89 *)h; | ||
2708 | unsigned int header_size, data_size, exp_max_sz; | ||
2709 | struct crypto_hash *verify_tfm = NULL; | ||
2710 | struct crypto_hash *csums_tfm = NULL; | ||
2711 | const int apv = mdev->agreed_pro_version; | ||
2712 | |||
2713 | exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param) | ||
2714 | : apv == 88 ? sizeof(struct p_rs_param) | ||
2715 | + SHARED_SECRET_MAX | ||
2716 | : /* 89 */ sizeof(struct p_rs_param_89); | ||
2717 | |||
2718 | if (h->length > exp_max_sz) { | ||
2719 | dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n", | ||
2720 | h->length, exp_max_sz); | ||
2721 | return FALSE; | ||
2722 | } | ||
2723 | |||
2724 | if (apv <= 88) { | ||
2725 | header_size = sizeof(struct p_rs_param) - sizeof(*h); | ||
2726 | data_size = h->length - header_size; | ||
2727 | } else /* apv >= 89 */ { | ||
2728 | header_size = sizeof(struct p_rs_param_89) - sizeof(*h); | ||
2729 | data_size = h->length - header_size; | ||
2730 | D_ASSERT(data_size == 0); | ||
2731 | } | ||
2732 | |||
2733 | /* initialize verify_alg and csums_alg */ | ||
2734 | memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); | ||
2735 | |||
2736 | if (drbd_recv(mdev, h->payload, header_size) != header_size) | ||
2737 | return FALSE; | ||
2738 | |||
2739 | mdev->sync_conf.rate = be32_to_cpu(p->rate); | ||
2740 | |||
2741 | if (apv >= 88) { | ||
2742 | if (apv == 88) { | ||
2743 | if (data_size > SHARED_SECRET_MAX) { | ||
2744 | dev_err(DEV, "verify-alg too long, " | ||
2745 | "peer wants %u, accepting only %u byte\n", | ||
2746 | data_size, SHARED_SECRET_MAX); | ||
2747 | return FALSE; | ||
2748 | } | ||
2749 | |||
2750 | if (drbd_recv(mdev, p->verify_alg, data_size) != data_size) | ||
2751 | return FALSE; | ||
2752 | |||
2753 | /* we expect NUL terminated string */ | ||
2754 | /* but just in case someone tries to be evil */ | ||
2755 | D_ASSERT(p->verify_alg[data_size-1] == 0); | ||
2756 | p->verify_alg[data_size-1] = 0; | ||
2757 | |||
2758 | } else /* apv >= 89 */ { | ||
2759 | /* we still expect NUL terminated strings */ | ||
2760 | /* but just in case someone tries to be evil */ | ||
2761 | D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0); | ||
2762 | D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0); | ||
2763 | p->verify_alg[SHARED_SECRET_MAX-1] = 0; | ||
2764 | p->csums_alg[SHARED_SECRET_MAX-1] = 0; | ||
2765 | } | ||
2766 | |||
2767 | if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) { | ||
2768 | if (mdev->state.conn == C_WF_REPORT_PARAMS) { | ||
2769 | dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n", | ||
2770 | mdev->sync_conf.verify_alg, p->verify_alg); | ||
2771 | goto disconnect; | ||
2772 | } | ||
2773 | verify_tfm = drbd_crypto_alloc_digest_safe(mdev, | ||
2774 | p->verify_alg, "verify-alg"); | ||
2775 | if (IS_ERR(verify_tfm)) { | ||
2776 | verify_tfm = NULL; | ||
2777 | goto disconnect; | ||
2778 | } | ||
2779 | } | ||
2780 | |||
2781 | if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) { | ||
2782 | if (mdev->state.conn == C_WF_REPORT_PARAMS) { | ||
2783 | dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n", | ||
2784 | mdev->sync_conf.csums_alg, p->csums_alg); | ||
2785 | goto disconnect; | ||
2786 | } | ||
2787 | csums_tfm = drbd_crypto_alloc_digest_safe(mdev, | ||
2788 | p->csums_alg, "csums-alg"); | ||
2789 | if (IS_ERR(csums_tfm)) { | ||
2790 | csums_tfm = NULL; | ||
2791 | goto disconnect; | ||
2792 | } | ||
2793 | } | ||
2794 | |||
2795 | |||
2796 | spin_lock(&mdev->peer_seq_lock); | ||
2797 | /* lock against drbd_nl_syncer_conf() */ | ||
2798 | if (verify_tfm) { | ||
2799 | strcpy(mdev->sync_conf.verify_alg, p->verify_alg); | ||
2800 | mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1; | ||
2801 | crypto_free_hash(mdev->verify_tfm); | ||
2802 | mdev->verify_tfm = verify_tfm; | ||
2803 | dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg); | ||
2804 | } | ||
2805 | if (csums_tfm) { | ||
2806 | strcpy(mdev->sync_conf.csums_alg, p->csums_alg); | ||
2807 | mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1; | ||
2808 | crypto_free_hash(mdev->csums_tfm); | ||
2809 | mdev->csums_tfm = csums_tfm; | ||
2810 | dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg); | ||
2811 | } | ||
2812 | spin_unlock(&mdev->peer_seq_lock); | ||
2813 | } | ||
2814 | |||
2815 | return ok; | ||
2816 | disconnect: | ||
2817 | /* just for completeness: actually not needed, | ||
2818 | * as this is not reached if csums_tfm was ok. */ | ||
2819 | crypto_free_hash(csums_tfm); | ||
2820 | /* but free the verify_tfm again, if csums_tfm did not work out */ | ||
2821 | crypto_free_hash(verify_tfm); | ||
2822 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2823 | return FALSE; | ||
2824 | } | ||
2825 | |||
2826 | static void drbd_setup_order_type(struct drbd_conf *mdev, int peer) | ||
2827 | { | ||
2828 | /* sorry, we currently have no working implementation | ||
2829 | * of distributed TCQ */ | ||
2830 | } | ||
2831 | |||
2832 | /* warn if the arguments differ by more than 12.5% */ | ||
2833 | static void warn_if_differ_considerably(struct drbd_conf *mdev, | ||
2834 | const char *s, sector_t a, sector_t b) | ||
2835 | { | ||
2836 | sector_t d; | ||
2837 | if (a == 0 || b == 0) | ||
2838 | return; | ||
2839 | d = (a > b) ? (a - b) : (b - a); | ||
2840 | if (d > (a>>3) || d > (b>>3)) | ||
2841 | dev_warn(DEV, "Considerable difference in %s: %llus vs. %llus\n", s, | ||
2842 | (unsigned long long)a, (unsigned long long)b); | ||
2843 | } | ||
2844 | |||
2845 | static int receive_sizes(struct drbd_conf *mdev, struct p_header *h) | ||
2846 | { | ||
2847 | struct p_sizes *p = (struct p_sizes *)h; | ||
2848 | enum determine_dev_size dd = unchanged; | ||
2849 | unsigned int max_seg_s; | ||
2850 | sector_t p_size, p_usize, my_usize; | ||
2851 | int ldsc = 0; /* local disk size changed */ | ||
2852 | enum drbd_conns nconn; | ||
2853 | |||
2854 | ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; | ||
2855 | if (drbd_recv(mdev, h->payload, h->length) != h->length) | ||
2856 | return FALSE; | ||
2857 | |||
2858 | p_size = be64_to_cpu(p->d_size); | ||
2859 | p_usize = be64_to_cpu(p->u_size); | ||
2860 | |||
2861 | if (p_size == 0 && mdev->state.disk == D_DISKLESS) { | ||
2862 | dev_err(DEV, "some backing storage is needed\n"); | ||
2863 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2864 | return FALSE; | ||
2865 | } | ||
2866 | |||
2867 | /* just store the peer's disk size for now. | ||
2868 | * we still need to figure out whether we accept that. */ | ||
2869 | mdev->p_size = p_size; | ||
2870 | |||
2871 | #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) | ||
2872 | if (get_ldev(mdev)) { | ||
2873 | warn_if_differ_considerably(mdev, "lower level device sizes", | ||
2874 | p_size, drbd_get_max_capacity(mdev->ldev)); | ||
2875 | warn_if_differ_considerably(mdev, "user requested size", | ||
2876 | p_usize, mdev->ldev->dc.disk_size); | ||
2877 | |||
2878 | /* if this is the first connect, or an otherwise expected | ||
2879 | * param exchange, choose the minimum */ | ||
2880 | if (mdev->state.conn == C_WF_REPORT_PARAMS) | ||
2881 | p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size, | ||
2882 | p_usize); | ||
2883 | |||
2884 | my_usize = mdev->ldev->dc.disk_size; | ||
2885 | |||
2886 | if (mdev->ldev->dc.disk_size != p_usize) { | ||
2887 | mdev->ldev->dc.disk_size = p_usize; | ||
2888 | dev_info(DEV, "Peer sets u_size to %lu sectors\n", | ||
2889 | (unsigned long)mdev->ldev->dc.disk_size); | ||
2890 | } | ||
2891 | |||
2892 | /* Never shrink a device with usable data during connect. | ||
2893 | But allow online shrinking if we are connected. */ | ||
2894 | if (drbd_new_dev_size(mdev, mdev->ldev, 0) < | ||
2895 | drbd_get_capacity(mdev->this_bdev) && | ||
2896 | mdev->state.disk >= D_OUTDATED && | ||
2897 | mdev->state.conn < C_CONNECTED) { | ||
2898 | dev_err(DEV, "The peer's disk size is too small!\n"); | ||
2899 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2900 | mdev->ldev->dc.disk_size = my_usize; | ||
2901 | put_ldev(mdev); | ||
2902 | return FALSE; | ||
2903 | } | ||
2904 | put_ldev(mdev); | ||
2905 | } | ||
2906 | #undef min_not_zero | ||
2907 | |||
2908 | if (get_ldev(mdev)) { | ||
2909 | dd = drbd_determin_dev_size(mdev, 0); | ||
2910 | put_ldev(mdev); | ||
2911 | if (dd == dev_size_error) | ||
2912 | return FALSE; | ||
2913 | drbd_md_sync(mdev); | ||
2914 | } else { | ||
2915 | /* I am diskless, need to accept the peer's size. */ | ||
2916 | drbd_set_my_capacity(mdev, p_size); | ||
2917 | } | ||
2918 | |||
2919 | if (mdev->p_uuid && mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) { | ||
2920 | nconn = drbd_sync_handshake(mdev, | ||
2921 | mdev->state.peer, mdev->state.pdsk); | ||
2922 | put_ldev(mdev); | ||
2923 | |||
2924 | if (nconn == C_MASK) { | ||
2925 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2926 | return FALSE; | ||
2927 | } | ||
2928 | |||
2929 | if (drbd_request_state(mdev, NS(conn, nconn)) < SS_SUCCESS) { | ||
2930 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2931 | return FALSE; | ||
2932 | } | ||
2933 | } | ||
2934 | |||
2935 | if (get_ldev(mdev)) { | ||
2936 | if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { | ||
2937 | mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); | ||
2938 | ldsc = 1; | ||
2939 | } | ||
2940 | |||
2941 | max_seg_s = be32_to_cpu(p->max_segment_size); | ||
2942 | if (max_seg_s != queue_max_segment_size(mdev->rq_queue)) | ||
2943 | drbd_setup_queue_param(mdev, max_seg_s); | ||
2944 | |||
2945 | drbd_setup_order_type(mdev, be32_to_cpu(p->queue_order_type)); | ||
2946 | put_ldev(mdev); | ||
2947 | } | ||
2948 | |||
2949 | if (mdev->state.conn > C_WF_REPORT_PARAMS) { | ||
2950 | if (be64_to_cpu(p->c_size) != | ||
2951 | drbd_get_capacity(mdev->this_bdev) || ldsc) { | ||
2952 | /* we have different sizes, probably peer | ||
2953 | * needs to know my new size... */ | ||
2954 | drbd_send_sizes(mdev, 0); | ||
2955 | } | ||
2956 | if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) || | ||
2957 | (dd == grew && mdev->state.conn == C_CONNECTED)) { | ||
2958 | if (mdev->state.pdsk >= D_INCONSISTENT && | ||
2959 | mdev->state.disk >= D_INCONSISTENT) | ||
2960 | resync_after_online_grow(mdev); | ||
2961 | else | ||
2962 | set_bit(RESYNC_AFTER_NEG, &mdev->flags); | ||
2963 | } | ||
2964 | } | ||
2965 | |||
2966 | return TRUE; | ||
2967 | } | ||
2968 | |||
2969 | static int receive_uuids(struct drbd_conf *mdev, struct p_header *h) | ||
2970 | { | ||
2971 | struct p_uuids *p = (struct p_uuids *)h; | ||
2972 | u64 *p_uuid; | ||
2973 | int i; | ||
2974 | |||
2975 | ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; | ||
2976 | if (drbd_recv(mdev, h->payload, h->length) != h->length) | ||
2977 | return FALSE; | ||
2978 | |||
2979 | p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO); | ||
2980 | |||
2981 | for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++) | ||
2982 | p_uuid[i] = be64_to_cpu(p->uuid[i]); | ||
2983 | |||
2984 | kfree(mdev->p_uuid); | ||
2985 | mdev->p_uuid = p_uuid; | ||
2986 | |||
2987 | if (mdev->state.conn < C_CONNECTED && | ||
2988 | mdev->state.disk < D_INCONSISTENT && | ||
2989 | mdev->state.role == R_PRIMARY && | ||
2990 | (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) { | ||
2991 | dev_err(DEV, "Can only connect to data with current UUID=%016llX\n", | ||
2992 | (unsigned long long)mdev->ed_uuid); | ||
2993 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2994 | return FALSE; | ||
2995 | } | ||
2996 | |||
2997 | if (get_ldev(mdev)) { | ||
2998 | int skip_initial_sync = | ||
2999 | mdev->state.conn == C_CONNECTED && | ||
3000 | mdev->agreed_pro_version >= 90 && | ||
3001 | mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && | ||
3002 | (p_uuid[UI_FLAGS] & 8); | ||
3003 | if (skip_initial_sync) { | ||
3004 | dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n"); | ||
3005 | drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, | ||
3006 | "clear_n_write from receive_uuids"); | ||
3007 | _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]); | ||
3008 | _drbd_uuid_set(mdev, UI_BITMAP, 0); | ||
3009 | _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), | ||
3010 | CS_VERBOSE, NULL); | ||
3011 | drbd_md_sync(mdev); | ||
3012 | } | ||
3013 | put_ldev(mdev); | ||
3014 | } | ||
3015 | |||
3016 | /* Before we test for the disk state, we should wait until an eventually | ||
3017 | ongoing cluster wide state change is finished. That is important if | ||
3018 | we are primary and are detaching from our disk. We need to see the | ||
3019 | new disk state... */ | ||
3020 | wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags)); | ||
3021 | if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT) | ||
3022 | drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]); | ||
3023 | |||
3024 | return TRUE; | ||
3025 | } | ||
3026 | |||
3027 | /** | ||
3028 | * convert_state() - Converts the peer's view of the cluster state to our point of view | ||
3029 | * @ps: The state as seen by the peer. | ||
3030 | */ | ||
3031 | static union drbd_state convert_state(union drbd_state ps) | ||
3032 | { | ||
3033 | union drbd_state ms; | ||
3034 | |||
3035 | static enum drbd_conns c_tab[] = { | ||
3036 | [C_CONNECTED] = C_CONNECTED, | ||
3037 | |||
3038 | [C_STARTING_SYNC_S] = C_STARTING_SYNC_T, | ||
3039 | [C_STARTING_SYNC_T] = C_STARTING_SYNC_S, | ||
3040 | [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */ | ||
3041 | [C_VERIFY_S] = C_VERIFY_T, | ||
3042 | [C_MASK] = C_MASK, | ||
3043 | }; | ||
3044 | |||
3045 | ms.i = ps.i; | ||
3046 | |||
3047 | ms.conn = c_tab[ps.conn]; | ||
3048 | ms.peer = ps.role; | ||
3049 | ms.role = ps.peer; | ||
3050 | ms.pdsk = ps.disk; | ||
3051 | ms.disk = ps.pdsk; | ||
3052 | ms.peer_isp = (ps.aftr_isp | ps.user_isp); | ||
3053 | |||
3054 | return ms; | ||
3055 | } | ||
3056 | |||
3057 | static int receive_req_state(struct drbd_conf *mdev, struct p_header *h) | ||
3058 | { | ||
3059 | struct p_req_state *p = (struct p_req_state *)h; | ||
3060 | union drbd_state mask, val; | ||
3061 | int rv; | ||
3062 | |||
3063 | ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; | ||
3064 | if (drbd_recv(mdev, h->payload, h->length) != h->length) | ||
3065 | return FALSE; | ||
3066 | |||
3067 | mask.i = be32_to_cpu(p->mask); | ||
3068 | val.i = be32_to_cpu(p->val); | ||
3069 | |||
3070 | if (test_bit(DISCARD_CONCURRENT, &mdev->flags) && | ||
3071 | test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) { | ||
3072 | drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG); | ||
3073 | return TRUE; | ||
3074 | } | ||
3075 | |||
3076 | mask = convert_state(mask); | ||
3077 | val = convert_state(val); | ||
3078 | |||
3079 | rv = drbd_change_state(mdev, CS_VERBOSE, mask, val); | ||
3080 | |||
3081 | drbd_send_sr_reply(mdev, rv); | ||
3082 | drbd_md_sync(mdev); | ||
3083 | |||
3084 | return TRUE; | ||
3085 | } | ||
3086 | |||
3087 | static int receive_state(struct drbd_conf *mdev, struct p_header *h) | ||
3088 | { | ||
3089 | struct p_state *p = (struct p_state *)h; | ||
3090 | enum drbd_conns nconn, oconn; | ||
3091 | union drbd_state ns, peer_state; | ||
3092 | enum drbd_disk_state real_peer_disk; | ||
3093 | int rv; | ||
3094 | |||
3095 | ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) | ||
3096 | return FALSE; | ||
3097 | |||
3098 | if (drbd_recv(mdev, h->payload, h->length) != h->length) | ||
3099 | return FALSE; | ||
3100 | |||
3101 | peer_state.i = be32_to_cpu(p->state); | ||
3102 | |||
3103 | real_peer_disk = peer_state.disk; | ||
3104 | if (peer_state.disk == D_NEGOTIATING) { | ||
3105 | real_peer_disk = mdev->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT; | ||
3106 | dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk)); | ||
3107 | } | ||
3108 | |||
3109 | spin_lock_irq(&mdev->req_lock); | ||
3110 | retry: | ||
3111 | oconn = nconn = mdev->state.conn; | ||
3112 | spin_unlock_irq(&mdev->req_lock); | ||
3113 | |||
3114 | if (nconn == C_WF_REPORT_PARAMS) | ||
3115 | nconn = C_CONNECTED; | ||
3116 | |||
3117 | if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING && | ||
3118 | get_ldev_if_state(mdev, D_NEGOTIATING)) { | ||
3119 | int cr; /* consider resync */ | ||
3120 | |||
3121 | /* if we established a new connection */ | ||
3122 | cr = (oconn < C_CONNECTED); | ||
3123 | /* if we had an established connection | ||
3124 | * and one of the nodes newly attaches a disk */ | ||
3125 | cr |= (oconn == C_CONNECTED && | ||
3126 | (peer_state.disk == D_NEGOTIATING || | ||
3127 | mdev->state.disk == D_NEGOTIATING)); | ||
3128 | /* if we have both been inconsistent, and the peer has been | ||
3129 | * forced to be UpToDate with --overwrite-data */ | ||
3130 | cr |= test_bit(CONSIDER_RESYNC, &mdev->flags); | ||
3131 | /* if we had been plain connected, and the admin requested to | ||
3132 | * start a sync by "invalidate" or "invalidate-remote" */ | ||
3133 | cr |= (oconn == C_CONNECTED && | ||
3134 | (peer_state.conn >= C_STARTING_SYNC_S && | ||
3135 | peer_state.conn <= C_WF_BITMAP_T)); | ||
3136 | |||
3137 | if (cr) | ||
3138 | nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk); | ||
3139 | |||
3140 | put_ldev(mdev); | ||
3141 | if (nconn == C_MASK) { | ||
3142 | nconn = C_CONNECTED; | ||
3143 | if (mdev->state.disk == D_NEGOTIATING) { | ||
3144 | drbd_force_state(mdev, NS(disk, D_DISKLESS)); | ||
3145 | } else if (peer_state.disk == D_NEGOTIATING) { | ||
3146 | dev_err(DEV, "Disk attach process on the peer node was aborted.\n"); | ||
3147 | peer_state.disk = D_DISKLESS; | ||
3148 | real_peer_disk = D_DISKLESS; | ||
3149 | } else { | ||
3150 | if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags)) | ||
3151 | return FALSE; | ||
3152 | D_ASSERT(oconn == C_WF_REPORT_PARAMS); | ||
3153 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
3154 | return FALSE; | ||
3155 | } | ||
3156 | } | ||
3157 | } | ||
3158 | |||
3159 | spin_lock_irq(&mdev->req_lock); | ||
3160 | if (mdev->state.conn != oconn) | ||
3161 | goto retry; | ||
3162 | clear_bit(CONSIDER_RESYNC, &mdev->flags); | ||
3163 | ns.i = mdev->state.i; | ||
3164 | ns.conn = nconn; | ||
3165 | ns.peer = peer_state.role; | ||
3166 | ns.pdsk = real_peer_disk; | ||
3167 | ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp); | ||
3168 | if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) | ||
3169 | ns.disk = mdev->new_state_tmp.disk; | ||
3170 | |||
3171 | rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL); | ||
3172 | ns = mdev->state; | ||
3173 | spin_unlock_irq(&mdev->req_lock); | ||
3174 | |||
3175 | if (rv < SS_SUCCESS) { | ||
3176 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
3177 | return FALSE; | ||
3178 | } | ||
3179 | |||
3180 | if (oconn > C_WF_REPORT_PARAMS) { | ||
3181 | if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED && | ||
3182 | peer_state.disk != D_NEGOTIATING ) { | ||
3183 | /* we want resync, peer has not yet decided to sync... */ | ||
3184 | /* Nowadays only used when forcing a node into primary role and | ||
3185 | setting its disk to UpToDate with that */ | ||
3186 | drbd_send_uuids(mdev); | ||
3187 | drbd_send_state(mdev); | ||
3188 | } | ||
3189 | } | ||
3190 | |||
3191 | mdev->net_conf->want_lose = 0; | ||
3192 | |||
3193 | drbd_md_sync(mdev); /* update connected indicator, la_size, ... */ | ||
3194 | |||
3195 | return TRUE; | ||
3196 | } | ||
3197 | |||
3198 | static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h) | ||
3199 | { | ||
3200 | struct p_rs_uuid *p = (struct p_rs_uuid *)h; | ||
3201 | |||
3202 | wait_event(mdev->misc_wait, | ||
3203 | mdev->state.conn == C_WF_SYNC_UUID || | ||
3204 | mdev->state.conn < C_CONNECTED || | ||
3205 | mdev->state.disk < D_NEGOTIATING); | ||
3206 | |||
3207 | /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */ | ||
3208 | |||
3209 | ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; | ||
3210 | if (drbd_recv(mdev, h->payload, h->length) != h->length) | ||
3211 | return FALSE; | ||
3212 | |||
3213 | /* Here the _drbd_uuid_ functions are right, current should | ||
3214 | _not_ be rotated into the history */ | ||
3215 | if (get_ldev_if_state(mdev, D_NEGOTIATING)) { | ||
3216 | _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid)); | ||
3217 | _drbd_uuid_set(mdev, UI_BITMAP, 0UL); | ||
3218 | |||
3219 | drbd_start_resync(mdev, C_SYNC_TARGET); | ||
3220 | |||
3221 | put_ldev(mdev); | ||
3222 | } else | ||
3223 | dev_err(DEV, "Ignoring SyncUUID packet!\n"); | ||
3224 | |||
3225 | return TRUE; | ||
3226 | } | ||
3227 | |||
3228 | enum receive_bitmap_ret { OK, DONE, FAILED }; | ||
3229 | |||
3230 | static enum receive_bitmap_ret | ||
3231 | receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h, | ||
3232 | unsigned long *buffer, struct bm_xfer_ctx *c) | ||
3233 | { | ||
3234 | unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); | ||
3235 | unsigned want = num_words * sizeof(long); | ||
3236 | |||
3237 | if (want != h->length) { | ||
3238 | dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length); | ||
3239 | return FAILED; | ||
3240 | } | ||
3241 | if (want == 0) | ||
3242 | return DONE; | ||
3243 | if (drbd_recv(mdev, buffer, want) != want) | ||
3244 | return FAILED; | ||
3245 | |||
3246 | drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer); | ||
3247 | |||
3248 | c->word_offset += num_words; | ||
3249 | c->bit_offset = c->word_offset * BITS_PER_LONG; | ||
3250 | if (c->bit_offset > c->bm_bits) | ||
3251 | c->bit_offset = c->bm_bits; | ||
3252 | |||
3253 | return OK; | ||
3254 | } | ||
3255 | |||
3256 | static enum receive_bitmap_ret | ||
3257 | recv_bm_rle_bits(struct drbd_conf *mdev, | ||
3258 | struct p_compressed_bm *p, | ||
3259 | struct bm_xfer_ctx *c) | ||
3260 | { | ||
3261 | struct bitstream bs; | ||
3262 | u64 look_ahead; | ||
3263 | u64 rl; | ||
3264 | u64 tmp; | ||
3265 | unsigned long s = c->bit_offset; | ||
3266 | unsigned long e; | ||
3267 | int len = p->head.length - (sizeof(*p) - sizeof(p->head)); | ||
3268 | int toggle = DCBP_get_start(p); | ||
3269 | int have; | ||
3270 | int bits; | ||
3271 | |||
3272 | bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p)); | ||
3273 | |||
3274 | bits = bitstream_get_bits(&bs, &look_ahead, 64); | ||
3275 | if (bits < 0) | ||
3276 | return FAILED; | ||
3277 | |||
3278 | for (have = bits; have > 0; s += rl, toggle = !toggle) { | ||
3279 | bits = vli_decode_bits(&rl, look_ahead); | ||
3280 | if (bits <= 0) | ||
3281 | return FAILED; | ||
3282 | |||
3283 | if (toggle) { | ||
3284 | e = s + rl -1; | ||
3285 | if (e >= c->bm_bits) { | ||
3286 | dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e); | ||
3287 | return FAILED; | ||
3288 | } | ||
3289 | _drbd_bm_set_bits(mdev, s, e); | ||
3290 | } | ||
3291 | |||
3292 | if (have < bits) { | ||
3293 | dev_err(DEV, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n", | ||
3294 | have, bits, look_ahead, | ||
3295 | (unsigned int)(bs.cur.b - p->code), | ||
3296 | (unsigned int)bs.buf_len); | ||
3297 | return FAILED; | ||
3298 | } | ||
3299 | look_ahead >>= bits; | ||
3300 | have -= bits; | ||
3301 | |||
3302 | bits = bitstream_get_bits(&bs, &tmp, 64 - have); | ||
3303 | if (bits < 0) | ||
3304 | return FAILED; | ||
3305 | look_ahead |= tmp << have; | ||
3306 | have += bits; | ||
3307 | } | ||
3308 | |||
3309 | c->bit_offset = s; | ||
3310 | bm_xfer_ctx_bit_to_word_offset(c); | ||
3311 | |||
3312 | return (s == c->bm_bits) ? DONE : OK; | ||
3313 | } | ||
3314 | |||
3315 | static enum receive_bitmap_ret | ||
3316 | decode_bitmap_c(struct drbd_conf *mdev, | ||
3317 | struct p_compressed_bm *p, | ||
3318 | struct bm_xfer_ctx *c) | ||
3319 | { | ||
3320 | if (DCBP_get_code(p) == RLE_VLI_Bits) | ||
3321 | return recv_bm_rle_bits(mdev, p, c); | ||
3322 | |||
3323 | /* other variants had been implemented for evaluation, | ||
3324 | * but have been dropped as this one turned out to be "best" | ||
3325 | * during all our tests. */ | ||
3326 | |||
3327 | dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding); | ||
3328 | drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); | ||
3329 | return FAILED; | ||
3330 | } | ||
3331 | |||
3332 | void INFO_bm_xfer_stats(struct drbd_conf *mdev, | ||
3333 | const char *direction, struct bm_xfer_ctx *c) | ||
3334 | { | ||
3335 | /* what would it take to transfer it "plaintext" */ | ||
3336 | unsigned plain = sizeof(struct p_header) * | ||
3337 | ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1) | ||
3338 | + c->bm_words * sizeof(long); | ||
3339 | unsigned total = c->bytes[0] + c->bytes[1]; | ||
3340 | unsigned r; | ||
3341 | |||
3342 | /* total can not be zero. but just in case: */ | ||
3343 | if (total == 0) | ||
3344 | return; | ||
3345 | |||
3346 | /* don't report if not compressed */ | ||
3347 | if (total >= plain) | ||
3348 | return; | ||
3349 | |||
3350 | /* total < plain. check for overflow, still */ | ||
3351 | r = (total > UINT_MAX/1000) ? (total / (plain/1000)) | ||
3352 | : (1000 * total / plain); | ||
3353 | |||
3354 | if (r > 1000) | ||
3355 | r = 1000; | ||
3356 | |||
3357 | r = 1000 - r; | ||
3358 | dev_info(DEV, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), " | ||
3359 | "total %u; compression: %u.%u%%\n", | ||
3360 | direction, | ||
3361 | c->bytes[1], c->packets[1], | ||
3362 | c->bytes[0], c->packets[0], | ||
3363 | total, r/10, r % 10); | ||
3364 | } | ||
3365 | |||
3366 | /* Since we are processing the bitfield from lower addresses to higher, | ||
3367 | it does not matter if the process it in 32 bit chunks or 64 bit | ||
3368 | chunks as long as it is little endian. (Understand it as byte stream, | ||
3369 | beginning with the lowest byte...) If we would use big endian | ||
3370 | we would need to process it from the highest address to the lowest, | ||
3371 | in order to be agnostic to the 32 vs 64 bits issue. | ||
3372 | |||
3373 | returns 0 on failure, 1 if we successfully received it. */ | ||
3374 | static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h) | ||
3375 | { | ||
3376 | struct bm_xfer_ctx c; | ||
3377 | void *buffer; | ||
3378 | enum receive_bitmap_ret ret; | ||
3379 | int ok = FALSE; | ||
3380 | |||
3381 | wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); | ||
3382 | |||
3383 | drbd_bm_lock(mdev, "receive bitmap"); | ||
3384 | |||
3385 | /* maybe we should use some per thread scratch page, | ||
3386 | * and allocate that during initial device creation? */ | ||
3387 | buffer = (unsigned long *) __get_free_page(GFP_NOIO); | ||
3388 | if (!buffer) { | ||
3389 | dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); | ||
3390 | goto out; | ||
3391 | } | ||
3392 | |||
3393 | c = (struct bm_xfer_ctx) { | ||
3394 | .bm_bits = drbd_bm_bits(mdev), | ||
3395 | .bm_words = drbd_bm_words(mdev), | ||
3396 | }; | ||
3397 | |||
3398 | do { | ||
3399 | if (h->command == P_BITMAP) { | ||
3400 | ret = receive_bitmap_plain(mdev, h, buffer, &c); | ||
3401 | } else if (h->command == P_COMPRESSED_BITMAP) { | ||
3402 | /* MAYBE: sanity check that we speak proto >= 90, | ||
3403 | * and the feature is enabled! */ | ||
3404 | struct p_compressed_bm *p; | ||
3405 | |||
3406 | if (h->length > BM_PACKET_PAYLOAD_BYTES) { | ||
3407 | dev_err(DEV, "ReportCBitmap packet too large\n"); | ||
3408 | goto out; | ||
3409 | } | ||
3410 | /* use the page buff */ | ||
3411 | p = buffer; | ||
3412 | memcpy(p, h, sizeof(*h)); | ||
3413 | if (drbd_recv(mdev, p->head.payload, h->length) != h->length) | ||
3414 | goto out; | ||
3415 | if (p->head.length <= (sizeof(*p) - sizeof(p->head))) { | ||
3416 | dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length); | ||
3417 | return FAILED; | ||
3418 | } | ||
3419 | ret = decode_bitmap_c(mdev, p, &c); | ||
3420 | } else { | ||
3421 | dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command); | ||
3422 | goto out; | ||
3423 | } | ||
3424 | |||
3425 | c.packets[h->command == P_BITMAP]++; | ||
3426 | c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length; | ||
3427 | |||
3428 | if (ret != OK) | ||
3429 | break; | ||
3430 | |||
3431 | if (!drbd_recv_header(mdev, h)) | ||
3432 | goto out; | ||
3433 | } while (ret == OK); | ||
3434 | if (ret == FAILED) | ||
3435 | goto out; | ||
3436 | |||
3437 | INFO_bm_xfer_stats(mdev, "receive", &c); | ||
3438 | |||
3439 | if (mdev->state.conn == C_WF_BITMAP_T) { | ||
3440 | ok = !drbd_send_bitmap(mdev); | ||
3441 | if (!ok) | ||
3442 | goto out; | ||
3443 | /* Omit CS_ORDERED with this state transition to avoid deadlocks. */ | ||
3444 | ok = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); | ||
3445 | D_ASSERT(ok == SS_SUCCESS); | ||
3446 | } else if (mdev->state.conn != C_WF_BITMAP_S) { | ||
3447 | /* admin may have requested C_DISCONNECTING, | ||
3448 | * other threads may have noticed network errors */ | ||
3449 | dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n", | ||
3450 | drbd_conn_str(mdev->state.conn)); | ||
3451 | } | ||
3452 | |||
3453 | ok = TRUE; | ||
3454 | out: | ||
3455 | drbd_bm_unlock(mdev); | ||
3456 | if (ok && mdev->state.conn == C_WF_BITMAP_S) | ||
3457 | drbd_start_resync(mdev, C_SYNC_SOURCE); | ||
3458 | free_page((unsigned long) buffer); | ||
3459 | return ok; | ||
3460 | } | ||
3461 | |||
3462 | static int receive_skip(struct drbd_conf *mdev, struct p_header *h) | ||
3463 | { | ||
3464 | /* TODO zero copy sink :) */ | ||
3465 | static char sink[128]; | ||
3466 | int size, want, r; | ||
3467 | |||
3468 | dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n", | ||
3469 | h->command, h->length); | ||
3470 | |||
3471 | size = h->length; | ||
3472 | while (size > 0) { | ||
3473 | want = min_t(int, size, sizeof(sink)); | ||
3474 | r = drbd_recv(mdev, sink, want); | ||
3475 | ERR_IF(r <= 0) break; | ||
3476 | size -= r; | ||
3477 | } | ||
3478 | return size == 0; | ||
3479 | } | ||
3480 | |||
3481 | static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h) | ||
3482 | { | ||
3483 | if (mdev->state.disk >= D_INCONSISTENT) | ||
3484 | drbd_kick_lo(mdev); | ||
3485 | |||
3486 | /* Make sure we've acked all the TCP data associated | ||
3487 | * with the data requests being unplugged */ | ||
3488 | drbd_tcp_quickack(mdev->data.socket); | ||
3489 | |||
3490 | return TRUE; | ||
3491 | } | ||
3492 | |||
3493 | typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *); | ||
3494 | |||
3495 | static drbd_cmd_handler_f drbd_default_handler[] = { | ||
3496 | [P_DATA] = receive_Data, | ||
3497 | [P_DATA_REPLY] = receive_DataReply, | ||
3498 | [P_RS_DATA_REPLY] = receive_RSDataReply, | ||
3499 | [P_BARRIER] = receive_Barrier, | ||
3500 | [P_BITMAP] = receive_bitmap, | ||
3501 | [P_COMPRESSED_BITMAP] = receive_bitmap, | ||
3502 | [P_UNPLUG_REMOTE] = receive_UnplugRemote, | ||
3503 | [P_DATA_REQUEST] = receive_DataRequest, | ||
3504 | [P_RS_DATA_REQUEST] = receive_DataRequest, | ||
3505 | [P_SYNC_PARAM] = receive_SyncParam, | ||
3506 | [P_SYNC_PARAM89] = receive_SyncParam, | ||
3507 | [P_PROTOCOL] = receive_protocol, | ||
3508 | [P_UUIDS] = receive_uuids, | ||
3509 | [P_SIZES] = receive_sizes, | ||
3510 | [P_STATE] = receive_state, | ||
3511 | [P_STATE_CHG_REQ] = receive_req_state, | ||
3512 | [P_SYNC_UUID] = receive_sync_uuid, | ||
3513 | [P_OV_REQUEST] = receive_DataRequest, | ||
3514 | [P_OV_REPLY] = receive_DataRequest, | ||
3515 | [P_CSUM_RS_REQUEST] = receive_DataRequest, | ||
3516 | /* anything missing from this table is in | ||
3517 | * the asender_tbl, see get_asender_cmd */ | ||
3518 | [P_MAX_CMD] = NULL, | ||
3519 | }; | ||
3520 | |||
3521 | static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler; | ||
3522 | static drbd_cmd_handler_f *drbd_opt_cmd_handler; | ||
3523 | |||
3524 | static void drbdd(struct drbd_conf *mdev) | ||
3525 | { | ||
3526 | drbd_cmd_handler_f handler; | ||
3527 | struct p_header *header = &mdev->data.rbuf.header; | ||
3528 | |||
3529 | while (get_t_state(&mdev->receiver) == Running) { | ||
3530 | drbd_thread_current_set_cpu(mdev); | ||
3531 | if (!drbd_recv_header(mdev, header)) { | ||
3532 | drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); | ||
3533 | break; | ||
3534 | } | ||
3535 | |||
3536 | if (header->command < P_MAX_CMD) | ||
3537 | handler = drbd_cmd_handler[header->command]; | ||
3538 | else if (P_MAY_IGNORE < header->command | ||
3539 | && header->command < P_MAX_OPT_CMD) | ||
3540 | handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE]; | ||
3541 | else if (header->command > P_MAX_OPT_CMD) | ||
3542 | handler = receive_skip; | ||
3543 | else | ||
3544 | handler = NULL; | ||
3545 | |||
3546 | if (unlikely(!handler)) { | ||
3547 | dev_err(DEV, "unknown packet type %d, l: %d!\n", | ||
3548 | header->command, header->length); | ||
3549 | drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); | ||
3550 | break; | ||
3551 | } | ||
3552 | if (unlikely(!handler(mdev, header))) { | ||
3553 | dev_err(DEV, "error receiving %s, l: %d!\n", | ||
3554 | cmdname(header->command), header->length); | ||
3555 | drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); | ||
3556 | break; | ||
3557 | } | ||
3558 | } | ||
3559 | } | ||
3560 | |||
3561 | static void drbd_fail_pending_reads(struct drbd_conf *mdev) | ||
3562 | { | ||
3563 | struct hlist_head *slot; | ||
3564 | struct hlist_node *pos; | ||
3565 | struct hlist_node *tmp; | ||
3566 | struct drbd_request *req; | ||
3567 | int i; | ||
3568 | |||
3569 | /* | ||
3570 | * Application READ requests | ||
3571 | */ | ||
3572 | spin_lock_irq(&mdev->req_lock); | ||
3573 | for (i = 0; i < APP_R_HSIZE; i++) { | ||
3574 | slot = mdev->app_reads_hash+i; | ||
3575 | hlist_for_each_entry_safe(req, pos, tmp, slot, colision) { | ||
3576 | /* it may (but should not any longer!) | ||
3577 | * be on the work queue; if that assert triggers, | ||
3578 | * we need to also grab the | ||
3579 | * spin_lock_irq(&mdev->data.work.q_lock); | ||
3580 | * and list_del_init here. */ | ||
3581 | D_ASSERT(list_empty(&req->w.list)); | ||
3582 | /* It would be nice to complete outside of spinlock. | ||
3583 | * But this is easier for now. */ | ||
3584 | _req_mod(req, connection_lost_while_pending); | ||
3585 | } | ||
3586 | } | ||
3587 | for (i = 0; i < APP_R_HSIZE; i++) | ||
3588 | if (!hlist_empty(mdev->app_reads_hash+i)) | ||
3589 | dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: " | ||
3590 | "%p, should be NULL\n", i, mdev->app_reads_hash[i].first); | ||
3591 | |||
3592 | memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); | ||
3593 | spin_unlock_irq(&mdev->req_lock); | ||
3594 | } | ||
3595 | |||
3596 | void drbd_flush_workqueue(struct drbd_conf *mdev) | ||
3597 | { | ||
3598 | struct drbd_wq_barrier barr; | ||
3599 | |||
3600 | barr.w.cb = w_prev_work_done; | ||
3601 | init_completion(&barr.done); | ||
3602 | drbd_queue_work(&mdev->data.work, &barr.w); | ||
3603 | wait_for_completion(&barr.done); | ||
3604 | } | ||
3605 | |||
3606 | static void drbd_disconnect(struct drbd_conf *mdev) | ||
3607 | { | ||
3608 | enum drbd_fencing_p fp; | ||
3609 | union drbd_state os, ns; | ||
3610 | int rv = SS_UNKNOWN_ERROR; | ||
3611 | unsigned int i; | ||
3612 | |||
3613 | if (mdev->state.conn == C_STANDALONE) | ||
3614 | return; | ||
3615 | if (mdev->state.conn >= C_WF_CONNECTION) | ||
3616 | dev_err(DEV, "ASSERT FAILED cstate = %s, expected < WFConnection\n", | ||
3617 | drbd_conn_str(mdev->state.conn)); | ||
3618 | |||
3619 | /* asender does not clean up anything. it must not interfere, either */ | ||
3620 | drbd_thread_stop(&mdev->asender); | ||
3621 | drbd_free_sock(mdev); | ||
3622 | |||
3623 | spin_lock_irq(&mdev->req_lock); | ||
3624 | _drbd_wait_ee_list_empty(mdev, &mdev->active_ee); | ||
3625 | _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee); | ||
3626 | _drbd_wait_ee_list_empty(mdev, &mdev->read_ee); | ||
3627 | spin_unlock_irq(&mdev->req_lock); | ||
3628 | |||
3629 | /* We do not have data structures that would allow us to | ||
3630 | * get the rs_pending_cnt down to 0 again. | ||
3631 | * * On C_SYNC_TARGET we do not have any data structures describing | ||
3632 | * the pending RSDataRequest's we have sent. | ||
3633 | * * On C_SYNC_SOURCE there is no data structure that tracks | ||
3634 | * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget. | ||
3635 | * And no, it is not the sum of the reference counts in the | ||
3636 | * resync_LRU. The resync_LRU tracks the whole operation including | ||
3637 | * the disk-IO, while the rs_pending_cnt only tracks the blocks | ||
3638 | * on the fly. */ | ||
3639 | drbd_rs_cancel_all(mdev); | ||
3640 | mdev->rs_total = 0; | ||
3641 | mdev->rs_failed = 0; | ||
3642 | atomic_set(&mdev->rs_pending_cnt, 0); | ||
3643 | wake_up(&mdev->misc_wait); | ||
3644 | |||
3645 | /* make sure syncer is stopped and w_resume_next_sg queued */ | ||
3646 | del_timer_sync(&mdev->resync_timer); | ||
3647 | set_bit(STOP_SYNC_TIMER, &mdev->flags); | ||
3648 | resync_timer_fn((unsigned long)mdev); | ||
3649 | |||
3650 | /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier, | ||
3651 | * w_make_resync_request etc. which may still be on the worker queue | ||
3652 | * to be "canceled" */ | ||
3653 | drbd_flush_workqueue(mdev); | ||
3654 | |||
3655 | /* This also does reclaim_net_ee(). If we do this too early, we might | ||
3656 | * miss some resync ee and pages.*/ | ||
3657 | drbd_process_done_ee(mdev); | ||
3658 | |||
3659 | kfree(mdev->p_uuid); | ||
3660 | mdev->p_uuid = NULL; | ||
3661 | |||
3662 | if (!mdev->state.susp) | ||
3663 | tl_clear(mdev); | ||
3664 | |||
3665 | drbd_fail_pending_reads(mdev); | ||
3666 | |||
3667 | dev_info(DEV, "Connection closed\n"); | ||
3668 | |||
3669 | drbd_md_sync(mdev); | ||
3670 | |||
3671 | fp = FP_DONT_CARE; | ||
3672 | if (get_ldev(mdev)) { | ||
3673 | fp = mdev->ldev->dc.fencing; | ||
3674 | put_ldev(mdev); | ||
3675 | } | ||
3676 | |||
3677 | if (mdev->state.role == R_PRIMARY) { | ||
3678 | if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) { | ||
3679 | enum drbd_disk_state nps = drbd_try_outdate_peer(mdev); | ||
3680 | drbd_request_state(mdev, NS(pdsk, nps)); | ||
3681 | } | ||
3682 | } | ||
3683 | |||
3684 | spin_lock_irq(&mdev->req_lock); | ||
3685 | os = mdev->state; | ||
3686 | if (os.conn >= C_UNCONNECTED) { | ||
3687 | /* Do not restart in case we are C_DISCONNECTING */ | ||
3688 | ns = os; | ||
3689 | ns.conn = C_UNCONNECTED; | ||
3690 | rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | ||
3691 | } | ||
3692 | spin_unlock_irq(&mdev->req_lock); | ||
3693 | |||
3694 | if (os.conn == C_DISCONNECTING) { | ||
3695 | struct hlist_head *h; | ||
3696 | wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0); | ||
3697 | |||
3698 | /* we must not free the tl_hash | ||
3699 | * while application io is still on the fly */ | ||
3700 | wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0); | ||
3701 | |||
3702 | spin_lock_irq(&mdev->req_lock); | ||
3703 | /* paranoia code */ | ||
3704 | for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++) | ||
3705 | if (h->first) | ||
3706 | dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n", | ||
3707 | (int)(h - mdev->ee_hash), h->first); | ||
3708 | kfree(mdev->ee_hash); | ||
3709 | mdev->ee_hash = NULL; | ||
3710 | mdev->ee_hash_s = 0; | ||
3711 | |||
3712 | /* paranoia code */ | ||
3713 | for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) | ||
3714 | if (h->first) | ||
3715 | dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n", | ||
3716 | (int)(h - mdev->tl_hash), h->first); | ||
3717 | kfree(mdev->tl_hash); | ||
3718 | mdev->tl_hash = NULL; | ||
3719 | mdev->tl_hash_s = 0; | ||
3720 | spin_unlock_irq(&mdev->req_lock); | ||
3721 | |||
3722 | crypto_free_hash(mdev->cram_hmac_tfm); | ||
3723 | mdev->cram_hmac_tfm = NULL; | ||
3724 | |||
3725 | kfree(mdev->net_conf); | ||
3726 | mdev->net_conf = NULL; | ||
3727 | drbd_request_state(mdev, NS(conn, C_STANDALONE)); | ||
3728 | } | ||
3729 | |||
3730 | /* tcp_close and release of sendpage pages can be deferred. I don't | ||
3731 | * want to use SO_LINGER, because apparently it can be deferred for | ||
3732 | * more than 20 seconds (longest time I checked). | ||
3733 | * | ||
3734 | * Actually we don't care for exactly when the network stack does its | ||
3735 | * put_page(), but release our reference on these pages right here. | ||
3736 | */ | ||
3737 | i = drbd_release_ee(mdev, &mdev->net_ee); | ||
3738 | if (i) | ||
3739 | dev_info(DEV, "net_ee not empty, killed %u entries\n", i); | ||
3740 | i = atomic_read(&mdev->pp_in_use); | ||
3741 | if (i) | ||
3742 | dev_info(DEV, "pp_in_use = %u, expected 0\n", i); | ||
3743 | |||
3744 | D_ASSERT(list_empty(&mdev->read_ee)); | ||
3745 | D_ASSERT(list_empty(&mdev->active_ee)); | ||
3746 | D_ASSERT(list_empty(&mdev->sync_ee)); | ||
3747 | D_ASSERT(list_empty(&mdev->done_ee)); | ||
3748 | |||
3749 | /* ok, no more ee's on the fly, it is safe to reset the epoch_size */ | ||
3750 | atomic_set(&mdev->current_epoch->epoch_size, 0); | ||
3751 | D_ASSERT(list_empty(&mdev->current_epoch->list)); | ||
3752 | } | ||
3753 | |||
3754 | /* | ||
3755 | * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version | ||
3756 | * we can agree on is stored in agreed_pro_version. | ||
3757 | * | ||
3758 | * feature flags and the reserved array should be enough room for future | ||
3759 | * enhancements of the handshake protocol, and possible plugins... | ||
3760 | * | ||
3761 | * for now, they are expected to be zero, but ignored. | ||
3762 | */ | ||
3763 | static int drbd_send_handshake(struct drbd_conf *mdev) | ||
3764 | { | ||
3765 | /* ASSERT current == mdev->receiver ... */ | ||
3766 | struct p_handshake *p = &mdev->data.sbuf.handshake; | ||
3767 | int ok; | ||
3768 | |||
3769 | if (mutex_lock_interruptible(&mdev->data.mutex)) { | ||
3770 | dev_err(DEV, "interrupted during initial handshake\n"); | ||
3771 | return 0; /* interrupted. not ok. */ | ||
3772 | } | ||
3773 | |||
3774 | if (mdev->data.socket == NULL) { | ||
3775 | mutex_unlock(&mdev->data.mutex); | ||
3776 | return 0; | ||
3777 | } | ||
3778 | |||
3779 | memset(p, 0, sizeof(*p)); | ||
3780 | p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); | ||
3781 | p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); | ||
3782 | ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE, | ||
3783 | (struct p_header *)p, sizeof(*p), 0 ); | ||
3784 | mutex_unlock(&mdev->data.mutex); | ||
3785 | return ok; | ||
3786 | } | ||
3787 | |||
3788 | /* | ||
3789 | * return values: | ||
3790 | * 1 yes, we have a valid connection | ||
3791 | * 0 oops, did not work out, please try again | ||
3792 | * -1 peer talks different language, | ||
3793 | * no point in trying again, please go standalone. | ||
3794 | */ | ||
3795 | static int drbd_do_handshake(struct drbd_conf *mdev) | ||
3796 | { | ||
3797 | /* ASSERT current == mdev->receiver ... */ | ||
3798 | struct p_handshake *p = &mdev->data.rbuf.handshake; | ||
3799 | const int expect = sizeof(struct p_handshake) | ||
3800 | -sizeof(struct p_header); | ||
3801 | int rv; | ||
3802 | |||
3803 | rv = drbd_send_handshake(mdev); | ||
3804 | if (!rv) | ||
3805 | return 0; | ||
3806 | |||
3807 | rv = drbd_recv_header(mdev, &p->head); | ||
3808 | if (!rv) | ||
3809 | return 0; | ||
3810 | |||
3811 | if (p->head.command != P_HAND_SHAKE) { | ||
3812 | dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n", | ||
3813 | cmdname(p->head.command), p->head.command); | ||
3814 | return -1; | ||
3815 | } | ||
3816 | |||
3817 | if (p->head.length != expect) { | ||
3818 | dev_err(DEV, "expected HandShake length: %u, received: %u\n", | ||
3819 | expect, p->head.length); | ||
3820 | return -1; | ||
3821 | } | ||
3822 | |||
3823 | rv = drbd_recv(mdev, &p->head.payload, expect); | ||
3824 | |||
3825 | if (rv != expect) { | ||
3826 | dev_err(DEV, "short read receiving handshake packet: l=%u\n", rv); | ||
3827 | return 0; | ||
3828 | } | ||
3829 | |||
3830 | p->protocol_min = be32_to_cpu(p->protocol_min); | ||
3831 | p->protocol_max = be32_to_cpu(p->protocol_max); | ||
3832 | if (p->protocol_max == 0) | ||
3833 | p->protocol_max = p->protocol_min; | ||
3834 | |||
3835 | if (PRO_VERSION_MAX < p->protocol_min || | ||
3836 | PRO_VERSION_MIN > p->protocol_max) | ||
3837 | goto incompat; | ||
3838 | |||
3839 | mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); | ||
3840 | |||
3841 | dev_info(DEV, "Handshake successful: " | ||
3842 | "Agreed network protocol version %d\n", mdev->agreed_pro_version); | ||
3843 | |||
3844 | return 1; | ||
3845 | |||
3846 | incompat: | ||
3847 | dev_err(DEV, "incompatible DRBD dialects: " | ||
3848 | "I support %d-%d, peer supports %d-%d\n", | ||
3849 | PRO_VERSION_MIN, PRO_VERSION_MAX, | ||
3850 | p->protocol_min, p->protocol_max); | ||
3851 | return -1; | ||
3852 | } | ||
3853 | |||
3854 | #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) | ||
3855 | static int drbd_do_auth(struct drbd_conf *mdev) | ||
3856 | { | ||
3857 | dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); | ||
3858 | dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); | ||
3859 | return -1; | ||
3860 | } | ||
3861 | #else | ||
3862 | #define CHALLENGE_LEN 64 | ||
3863 | |||
3864 | /* Return value: | ||
3865 | 1 - auth succeeded, | ||
3866 | 0 - failed, try again (network error), | ||
3867 | -1 - auth failed, don't try again. | ||
3868 | */ | ||
3869 | |||
3870 | static int drbd_do_auth(struct drbd_conf *mdev) | ||
3871 | { | ||
3872 | char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */ | ||
3873 | struct scatterlist sg; | ||
3874 | char *response = NULL; | ||
3875 | char *right_response = NULL; | ||
3876 | char *peers_ch = NULL; | ||
3877 | struct p_header p; | ||
3878 | unsigned int key_len = strlen(mdev->net_conf->shared_secret); | ||
3879 | unsigned int resp_size; | ||
3880 | struct hash_desc desc; | ||
3881 | int rv; | ||
3882 | |||
3883 | desc.tfm = mdev->cram_hmac_tfm; | ||
3884 | desc.flags = 0; | ||
3885 | |||
3886 | rv = crypto_hash_setkey(mdev->cram_hmac_tfm, | ||
3887 | (u8 *)mdev->net_conf->shared_secret, key_len); | ||
3888 | if (rv) { | ||
3889 | dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv); | ||
3890 | rv = -1; | ||
3891 | goto fail; | ||
3892 | } | ||
3893 | |||
3894 | get_random_bytes(my_challenge, CHALLENGE_LEN); | ||
3895 | |||
3896 | rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN); | ||
3897 | if (!rv) | ||
3898 | goto fail; | ||
3899 | |||
3900 | rv = drbd_recv_header(mdev, &p); | ||
3901 | if (!rv) | ||
3902 | goto fail; | ||
3903 | |||
3904 | if (p.command != P_AUTH_CHALLENGE) { | ||
3905 | dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n", | ||
3906 | cmdname(p.command), p.command); | ||
3907 | rv = 0; | ||
3908 | goto fail; | ||
3909 | } | ||
3910 | |||
3911 | if (p.length > CHALLENGE_LEN*2) { | ||
3912 | dev_err(DEV, "expected AuthChallenge payload too big.\n"); | ||
3913 | rv = -1; | ||
3914 | goto fail; | ||
3915 | } | ||
3916 | |||
3917 | peers_ch = kmalloc(p.length, GFP_NOIO); | ||
3918 | if (peers_ch == NULL) { | ||
3919 | dev_err(DEV, "kmalloc of peers_ch failed\n"); | ||
3920 | rv = -1; | ||
3921 | goto fail; | ||
3922 | } | ||
3923 | |||
3924 | rv = drbd_recv(mdev, peers_ch, p.length); | ||
3925 | |||
3926 | if (rv != p.length) { | ||
3927 | dev_err(DEV, "short read AuthChallenge: l=%u\n", rv); | ||
3928 | rv = 0; | ||
3929 | goto fail; | ||
3930 | } | ||
3931 | |||
3932 | resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm); | ||
3933 | response = kmalloc(resp_size, GFP_NOIO); | ||
3934 | if (response == NULL) { | ||
3935 | dev_err(DEV, "kmalloc of response failed\n"); | ||
3936 | rv = -1; | ||
3937 | goto fail; | ||
3938 | } | ||
3939 | |||
3940 | sg_init_table(&sg, 1); | ||
3941 | sg_set_buf(&sg, peers_ch, p.length); | ||
3942 | |||
3943 | rv = crypto_hash_digest(&desc, &sg, sg.length, response); | ||
3944 | if (rv) { | ||
3945 | dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv); | ||
3946 | rv = -1; | ||
3947 | goto fail; | ||
3948 | } | ||
3949 | |||
3950 | rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size); | ||
3951 | if (!rv) | ||
3952 | goto fail; | ||
3953 | |||
3954 | rv = drbd_recv_header(mdev, &p); | ||
3955 | if (!rv) | ||
3956 | goto fail; | ||
3957 | |||
3958 | if (p.command != P_AUTH_RESPONSE) { | ||
3959 | dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n", | ||
3960 | cmdname(p.command), p.command); | ||
3961 | rv = 0; | ||
3962 | goto fail; | ||
3963 | } | ||
3964 | |||
3965 | if (p.length != resp_size) { | ||
3966 | dev_err(DEV, "expected AuthResponse payload of wrong size\n"); | ||
3967 | rv = 0; | ||
3968 | goto fail; | ||
3969 | } | ||
3970 | |||
3971 | rv = drbd_recv(mdev, response , resp_size); | ||
3972 | |||
3973 | if (rv != resp_size) { | ||
3974 | dev_err(DEV, "short read receiving AuthResponse: l=%u\n", rv); | ||
3975 | rv = 0; | ||
3976 | goto fail; | ||
3977 | } | ||
3978 | |||
3979 | right_response = kmalloc(resp_size, GFP_NOIO); | ||
3980 | if (right_response == NULL) { | ||
3981 | dev_err(DEV, "kmalloc of right_response failed\n"); | ||
3982 | rv = -1; | ||
3983 | goto fail; | ||
3984 | } | ||
3985 | |||
3986 | sg_set_buf(&sg, my_challenge, CHALLENGE_LEN); | ||
3987 | |||
3988 | rv = crypto_hash_digest(&desc, &sg, sg.length, right_response); | ||
3989 | if (rv) { | ||
3990 | dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv); | ||
3991 | rv = -1; | ||
3992 | goto fail; | ||
3993 | } | ||
3994 | |||
3995 | rv = !memcmp(response, right_response, resp_size); | ||
3996 | |||
3997 | if (rv) | ||
3998 | dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n", | ||
3999 | resp_size, mdev->net_conf->cram_hmac_alg); | ||
4000 | else | ||
4001 | rv = -1; | ||
4002 | |||
4003 | fail: | ||
4004 | kfree(peers_ch); | ||
4005 | kfree(response); | ||
4006 | kfree(right_response); | ||
4007 | |||
4008 | return rv; | ||
4009 | } | ||
4010 | #endif | ||
4011 | |||
4012 | int drbdd_init(struct drbd_thread *thi) | ||
4013 | { | ||
4014 | struct drbd_conf *mdev = thi->mdev; | ||
4015 | unsigned int minor = mdev_to_minor(mdev); | ||
4016 | int h; | ||
4017 | |||
4018 | sprintf(current->comm, "drbd%d_receiver", minor); | ||
4019 | |||
4020 | dev_info(DEV, "receiver (re)started\n"); | ||
4021 | |||
4022 | do { | ||
4023 | h = drbd_connect(mdev); | ||
4024 | if (h == 0) { | ||
4025 | drbd_disconnect(mdev); | ||
4026 | __set_current_state(TASK_INTERRUPTIBLE); | ||
4027 | schedule_timeout(HZ); | ||
4028 | } | ||
4029 | if (h == -1) { | ||
4030 | dev_warn(DEV, "Discarding network configuration.\n"); | ||
4031 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
4032 | } | ||
4033 | } while (h == 0); | ||
4034 | |||
4035 | if (h > 0) { | ||
4036 | if (get_net_conf(mdev)) { | ||
4037 | drbdd(mdev); | ||
4038 | put_net_conf(mdev); | ||
4039 | } | ||
4040 | } | ||
4041 | |||
4042 | drbd_disconnect(mdev); | ||
4043 | |||
4044 | dev_info(DEV, "receiver terminated\n"); | ||
4045 | return 0; | ||
4046 | } | ||
4047 | |||
4048 | /* ********* acknowledge sender ******** */ | ||
4049 | |||
4050 | static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h) | ||
4051 | { | ||
4052 | struct p_req_state_reply *p = (struct p_req_state_reply *)h; | ||
4053 | |||
4054 | int retcode = be32_to_cpu(p->retcode); | ||
4055 | |||
4056 | if (retcode >= SS_SUCCESS) { | ||
4057 | set_bit(CL_ST_CHG_SUCCESS, &mdev->flags); | ||
4058 | } else { | ||
4059 | set_bit(CL_ST_CHG_FAIL, &mdev->flags); | ||
4060 | dev_err(DEV, "Requested state change failed by peer: %s (%d)\n", | ||
4061 | drbd_set_st_err_str(retcode), retcode); | ||
4062 | } | ||
4063 | wake_up(&mdev->state_wait); | ||
4064 | |||
4065 | return TRUE; | ||
4066 | } | ||
4067 | |||
4068 | static int got_Ping(struct drbd_conf *mdev, struct p_header *h) | ||
4069 | { | ||
4070 | return drbd_send_ping_ack(mdev); | ||
4071 | |||
4072 | } | ||
4073 | |||
4074 | static int got_PingAck(struct drbd_conf *mdev, struct p_header *h) | ||
4075 | { | ||
4076 | /* restore idle timeout */ | ||
4077 | mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; | ||
4078 | if (!test_and_set_bit(GOT_PING_ACK, &mdev->flags)) | ||
4079 | wake_up(&mdev->misc_wait); | ||
4080 | |||
4081 | return TRUE; | ||
4082 | } | ||
4083 | |||
4084 | static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h) | ||
4085 | { | ||
4086 | struct p_block_ack *p = (struct p_block_ack *)h; | ||
4087 | sector_t sector = be64_to_cpu(p->sector); | ||
4088 | int blksize = be32_to_cpu(p->blksize); | ||
4089 | |||
4090 | D_ASSERT(mdev->agreed_pro_version >= 89); | ||
4091 | |||
4092 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | ||
4093 | |||
4094 | drbd_rs_complete_io(mdev, sector); | ||
4095 | drbd_set_in_sync(mdev, sector, blksize); | ||
4096 | /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */ | ||
4097 | mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT); | ||
4098 | dec_rs_pending(mdev); | ||
4099 | |||
4100 | return TRUE; | ||
4101 | } | ||
4102 | |||
4103 | /* when we receive the ACK for a write request, | ||
4104 | * verify that we actually know about it */ | ||
4105 | static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev, | ||
4106 | u64 id, sector_t sector) | ||
4107 | { | ||
4108 | struct hlist_head *slot = tl_hash_slot(mdev, sector); | ||
4109 | struct hlist_node *n; | ||
4110 | struct drbd_request *req; | ||
4111 | |||
4112 | hlist_for_each_entry(req, n, slot, colision) { | ||
4113 | if ((unsigned long)req == (unsigned long)id) { | ||
4114 | if (req->sector != sector) { | ||
4115 | dev_err(DEV, "_ack_id_to_req: found req %p but it has " | ||
4116 | "wrong sector (%llus versus %llus)\n", req, | ||
4117 | (unsigned long long)req->sector, | ||
4118 | (unsigned long long)sector); | ||
4119 | break; | ||
4120 | } | ||
4121 | return req; | ||
4122 | } | ||
4123 | } | ||
4124 | dev_err(DEV, "_ack_id_to_req: failed to find req %p, sector %llus in list\n", | ||
4125 | (void *)(unsigned long)id, (unsigned long long)sector); | ||
4126 | return NULL; | ||
4127 | } | ||
4128 | |||
4129 | typedef struct drbd_request *(req_validator_fn) | ||
4130 | (struct drbd_conf *mdev, u64 id, sector_t sector); | ||
4131 | |||
4132 | static int validate_req_change_req_state(struct drbd_conf *mdev, | ||
4133 | u64 id, sector_t sector, req_validator_fn validator, | ||
4134 | const char *func, enum drbd_req_event what) | ||
4135 | { | ||
4136 | struct drbd_request *req; | ||
4137 | struct bio_and_error m; | ||
4138 | |||
4139 | spin_lock_irq(&mdev->req_lock); | ||
4140 | req = validator(mdev, id, sector); | ||
4141 | if (unlikely(!req)) { | ||
4142 | spin_unlock_irq(&mdev->req_lock); | ||
4143 | dev_err(DEV, "%s: got a corrupt block_id/sector pair\n", func); | ||
4144 | return FALSE; | ||
4145 | } | ||
4146 | __req_mod(req, what, &m); | ||
4147 | spin_unlock_irq(&mdev->req_lock); | ||
4148 | |||
4149 | if (m.bio) | ||
4150 | complete_master_bio(mdev, &m); | ||
4151 | return TRUE; | ||
4152 | } | ||
4153 | |||
4154 | static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h) | ||
4155 | { | ||
4156 | struct p_block_ack *p = (struct p_block_ack *)h; | ||
4157 | sector_t sector = be64_to_cpu(p->sector); | ||
4158 | int blksize = be32_to_cpu(p->blksize); | ||
4159 | enum drbd_req_event what; | ||
4160 | |||
4161 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | ||
4162 | |||
4163 | if (is_syncer_block_id(p->block_id)) { | ||
4164 | drbd_set_in_sync(mdev, sector, blksize); | ||
4165 | dec_rs_pending(mdev); | ||
4166 | return TRUE; | ||
4167 | } | ||
4168 | switch (be16_to_cpu(h->command)) { | ||
4169 | case P_RS_WRITE_ACK: | ||
4170 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | ||
4171 | what = write_acked_by_peer_and_sis; | ||
4172 | break; | ||
4173 | case P_WRITE_ACK: | ||
4174 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | ||
4175 | what = write_acked_by_peer; | ||
4176 | break; | ||
4177 | case P_RECV_ACK: | ||
4178 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B); | ||
4179 | what = recv_acked_by_peer; | ||
4180 | break; | ||
4181 | case P_DISCARD_ACK: | ||
4182 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | ||
4183 | what = conflict_discarded_by_peer; | ||
4184 | break; | ||
4185 | default: | ||
4186 | D_ASSERT(0); | ||
4187 | return FALSE; | ||
4188 | } | ||
4189 | |||
4190 | return validate_req_change_req_state(mdev, p->block_id, sector, | ||
4191 | _ack_id_to_req, __func__ , what); | ||
4192 | } | ||
4193 | |||
4194 | static int got_NegAck(struct drbd_conf *mdev, struct p_header *h) | ||
4195 | { | ||
4196 | struct p_block_ack *p = (struct p_block_ack *)h; | ||
4197 | sector_t sector = be64_to_cpu(p->sector); | ||
4198 | |||
4199 | if (__ratelimit(&drbd_ratelimit_state)) | ||
4200 | dev_warn(DEV, "Got NegAck packet. Peer is in troubles?\n"); | ||
4201 | |||
4202 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | ||
4203 | |||
4204 | if (is_syncer_block_id(p->block_id)) { | ||
4205 | int size = be32_to_cpu(p->blksize); | ||
4206 | dec_rs_pending(mdev); | ||
4207 | drbd_rs_failed_io(mdev, sector, size); | ||
4208 | return TRUE; | ||
4209 | } | ||
4210 | return validate_req_change_req_state(mdev, p->block_id, sector, | ||
4211 | _ack_id_to_req, __func__ , neg_acked); | ||
4212 | } | ||
4213 | |||
4214 | static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h) | ||
4215 | { | ||
4216 | struct p_block_ack *p = (struct p_block_ack *)h; | ||
4217 | sector_t sector = be64_to_cpu(p->sector); | ||
4218 | |||
4219 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | ||
4220 | dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n", | ||
4221 | (unsigned long long)sector, be32_to_cpu(p->blksize)); | ||
4222 | |||
4223 | return validate_req_change_req_state(mdev, p->block_id, sector, | ||
4224 | _ar_id_to_req, __func__ , neg_acked); | ||
4225 | } | ||
4226 | |||
4227 | static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h) | ||
4228 | { | ||
4229 | sector_t sector; | ||
4230 | int size; | ||
4231 | struct p_block_ack *p = (struct p_block_ack *)h; | ||
4232 | |||
4233 | sector = be64_to_cpu(p->sector); | ||
4234 | size = be32_to_cpu(p->blksize); | ||
4235 | D_ASSERT(p->block_id == ID_SYNCER); | ||
4236 | |||
4237 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | ||
4238 | |||
4239 | dec_rs_pending(mdev); | ||
4240 | |||
4241 | if (get_ldev_if_state(mdev, D_FAILED)) { | ||
4242 | drbd_rs_complete_io(mdev, sector); | ||
4243 | drbd_rs_failed_io(mdev, sector, size); | ||
4244 | put_ldev(mdev); | ||
4245 | } | ||
4246 | |||
4247 | return TRUE; | ||
4248 | } | ||
4249 | |||
4250 | static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h) | ||
4251 | { | ||
4252 | struct p_barrier_ack *p = (struct p_barrier_ack *)h; | ||
4253 | |||
4254 | tl_release(mdev, p->barrier, be32_to_cpu(p->set_size)); | ||
4255 | |||
4256 | return TRUE; | ||
4257 | } | ||
4258 | |||
4259 | static int got_OVResult(struct drbd_conf *mdev, struct p_header *h) | ||
4260 | { | ||
4261 | struct p_block_ack *p = (struct p_block_ack *)h; | ||
4262 | struct drbd_work *w; | ||
4263 | sector_t sector; | ||
4264 | int size; | ||
4265 | |||
4266 | sector = be64_to_cpu(p->sector); | ||
4267 | size = be32_to_cpu(p->blksize); | ||
4268 | |||
4269 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | ||
4270 | |||
4271 | if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC) | ||
4272 | drbd_ov_oos_found(mdev, sector, size); | ||
4273 | else | ||
4274 | ov_oos_print(mdev); | ||
4275 | |||
4276 | drbd_rs_complete_io(mdev, sector); | ||
4277 | dec_rs_pending(mdev); | ||
4278 | |||
4279 | if (--mdev->ov_left == 0) { | ||
4280 | w = kmalloc(sizeof(*w), GFP_NOIO); | ||
4281 | if (w) { | ||
4282 | w->cb = w_ov_finished; | ||
4283 | drbd_queue_work_front(&mdev->data.work, w); | ||
4284 | } else { | ||
4285 | dev_err(DEV, "kmalloc(w) failed."); | ||
4286 | ov_oos_print(mdev); | ||
4287 | drbd_resync_finished(mdev); | ||
4288 | } | ||
4289 | } | ||
4290 | return TRUE; | ||
4291 | } | ||
4292 | |||
4293 | struct asender_cmd { | ||
4294 | size_t pkt_size; | ||
4295 | int (*process)(struct drbd_conf *mdev, struct p_header *h); | ||
4296 | }; | ||
4297 | |||
4298 | static struct asender_cmd *get_asender_cmd(int cmd) | ||
4299 | { | ||
4300 | static struct asender_cmd asender_tbl[] = { | ||
4301 | /* anything missing from this table is in | ||
4302 | * the drbd_cmd_handler (drbd_default_handler) table, | ||
4303 | * see the beginning of drbdd() */ | ||
4304 | [P_PING] = { sizeof(struct p_header), got_Ping }, | ||
4305 | [P_PING_ACK] = { sizeof(struct p_header), got_PingAck }, | ||
4306 | [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, | ||
4307 | [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, | ||
4308 | [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, | ||
4309 | [P_DISCARD_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, | ||
4310 | [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck }, | ||
4311 | [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply }, | ||
4312 | [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply}, | ||
4313 | [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult }, | ||
4314 | [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, | ||
4315 | [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, | ||
4316 | [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, | ||
4317 | [P_MAX_CMD] = { 0, NULL }, | ||
4318 | }; | ||
4319 | if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL) | ||
4320 | return NULL; | ||
4321 | return &asender_tbl[cmd]; | ||
4322 | } | ||
4323 | |||
4324 | int drbd_asender(struct drbd_thread *thi) | ||
4325 | { | ||
4326 | struct drbd_conf *mdev = thi->mdev; | ||
4327 | struct p_header *h = &mdev->meta.rbuf.header; | ||
4328 | struct asender_cmd *cmd = NULL; | ||
4329 | |||
4330 | int rv, len; | ||
4331 | void *buf = h; | ||
4332 | int received = 0; | ||
4333 | int expect = sizeof(struct p_header); | ||
4334 | int empty; | ||
4335 | |||
4336 | sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev)); | ||
4337 | |||
4338 | current->policy = SCHED_RR; /* Make this a realtime task! */ | ||
4339 | current->rt_priority = 2; /* more important than all other tasks */ | ||
4340 | |||
4341 | while (get_t_state(thi) == Running) { | ||
4342 | drbd_thread_current_set_cpu(mdev); | ||
4343 | if (test_and_clear_bit(SEND_PING, &mdev->flags)) { | ||
4344 | ERR_IF(!drbd_send_ping(mdev)) goto reconnect; | ||
4345 | mdev->meta.socket->sk->sk_rcvtimeo = | ||
4346 | mdev->net_conf->ping_timeo*HZ/10; | ||
4347 | } | ||
4348 | |||
4349 | /* conditionally cork; | ||
4350 | * it may hurt latency if we cork without much to send */ | ||
4351 | if (!mdev->net_conf->no_cork && | ||
4352 | 3 < atomic_read(&mdev->unacked_cnt)) | ||
4353 | drbd_tcp_cork(mdev->meta.socket); | ||
4354 | while (1) { | ||
4355 | clear_bit(SIGNAL_ASENDER, &mdev->flags); | ||
4356 | flush_signals(current); | ||
4357 | if (!drbd_process_done_ee(mdev)) { | ||
4358 | dev_err(DEV, "process_done_ee() = NOT_OK\n"); | ||
4359 | goto reconnect; | ||
4360 | } | ||
4361 | /* to avoid race with newly queued ACKs */ | ||
4362 | set_bit(SIGNAL_ASENDER, &mdev->flags); | ||
4363 | spin_lock_irq(&mdev->req_lock); | ||
4364 | empty = list_empty(&mdev->done_ee); | ||
4365 | spin_unlock_irq(&mdev->req_lock); | ||
4366 | /* new ack may have been queued right here, | ||
4367 | * but then there is also a signal pending, | ||
4368 | * and we start over... */ | ||
4369 | if (empty) | ||
4370 | break; | ||
4371 | } | ||
4372 | /* but unconditionally uncork unless disabled */ | ||
4373 | if (!mdev->net_conf->no_cork) | ||
4374 | drbd_tcp_uncork(mdev->meta.socket); | ||
4375 | |||
4376 | /* short circuit, recv_msg would return EINTR anyways. */ | ||
4377 | if (signal_pending(current)) | ||
4378 | continue; | ||
4379 | |||
4380 | rv = drbd_recv_short(mdev, mdev->meta.socket, | ||
4381 | buf, expect-received, 0); | ||
4382 | clear_bit(SIGNAL_ASENDER, &mdev->flags); | ||
4383 | |||
4384 | flush_signals(current); | ||
4385 | |||
4386 | /* Note: | ||
4387 | * -EINTR (on meta) we got a signal | ||
4388 | * -EAGAIN (on meta) rcvtimeo expired | ||
4389 | * -ECONNRESET other side closed the connection | ||
4390 | * -ERESTARTSYS (on data) we got a signal | ||
4391 | * rv < 0 other than above: unexpected error! | ||
4392 | * rv == expected: full header or command | ||
4393 | * rv < expected: "woken" by signal during receive | ||
4394 | * rv == 0 : "connection shut down by peer" | ||
4395 | */ | ||
4396 | if (likely(rv > 0)) { | ||
4397 | received += rv; | ||
4398 | buf += rv; | ||
4399 | } else if (rv == 0) { | ||
4400 | dev_err(DEV, "meta connection shut down by peer.\n"); | ||
4401 | goto reconnect; | ||
4402 | } else if (rv == -EAGAIN) { | ||
4403 | if (mdev->meta.socket->sk->sk_rcvtimeo == | ||
4404 | mdev->net_conf->ping_timeo*HZ/10) { | ||
4405 | dev_err(DEV, "PingAck did not arrive in time.\n"); | ||
4406 | goto reconnect; | ||
4407 | } | ||
4408 | set_bit(SEND_PING, &mdev->flags); | ||
4409 | continue; | ||
4410 | } else if (rv == -EINTR) { | ||
4411 | continue; | ||
4412 | } else { | ||
4413 | dev_err(DEV, "sock_recvmsg returned %d\n", rv); | ||
4414 | goto reconnect; | ||
4415 | } | ||
4416 | |||
4417 | if (received == expect && cmd == NULL) { | ||
4418 | if (unlikely(h->magic != BE_DRBD_MAGIC)) { | ||
4419 | dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n", | ||
4420 | (long)be32_to_cpu(h->magic), | ||
4421 | h->command, h->length); | ||
4422 | goto reconnect; | ||
4423 | } | ||
4424 | cmd = get_asender_cmd(be16_to_cpu(h->command)); | ||
4425 | len = be16_to_cpu(h->length); | ||
4426 | if (unlikely(cmd == NULL)) { | ||
4427 | dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n", | ||
4428 | (long)be32_to_cpu(h->magic), | ||
4429 | h->command, h->length); | ||
4430 | goto disconnect; | ||
4431 | } | ||
4432 | expect = cmd->pkt_size; | ||
4433 | ERR_IF(len != expect-sizeof(struct p_header)) | ||
4434 | goto reconnect; | ||
4435 | } | ||
4436 | if (received == expect) { | ||
4437 | D_ASSERT(cmd != NULL); | ||
4438 | if (!cmd->process(mdev, h)) | ||
4439 | goto reconnect; | ||
4440 | |||
4441 | buf = h; | ||
4442 | received = 0; | ||
4443 | expect = sizeof(struct p_header); | ||
4444 | cmd = NULL; | ||
4445 | } | ||
4446 | } | ||
4447 | |||
4448 | if (0) { | ||
4449 | reconnect: | ||
4450 | drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); | ||
4451 | } | ||
4452 | if (0) { | ||
4453 | disconnect: | ||
4454 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
4455 | } | ||
4456 | clear_bit(SIGNAL_ASENDER, &mdev->flags); | ||
4457 | |||
4458 | D_ASSERT(mdev->state.conn < C_CONNECTED); | ||
4459 | dev_info(DEV, "asender terminated\n"); | ||
4460 | |||
4461 | return 0; | ||
4462 | } | ||
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c new file mode 100644 index 000000000000..de81ab7b4627 --- /dev/null +++ b/drivers/block/drbd/drbd_req.c | |||
@@ -0,0 +1,1125 @@ | |||
1 | /* | ||
2 | drbd_req.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | |||
28 | #include <linux/slab.h> | ||
29 | #include <linux/drbd.h> | ||
30 | #include "drbd_int.h" | ||
31 | #include "drbd_req.h" | ||
32 | |||
33 | |||
34 | /* Update disk stats at start of I/O request */ | ||
35 | static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio) | ||
36 | { | ||
37 | const int rw = bio_data_dir(bio); | ||
38 | int cpu; | ||
39 | cpu = part_stat_lock(); | ||
40 | part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); | ||
41 | part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); | ||
42 | part_inc_in_flight(&mdev->vdisk->part0, rw); | ||
43 | part_stat_unlock(); | ||
44 | } | ||
45 | |||
46 | /* Update disk stats when completing request upwards */ | ||
47 | static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req) | ||
48 | { | ||
49 | int rw = bio_data_dir(req->master_bio); | ||
50 | unsigned long duration = jiffies - req->start_time; | ||
51 | int cpu; | ||
52 | cpu = part_stat_lock(); | ||
53 | part_stat_add(cpu, &mdev->vdisk->part0, ticks[rw], duration); | ||
54 | part_round_stats(cpu, &mdev->vdisk->part0); | ||
55 | part_dec_in_flight(&mdev->vdisk->part0, rw); | ||
56 | part_stat_unlock(); | ||
57 | } | ||
58 | |||
59 | static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw) | ||
60 | { | ||
61 | const unsigned long s = req->rq_state; | ||
62 | /* if it was a write, we may have to set the corresponding | ||
63 | * bit(s) out-of-sync first. If it had a local part, we need to | ||
64 | * release the reference to the activity log. */ | ||
65 | if (rw == WRITE) { | ||
66 | /* remove it from the transfer log. | ||
67 | * well, only if it had been there in the first | ||
68 | * place... if it had not (local only or conflicting | ||
69 | * and never sent), it should still be "empty" as | ||
70 | * initialized in drbd_req_new(), so we can list_del() it | ||
71 | * here unconditionally */ | ||
72 | list_del(&req->tl_requests); | ||
73 | /* Set out-of-sync unless both OK flags are set | ||
74 | * (local only or remote failed). | ||
75 | * Other places where we set out-of-sync: | ||
76 | * READ with local io-error */ | ||
77 | if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK)) | ||
78 | drbd_set_out_of_sync(mdev, req->sector, req->size); | ||
79 | |||
80 | if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS)) | ||
81 | drbd_set_in_sync(mdev, req->sector, req->size); | ||
82 | |||
83 | /* one might be tempted to move the drbd_al_complete_io | ||
84 | * to the local io completion callback drbd_endio_pri. | ||
85 | * but, if this was a mirror write, we may only | ||
86 | * drbd_al_complete_io after this is RQ_NET_DONE, | ||
87 | * otherwise the extent could be dropped from the al | ||
88 | * before it has actually been written on the peer. | ||
89 | * if we crash before our peer knows about the request, | ||
90 | * but after the extent has been dropped from the al, | ||
91 | * we would forget to resync the corresponding extent. | ||
92 | */ | ||
93 | if (s & RQ_LOCAL_MASK) { | ||
94 | if (get_ldev_if_state(mdev, D_FAILED)) { | ||
95 | drbd_al_complete_io(mdev, req->sector); | ||
96 | put_ldev(mdev); | ||
97 | } else if (__ratelimit(&drbd_ratelimit_state)) { | ||
98 | dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), " | ||
99 | "but my Disk seems to have failed :(\n", | ||
100 | (unsigned long long) req->sector); | ||
101 | } | ||
102 | } | ||
103 | } | ||
104 | |||
105 | /* if it was a local io error, we want to notify our | ||
106 | * peer about that, and see if we need to | ||
107 | * detach the disk and stuff. | ||
108 | * to avoid allocating some special work | ||
109 | * struct, reuse the request. */ | ||
110 | |||
111 | /* THINK | ||
112 | * why do we do this not when we detect the error, | ||
113 | * but delay it until it is "done", i.e. possibly | ||
114 | * until the next barrier ack? */ | ||
115 | |||
116 | if (rw == WRITE && | ||
117 | ((s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) { | ||
118 | if (!(req->w.list.next == LIST_POISON1 || | ||
119 | list_empty(&req->w.list))) { | ||
120 | /* DEBUG ASSERT only; if this triggers, we | ||
121 | * probably corrupt the worker list here */ | ||
122 | dev_err(DEV, "req->w.list.next = %p\n", req->w.list.next); | ||
123 | dev_err(DEV, "req->w.list.prev = %p\n", req->w.list.prev); | ||
124 | } | ||
125 | req->w.cb = w_io_error; | ||
126 | drbd_queue_work(&mdev->data.work, &req->w); | ||
127 | /* drbd_req_free() is done in w_io_error */ | ||
128 | } else { | ||
129 | drbd_req_free(req); | ||
130 | } | ||
131 | } | ||
132 | |||
133 | static void queue_barrier(struct drbd_conf *mdev) | ||
134 | { | ||
135 | struct drbd_tl_epoch *b; | ||
136 | |||
137 | /* We are within the req_lock. Once we queued the barrier for sending, | ||
138 | * we set the CREATE_BARRIER bit. It is cleared as soon as a new | ||
139 | * barrier/epoch object is added. This is the only place this bit is | ||
140 | * set. It indicates that the barrier for this epoch is already queued, | ||
141 | * and no new epoch has been created yet. */ | ||
142 | if (test_bit(CREATE_BARRIER, &mdev->flags)) | ||
143 | return; | ||
144 | |||
145 | b = mdev->newest_tle; | ||
146 | b->w.cb = w_send_barrier; | ||
147 | /* inc_ap_pending done here, so we won't | ||
148 | * get imbalanced on connection loss. | ||
149 | * dec_ap_pending will be done in got_BarrierAck | ||
150 | * or (on connection loss) in tl_clear. */ | ||
151 | inc_ap_pending(mdev); | ||
152 | drbd_queue_work(&mdev->data.work, &b->w); | ||
153 | set_bit(CREATE_BARRIER, &mdev->flags); | ||
154 | } | ||
155 | |||
156 | static void _about_to_complete_local_write(struct drbd_conf *mdev, | ||
157 | struct drbd_request *req) | ||
158 | { | ||
159 | const unsigned long s = req->rq_state; | ||
160 | struct drbd_request *i; | ||
161 | struct drbd_epoch_entry *e; | ||
162 | struct hlist_node *n; | ||
163 | struct hlist_head *slot; | ||
164 | |||
165 | /* before we can signal completion to the upper layers, | ||
166 | * we may need to close the current epoch */ | ||
167 | if (mdev->state.conn >= C_CONNECTED && | ||
168 | req->epoch == mdev->newest_tle->br_number) | ||
169 | queue_barrier(mdev); | ||
170 | |||
171 | /* we need to do the conflict detection stuff, | ||
172 | * if we have the ee_hash (two_primaries) and | ||
173 | * this has been on the network */ | ||
174 | if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) { | ||
175 | const sector_t sector = req->sector; | ||
176 | const int size = req->size; | ||
177 | |||
178 | /* ASSERT: | ||
179 | * there must be no conflicting requests, since | ||
180 | * they must have been failed on the spot */ | ||
181 | #define OVERLAPS overlaps(sector, size, i->sector, i->size) | ||
182 | slot = tl_hash_slot(mdev, sector); | ||
183 | hlist_for_each_entry(i, n, slot, colision) { | ||
184 | if (OVERLAPS) { | ||
185 | dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; " | ||
186 | "other: %p %llus +%u\n", | ||
187 | req, (unsigned long long)sector, size, | ||
188 | i, (unsigned long long)i->sector, i->size); | ||
189 | } | ||
190 | } | ||
191 | |||
192 | /* maybe "wake" those conflicting epoch entries | ||
193 | * that wait for this request to finish. | ||
194 | * | ||
195 | * currently, there can be only _one_ such ee | ||
196 | * (well, or some more, which would be pending | ||
197 | * P_DISCARD_ACK not yet sent by the asender...), | ||
198 | * since we block the receiver thread upon the | ||
199 | * first conflict detection, which will wait on | ||
200 | * misc_wait. maybe we want to assert that? | ||
201 | * | ||
202 | * anyways, if we found one, | ||
203 | * we just have to do a wake_up. */ | ||
204 | #undef OVERLAPS | ||
205 | #define OVERLAPS overlaps(sector, size, e->sector, e->size) | ||
206 | slot = ee_hash_slot(mdev, req->sector); | ||
207 | hlist_for_each_entry(e, n, slot, colision) { | ||
208 | if (OVERLAPS) { | ||
209 | wake_up(&mdev->misc_wait); | ||
210 | break; | ||
211 | } | ||
212 | } | ||
213 | } | ||
214 | #undef OVERLAPS | ||
215 | } | ||
216 | |||
217 | void complete_master_bio(struct drbd_conf *mdev, | ||
218 | struct bio_and_error *m) | ||
219 | { | ||
220 | bio_endio(m->bio, m->error); | ||
221 | dec_ap_bio(mdev); | ||
222 | } | ||
223 | |||
224 | /* Helper for __req_mod(). | ||
225 | * Set m->bio to the master bio, if it is fit to be completed, | ||
226 | * or leave it alone (it is initialized to NULL in __req_mod), | ||
227 | * if it has already been completed, or cannot be completed yet. | ||
228 | * If m->bio is set, the error status to be returned is placed in m->error. | ||
229 | */ | ||
230 | void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) | ||
231 | { | ||
232 | const unsigned long s = req->rq_state; | ||
233 | struct drbd_conf *mdev = req->mdev; | ||
234 | /* only WRITES may end up here without a master bio (on barrier ack) */ | ||
235 | int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE; | ||
236 | |||
237 | /* we must not complete the master bio, while it is | ||
238 | * still being processed by _drbd_send_zc_bio (drbd_send_dblock) | ||
239 | * not yet acknowledged by the peer | ||
240 | * not yet completed by the local io subsystem | ||
241 | * these flags may get cleared in any order by | ||
242 | * the worker, | ||
243 | * the receiver, | ||
244 | * the bio_endio completion callbacks. | ||
245 | */ | ||
246 | if (s & RQ_NET_QUEUED) | ||
247 | return; | ||
248 | if (s & RQ_NET_PENDING) | ||
249 | return; | ||
250 | if (s & RQ_LOCAL_PENDING) | ||
251 | return; | ||
252 | |||
253 | if (req->master_bio) { | ||
254 | /* this is data_received (remote read) | ||
255 | * or protocol C P_WRITE_ACK | ||
256 | * or protocol B P_RECV_ACK | ||
257 | * or protocol A "handed_over_to_network" (SendAck) | ||
258 | * or canceled or failed, | ||
259 | * or killed from the transfer log due to connection loss. | ||
260 | */ | ||
261 | |||
262 | /* | ||
263 | * figure out whether to report success or failure. | ||
264 | * | ||
265 | * report success when at least one of the operations succeeded. | ||
266 | * or, to put the other way, | ||
267 | * only report failure, when both operations failed. | ||
268 | * | ||
269 | * what to do about the failures is handled elsewhere. | ||
270 | * what we need to do here is just: complete the master_bio. | ||
271 | * | ||
272 | * local completion error, if any, has been stored as ERR_PTR | ||
273 | * in private_bio within drbd_endio_pri. | ||
274 | */ | ||
275 | int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK); | ||
276 | int error = PTR_ERR(req->private_bio); | ||
277 | |||
278 | /* remove the request from the conflict detection | ||
279 | * respective block_id verification hash */ | ||
280 | if (!hlist_unhashed(&req->colision)) | ||
281 | hlist_del(&req->colision); | ||
282 | else | ||
283 | D_ASSERT((s & RQ_NET_MASK) == 0); | ||
284 | |||
285 | /* for writes we need to do some extra housekeeping */ | ||
286 | if (rw == WRITE) | ||
287 | _about_to_complete_local_write(mdev, req); | ||
288 | |||
289 | /* Update disk stats */ | ||
290 | _drbd_end_io_acct(mdev, req); | ||
291 | |||
292 | m->error = ok ? 0 : (error ?: -EIO); | ||
293 | m->bio = req->master_bio; | ||
294 | req->master_bio = NULL; | ||
295 | } | ||
296 | |||
297 | if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) { | ||
298 | /* this is disconnected (local only) operation, | ||
299 | * or protocol C P_WRITE_ACK, | ||
300 | * or protocol A or B P_BARRIER_ACK, | ||
301 | * or killed from the transfer log due to connection loss. */ | ||
302 | _req_is_done(mdev, req, rw); | ||
303 | } | ||
304 | /* else: network part and not DONE yet. that is | ||
305 | * protocol A or B, barrier ack still pending... */ | ||
306 | } | ||
307 | |||
308 | /* | ||
309 | * checks whether there was an overlapping request | ||
310 | * or ee already registered. | ||
311 | * | ||
312 | * if so, return 1, in which case this request is completed on the spot, | ||
313 | * without ever being submitted or send. | ||
314 | * | ||
315 | * return 0 if it is ok to submit this request. | ||
316 | * | ||
317 | * NOTE: | ||
318 | * paranoia: assume something above us is broken, and issues different write | ||
319 | * requests for the same block simultaneously... | ||
320 | * | ||
321 | * To ensure these won't be reordered differently on both nodes, resulting in | ||
322 | * diverging data sets, we discard the later one(s). Not that this is supposed | ||
323 | * to happen, but this is the rationale why we also have to check for | ||
324 | * conflicting requests with local origin, and why we have to do so regardless | ||
325 | * of whether we allowed multiple primaries. | ||
326 | * | ||
327 | * BTW, in case we only have one primary, the ee_hash is empty anyways, and the | ||
328 | * second hlist_for_each_entry becomes a noop. This is even simpler than to | ||
329 | * grab a reference on the net_conf, and check for the two_primaries flag... | ||
330 | */ | ||
331 | static int _req_conflicts(struct drbd_request *req) | ||
332 | { | ||
333 | struct drbd_conf *mdev = req->mdev; | ||
334 | const sector_t sector = req->sector; | ||
335 | const int size = req->size; | ||
336 | struct drbd_request *i; | ||
337 | struct drbd_epoch_entry *e; | ||
338 | struct hlist_node *n; | ||
339 | struct hlist_head *slot; | ||
340 | |||
341 | D_ASSERT(hlist_unhashed(&req->colision)); | ||
342 | |||
343 | if (!get_net_conf(mdev)) | ||
344 | return 0; | ||
345 | |||
346 | /* BUG_ON */ | ||
347 | ERR_IF (mdev->tl_hash_s == 0) | ||
348 | goto out_no_conflict; | ||
349 | BUG_ON(mdev->tl_hash == NULL); | ||
350 | |||
351 | #define OVERLAPS overlaps(i->sector, i->size, sector, size) | ||
352 | slot = tl_hash_slot(mdev, sector); | ||
353 | hlist_for_each_entry(i, n, slot, colision) { | ||
354 | if (OVERLAPS) { | ||
355 | dev_alert(DEV, "%s[%u] Concurrent local write detected! " | ||
356 | "[DISCARD L] new: %llus +%u; " | ||
357 | "pending: %llus +%u\n", | ||
358 | current->comm, current->pid, | ||
359 | (unsigned long long)sector, size, | ||
360 | (unsigned long long)i->sector, i->size); | ||
361 | goto out_conflict; | ||
362 | } | ||
363 | } | ||
364 | |||
365 | if (mdev->ee_hash_s) { | ||
366 | /* now, check for overlapping requests with remote origin */ | ||
367 | BUG_ON(mdev->ee_hash == NULL); | ||
368 | #undef OVERLAPS | ||
369 | #define OVERLAPS overlaps(e->sector, e->size, sector, size) | ||
370 | slot = ee_hash_slot(mdev, sector); | ||
371 | hlist_for_each_entry(e, n, slot, colision) { | ||
372 | if (OVERLAPS) { | ||
373 | dev_alert(DEV, "%s[%u] Concurrent remote write detected!" | ||
374 | " [DISCARD L] new: %llus +%u; " | ||
375 | "pending: %llus +%u\n", | ||
376 | current->comm, current->pid, | ||
377 | (unsigned long long)sector, size, | ||
378 | (unsigned long long)e->sector, e->size); | ||
379 | goto out_conflict; | ||
380 | } | ||
381 | } | ||
382 | } | ||
383 | #undef OVERLAPS | ||
384 | |||
385 | out_no_conflict: | ||
386 | /* this is like it should be, and what we expected. | ||
387 | * our users do behave after all... */ | ||
388 | put_net_conf(mdev); | ||
389 | return 0; | ||
390 | |||
391 | out_conflict: | ||
392 | put_net_conf(mdev); | ||
393 | return 1; | ||
394 | } | ||
395 | |||
396 | /* obviously this could be coded as many single functions | ||
397 | * instead of one huge switch, | ||
398 | * or by putting the code directly in the respective locations | ||
399 | * (as it has been before). | ||
400 | * | ||
401 | * but having it this way | ||
402 | * enforces that it is all in this one place, where it is easier to audit, | ||
403 | * it makes it obvious that whatever "event" "happens" to a request should | ||
404 | * happen "atomically" within the req_lock, | ||
405 | * and it enforces that we have to think in a very structured manner | ||
406 | * about the "events" that may happen to a request during its life time ... | ||
407 | */ | ||
408 | void __req_mod(struct drbd_request *req, enum drbd_req_event what, | ||
409 | struct bio_and_error *m) | ||
410 | { | ||
411 | struct drbd_conf *mdev = req->mdev; | ||
412 | m->bio = NULL; | ||
413 | |||
414 | switch (what) { | ||
415 | default: | ||
416 | dev_err(DEV, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__); | ||
417 | break; | ||
418 | |||
419 | /* does not happen... | ||
420 | * initialization done in drbd_req_new | ||
421 | case created: | ||
422 | break; | ||
423 | */ | ||
424 | |||
425 | case to_be_send: /* via network */ | ||
426 | /* reached via drbd_make_request_common | ||
427 | * and from w_read_retry_remote */ | ||
428 | D_ASSERT(!(req->rq_state & RQ_NET_MASK)); | ||
429 | req->rq_state |= RQ_NET_PENDING; | ||
430 | inc_ap_pending(mdev); | ||
431 | break; | ||
432 | |||
433 | case to_be_submitted: /* locally */ | ||
434 | /* reached via drbd_make_request_common */ | ||
435 | D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK)); | ||
436 | req->rq_state |= RQ_LOCAL_PENDING; | ||
437 | break; | ||
438 | |||
439 | case completed_ok: | ||
440 | if (bio_data_dir(req->master_bio) == WRITE) | ||
441 | mdev->writ_cnt += req->size>>9; | ||
442 | else | ||
443 | mdev->read_cnt += req->size>>9; | ||
444 | |||
445 | req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK); | ||
446 | req->rq_state &= ~RQ_LOCAL_PENDING; | ||
447 | |||
448 | _req_may_be_done(req, m); | ||
449 | put_ldev(mdev); | ||
450 | break; | ||
451 | |||
452 | case write_completed_with_error: | ||
453 | req->rq_state |= RQ_LOCAL_COMPLETED; | ||
454 | req->rq_state &= ~RQ_LOCAL_PENDING; | ||
455 | |||
456 | dev_alert(DEV, "Local WRITE failed sec=%llus size=%u\n", | ||
457 | (unsigned long long)req->sector, req->size); | ||
458 | /* and now: check how to handle local io error. */ | ||
459 | __drbd_chk_io_error(mdev, FALSE); | ||
460 | _req_may_be_done(req, m); | ||
461 | put_ldev(mdev); | ||
462 | break; | ||
463 | |||
464 | case read_ahead_completed_with_error: | ||
465 | /* it is legal to fail READA */ | ||
466 | req->rq_state |= RQ_LOCAL_COMPLETED; | ||
467 | req->rq_state &= ~RQ_LOCAL_PENDING; | ||
468 | _req_may_be_done(req, m); | ||
469 | put_ldev(mdev); | ||
470 | break; | ||
471 | |||
472 | case read_completed_with_error: | ||
473 | drbd_set_out_of_sync(mdev, req->sector, req->size); | ||
474 | |||
475 | req->rq_state |= RQ_LOCAL_COMPLETED; | ||
476 | req->rq_state &= ~RQ_LOCAL_PENDING; | ||
477 | |||
478 | dev_alert(DEV, "Local READ failed sec=%llus size=%u\n", | ||
479 | (unsigned long long)req->sector, req->size); | ||
480 | /* _req_mod(req,to_be_send); oops, recursion... */ | ||
481 | D_ASSERT(!(req->rq_state & RQ_NET_MASK)); | ||
482 | req->rq_state |= RQ_NET_PENDING; | ||
483 | inc_ap_pending(mdev); | ||
484 | |||
485 | __drbd_chk_io_error(mdev, FALSE); | ||
486 | put_ldev(mdev); | ||
487 | /* NOTE: if we have no connection, | ||
488 | * or know the peer has no good data either, | ||
489 | * then we don't actually need to "queue_for_net_read", | ||
490 | * but we do so anyways, since the drbd_io_error() | ||
491 | * and the potential state change to "Diskless" | ||
492 | * needs to be done from process context */ | ||
493 | |||
494 | /* fall through: _req_mod(req,queue_for_net_read); */ | ||
495 | |||
496 | case queue_for_net_read: | ||
497 | /* READ or READA, and | ||
498 | * no local disk, | ||
499 | * or target area marked as invalid, | ||
500 | * or just got an io-error. */ | ||
501 | /* from drbd_make_request_common | ||
502 | * or from bio_endio during read io-error recovery */ | ||
503 | |||
504 | /* so we can verify the handle in the answer packet | ||
505 | * corresponding hlist_del is in _req_may_be_done() */ | ||
506 | hlist_add_head(&req->colision, ar_hash_slot(mdev, req->sector)); | ||
507 | |||
508 | set_bit(UNPLUG_REMOTE, &mdev->flags); | ||
509 | |||
510 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | ||
511 | req->rq_state |= RQ_NET_QUEUED; | ||
512 | req->w.cb = (req->rq_state & RQ_LOCAL_MASK) | ||
513 | ? w_read_retry_remote | ||
514 | : w_send_read_req; | ||
515 | drbd_queue_work(&mdev->data.work, &req->w); | ||
516 | break; | ||
517 | |||
518 | case queue_for_net_write: | ||
519 | /* assert something? */ | ||
520 | /* from drbd_make_request_common only */ | ||
521 | |||
522 | hlist_add_head(&req->colision, tl_hash_slot(mdev, req->sector)); | ||
523 | /* corresponding hlist_del is in _req_may_be_done() */ | ||
524 | |||
525 | /* NOTE | ||
526 | * In case the req ended up on the transfer log before being | ||
527 | * queued on the worker, it could lead to this request being | ||
528 | * missed during cleanup after connection loss. | ||
529 | * So we have to do both operations here, | ||
530 | * within the same lock that protects the transfer log. | ||
531 | * | ||
532 | * _req_add_to_epoch(req); this has to be after the | ||
533 | * _maybe_start_new_epoch(req); which happened in | ||
534 | * drbd_make_request_common, because we now may set the bit | ||
535 | * again ourselves to close the current epoch. | ||
536 | * | ||
537 | * Add req to the (now) current epoch (barrier). */ | ||
538 | |||
539 | /* otherwise we may lose an unplug, which may cause some remote | ||
540 | * io-scheduler timeout to expire, increasing maximum latency, | ||
541 | * hurting performance. */ | ||
542 | set_bit(UNPLUG_REMOTE, &mdev->flags); | ||
543 | |||
544 | /* see drbd_make_request_common, | ||
545 | * just after it grabs the req_lock */ | ||
546 | D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0); | ||
547 | |||
548 | req->epoch = mdev->newest_tle->br_number; | ||
549 | list_add_tail(&req->tl_requests, | ||
550 | &mdev->newest_tle->requests); | ||
551 | |||
552 | /* increment size of current epoch */ | ||
553 | mdev->newest_tle->n_req++; | ||
554 | |||
555 | /* queue work item to send data */ | ||
556 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | ||
557 | req->rq_state |= RQ_NET_QUEUED; | ||
558 | req->w.cb = w_send_dblock; | ||
559 | drbd_queue_work(&mdev->data.work, &req->w); | ||
560 | |||
561 | /* close the epoch, in case it outgrew the limit */ | ||
562 | if (mdev->newest_tle->n_req >= mdev->net_conf->max_epoch_size) | ||
563 | queue_barrier(mdev); | ||
564 | |||
565 | break; | ||
566 | |||
567 | case send_canceled: | ||
568 | /* treat it the same */ | ||
569 | case send_failed: | ||
570 | /* real cleanup will be done from tl_clear. just update flags | ||
571 | * so it is no longer marked as on the worker queue */ | ||
572 | req->rq_state &= ~RQ_NET_QUEUED; | ||
573 | /* if we did it right, tl_clear should be scheduled only after | ||
574 | * this, so this should not be necessary! */ | ||
575 | _req_may_be_done(req, m); | ||
576 | break; | ||
577 | |||
578 | case handed_over_to_network: | ||
579 | /* assert something? */ | ||
580 | if (bio_data_dir(req->master_bio) == WRITE && | ||
581 | mdev->net_conf->wire_protocol == DRBD_PROT_A) { | ||
582 | /* this is what is dangerous about protocol A: | ||
583 | * pretend it was successfully written on the peer. */ | ||
584 | if (req->rq_state & RQ_NET_PENDING) { | ||
585 | dec_ap_pending(mdev); | ||
586 | req->rq_state &= ~RQ_NET_PENDING; | ||
587 | req->rq_state |= RQ_NET_OK; | ||
588 | } /* else: neg-ack was faster... */ | ||
589 | /* it is still not yet RQ_NET_DONE until the | ||
590 | * corresponding epoch barrier got acked as well, | ||
591 | * so we know what to dirty on connection loss */ | ||
592 | } | ||
593 | req->rq_state &= ~RQ_NET_QUEUED; | ||
594 | req->rq_state |= RQ_NET_SENT; | ||
595 | /* because _drbd_send_zc_bio could sleep, and may want to | ||
596 | * dereference the bio even after the "write_acked_by_peer" and | ||
597 | * "completed_ok" events came in, once we return from | ||
598 | * _drbd_send_zc_bio (drbd_send_dblock), we have to check | ||
599 | * whether it is done already, and end it. */ | ||
600 | _req_may_be_done(req, m); | ||
601 | break; | ||
602 | |||
603 | case connection_lost_while_pending: | ||
604 | /* transfer log cleanup after connection loss */ | ||
605 | /* assert something? */ | ||
606 | if (req->rq_state & RQ_NET_PENDING) | ||
607 | dec_ap_pending(mdev); | ||
608 | req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); | ||
609 | req->rq_state |= RQ_NET_DONE; | ||
610 | /* if it is still queued, we may not complete it here. | ||
611 | * it will be canceled soon. */ | ||
612 | if (!(req->rq_state & RQ_NET_QUEUED)) | ||
613 | _req_may_be_done(req, m); | ||
614 | break; | ||
615 | |||
616 | case write_acked_by_peer_and_sis: | ||
617 | req->rq_state |= RQ_NET_SIS; | ||
618 | case conflict_discarded_by_peer: | ||
619 | /* for discarded conflicting writes of multiple primaries, | ||
620 | * there is no need to keep anything in the tl, potential | ||
621 | * node crashes are covered by the activity log. */ | ||
622 | if (what == conflict_discarded_by_peer) | ||
623 | dev_alert(DEV, "Got DiscardAck packet %llus +%u!" | ||
624 | " DRBD is not a random data generator!\n", | ||
625 | (unsigned long long)req->sector, req->size); | ||
626 | req->rq_state |= RQ_NET_DONE; | ||
627 | /* fall through */ | ||
628 | case write_acked_by_peer: | ||
629 | /* protocol C; successfully written on peer. | ||
630 | * Nothing to do here. | ||
631 | * We want to keep the tl in place for all protocols, to cater | ||
632 | * for volatile write-back caches on lower level devices. | ||
633 | * | ||
634 | * A barrier request is expected to have forced all prior | ||
635 | * requests onto stable storage, so completion of a barrier | ||
636 | * request could set NET_DONE right here, and not wait for the | ||
637 | * P_BARRIER_ACK, but that is an unnecessary optimization. */ | ||
638 | |||
639 | /* this makes it effectively the same as for: */ | ||
640 | case recv_acked_by_peer: | ||
641 | /* protocol B; pretends to be successfully written on peer. | ||
642 | * see also notes above in handed_over_to_network about | ||
643 | * protocol != C */ | ||
644 | req->rq_state |= RQ_NET_OK; | ||
645 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | ||
646 | dec_ap_pending(mdev); | ||
647 | req->rq_state &= ~RQ_NET_PENDING; | ||
648 | _req_may_be_done(req, m); | ||
649 | break; | ||
650 | |||
651 | case neg_acked: | ||
652 | /* assert something? */ | ||
653 | if (req->rq_state & RQ_NET_PENDING) | ||
654 | dec_ap_pending(mdev); | ||
655 | req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); | ||
656 | |||
657 | req->rq_state |= RQ_NET_DONE; | ||
658 | _req_may_be_done(req, m); | ||
659 | /* else: done by handed_over_to_network */ | ||
660 | break; | ||
661 | |||
662 | case barrier_acked: | ||
663 | if (req->rq_state & RQ_NET_PENDING) { | ||
664 | /* barrier came in before all requests have been acked. | ||
665 | * this is bad, because if the connection is lost now, | ||
666 | * we won't be able to clean them up... */ | ||
667 | dev_err(DEV, "FIXME (barrier_acked but pending)\n"); | ||
668 | list_move(&req->tl_requests, &mdev->out_of_sequence_requests); | ||
669 | } | ||
670 | D_ASSERT(req->rq_state & RQ_NET_SENT); | ||
671 | req->rq_state |= RQ_NET_DONE; | ||
672 | _req_may_be_done(req, m); | ||
673 | break; | ||
674 | |||
675 | case data_received: | ||
676 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | ||
677 | dec_ap_pending(mdev); | ||
678 | req->rq_state &= ~RQ_NET_PENDING; | ||
679 | req->rq_state |= (RQ_NET_OK|RQ_NET_DONE); | ||
680 | _req_may_be_done(req, m); | ||
681 | break; | ||
682 | }; | ||
683 | } | ||
684 | |||
685 | /* we may do a local read if: | ||
686 | * - we are consistent (of course), | ||
687 | * - or we are generally inconsistent, | ||
688 | * BUT we are still/already IN SYNC for this area. | ||
689 | * since size may be bigger than BM_BLOCK_SIZE, | ||
690 | * we may need to check several bits. | ||
691 | */ | ||
692 | static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size) | ||
693 | { | ||
694 | unsigned long sbnr, ebnr; | ||
695 | sector_t esector, nr_sectors; | ||
696 | |||
697 | if (mdev->state.disk == D_UP_TO_DATE) | ||
698 | return 1; | ||
699 | if (mdev->state.disk >= D_OUTDATED) | ||
700 | return 0; | ||
701 | if (mdev->state.disk < D_INCONSISTENT) | ||
702 | return 0; | ||
703 | /* state.disk == D_INCONSISTENT We will have a look at the BitMap */ | ||
704 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | ||
705 | esector = sector + (size >> 9) - 1; | ||
706 | |||
707 | D_ASSERT(sector < nr_sectors); | ||
708 | D_ASSERT(esector < nr_sectors); | ||
709 | |||
710 | sbnr = BM_SECT_TO_BIT(sector); | ||
711 | ebnr = BM_SECT_TO_BIT(esector); | ||
712 | |||
713 | return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr); | ||
714 | } | ||
715 | |||
716 | static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio) | ||
717 | { | ||
718 | const int rw = bio_rw(bio); | ||
719 | const int size = bio->bi_size; | ||
720 | const sector_t sector = bio->bi_sector; | ||
721 | struct drbd_tl_epoch *b = NULL; | ||
722 | struct drbd_request *req; | ||
723 | int local, remote; | ||
724 | int err = -EIO; | ||
725 | |||
726 | /* allocate outside of all locks; */ | ||
727 | req = drbd_req_new(mdev, bio); | ||
728 | if (!req) { | ||
729 | dec_ap_bio(mdev); | ||
730 | /* only pass the error to the upper layers. | ||
731 | * if user cannot handle io errors, that's not our business. */ | ||
732 | dev_err(DEV, "could not kmalloc() req\n"); | ||
733 | bio_endio(bio, -ENOMEM); | ||
734 | return 0; | ||
735 | } | ||
736 | |||
737 | local = get_ldev(mdev); | ||
738 | if (!local) { | ||
739 | bio_put(req->private_bio); /* or we get a bio leak */ | ||
740 | req->private_bio = NULL; | ||
741 | } | ||
742 | if (rw == WRITE) { | ||
743 | remote = 1; | ||
744 | } else { | ||
745 | /* READ || READA */ | ||
746 | if (local) { | ||
747 | if (!drbd_may_do_local_read(mdev, sector, size)) { | ||
748 | /* we could kick the syncer to | ||
749 | * sync this extent asap, wait for | ||
750 | * it, then continue locally. | ||
751 | * Or just issue the request remotely. | ||
752 | */ | ||
753 | local = 0; | ||
754 | bio_put(req->private_bio); | ||
755 | req->private_bio = NULL; | ||
756 | put_ldev(mdev); | ||
757 | } | ||
758 | } | ||
759 | remote = !local && mdev->state.pdsk >= D_UP_TO_DATE; | ||
760 | } | ||
761 | |||
762 | /* If we have a disk, but a READA request is mapped to remote, | ||
763 | * we are R_PRIMARY, D_INCONSISTENT, SyncTarget. | ||
764 | * Just fail that READA request right here. | ||
765 | * | ||
766 | * THINK: maybe fail all READA when not local? | ||
767 | * or make this configurable... | ||
768 | * if network is slow, READA won't do any good. | ||
769 | */ | ||
770 | if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) { | ||
771 | err = -EWOULDBLOCK; | ||
772 | goto fail_and_free_req; | ||
773 | } | ||
774 | |||
775 | /* For WRITES going to the local disk, grab a reference on the target | ||
776 | * extent. This waits for any resync activity in the corresponding | ||
777 | * resync extent to finish, and, if necessary, pulls in the target | ||
778 | * extent into the activity log, which involves further disk io because | ||
779 | * of transactional on-disk meta data updates. */ | ||
780 | if (rw == WRITE && local) | ||
781 | drbd_al_begin_io(mdev, sector); | ||
782 | |||
783 | remote = remote && (mdev->state.pdsk == D_UP_TO_DATE || | ||
784 | (mdev->state.pdsk == D_INCONSISTENT && | ||
785 | mdev->state.conn >= C_CONNECTED)); | ||
786 | |||
787 | if (!(local || remote)) { | ||
788 | dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); | ||
789 | goto fail_free_complete; | ||
790 | } | ||
791 | |||
792 | /* For WRITE request, we have to make sure that we have an | ||
793 | * unused_spare_tle, in case we need to start a new epoch. | ||
794 | * I try to be smart and avoid to pre-allocate always "just in case", | ||
795 | * but there is a race between testing the bit and pointer outside the | ||
796 | * spinlock, and grabbing the spinlock. | ||
797 | * if we lost that race, we retry. */ | ||
798 | if (rw == WRITE && remote && | ||
799 | mdev->unused_spare_tle == NULL && | ||
800 | test_bit(CREATE_BARRIER, &mdev->flags)) { | ||
801 | allocate_barrier: | ||
802 | b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO); | ||
803 | if (!b) { | ||
804 | dev_err(DEV, "Failed to alloc barrier.\n"); | ||
805 | err = -ENOMEM; | ||
806 | goto fail_free_complete; | ||
807 | } | ||
808 | } | ||
809 | |||
810 | /* GOOD, everything prepared, grab the spin_lock */ | ||
811 | spin_lock_irq(&mdev->req_lock); | ||
812 | |||
813 | if (remote) { | ||
814 | remote = (mdev->state.pdsk == D_UP_TO_DATE || | ||
815 | (mdev->state.pdsk == D_INCONSISTENT && | ||
816 | mdev->state.conn >= C_CONNECTED)); | ||
817 | if (!remote) | ||
818 | dev_warn(DEV, "lost connection while grabbing the req_lock!\n"); | ||
819 | if (!(local || remote)) { | ||
820 | dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); | ||
821 | spin_unlock_irq(&mdev->req_lock); | ||
822 | goto fail_free_complete; | ||
823 | } | ||
824 | } | ||
825 | |||
826 | if (b && mdev->unused_spare_tle == NULL) { | ||
827 | mdev->unused_spare_tle = b; | ||
828 | b = NULL; | ||
829 | } | ||
830 | if (rw == WRITE && remote && | ||
831 | mdev->unused_spare_tle == NULL && | ||
832 | test_bit(CREATE_BARRIER, &mdev->flags)) { | ||
833 | /* someone closed the current epoch | ||
834 | * while we were grabbing the spinlock */ | ||
835 | spin_unlock_irq(&mdev->req_lock); | ||
836 | goto allocate_barrier; | ||
837 | } | ||
838 | |||
839 | |||
840 | /* Update disk stats */ | ||
841 | _drbd_start_io_acct(mdev, req, bio); | ||
842 | |||
843 | /* _maybe_start_new_epoch(mdev); | ||
844 | * If we need to generate a write barrier packet, we have to add the | ||
845 | * new epoch (barrier) object, and queue the barrier packet for sending, | ||
846 | * and queue the req's data after it _within the same lock_, otherwise | ||
847 | * we have race conditions were the reorder domains could be mixed up. | ||
848 | * | ||
849 | * Even read requests may start a new epoch and queue the corresponding | ||
850 | * barrier packet. To get the write ordering right, we only have to | ||
851 | * make sure that, if this is a write request and it triggered a | ||
852 | * barrier packet, this request is queued within the same spinlock. */ | ||
853 | if (remote && mdev->unused_spare_tle && | ||
854 | test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { | ||
855 | _tl_add_barrier(mdev, mdev->unused_spare_tle); | ||
856 | mdev->unused_spare_tle = NULL; | ||
857 | } else { | ||
858 | D_ASSERT(!(remote && rw == WRITE && | ||
859 | test_bit(CREATE_BARRIER, &mdev->flags))); | ||
860 | } | ||
861 | |||
862 | /* NOTE | ||
863 | * Actually, 'local' may be wrong here already, since we may have failed | ||
864 | * to write to the meta data, and may become wrong anytime because of | ||
865 | * local io-error for some other request, which would lead to us | ||
866 | * "detaching" the local disk. | ||
867 | * | ||
868 | * 'remote' may become wrong any time because the network could fail. | ||
869 | * | ||
870 | * This is a harmless race condition, though, since it is handled | ||
871 | * correctly at the appropriate places; so it just defers the failure | ||
872 | * of the respective operation. | ||
873 | */ | ||
874 | |||
875 | /* mark them early for readability. | ||
876 | * this just sets some state flags. */ | ||
877 | if (remote) | ||
878 | _req_mod(req, to_be_send); | ||
879 | if (local) | ||
880 | _req_mod(req, to_be_submitted); | ||
881 | |||
882 | /* check this request on the collision detection hash tables. | ||
883 | * if we have a conflict, just complete it here. | ||
884 | * THINK do we want to check reads, too? (I don't think so...) */ | ||
885 | if (rw == WRITE && _req_conflicts(req)) { | ||
886 | /* this is a conflicting request. | ||
887 | * even though it may have been only _partially_ | ||
888 | * overlapping with one of the currently pending requests, | ||
889 | * without even submitting or sending it, we will | ||
890 | * pretend that it was successfully served right now. | ||
891 | */ | ||
892 | if (local) { | ||
893 | bio_put(req->private_bio); | ||
894 | req->private_bio = NULL; | ||
895 | drbd_al_complete_io(mdev, req->sector); | ||
896 | put_ldev(mdev); | ||
897 | local = 0; | ||
898 | } | ||
899 | if (remote) | ||
900 | dec_ap_pending(mdev); | ||
901 | _drbd_end_io_acct(mdev, req); | ||
902 | /* THINK: do we want to fail it (-EIO), or pretend success? */ | ||
903 | bio_endio(req->master_bio, 0); | ||
904 | req->master_bio = NULL; | ||
905 | dec_ap_bio(mdev); | ||
906 | drbd_req_free(req); | ||
907 | remote = 0; | ||
908 | } | ||
909 | |||
910 | /* NOTE remote first: to get the concurrent write detection right, | ||
911 | * we must register the request before start of local IO. */ | ||
912 | if (remote) { | ||
913 | /* either WRITE and C_CONNECTED, | ||
914 | * or READ, and no local disk, | ||
915 | * or READ, but not in sync. | ||
916 | */ | ||
917 | _req_mod(req, (rw == WRITE) | ||
918 | ? queue_for_net_write | ||
919 | : queue_for_net_read); | ||
920 | } | ||
921 | spin_unlock_irq(&mdev->req_lock); | ||
922 | kfree(b); /* if someone else has beaten us to it... */ | ||
923 | |||
924 | if (local) { | ||
925 | req->private_bio->bi_bdev = mdev->ldev->backing_bdev; | ||
926 | |||
927 | if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR | ||
928 | : rw == READ ? DRBD_FAULT_DT_RD | ||
929 | : DRBD_FAULT_DT_RA)) | ||
930 | bio_endio(req->private_bio, -EIO); | ||
931 | else | ||
932 | generic_make_request(req->private_bio); | ||
933 | } | ||
934 | |||
935 | /* we need to plug ALWAYS since we possibly need to kick lo_dev. | ||
936 | * we plug after submit, so we won't miss an unplug event */ | ||
937 | drbd_plug_device(mdev); | ||
938 | |||
939 | return 0; | ||
940 | |||
941 | fail_free_complete: | ||
942 | if (rw == WRITE && local) | ||
943 | drbd_al_complete_io(mdev, sector); | ||
944 | fail_and_free_req: | ||
945 | if (local) { | ||
946 | bio_put(req->private_bio); | ||
947 | req->private_bio = NULL; | ||
948 | put_ldev(mdev); | ||
949 | } | ||
950 | bio_endio(bio, err); | ||
951 | drbd_req_free(req); | ||
952 | dec_ap_bio(mdev); | ||
953 | kfree(b); | ||
954 | |||
955 | return 0; | ||
956 | } | ||
957 | |||
958 | /* helper function for drbd_make_request | ||
959 | * if we can determine just by the mdev (state) that this request will fail, | ||
960 | * return 1 | ||
961 | * otherwise return 0 | ||
962 | */ | ||
963 | static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write) | ||
964 | { | ||
965 | /* Unconfigured */ | ||
966 | if (mdev->state.conn == C_DISCONNECTING && | ||
967 | mdev->state.disk == D_DISKLESS) | ||
968 | return 1; | ||
969 | |||
970 | if (mdev->state.role != R_PRIMARY && | ||
971 | (!allow_oos || is_write)) { | ||
972 | if (__ratelimit(&drbd_ratelimit_state)) { | ||
973 | dev_err(DEV, "Process %s[%u] tried to %s; " | ||
974 | "since we are not in Primary state, " | ||
975 | "we cannot allow this\n", | ||
976 | current->comm, current->pid, | ||
977 | is_write ? "WRITE" : "READ"); | ||
978 | } | ||
979 | return 1; | ||
980 | } | ||
981 | |||
982 | /* | ||
983 | * Paranoia: we might have been primary, but sync target, or | ||
984 | * even diskless, then lost the connection. | ||
985 | * This should have been handled (panic? suspend?) somewhere | ||
986 | * else. But maybe it was not, so check again here. | ||
987 | * Caution: as long as we do not have a read/write lock on mdev, | ||
988 | * to serialize state changes, this is racy, since we may lose | ||
989 | * the connection *after* we test for the cstate. | ||
990 | */ | ||
991 | if (mdev->state.disk < D_UP_TO_DATE && mdev->state.pdsk < D_UP_TO_DATE) { | ||
992 | if (__ratelimit(&drbd_ratelimit_state)) | ||
993 | dev_err(DEV, "Sorry, I have no access to good data anymore.\n"); | ||
994 | return 1; | ||
995 | } | ||
996 | |||
997 | return 0; | ||
998 | } | ||
999 | |||
1000 | int drbd_make_request_26(struct request_queue *q, struct bio *bio) | ||
1001 | { | ||
1002 | unsigned int s_enr, e_enr; | ||
1003 | struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; | ||
1004 | |||
1005 | if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) { | ||
1006 | bio_endio(bio, -EPERM); | ||
1007 | return 0; | ||
1008 | } | ||
1009 | |||
1010 | /* Reject barrier requests if we know the underlying device does | ||
1011 | * not support them. | ||
1012 | * XXX: Need to get this info from peer as well some how so we | ||
1013 | * XXX: reject if EITHER side/data/metadata area does not support them. | ||
1014 | * | ||
1015 | * because of those XXX, this is not yet enabled, | ||
1016 | * i.e. in drbd_init_set_defaults we set the NO_BARRIER_SUPP bit. | ||
1017 | */ | ||
1018 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags))) { | ||
1019 | /* dev_warn(DEV, "Rejecting barrier request as underlying device does not support\n"); */ | ||
1020 | bio_endio(bio, -EOPNOTSUPP); | ||
1021 | return 0; | ||
1022 | } | ||
1023 | |||
1024 | /* | ||
1025 | * what we "blindly" assume: | ||
1026 | */ | ||
1027 | D_ASSERT(bio->bi_size > 0); | ||
1028 | D_ASSERT((bio->bi_size & 0x1ff) == 0); | ||
1029 | D_ASSERT(bio->bi_idx == 0); | ||
1030 | |||
1031 | /* to make some things easier, force alignment of requests within the | ||
1032 | * granularity of our hash tables */ | ||
1033 | s_enr = bio->bi_sector >> HT_SHIFT; | ||
1034 | e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT; | ||
1035 | |||
1036 | if (likely(s_enr == e_enr)) { | ||
1037 | inc_ap_bio(mdev, 1); | ||
1038 | return drbd_make_request_common(mdev, bio); | ||
1039 | } | ||
1040 | |||
1041 | /* can this bio be split generically? | ||
1042 | * Maybe add our own split-arbitrary-bios function. */ | ||
1043 | if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_SEGMENT_SIZE) { | ||
1044 | /* rather error out here than BUG in bio_split */ | ||
1045 | dev_err(DEV, "bio would need to, but cannot, be split: " | ||
1046 | "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n", | ||
1047 | bio->bi_vcnt, bio->bi_idx, bio->bi_size, | ||
1048 | (unsigned long long)bio->bi_sector); | ||
1049 | bio_endio(bio, -EINVAL); | ||
1050 | } else { | ||
1051 | /* This bio crosses some boundary, so we have to split it. */ | ||
1052 | struct bio_pair *bp; | ||
1053 | /* works for the "do not cross hash slot boundaries" case | ||
1054 | * e.g. sector 262269, size 4096 | ||
1055 | * s_enr = 262269 >> 6 = 4097 | ||
1056 | * e_enr = (262269+8-1) >> 6 = 4098 | ||
1057 | * HT_SHIFT = 6 | ||
1058 | * sps = 64, mask = 63 | ||
1059 | * first_sectors = 64 - (262269 & 63) = 3 | ||
1060 | */ | ||
1061 | const sector_t sect = bio->bi_sector; | ||
1062 | const int sps = 1 << HT_SHIFT; /* sectors per slot */ | ||
1063 | const int mask = sps - 1; | ||
1064 | const sector_t first_sectors = sps - (sect & mask); | ||
1065 | bp = bio_split(bio, | ||
1066 | #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) | ||
1067 | bio_split_pool, | ||
1068 | #endif | ||
1069 | first_sectors); | ||
1070 | |||
1071 | /* we need to get a "reference count" (ap_bio_cnt) | ||
1072 | * to avoid races with the disconnect/reconnect/suspend code. | ||
1073 | * In case we need to split the bio here, we need to get two references | ||
1074 | * atomically, otherwise we might deadlock when trying to submit the | ||
1075 | * second one! */ | ||
1076 | inc_ap_bio(mdev, 2); | ||
1077 | |||
1078 | D_ASSERT(e_enr == s_enr + 1); | ||
1079 | |||
1080 | drbd_make_request_common(mdev, &bp->bio1); | ||
1081 | drbd_make_request_common(mdev, &bp->bio2); | ||
1082 | bio_pair_release(bp); | ||
1083 | } | ||
1084 | return 0; | ||
1085 | } | ||
1086 | |||
1087 | /* This is called by bio_add_page(). With this function we reduce | ||
1088 | * the number of BIOs that span over multiple DRBD_MAX_SEGMENT_SIZEs | ||
1089 | * units (was AL_EXTENTs). | ||
1090 | * | ||
1091 | * we do the calculation within the lower 32bit of the byte offsets, | ||
1092 | * since we don't care for actual offset, but only check whether it | ||
1093 | * would cross "activity log extent" boundaries. | ||
1094 | * | ||
1095 | * As long as the BIO is empty we have to allow at least one bvec, | ||
1096 | * regardless of size and offset. so the resulting bio may still | ||
1097 | * cross extent boundaries. those are dealt with (bio_split) in | ||
1098 | * drbd_make_request_26. | ||
1099 | */ | ||
1100 | int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec) | ||
1101 | { | ||
1102 | struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; | ||
1103 | unsigned int bio_offset = | ||
1104 | (unsigned int)bvm->bi_sector << 9; /* 32 bit */ | ||
1105 | unsigned int bio_size = bvm->bi_size; | ||
1106 | int limit, backing_limit; | ||
1107 | |||
1108 | limit = DRBD_MAX_SEGMENT_SIZE | ||
1109 | - ((bio_offset & (DRBD_MAX_SEGMENT_SIZE-1)) + bio_size); | ||
1110 | if (limit < 0) | ||
1111 | limit = 0; | ||
1112 | if (bio_size == 0) { | ||
1113 | if (limit <= bvec->bv_len) | ||
1114 | limit = bvec->bv_len; | ||
1115 | } else if (limit && get_ldev(mdev)) { | ||
1116 | struct request_queue * const b = | ||
1117 | mdev->ldev->backing_bdev->bd_disk->queue; | ||
1118 | if (b->merge_bvec_fn && mdev->ldev->dc.use_bmbv) { | ||
1119 | backing_limit = b->merge_bvec_fn(b, bvm, bvec); | ||
1120 | limit = min(limit, backing_limit); | ||
1121 | } | ||
1122 | put_ldev(mdev); | ||
1123 | } | ||
1124 | return limit; | ||
1125 | } | ||
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h new file mode 100644 index 000000000000..16119d7056cc --- /dev/null +++ b/drivers/block/drbd/drbd_req.h | |||
@@ -0,0 +1,326 @@ | |||
1 | /* | ||
2 | drbd_req.h | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2006-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 2006-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
8 | Copyright (C) 2006-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
9 | |||
10 | DRBD is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | DRBD is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | */ | ||
24 | |||
25 | #ifndef _DRBD_REQ_H | ||
26 | #define _DRBD_REQ_H | ||
27 | |||
28 | #include <linux/module.h> | ||
29 | |||
30 | #include <linux/slab.h> | ||
31 | #include <linux/drbd.h> | ||
32 | #include "drbd_int.h" | ||
33 | #include "drbd_wrappers.h" | ||
34 | |||
35 | /* The request callbacks will be called in irq context by the IDE drivers, | ||
36 | and in Softirqs/Tasklets/BH context by the SCSI drivers, | ||
37 | and by the receiver and worker in kernel-thread context. | ||
38 | Try to get the locking right :) */ | ||
39 | |||
40 | /* | ||
41 | * Objects of type struct drbd_request do only exist on a R_PRIMARY node, and are | ||
42 | * associated with IO requests originating from the block layer above us. | ||
43 | * | ||
44 | * There are quite a few things that may happen to a drbd request | ||
45 | * during its lifetime. | ||
46 | * | ||
47 | * It will be created. | ||
48 | * It will be marked with the intention to be | ||
49 | * submitted to local disk and/or | ||
50 | * send via the network. | ||
51 | * | ||
52 | * It has to be placed on the transfer log and other housekeeping lists, | ||
53 | * In case we have a network connection. | ||
54 | * | ||
55 | * It may be identified as a concurrent (write) request | ||
56 | * and be handled accordingly. | ||
57 | * | ||
58 | * It may me handed over to the local disk subsystem. | ||
59 | * It may be completed by the local disk subsystem, | ||
60 | * either successfully or with io-error. | ||
61 | * In case it is a READ request, and it failed locally, | ||
62 | * it may be retried remotely. | ||
63 | * | ||
64 | * It may be queued for sending. | ||
65 | * It may be handed over to the network stack, | ||
66 | * which may fail. | ||
67 | * It may be acknowledged by the "peer" according to the wire_protocol in use. | ||
68 | * this may be a negative ack. | ||
69 | * It may receive a faked ack when the network connection is lost and the | ||
70 | * transfer log is cleaned up. | ||
71 | * Sending may be canceled due to network connection loss. | ||
72 | * When it finally has outlived its time, | ||
73 | * corresponding dirty bits in the resync-bitmap may be cleared or set, | ||
74 | * it will be destroyed, | ||
75 | * and completion will be signalled to the originator, | ||
76 | * with or without "success". | ||
77 | */ | ||
78 | |||
79 | enum drbd_req_event { | ||
80 | created, | ||
81 | to_be_send, | ||
82 | to_be_submitted, | ||
83 | |||
84 | /* XXX yes, now I am inconsistent... | ||
85 | * these two are not "events" but "actions" | ||
86 | * oh, well... */ | ||
87 | queue_for_net_write, | ||
88 | queue_for_net_read, | ||
89 | |||
90 | send_canceled, | ||
91 | send_failed, | ||
92 | handed_over_to_network, | ||
93 | connection_lost_while_pending, | ||
94 | recv_acked_by_peer, | ||
95 | write_acked_by_peer, | ||
96 | write_acked_by_peer_and_sis, /* and set_in_sync */ | ||
97 | conflict_discarded_by_peer, | ||
98 | neg_acked, | ||
99 | barrier_acked, /* in protocol A and B */ | ||
100 | data_received, /* (remote read) */ | ||
101 | |||
102 | read_completed_with_error, | ||
103 | read_ahead_completed_with_error, | ||
104 | write_completed_with_error, | ||
105 | completed_ok, | ||
106 | nothing, /* for tracing only */ | ||
107 | }; | ||
108 | |||
109 | /* encoding of request states for now. we don't actually need that many bits. | ||
110 | * we don't need to do atomic bit operations either, since most of the time we | ||
111 | * need to look at the connection state and/or manipulate some lists at the | ||
112 | * same time, so we should hold the request lock anyways. | ||
113 | */ | ||
114 | enum drbd_req_state_bits { | ||
115 | /* 210 | ||
116 | * 000: no local possible | ||
117 | * 001: to be submitted | ||
118 | * UNUSED, we could map: 011: submitted, completion still pending | ||
119 | * 110: completed ok | ||
120 | * 010: completed with error | ||
121 | */ | ||
122 | __RQ_LOCAL_PENDING, | ||
123 | __RQ_LOCAL_COMPLETED, | ||
124 | __RQ_LOCAL_OK, | ||
125 | |||
126 | /* 76543 | ||
127 | * 00000: no network possible | ||
128 | * 00001: to be send | ||
129 | * 00011: to be send, on worker queue | ||
130 | * 00101: sent, expecting recv_ack (B) or write_ack (C) | ||
131 | * 11101: sent, | ||
132 | * recv_ack (B) or implicit "ack" (A), | ||
133 | * still waiting for the barrier ack. | ||
134 | * master_bio may already be completed and invalidated. | ||
135 | * 11100: write_acked (C), | ||
136 | * data_received (for remote read, any protocol) | ||
137 | * or finally the barrier ack has arrived (B,A)... | ||
138 | * request can be freed | ||
139 | * 01100: neg-acked (write, protocol C) | ||
140 | * or neg-d-acked (read, any protocol) | ||
141 | * or killed from the transfer log | ||
142 | * during cleanup after connection loss | ||
143 | * request can be freed | ||
144 | * 01000: canceled or send failed... | ||
145 | * request can be freed | ||
146 | */ | ||
147 | |||
148 | /* if "SENT" is not set, yet, this can still fail or be canceled. | ||
149 | * if "SENT" is set already, we still wait for an Ack packet. | ||
150 | * when cleared, the master_bio may be completed. | ||
151 | * in (B,A) the request object may still linger on the transaction log | ||
152 | * until the corresponding barrier ack comes in */ | ||
153 | __RQ_NET_PENDING, | ||
154 | |||
155 | /* If it is QUEUED, and it is a WRITE, it is also registered in the | ||
156 | * transfer log. Currently we need this flag to avoid conflicts between | ||
157 | * worker canceling the request and tl_clear_barrier killing it from | ||
158 | * transfer log. We should restructure the code so this conflict does | ||
159 | * no longer occur. */ | ||
160 | __RQ_NET_QUEUED, | ||
161 | |||
162 | /* well, actually only "handed over to the network stack". | ||
163 | * | ||
164 | * TODO can potentially be dropped because of the similar meaning | ||
165 | * of RQ_NET_SENT and ~RQ_NET_QUEUED. | ||
166 | * however it is not exactly the same. before we drop it | ||
167 | * we must ensure that we can tell a request with network part | ||
168 | * from a request without, regardless of what happens to it. */ | ||
169 | __RQ_NET_SENT, | ||
170 | |||
171 | /* when set, the request may be freed (if RQ_NET_QUEUED is clear). | ||
172 | * basically this means the corresponding P_BARRIER_ACK was received */ | ||
173 | __RQ_NET_DONE, | ||
174 | |||
175 | /* whether or not we know (C) or pretend (B,A) that the write | ||
176 | * was successfully written on the peer. | ||
177 | */ | ||
178 | __RQ_NET_OK, | ||
179 | |||
180 | /* peer called drbd_set_in_sync() for this write */ | ||
181 | __RQ_NET_SIS, | ||
182 | |||
183 | /* keep this last, its for the RQ_NET_MASK */ | ||
184 | __RQ_NET_MAX, | ||
185 | }; | ||
186 | |||
187 | #define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) | ||
188 | #define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED) | ||
189 | #define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK) | ||
190 | |||
191 | #define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */ | ||
192 | |||
193 | #define RQ_NET_PENDING (1UL << __RQ_NET_PENDING) | ||
194 | #define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED) | ||
195 | #define RQ_NET_SENT (1UL << __RQ_NET_SENT) | ||
196 | #define RQ_NET_DONE (1UL << __RQ_NET_DONE) | ||
197 | #define RQ_NET_OK (1UL << __RQ_NET_OK) | ||
198 | #define RQ_NET_SIS (1UL << __RQ_NET_SIS) | ||
199 | |||
200 | /* 0x1f8 */ | ||
201 | #define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK) | ||
202 | |||
203 | /* epoch entries */ | ||
204 | static inline | ||
205 | struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector) | ||
206 | { | ||
207 | BUG_ON(mdev->ee_hash_s == 0); | ||
208 | return mdev->ee_hash + | ||
209 | ((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s); | ||
210 | } | ||
211 | |||
212 | /* transfer log (drbd_request objects) */ | ||
213 | static inline | ||
214 | struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector) | ||
215 | { | ||
216 | BUG_ON(mdev->tl_hash_s == 0); | ||
217 | return mdev->tl_hash + | ||
218 | ((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s); | ||
219 | } | ||
220 | |||
221 | /* application reads (drbd_request objects) */ | ||
222 | static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector) | ||
223 | { | ||
224 | return mdev->app_reads_hash | ||
225 | + ((unsigned int)(sector) % APP_R_HSIZE); | ||
226 | } | ||
227 | |||
228 | /* when we receive the answer for a read request, | ||
229 | * verify that we actually know about it */ | ||
230 | static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev, | ||
231 | u64 id, sector_t sector) | ||
232 | { | ||
233 | struct hlist_head *slot = ar_hash_slot(mdev, sector); | ||
234 | struct hlist_node *n; | ||
235 | struct drbd_request *req; | ||
236 | |||
237 | hlist_for_each_entry(req, n, slot, colision) { | ||
238 | if ((unsigned long)req == (unsigned long)id) { | ||
239 | D_ASSERT(req->sector == sector); | ||
240 | return req; | ||
241 | } | ||
242 | } | ||
243 | return NULL; | ||
244 | } | ||
245 | |||
246 | static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev, | ||
247 | struct bio *bio_src) | ||
248 | { | ||
249 | struct bio *bio; | ||
250 | struct drbd_request *req = | ||
251 | mempool_alloc(drbd_request_mempool, GFP_NOIO); | ||
252 | if (likely(req)) { | ||
253 | bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */ | ||
254 | |||
255 | req->rq_state = 0; | ||
256 | req->mdev = mdev; | ||
257 | req->master_bio = bio_src; | ||
258 | req->private_bio = bio; | ||
259 | req->epoch = 0; | ||
260 | req->sector = bio->bi_sector; | ||
261 | req->size = bio->bi_size; | ||
262 | req->start_time = jiffies; | ||
263 | INIT_HLIST_NODE(&req->colision); | ||
264 | INIT_LIST_HEAD(&req->tl_requests); | ||
265 | INIT_LIST_HEAD(&req->w.list); | ||
266 | |||
267 | bio->bi_private = req; | ||
268 | bio->bi_end_io = drbd_endio_pri; | ||
269 | bio->bi_next = NULL; | ||
270 | } | ||
271 | return req; | ||
272 | } | ||
273 | |||
274 | static inline void drbd_req_free(struct drbd_request *req) | ||
275 | { | ||
276 | mempool_free(req, drbd_request_mempool); | ||
277 | } | ||
278 | |||
279 | static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2) | ||
280 | { | ||
281 | return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9))); | ||
282 | } | ||
283 | |||
284 | /* Short lived temporary struct on the stack. | ||
285 | * We could squirrel the error to be returned into | ||
286 | * bio->bi_size, or similar. But that would be too ugly. */ | ||
287 | struct bio_and_error { | ||
288 | struct bio *bio; | ||
289 | int error; | ||
290 | }; | ||
291 | |||
292 | extern void _req_may_be_done(struct drbd_request *req, | ||
293 | struct bio_and_error *m); | ||
294 | extern void __req_mod(struct drbd_request *req, enum drbd_req_event what, | ||
295 | struct bio_and_error *m); | ||
296 | extern void complete_master_bio(struct drbd_conf *mdev, | ||
297 | struct bio_and_error *m); | ||
298 | |||
299 | /* use this if you don't want to deal with calling complete_master_bio() | ||
300 | * outside the spinlock, e.g. when walking some list on cleanup. */ | ||
301 | static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what) | ||
302 | { | ||
303 | struct drbd_conf *mdev = req->mdev; | ||
304 | struct bio_and_error m; | ||
305 | |||
306 | /* __req_mod possibly frees req, do not touch req after that! */ | ||
307 | __req_mod(req, what, &m); | ||
308 | if (m.bio) | ||
309 | complete_master_bio(mdev, &m); | ||
310 | } | ||
311 | |||
312 | /* completion of master bio is outside of spinlock. | ||
313 | * If you need it irqsave, do it your self! */ | ||
314 | static inline void req_mod(struct drbd_request *req, | ||
315 | enum drbd_req_event what) | ||
316 | { | ||
317 | struct drbd_conf *mdev = req->mdev; | ||
318 | struct bio_and_error m; | ||
319 | spin_lock_irq(&mdev->req_lock); | ||
320 | __req_mod(req, what, &m); | ||
321 | spin_unlock_irq(&mdev->req_lock); | ||
322 | |||
323 | if (m.bio) | ||
324 | complete_master_bio(mdev, &m); | ||
325 | } | ||
326 | #endif | ||
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c new file mode 100644 index 000000000000..76863e3f05be --- /dev/null +++ b/drivers/block/drbd/drbd_strings.c | |||
@@ -0,0 +1,113 @@ | |||
1 | /* | ||
2 | drbd.h | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #include <linux/drbd.h> | ||
27 | |||
28 | static const char *drbd_conn_s_names[] = { | ||
29 | [C_STANDALONE] = "StandAlone", | ||
30 | [C_DISCONNECTING] = "Disconnecting", | ||
31 | [C_UNCONNECTED] = "Unconnected", | ||
32 | [C_TIMEOUT] = "Timeout", | ||
33 | [C_BROKEN_PIPE] = "BrokenPipe", | ||
34 | [C_NETWORK_FAILURE] = "NetworkFailure", | ||
35 | [C_PROTOCOL_ERROR] = "ProtocolError", | ||
36 | [C_WF_CONNECTION] = "WFConnection", | ||
37 | [C_WF_REPORT_PARAMS] = "WFReportParams", | ||
38 | [C_TEAR_DOWN] = "TearDown", | ||
39 | [C_CONNECTED] = "Connected", | ||
40 | [C_STARTING_SYNC_S] = "StartingSyncS", | ||
41 | [C_STARTING_SYNC_T] = "StartingSyncT", | ||
42 | [C_WF_BITMAP_S] = "WFBitMapS", | ||
43 | [C_WF_BITMAP_T] = "WFBitMapT", | ||
44 | [C_WF_SYNC_UUID] = "WFSyncUUID", | ||
45 | [C_SYNC_SOURCE] = "SyncSource", | ||
46 | [C_SYNC_TARGET] = "SyncTarget", | ||
47 | [C_PAUSED_SYNC_S] = "PausedSyncS", | ||
48 | [C_PAUSED_SYNC_T] = "PausedSyncT", | ||
49 | [C_VERIFY_S] = "VerifyS", | ||
50 | [C_VERIFY_T] = "VerifyT", | ||
51 | }; | ||
52 | |||
53 | static const char *drbd_role_s_names[] = { | ||
54 | [R_PRIMARY] = "Primary", | ||
55 | [R_SECONDARY] = "Secondary", | ||
56 | [R_UNKNOWN] = "Unknown" | ||
57 | }; | ||
58 | |||
59 | static const char *drbd_disk_s_names[] = { | ||
60 | [D_DISKLESS] = "Diskless", | ||
61 | [D_ATTACHING] = "Attaching", | ||
62 | [D_FAILED] = "Failed", | ||
63 | [D_NEGOTIATING] = "Negotiating", | ||
64 | [D_INCONSISTENT] = "Inconsistent", | ||
65 | [D_OUTDATED] = "Outdated", | ||
66 | [D_UNKNOWN] = "DUnknown", | ||
67 | [D_CONSISTENT] = "Consistent", | ||
68 | [D_UP_TO_DATE] = "UpToDate", | ||
69 | }; | ||
70 | |||
71 | static const char *drbd_state_sw_errors[] = { | ||
72 | [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config", | ||
73 | [-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk", | ||
74 | [-SS_NO_LOCAL_DISK] = "Can not resync without local disk", | ||
75 | [-SS_NO_REMOTE_DISK] = "Can not resync without remote disk", | ||
76 | [-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected", | ||
77 | [-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated", | ||
78 | [-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active", | ||
79 | [-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device", | ||
80 | [-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node", | ||
81 | [-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk", | ||
82 | [-SS_DEVICE_IN_USE] = "Device is held open by someone", | ||
83 | [-SS_NO_NET_CONFIG] = "Have no net/connection configuration", | ||
84 | [-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify", | ||
85 | [-SS_NEED_CONNECTION] = "Need a connection to start verify or resync", | ||
86 | [-SS_NOT_SUPPORTED] = "Peer does not support protocol", | ||
87 | [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated", | ||
88 | [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change", | ||
89 | [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted", | ||
90 | }; | ||
91 | |||
92 | const char *drbd_conn_str(enum drbd_conns s) | ||
93 | { | ||
94 | /* enums are unsigned... */ | ||
95 | return s > C_PAUSED_SYNC_T ? "TOO_LARGE" : drbd_conn_s_names[s]; | ||
96 | } | ||
97 | |||
98 | const char *drbd_role_str(enum drbd_role s) | ||
99 | { | ||
100 | return s > R_SECONDARY ? "TOO_LARGE" : drbd_role_s_names[s]; | ||
101 | } | ||
102 | |||
103 | const char *drbd_disk_str(enum drbd_disk_state s) | ||
104 | { | ||
105 | return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s]; | ||
106 | } | ||
107 | |||
108 | const char *drbd_set_st_err_str(enum drbd_state_ret_codes err) | ||
109 | { | ||
110 | return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" : | ||
111 | err > SS_TWO_PRIMARIES ? "TOO_LARGE" | ||
112 | : drbd_state_sw_errors[-err]; | ||
113 | } | ||
diff --git a/drivers/block/drbd/drbd_vli.h b/drivers/block/drbd/drbd_vli.h new file mode 100644 index 000000000000..fc824006e721 --- /dev/null +++ b/drivers/block/drbd/drbd_vli.h | |||
@@ -0,0 +1,351 @@ | |||
1 | /* | ||
2 | -*- linux-c -*- | ||
3 | drbd_receiver.c | ||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | */ | ||
24 | |||
25 | #ifndef _DRBD_VLI_H | ||
26 | #define _DRBD_VLI_H | ||
27 | |||
28 | /* | ||
29 | * At a granularity of 4KiB storage represented per bit, | ||
30 | * and stroage sizes of several TiB, | ||
31 | * and possibly small-bandwidth replication, | ||
32 | * the bitmap transfer time can take much too long, | ||
33 | * if transmitted in plain text. | ||
34 | * | ||
35 | * We try to reduce the transfered bitmap information | ||
36 | * by encoding runlengths of bit polarity. | ||
37 | * | ||
38 | * We never actually need to encode a "zero" (runlengths are positive). | ||
39 | * But then we have to store the value of the first bit. | ||
40 | * The first bit of information thus shall encode if the first runlength | ||
41 | * gives the number of set or unset bits. | ||
42 | * | ||
43 | * We assume that large areas are either completely set or unset, | ||
44 | * which gives good compression with any runlength method, | ||
45 | * even when encoding the runlength as fixed size 32bit/64bit integers. | ||
46 | * | ||
47 | * Still, there may be areas where the polarity flips every few bits, | ||
48 | * and encoding the runlength sequence of those areas with fix size | ||
49 | * integers would be much worse than plaintext. | ||
50 | * | ||
51 | * We want to encode small runlength values with minimum code length, | ||
52 | * while still being able to encode a Huge run of all zeros. | ||
53 | * | ||
54 | * Thus we need a Variable Length Integer encoding, VLI. | ||
55 | * | ||
56 | * For some cases, we produce more code bits than plaintext input. | ||
57 | * We need to send incompressible chunks as plaintext, skip over them | ||
58 | * and then see if the next chunk compresses better. | ||
59 | * | ||
60 | * We don't care too much about "excellent" compression ratio for large | ||
61 | * runlengths (all set/all clear): whether we achieve a factor of 100 | ||
62 | * or 1000 is not that much of an issue. | ||
63 | * We do not want to waste too much on short runlengths in the "noisy" | ||
64 | * parts of the bitmap, though. | ||
65 | * | ||
66 | * There are endless variants of VLI, we experimented with: | ||
67 | * * simple byte-based | ||
68 | * * various bit based with different code word length. | ||
69 | * | ||
70 | * To avoid yet an other configuration parameter (choice of bitmap compression | ||
71 | * algorithm) which was difficult to explain and tune, we just chose the one | ||
72 | * variant that turned out best in all test cases. | ||
73 | * Based on real world usage patterns, with device sizes ranging from a few GiB | ||
74 | * to several TiB, file server/mailserver/webserver/mysql/postgress, | ||
75 | * mostly idle to really busy, the all time winner (though sometimes only | ||
76 | * marginally better) is: | ||
77 | */ | ||
78 | |||
79 | /* | ||
80 | * encoding is "visualised" as | ||
81 | * __little endian__ bitstream, least significant bit first (left most) | ||
82 | * | ||
83 | * this particular encoding is chosen so that the prefix code | ||
84 | * starts as unary encoding the level, then modified so that | ||
85 | * 10 levels can be described in 8bit, with minimal overhead | ||
86 | * for the smaller levels. | ||
87 | * | ||
88 | * Number of data bits follow fibonacci sequence, with the exception of the | ||
89 | * last level (+1 data bit, so it makes 64bit total). The only worse code when | ||
90 | * encoding bit polarity runlength is 1 plain bits => 2 code bits. | ||
91 | prefix data bits max val Nº data bits | ||
92 | 0 x 0x2 1 | ||
93 | 10 x 0x4 1 | ||
94 | 110 xx 0x8 2 | ||
95 | 1110 xxx 0x10 3 | ||
96 | 11110 xxx xx 0x30 5 | ||
97 | 111110 xx xxxxxx 0x130 8 | ||
98 | 11111100 xxxxxxxx xxxxx 0x2130 13 | ||
99 | 11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21 | ||
100 | 11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34 | ||
101 | 11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56 | ||
102 | * maximum encodable value: 0x100000400202130 == 2**56 + some */ | ||
103 | |||
104 | /* compression "table": | ||
105 | transmitted x 0.29 | ||
106 | as plaintext x ........................ | ||
107 | x ........................ | ||
108 | x ........................ | ||
109 | x 0.59 0.21........................ | ||
110 | x ........................................................ | ||
111 | x .. c ................................................... | ||
112 | x 0.44.. o ................................................... | ||
113 | x .......... d ................................................... | ||
114 | x .......... e ................................................... | ||
115 | X............. ................................................... | ||
116 | x.............. b ................................................... | ||
117 | 2.0x............... i ................................................... | ||
118 | #X................ t ................................................... | ||
119 | #................. s ........................... plain bits .......... | ||
120 | -+----------------------------------------------------------------------- | ||
121 | 1 16 32 64 | ||
122 | */ | ||
123 | |||
124 | /* LEVEL: (total bits, prefix bits, prefix value), | ||
125 | * sorted ascending by number of total bits. | ||
126 | * The rest of the code table is calculated at compiletime from this. */ | ||
127 | |||
128 | /* fibonacci data 1, 1, ... */ | ||
129 | #define VLI_L_1_1() do { \ | ||
130 | LEVEL( 2, 1, 0x00); \ | ||
131 | LEVEL( 3, 2, 0x01); \ | ||
132 | LEVEL( 5, 3, 0x03); \ | ||
133 | LEVEL( 7, 4, 0x07); \ | ||
134 | LEVEL(10, 5, 0x0f); \ | ||
135 | LEVEL(14, 6, 0x1f); \ | ||
136 | LEVEL(21, 8, 0x3f); \ | ||
137 | LEVEL(29, 8, 0x7f); \ | ||
138 | LEVEL(42, 8, 0xbf); \ | ||
139 | LEVEL(64, 8, 0xff); \ | ||
140 | } while (0) | ||
141 | |||
142 | /* finds a suitable level to decode the least significant part of in. | ||
143 | * returns number of bits consumed. | ||
144 | * | ||
145 | * BUG() for bad input, as that would mean a buggy code table. */ | ||
146 | static inline int vli_decode_bits(u64 *out, const u64 in) | ||
147 | { | ||
148 | u64 adj = 1; | ||
149 | |||
150 | #define LEVEL(t,b,v) \ | ||
151 | do { \ | ||
152 | if ((in & ((1 << b) -1)) == v) { \ | ||
153 | *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \ | ||
154 | return t; \ | ||
155 | } \ | ||
156 | adj += 1ULL << (t - b); \ | ||
157 | } while (0) | ||
158 | |||
159 | VLI_L_1_1(); | ||
160 | |||
161 | /* NOT REACHED, if VLI_LEVELS code table is defined properly */ | ||
162 | BUG(); | ||
163 | #undef LEVEL | ||
164 | } | ||
165 | |||
166 | /* return number of code bits needed, | ||
167 | * or negative error number */ | ||
168 | static inline int __vli_encode_bits(u64 *out, const u64 in) | ||
169 | { | ||
170 | u64 max = 0; | ||
171 | u64 adj = 1; | ||
172 | |||
173 | if (in == 0) | ||
174 | return -EINVAL; | ||
175 | |||
176 | #define LEVEL(t,b,v) do { \ | ||
177 | max += 1ULL << (t - b); \ | ||
178 | if (in <= max) { \ | ||
179 | if (out) \ | ||
180 | *out = ((in - adj) << b) | v; \ | ||
181 | return t; \ | ||
182 | } \ | ||
183 | adj = max + 1; \ | ||
184 | } while (0) | ||
185 | |||
186 | VLI_L_1_1(); | ||
187 | |||
188 | return -EOVERFLOW; | ||
189 | #undef LEVEL | ||
190 | } | ||
191 | |||
192 | #undef VLI_L_1_1 | ||
193 | |||
194 | /* code from here down is independend of actually used bit code */ | ||
195 | |||
196 | /* | ||
197 | * Code length is determined by some unique (e.g. unary) prefix. | ||
198 | * This encodes arbitrary bit length, not whole bytes: we have a bit-stream, | ||
199 | * not a byte stream. | ||
200 | */ | ||
201 | |||
202 | /* for the bitstream, we need a cursor */ | ||
203 | struct bitstream_cursor { | ||
204 | /* the current byte */ | ||
205 | u8 *b; | ||
206 | /* the current bit within *b, nomalized: 0..7 */ | ||
207 | unsigned int bit; | ||
208 | }; | ||
209 | |||
210 | /* initialize cursor to point to first bit of stream */ | ||
211 | static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s) | ||
212 | { | ||
213 | cur->b = s; | ||
214 | cur->bit = 0; | ||
215 | } | ||
216 | |||
217 | /* advance cursor by that many bits; maximum expected input value: 64, | ||
218 | * but depending on VLI implementation, it may be more. */ | ||
219 | static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits) | ||
220 | { | ||
221 | bits += cur->bit; | ||
222 | cur->b = cur->b + (bits >> 3); | ||
223 | cur->bit = bits & 7; | ||
224 | } | ||
225 | |||
226 | /* the bitstream itself knows its length */ | ||
227 | struct bitstream { | ||
228 | struct bitstream_cursor cur; | ||
229 | unsigned char *buf; | ||
230 | size_t buf_len; /* in bytes */ | ||
231 | |||
232 | /* for input stream: | ||
233 | * number of trailing 0 bits for padding | ||
234 | * total number of valid bits in stream: buf_len * 8 - pad_bits */ | ||
235 | unsigned int pad_bits; | ||
236 | }; | ||
237 | |||
238 | static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits) | ||
239 | { | ||
240 | bs->buf = s; | ||
241 | bs->buf_len = len; | ||
242 | bs->pad_bits = pad_bits; | ||
243 | bitstream_cursor_reset(&bs->cur, bs->buf); | ||
244 | } | ||
245 | |||
246 | static inline void bitstream_rewind(struct bitstream *bs) | ||
247 | { | ||
248 | bitstream_cursor_reset(&bs->cur, bs->buf); | ||
249 | memset(bs->buf, 0, bs->buf_len); | ||
250 | } | ||
251 | |||
252 | /* Put (at most 64) least significant bits of val into bitstream, and advance cursor. | ||
253 | * Ignores "pad_bits". | ||
254 | * Returns zero if bits == 0 (nothing to do). | ||
255 | * Returns number of bits used if successful. | ||
256 | * | ||
257 | * If there is not enough room left in bitstream, | ||
258 | * leaves bitstream unchanged and returns -ENOBUFS. | ||
259 | */ | ||
260 | static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits) | ||
261 | { | ||
262 | unsigned char *b = bs->cur.b; | ||
263 | unsigned int tmp; | ||
264 | |||
265 | if (bits == 0) | ||
266 | return 0; | ||
267 | |||
268 | if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len) | ||
269 | return -ENOBUFS; | ||
270 | |||
271 | /* paranoia: strip off hi bits; they should not be set anyways. */ | ||
272 | if (bits < 64) | ||
273 | val &= ~0ULL >> (64 - bits); | ||
274 | |||
275 | *b++ |= (val & 0xff) << bs->cur.bit; | ||
276 | |||
277 | for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8) | ||
278 | *b++ |= (val >> tmp) & 0xff; | ||
279 | |||
280 | bitstream_cursor_advance(&bs->cur, bits); | ||
281 | return bits; | ||
282 | } | ||
283 | |||
284 | /* Fetch (at most 64) bits from bitstream into *out, and advance cursor. | ||
285 | * | ||
286 | * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged. | ||
287 | * | ||
288 | * If there are less than the requested number of valid bits left in the | ||
289 | * bitstream, still fetches all available bits. | ||
290 | * | ||
291 | * Returns number of actually fetched bits. | ||
292 | */ | ||
293 | static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits) | ||
294 | { | ||
295 | u64 val; | ||
296 | unsigned int n; | ||
297 | |||
298 | if (bits > 64) | ||
299 | return -EINVAL; | ||
300 | |||
301 | if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len) | ||
302 | bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3) | ||
303 | - bs->cur.bit - bs->pad_bits; | ||
304 | |||
305 | if (bits == 0) { | ||
306 | *out = 0; | ||
307 | return 0; | ||
308 | } | ||
309 | |||
310 | /* get the high bits */ | ||
311 | val = 0; | ||
312 | n = (bs->cur.bit + bits + 7) >> 3; | ||
313 | /* n may be at most 9, if cur.bit + bits > 64 */ | ||
314 | /* which means this copies at most 8 byte */ | ||
315 | if (n) { | ||
316 | memcpy(&val, bs->cur.b+1, n - 1); | ||
317 | val = le64_to_cpu(val) << (8 - bs->cur.bit); | ||
318 | } | ||
319 | |||
320 | /* we still need the low bits */ | ||
321 | val |= bs->cur.b[0] >> bs->cur.bit; | ||
322 | |||
323 | /* and mask out bits we don't want */ | ||
324 | val &= ~0ULL >> (64 - bits); | ||
325 | |||
326 | bitstream_cursor_advance(&bs->cur, bits); | ||
327 | *out = val; | ||
328 | |||
329 | return bits; | ||
330 | } | ||
331 | |||
332 | /* encodes @in as vli into @bs; | ||
333 | |||
334 | * return values | ||
335 | * > 0: number of bits successfully stored in bitstream | ||
336 | * -ENOBUFS @bs is full | ||
337 | * -EINVAL input zero (invalid) | ||
338 | * -EOVERFLOW input too large for this vli code (invalid) | ||
339 | */ | ||
340 | static inline int vli_encode_bits(struct bitstream *bs, u64 in) | ||
341 | { | ||
342 | u64 code = code; | ||
343 | int bits = __vli_encode_bits(&code, in); | ||
344 | |||
345 | if (bits <= 0) | ||
346 | return bits; | ||
347 | |||
348 | return bitstream_put_bits(bs, code, bits); | ||
349 | } | ||
350 | |||
351 | #endif | ||
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c new file mode 100644 index 000000000000..d48a1dfd7b24 --- /dev/null +++ b/drivers/block/drbd/drbd_worker.c | |||
@@ -0,0 +1,1516 @@ | |||
1 | /* | ||
2 | drbd_worker.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | #include <linux/drbd.h> | ||
28 | #include <linux/sched.h> | ||
29 | #include <linux/smp_lock.h> | ||
30 | #include <linux/wait.h> | ||
31 | #include <linux/mm.h> | ||
32 | #include <linux/memcontrol.h> | ||
33 | #include <linux/mm_inline.h> | ||
34 | #include <linux/slab.h> | ||
35 | #include <linux/random.h> | ||
36 | #include <linux/string.h> | ||
37 | #include <linux/scatterlist.h> | ||
38 | |||
39 | #include "drbd_int.h" | ||
40 | #include "drbd_req.h" | ||
41 | |||
42 | #define SLEEP_TIME (HZ/10) | ||
43 | |||
44 | static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel); | ||
45 | |||
46 | |||
47 | |||
48 | /* defined here: | ||
49 | drbd_md_io_complete | ||
50 | drbd_endio_write_sec | ||
51 | drbd_endio_read_sec | ||
52 | drbd_endio_pri | ||
53 | |||
54 | * more endio handlers: | ||
55 | atodb_endio in drbd_actlog.c | ||
56 | drbd_bm_async_io_complete in drbd_bitmap.c | ||
57 | |||
58 | * For all these callbacks, note the following: | ||
59 | * The callbacks will be called in irq context by the IDE drivers, | ||
60 | * and in Softirqs/Tasklets/BH context by the SCSI drivers. | ||
61 | * Try to get the locking right :) | ||
62 | * | ||
63 | */ | ||
64 | |||
65 | |||
66 | /* About the global_state_lock | ||
67 | Each state transition on an device holds a read lock. In case we have | ||
68 | to evaluate the sync after dependencies, we grab a write lock, because | ||
69 | we need stable states on all devices for that. */ | ||
70 | rwlock_t global_state_lock; | ||
71 | |||
72 | /* used for synchronous meta data and bitmap IO | ||
73 | * submitted by drbd_md_sync_page_io() | ||
74 | */ | ||
75 | void drbd_md_io_complete(struct bio *bio, int error) | ||
76 | { | ||
77 | struct drbd_md_io *md_io; | ||
78 | |||
79 | md_io = (struct drbd_md_io *)bio->bi_private; | ||
80 | md_io->error = error; | ||
81 | |||
82 | complete(&md_io->event); | ||
83 | } | ||
84 | |||
85 | /* reads on behalf of the partner, | ||
86 | * "submitted" by the receiver | ||
87 | */ | ||
88 | void drbd_endio_read_sec(struct bio *bio, int error) __releases(local) | ||
89 | { | ||
90 | unsigned long flags = 0; | ||
91 | struct drbd_epoch_entry *e = NULL; | ||
92 | struct drbd_conf *mdev; | ||
93 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | ||
94 | |||
95 | e = bio->bi_private; | ||
96 | mdev = e->mdev; | ||
97 | |||
98 | if (error) | ||
99 | dev_warn(DEV, "read: error=%d s=%llus\n", error, | ||
100 | (unsigned long long)e->sector); | ||
101 | if (!error && !uptodate) { | ||
102 | dev_warn(DEV, "read: setting error to -EIO s=%llus\n", | ||
103 | (unsigned long long)e->sector); | ||
104 | /* strange behavior of some lower level drivers... | ||
105 | * fail the request by clearing the uptodate flag, | ||
106 | * but do not return any error?! */ | ||
107 | error = -EIO; | ||
108 | } | ||
109 | |||
110 | D_ASSERT(e->block_id != ID_VACANT); | ||
111 | |||
112 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
113 | mdev->read_cnt += e->size >> 9; | ||
114 | list_del(&e->w.list); | ||
115 | if (list_empty(&mdev->read_ee)) | ||
116 | wake_up(&mdev->ee_wait); | ||
117 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
118 | |||
119 | drbd_chk_io_error(mdev, error, FALSE); | ||
120 | drbd_queue_work(&mdev->data.work, &e->w); | ||
121 | put_ldev(mdev); | ||
122 | } | ||
123 | |||
124 | /* writes on behalf of the partner, or resync writes, | ||
125 | * "submitted" by the receiver. | ||
126 | */ | ||
127 | void drbd_endio_write_sec(struct bio *bio, int error) __releases(local) | ||
128 | { | ||
129 | unsigned long flags = 0; | ||
130 | struct drbd_epoch_entry *e = NULL; | ||
131 | struct drbd_conf *mdev; | ||
132 | sector_t e_sector; | ||
133 | int do_wake; | ||
134 | int is_syncer_req; | ||
135 | int do_al_complete_io; | ||
136 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | ||
137 | int is_barrier = bio_rw_flagged(bio, BIO_RW_BARRIER); | ||
138 | |||
139 | e = bio->bi_private; | ||
140 | mdev = e->mdev; | ||
141 | |||
142 | if (error) | ||
143 | dev_warn(DEV, "write: error=%d s=%llus\n", error, | ||
144 | (unsigned long long)e->sector); | ||
145 | if (!error && !uptodate) { | ||
146 | dev_warn(DEV, "write: setting error to -EIO s=%llus\n", | ||
147 | (unsigned long long)e->sector); | ||
148 | /* strange behavior of some lower level drivers... | ||
149 | * fail the request by clearing the uptodate flag, | ||
150 | * but do not return any error?! */ | ||
151 | error = -EIO; | ||
152 | } | ||
153 | |||
154 | /* error == -ENOTSUPP would be a better test, | ||
155 | * alas it is not reliable */ | ||
156 | if (error && is_barrier && e->flags & EE_IS_BARRIER) { | ||
157 | drbd_bump_write_ordering(mdev, WO_bdev_flush); | ||
158 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
159 | list_del(&e->w.list); | ||
160 | e->w.cb = w_e_reissue; | ||
161 | /* put_ldev actually happens below, once we come here again. */ | ||
162 | __release(local); | ||
163 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
164 | drbd_queue_work(&mdev->data.work, &e->w); | ||
165 | return; | ||
166 | } | ||
167 | |||
168 | D_ASSERT(e->block_id != ID_VACANT); | ||
169 | |||
170 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
171 | mdev->writ_cnt += e->size >> 9; | ||
172 | is_syncer_req = is_syncer_block_id(e->block_id); | ||
173 | |||
174 | /* after we moved e to done_ee, | ||
175 | * we may no longer access it, | ||
176 | * it may be freed/reused already! | ||
177 | * (as soon as we release the req_lock) */ | ||
178 | e_sector = e->sector; | ||
179 | do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO; | ||
180 | |||
181 | list_del(&e->w.list); /* has been on active_ee or sync_ee */ | ||
182 | list_add_tail(&e->w.list, &mdev->done_ee); | ||
183 | |||
184 | /* No hlist_del_init(&e->colision) here, we did not send the Ack yet, | ||
185 | * neither did we wake possibly waiting conflicting requests. | ||
186 | * done from "drbd_process_done_ee" within the appropriate w.cb | ||
187 | * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */ | ||
188 | |||
189 | do_wake = is_syncer_req | ||
190 | ? list_empty(&mdev->sync_ee) | ||
191 | : list_empty(&mdev->active_ee); | ||
192 | |||
193 | if (error) | ||
194 | __drbd_chk_io_error(mdev, FALSE); | ||
195 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
196 | |||
197 | if (is_syncer_req) | ||
198 | drbd_rs_complete_io(mdev, e_sector); | ||
199 | |||
200 | if (do_wake) | ||
201 | wake_up(&mdev->ee_wait); | ||
202 | |||
203 | if (do_al_complete_io) | ||
204 | drbd_al_complete_io(mdev, e_sector); | ||
205 | |||
206 | wake_asender(mdev); | ||
207 | put_ldev(mdev); | ||
208 | |||
209 | } | ||
210 | |||
211 | /* read, readA or write requests on R_PRIMARY coming from drbd_make_request | ||
212 | */ | ||
213 | void drbd_endio_pri(struct bio *bio, int error) | ||
214 | { | ||
215 | unsigned long flags; | ||
216 | struct drbd_request *req = bio->bi_private; | ||
217 | struct drbd_conf *mdev = req->mdev; | ||
218 | struct bio_and_error m; | ||
219 | enum drbd_req_event what; | ||
220 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | ||
221 | |||
222 | if (error) | ||
223 | dev_warn(DEV, "p %s: error=%d\n", | ||
224 | bio_data_dir(bio) == WRITE ? "write" : "read", error); | ||
225 | if (!error && !uptodate) { | ||
226 | dev_warn(DEV, "p %s: setting error to -EIO\n", | ||
227 | bio_data_dir(bio) == WRITE ? "write" : "read"); | ||
228 | /* strange behavior of some lower level drivers... | ||
229 | * fail the request by clearing the uptodate flag, | ||
230 | * but do not return any error?! */ | ||
231 | error = -EIO; | ||
232 | } | ||
233 | |||
234 | /* to avoid recursion in __req_mod */ | ||
235 | if (unlikely(error)) { | ||
236 | what = (bio_data_dir(bio) == WRITE) | ||
237 | ? write_completed_with_error | ||
238 | : (bio_rw(bio) == READ) | ||
239 | ? read_completed_with_error | ||
240 | : read_ahead_completed_with_error; | ||
241 | } else | ||
242 | what = completed_ok; | ||
243 | |||
244 | bio_put(req->private_bio); | ||
245 | req->private_bio = ERR_PTR(error); | ||
246 | |||
247 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
248 | __req_mod(req, what, &m); | ||
249 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
250 | |||
251 | if (m.bio) | ||
252 | complete_master_bio(mdev, &m); | ||
253 | } | ||
254 | |||
255 | int w_io_error(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
256 | { | ||
257 | struct drbd_request *req = container_of(w, struct drbd_request, w); | ||
258 | |||
259 | /* NOTE: mdev->ldev can be NULL by the time we get here! */ | ||
260 | /* D_ASSERT(mdev->ldev->dc.on_io_error != EP_PASS_ON); */ | ||
261 | |||
262 | /* the only way this callback is scheduled is from _req_may_be_done, | ||
263 | * when it is done and had a local write error, see comments there */ | ||
264 | drbd_req_free(req); | ||
265 | |||
266 | return TRUE; | ||
267 | } | ||
268 | |||
269 | int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
270 | { | ||
271 | struct drbd_request *req = container_of(w, struct drbd_request, w); | ||
272 | |||
273 | /* We should not detach for read io-error, | ||
274 | * but try to WRITE the P_DATA_REPLY to the failed location, | ||
275 | * to give the disk the chance to relocate that block */ | ||
276 | |||
277 | spin_lock_irq(&mdev->req_lock); | ||
278 | if (cancel || | ||
279 | mdev->state.conn < C_CONNECTED || | ||
280 | mdev->state.pdsk <= D_INCONSISTENT) { | ||
281 | _req_mod(req, send_canceled); | ||
282 | spin_unlock_irq(&mdev->req_lock); | ||
283 | dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n"); | ||
284 | return 1; | ||
285 | } | ||
286 | spin_unlock_irq(&mdev->req_lock); | ||
287 | |||
288 | return w_send_read_req(mdev, w, 0); | ||
289 | } | ||
290 | |||
291 | int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
292 | { | ||
293 | ERR_IF(cancel) return 1; | ||
294 | dev_err(DEV, "resync inactive, but callback triggered??\n"); | ||
295 | return 1; /* Simply ignore this! */ | ||
296 | } | ||
297 | |||
298 | void drbd_csum(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest) | ||
299 | { | ||
300 | struct hash_desc desc; | ||
301 | struct scatterlist sg; | ||
302 | struct bio_vec *bvec; | ||
303 | int i; | ||
304 | |||
305 | desc.tfm = tfm; | ||
306 | desc.flags = 0; | ||
307 | |||
308 | sg_init_table(&sg, 1); | ||
309 | crypto_hash_init(&desc); | ||
310 | |||
311 | __bio_for_each_segment(bvec, bio, i, 0) { | ||
312 | sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset); | ||
313 | crypto_hash_update(&desc, &sg, sg.length); | ||
314 | } | ||
315 | crypto_hash_final(&desc, digest); | ||
316 | } | ||
317 | |||
318 | static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
319 | { | ||
320 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | ||
321 | int digest_size; | ||
322 | void *digest; | ||
323 | int ok; | ||
324 | |||
325 | D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef); | ||
326 | |||
327 | if (unlikely(cancel)) { | ||
328 | drbd_free_ee(mdev, e); | ||
329 | return 1; | ||
330 | } | ||
331 | |||
332 | if (likely(drbd_bio_uptodate(e->private_bio))) { | ||
333 | digest_size = crypto_hash_digestsize(mdev->csums_tfm); | ||
334 | digest = kmalloc(digest_size, GFP_NOIO); | ||
335 | if (digest) { | ||
336 | drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest); | ||
337 | |||
338 | inc_rs_pending(mdev); | ||
339 | ok = drbd_send_drequest_csum(mdev, | ||
340 | e->sector, | ||
341 | e->size, | ||
342 | digest, | ||
343 | digest_size, | ||
344 | P_CSUM_RS_REQUEST); | ||
345 | kfree(digest); | ||
346 | } else { | ||
347 | dev_err(DEV, "kmalloc() of digest failed.\n"); | ||
348 | ok = 0; | ||
349 | } | ||
350 | } else | ||
351 | ok = 1; | ||
352 | |||
353 | drbd_free_ee(mdev, e); | ||
354 | |||
355 | if (unlikely(!ok)) | ||
356 | dev_err(DEV, "drbd_send_drequest(..., csum) failed\n"); | ||
357 | return ok; | ||
358 | } | ||
359 | |||
360 | #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) | ||
361 | |||
362 | static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) | ||
363 | { | ||
364 | struct drbd_epoch_entry *e; | ||
365 | |||
366 | if (!get_ldev(mdev)) | ||
367 | return 0; | ||
368 | |||
369 | /* GFP_TRY, because if there is no memory available right now, this may | ||
370 | * be rescheduled for later. It is "only" background resync, after all. */ | ||
371 | e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY); | ||
372 | if (!e) { | ||
373 | put_ldev(mdev); | ||
374 | return 2; | ||
375 | } | ||
376 | |||
377 | spin_lock_irq(&mdev->req_lock); | ||
378 | list_add(&e->w.list, &mdev->read_ee); | ||
379 | spin_unlock_irq(&mdev->req_lock); | ||
380 | |||
381 | e->private_bio->bi_end_io = drbd_endio_read_sec; | ||
382 | e->private_bio->bi_rw = READ; | ||
383 | e->w.cb = w_e_send_csum; | ||
384 | |||
385 | mdev->read_cnt += size >> 9; | ||
386 | drbd_generic_make_request(mdev, DRBD_FAULT_RS_RD, e->private_bio); | ||
387 | |||
388 | return 1; | ||
389 | } | ||
390 | |||
391 | void resync_timer_fn(unsigned long data) | ||
392 | { | ||
393 | unsigned long flags; | ||
394 | struct drbd_conf *mdev = (struct drbd_conf *) data; | ||
395 | int queue; | ||
396 | |||
397 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
398 | |||
399 | if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) { | ||
400 | queue = 1; | ||
401 | if (mdev->state.conn == C_VERIFY_S) | ||
402 | mdev->resync_work.cb = w_make_ov_request; | ||
403 | else | ||
404 | mdev->resync_work.cb = w_make_resync_request; | ||
405 | } else { | ||
406 | queue = 0; | ||
407 | mdev->resync_work.cb = w_resync_inactive; | ||
408 | } | ||
409 | |||
410 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
411 | |||
412 | /* harmless race: list_empty outside data.work.q_lock */ | ||
413 | if (list_empty(&mdev->resync_work.list) && queue) | ||
414 | drbd_queue_work(&mdev->data.work, &mdev->resync_work); | ||
415 | } | ||
416 | |||
417 | int w_make_resync_request(struct drbd_conf *mdev, | ||
418 | struct drbd_work *w, int cancel) | ||
419 | { | ||
420 | unsigned long bit; | ||
421 | sector_t sector; | ||
422 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); | ||
423 | int max_segment_size = queue_max_segment_size(mdev->rq_queue); | ||
424 | int number, i, size, pe, mx; | ||
425 | int align, queued, sndbuf; | ||
426 | |||
427 | if (unlikely(cancel)) | ||
428 | return 1; | ||
429 | |||
430 | if (unlikely(mdev->state.conn < C_CONNECTED)) { | ||
431 | dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected"); | ||
432 | return 0; | ||
433 | } | ||
434 | |||
435 | if (mdev->state.conn != C_SYNC_TARGET) | ||
436 | dev_err(DEV, "%s in w_make_resync_request\n", | ||
437 | drbd_conn_str(mdev->state.conn)); | ||
438 | |||
439 | if (!get_ldev(mdev)) { | ||
440 | /* Since we only need to access mdev->rsync a | ||
441 | get_ldev_if_state(mdev,D_FAILED) would be sufficient, but | ||
442 | to continue resync with a broken disk makes no sense at | ||
443 | all */ | ||
444 | dev_err(DEV, "Disk broke down during resync!\n"); | ||
445 | mdev->resync_work.cb = w_resync_inactive; | ||
446 | return 1; | ||
447 | } | ||
448 | |||
449 | number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ); | ||
450 | pe = atomic_read(&mdev->rs_pending_cnt); | ||
451 | |||
452 | mutex_lock(&mdev->data.mutex); | ||
453 | if (mdev->data.socket) | ||
454 | mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req); | ||
455 | else | ||
456 | mx = 1; | ||
457 | mutex_unlock(&mdev->data.mutex); | ||
458 | |||
459 | /* For resync rates >160MB/sec, allow more pending RS requests */ | ||
460 | if (number > mx) | ||
461 | mx = number; | ||
462 | |||
463 | /* Limit the number of pending RS requests to no more than the peer's receive buffer */ | ||
464 | if ((pe + number) > mx) { | ||
465 | number = mx - pe; | ||
466 | } | ||
467 | |||
468 | for (i = 0; i < number; i++) { | ||
469 | /* Stop generating RS requests, when half of the send buffer is filled */ | ||
470 | mutex_lock(&mdev->data.mutex); | ||
471 | if (mdev->data.socket) { | ||
472 | queued = mdev->data.socket->sk->sk_wmem_queued; | ||
473 | sndbuf = mdev->data.socket->sk->sk_sndbuf; | ||
474 | } else { | ||
475 | queued = 1; | ||
476 | sndbuf = 0; | ||
477 | } | ||
478 | mutex_unlock(&mdev->data.mutex); | ||
479 | if (queued > sndbuf / 2) | ||
480 | goto requeue; | ||
481 | |||
482 | next_sector: | ||
483 | size = BM_BLOCK_SIZE; | ||
484 | bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo); | ||
485 | |||
486 | if (bit == -1UL) { | ||
487 | mdev->bm_resync_fo = drbd_bm_bits(mdev); | ||
488 | mdev->resync_work.cb = w_resync_inactive; | ||
489 | put_ldev(mdev); | ||
490 | return 1; | ||
491 | } | ||
492 | |||
493 | sector = BM_BIT_TO_SECT(bit); | ||
494 | |||
495 | if (drbd_try_rs_begin_io(mdev, sector)) { | ||
496 | mdev->bm_resync_fo = bit; | ||
497 | goto requeue; | ||
498 | } | ||
499 | mdev->bm_resync_fo = bit + 1; | ||
500 | |||
501 | if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) { | ||
502 | drbd_rs_complete_io(mdev, sector); | ||
503 | goto next_sector; | ||
504 | } | ||
505 | |||
506 | #if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE | ||
507 | /* try to find some adjacent bits. | ||
508 | * we stop if we have already the maximum req size. | ||
509 | * | ||
510 | * Additionally always align bigger requests, in order to | ||
511 | * be prepared for all stripe sizes of software RAIDs. | ||
512 | * | ||
513 | * we _do_ care about the agreed-upon q->max_segment_size | ||
514 | * here, as splitting up the requests on the other side is more | ||
515 | * difficult. the consequence is, that on lvm and md and other | ||
516 | * "indirect" devices, this is dead code, since | ||
517 | * q->max_segment_size will be PAGE_SIZE. | ||
518 | */ | ||
519 | align = 1; | ||
520 | for (;;) { | ||
521 | if (size + BM_BLOCK_SIZE > max_segment_size) | ||
522 | break; | ||
523 | |||
524 | /* Be always aligned */ | ||
525 | if (sector & ((1<<(align+3))-1)) | ||
526 | break; | ||
527 | |||
528 | /* do not cross extent boundaries */ | ||
529 | if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0) | ||
530 | break; | ||
531 | /* now, is it actually dirty, after all? | ||
532 | * caution, drbd_bm_test_bit is tri-state for some | ||
533 | * obscure reason; ( b == 0 ) would get the out-of-band | ||
534 | * only accidentally right because of the "oddly sized" | ||
535 | * adjustment below */ | ||
536 | if (drbd_bm_test_bit(mdev, bit+1) != 1) | ||
537 | break; | ||
538 | bit++; | ||
539 | size += BM_BLOCK_SIZE; | ||
540 | if ((BM_BLOCK_SIZE << align) <= size) | ||
541 | align++; | ||
542 | i++; | ||
543 | } | ||
544 | /* if we merged some, | ||
545 | * reset the offset to start the next drbd_bm_find_next from */ | ||
546 | if (size > BM_BLOCK_SIZE) | ||
547 | mdev->bm_resync_fo = bit + 1; | ||
548 | #endif | ||
549 | |||
550 | /* adjust very last sectors, in case we are oddly sized */ | ||
551 | if (sector + (size>>9) > capacity) | ||
552 | size = (capacity-sector)<<9; | ||
553 | if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) { | ||
554 | switch (read_for_csum(mdev, sector, size)) { | ||
555 | case 0: /* Disk failure*/ | ||
556 | put_ldev(mdev); | ||
557 | return 0; | ||
558 | case 2: /* Allocation failed */ | ||
559 | drbd_rs_complete_io(mdev, sector); | ||
560 | mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); | ||
561 | goto requeue; | ||
562 | /* case 1: everything ok */ | ||
563 | } | ||
564 | } else { | ||
565 | inc_rs_pending(mdev); | ||
566 | if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST, | ||
567 | sector, size, ID_SYNCER)) { | ||
568 | dev_err(DEV, "drbd_send_drequest() failed, aborting...\n"); | ||
569 | dec_rs_pending(mdev); | ||
570 | put_ldev(mdev); | ||
571 | return 0; | ||
572 | } | ||
573 | } | ||
574 | } | ||
575 | |||
576 | if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) { | ||
577 | /* last syncer _request_ was sent, | ||
578 | * but the P_RS_DATA_REPLY not yet received. sync will end (and | ||
579 | * next sync group will resume), as soon as we receive the last | ||
580 | * resync data block, and the last bit is cleared. | ||
581 | * until then resync "work" is "inactive" ... | ||
582 | */ | ||
583 | mdev->resync_work.cb = w_resync_inactive; | ||
584 | put_ldev(mdev); | ||
585 | return 1; | ||
586 | } | ||
587 | |||
588 | requeue: | ||
589 | mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); | ||
590 | put_ldev(mdev); | ||
591 | return 1; | ||
592 | } | ||
593 | |||
594 | static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
595 | { | ||
596 | int number, i, size; | ||
597 | sector_t sector; | ||
598 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); | ||
599 | |||
600 | if (unlikely(cancel)) | ||
601 | return 1; | ||
602 | |||
603 | if (unlikely(mdev->state.conn < C_CONNECTED)) { | ||
604 | dev_err(DEV, "Confused in w_make_ov_request()! cstate < Connected"); | ||
605 | return 0; | ||
606 | } | ||
607 | |||
608 | number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ); | ||
609 | if (atomic_read(&mdev->rs_pending_cnt) > number) | ||
610 | goto requeue; | ||
611 | |||
612 | number -= atomic_read(&mdev->rs_pending_cnt); | ||
613 | |||
614 | sector = mdev->ov_position; | ||
615 | for (i = 0; i < number; i++) { | ||
616 | if (sector >= capacity) { | ||
617 | mdev->resync_work.cb = w_resync_inactive; | ||
618 | return 1; | ||
619 | } | ||
620 | |||
621 | size = BM_BLOCK_SIZE; | ||
622 | |||
623 | if (drbd_try_rs_begin_io(mdev, sector)) { | ||
624 | mdev->ov_position = sector; | ||
625 | goto requeue; | ||
626 | } | ||
627 | |||
628 | if (sector + (size>>9) > capacity) | ||
629 | size = (capacity-sector)<<9; | ||
630 | |||
631 | inc_rs_pending(mdev); | ||
632 | if (!drbd_send_ov_request(mdev, sector, size)) { | ||
633 | dec_rs_pending(mdev); | ||
634 | return 0; | ||
635 | } | ||
636 | sector += BM_SECT_PER_BIT; | ||
637 | } | ||
638 | mdev->ov_position = sector; | ||
639 | |||
640 | requeue: | ||
641 | mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); | ||
642 | return 1; | ||
643 | } | ||
644 | |||
645 | |||
646 | int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
647 | { | ||
648 | kfree(w); | ||
649 | ov_oos_print(mdev); | ||
650 | drbd_resync_finished(mdev); | ||
651 | |||
652 | return 1; | ||
653 | } | ||
654 | |||
655 | static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
656 | { | ||
657 | kfree(w); | ||
658 | |||
659 | drbd_resync_finished(mdev); | ||
660 | |||
661 | return 1; | ||
662 | } | ||
663 | |||
664 | int drbd_resync_finished(struct drbd_conf *mdev) | ||
665 | { | ||
666 | unsigned long db, dt, dbdt; | ||
667 | unsigned long n_oos; | ||
668 | union drbd_state os, ns; | ||
669 | struct drbd_work *w; | ||
670 | char *khelper_cmd = NULL; | ||
671 | |||
672 | /* Remove all elements from the resync LRU. Since future actions | ||
673 | * might set bits in the (main) bitmap, then the entries in the | ||
674 | * resync LRU would be wrong. */ | ||
675 | if (drbd_rs_del_all(mdev)) { | ||
676 | /* In case this is not possible now, most probably because | ||
677 | * there are P_RS_DATA_REPLY Packets lingering on the worker's | ||
678 | * queue (or even the read operations for those packets | ||
679 | * is not finished by now). Retry in 100ms. */ | ||
680 | |||
681 | drbd_kick_lo(mdev); | ||
682 | __set_current_state(TASK_INTERRUPTIBLE); | ||
683 | schedule_timeout(HZ / 10); | ||
684 | w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC); | ||
685 | if (w) { | ||
686 | w->cb = w_resync_finished; | ||
687 | drbd_queue_work(&mdev->data.work, w); | ||
688 | return 1; | ||
689 | } | ||
690 | dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n"); | ||
691 | } | ||
692 | |||
693 | dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; | ||
694 | if (dt <= 0) | ||
695 | dt = 1; | ||
696 | db = mdev->rs_total; | ||
697 | dbdt = Bit2KB(db/dt); | ||
698 | mdev->rs_paused /= HZ; | ||
699 | |||
700 | if (!get_ldev(mdev)) | ||
701 | goto out; | ||
702 | |||
703 | spin_lock_irq(&mdev->req_lock); | ||
704 | os = mdev->state; | ||
705 | |||
706 | /* This protects us against multiple calls (that can happen in the presence | ||
707 | of application IO), and against connectivity loss just before we arrive here. */ | ||
708 | if (os.conn <= C_CONNECTED) | ||
709 | goto out_unlock; | ||
710 | |||
711 | ns = os; | ||
712 | ns.conn = C_CONNECTED; | ||
713 | |||
714 | dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n", | ||
715 | (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) ? | ||
716 | "Online verify " : "Resync", | ||
717 | dt + mdev->rs_paused, mdev->rs_paused, dbdt); | ||
718 | |||
719 | n_oos = drbd_bm_total_weight(mdev); | ||
720 | |||
721 | if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) { | ||
722 | if (n_oos) { | ||
723 | dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n", | ||
724 | n_oos, Bit2KB(1)); | ||
725 | khelper_cmd = "out-of-sync"; | ||
726 | } | ||
727 | } else { | ||
728 | D_ASSERT((n_oos - mdev->rs_failed) == 0); | ||
729 | |||
730 | if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) | ||
731 | khelper_cmd = "after-resync-target"; | ||
732 | |||
733 | if (mdev->csums_tfm && mdev->rs_total) { | ||
734 | const unsigned long s = mdev->rs_same_csum; | ||
735 | const unsigned long t = mdev->rs_total; | ||
736 | const int ratio = | ||
737 | (t == 0) ? 0 : | ||
738 | (t < 100000) ? ((s*100)/t) : (s/(t/100)); | ||
739 | dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; " | ||
740 | "transferred %luK total %luK\n", | ||
741 | ratio, | ||
742 | Bit2KB(mdev->rs_same_csum), | ||
743 | Bit2KB(mdev->rs_total - mdev->rs_same_csum), | ||
744 | Bit2KB(mdev->rs_total)); | ||
745 | } | ||
746 | } | ||
747 | |||
748 | if (mdev->rs_failed) { | ||
749 | dev_info(DEV, " %lu failed blocks\n", mdev->rs_failed); | ||
750 | |||
751 | if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) { | ||
752 | ns.disk = D_INCONSISTENT; | ||
753 | ns.pdsk = D_UP_TO_DATE; | ||
754 | } else { | ||
755 | ns.disk = D_UP_TO_DATE; | ||
756 | ns.pdsk = D_INCONSISTENT; | ||
757 | } | ||
758 | } else { | ||
759 | ns.disk = D_UP_TO_DATE; | ||
760 | ns.pdsk = D_UP_TO_DATE; | ||
761 | |||
762 | if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) { | ||
763 | if (mdev->p_uuid) { | ||
764 | int i; | ||
765 | for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++) | ||
766 | _drbd_uuid_set(mdev, i, mdev->p_uuid[i]); | ||
767 | drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]); | ||
768 | _drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]); | ||
769 | } else { | ||
770 | dev_err(DEV, "mdev->p_uuid is NULL! BUG\n"); | ||
771 | } | ||
772 | } | ||
773 | |||
774 | drbd_uuid_set_bm(mdev, 0UL); | ||
775 | |||
776 | if (mdev->p_uuid) { | ||
777 | /* Now the two UUID sets are equal, update what we | ||
778 | * know of the peer. */ | ||
779 | int i; | ||
780 | for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++) | ||
781 | mdev->p_uuid[i] = mdev->ldev->md.uuid[i]; | ||
782 | } | ||
783 | } | ||
784 | |||
785 | _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | ||
786 | out_unlock: | ||
787 | spin_unlock_irq(&mdev->req_lock); | ||
788 | put_ldev(mdev); | ||
789 | out: | ||
790 | mdev->rs_total = 0; | ||
791 | mdev->rs_failed = 0; | ||
792 | mdev->rs_paused = 0; | ||
793 | mdev->ov_start_sector = 0; | ||
794 | |||
795 | if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) { | ||
796 | dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n"); | ||
797 | drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished"); | ||
798 | } | ||
799 | |||
800 | if (khelper_cmd) | ||
801 | drbd_khelper(mdev, khelper_cmd); | ||
802 | |||
803 | return 1; | ||
804 | } | ||
805 | |||
806 | /* helper */ | ||
807 | static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e) | ||
808 | { | ||
809 | if (drbd_bio_has_active_page(e->private_bio)) { | ||
810 | /* This might happen if sendpage() has not finished */ | ||
811 | spin_lock_irq(&mdev->req_lock); | ||
812 | list_add_tail(&e->w.list, &mdev->net_ee); | ||
813 | spin_unlock_irq(&mdev->req_lock); | ||
814 | } else | ||
815 | drbd_free_ee(mdev, e); | ||
816 | } | ||
817 | |||
818 | /** | ||
819 | * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST | ||
820 | * @mdev: DRBD device. | ||
821 | * @w: work object. | ||
822 | * @cancel: The connection will be closed anyways | ||
823 | */ | ||
824 | int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
825 | { | ||
826 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | ||
827 | int ok; | ||
828 | |||
829 | if (unlikely(cancel)) { | ||
830 | drbd_free_ee(mdev, e); | ||
831 | dec_unacked(mdev); | ||
832 | return 1; | ||
833 | } | ||
834 | |||
835 | if (likely(drbd_bio_uptodate(e->private_bio))) { | ||
836 | ok = drbd_send_block(mdev, P_DATA_REPLY, e); | ||
837 | } else { | ||
838 | if (__ratelimit(&drbd_ratelimit_state)) | ||
839 | dev_err(DEV, "Sending NegDReply. sector=%llus.\n", | ||
840 | (unsigned long long)e->sector); | ||
841 | |||
842 | ok = drbd_send_ack(mdev, P_NEG_DREPLY, e); | ||
843 | } | ||
844 | |||
845 | dec_unacked(mdev); | ||
846 | |||
847 | move_to_net_ee_or_free(mdev, e); | ||
848 | |||
849 | if (unlikely(!ok)) | ||
850 | dev_err(DEV, "drbd_send_block() failed\n"); | ||
851 | return ok; | ||
852 | } | ||
853 | |||
854 | /** | ||
855 | * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS | ||
856 | * @mdev: DRBD device. | ||
857 | * @w: work object. | ||
858 | * @cancel: The connection will be closed anyways | ||
859 | */ | ||
860 | int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
861 | { | ||
862 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | ||
863 | int ok; | ||
864 | |||
865 | if (unlikely(cancel)) { | ||
866 | drbd_free_ee(mdev, e); | ||
867 | dec_unacked(mdev); | ||
868 | return 1; | ||
869 | } | ||
870 | |||
871 | if (get_ldev_if_state(mdev, D_FAILED)) { | ||
872 | drbd_rs_complete_io(mdev, e->sector); | ||
873 | put_ldev(mdev); | ||
874 | } | ||
875 | |||
876 | if (likely(drbd_bio_uptodate(e->private_bio))) { | ||
877 | if (likely(mdev->state.pdsk >= D_INCONSISTENT)) { | ||
878 | inc_rs_pending(mdev); | ||
879 | ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); | ||
880 | } else { | ||
881 | if (__ratelimit(&drbd_ratelimit_state)) | ||
882 | dev_err(DEV, "Not sending RSDataReply, " | ||
883 | "partner DISKLESS!\n"); | ||
884 | ok = 1; | ||
885 | } | ||
886 | } else { | ||
887 | if (__ratelimit(&drbd_ratelimit_state)) | ||
888 | dev_err(DEV, "Sending NegRSDReply. sector %llus.\n", | ||
889 | (unsigned long long)e->sector); | ||
890 | |||
891 | ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); | ||
892 | |||
893 | /* update resync data with failure */ | ||
894 | drbd_rs_failed_io(mdev, e->sector, e->size); | ||
895 | } | ||
896 | |||
897 | dec_unacked(mdev); | ||
898 | |||
899 | move_to_net_ee_or_free(mdev, e); | ||
900 | |||
901 | if (unlikely(!ok)) | ||
902 | dev_err(DEV, "drbd_send_block() failed\n"); | ||
903 | return ok; | ||
904 | } | ||
905 | |||
906 | int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
907 | { | ||
908 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | ||
909 | struct digest_info *di; | ||
910 | int digest_size; | ||
911 | void *digest = NULL; | ||
912 | int ok, eq = 0; | ||
913 | |||
914 | if (unlikely(cancel)) { | ||
915 | drbd_free_ee(mdev, e); | ||
916 | dec_unacked(mdev); | ||
917 | return 1; | ||
918 | } | ||
919 | |||
920 | drbd_rs_complete_io(mdev, e->sector); | ||
921 | |||
922 | di = (struct digest_info *)(unsigned long)e->block_id; | ||
923 | |||
924 | if (likely(drbd_bio_uptodate(e->private_bio))) { | ||
925 | /* quick hack to try to avoid a race against reconfiguration. | ||
926 | * a real fix would be much more involved, | ||
927 | * introducing more locking mechanisms */ | ||
928 | if (mdev->csums_tfm) { | ||
929 | digest_size = crypto_hash_digestsize(mdev->csums_tfm); | ||
930 | D_ASSERT(digest_size == di->digest_size); | ||
931 | digest = kmalloc(digest_size, GFP_NOIO); | ||
932 | } | ||
933 | if (digest) { | ||
934 | drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest); | ||
935 | eq = !memcmp(digest, di->digest, digest_size); | ||
936 | kfree(digest); | ||
937 | } | ||
938 | |||
939 | if (eq) { | ||
940 | drbd_set_in_sync(mdev, e->sector, e->size); | ||
941 | /* rs_same_csums unit is BM_BLOCK_SIZE */ | ||
942 | mdev->rs_same_csum += e->size >> BM_BLOCK_SHIFT; | ||
943 | ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e); | ||
944 | } else { | ||
945 | inc_rs_pending(mdev); | ||
946 | e->block_id = ID_SYNCER; | ||
947 | ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); | ||
948 | } | ||
949 | } else { | ||
950 | ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); | ||
951 | if (__ratelimit(&drbd_ratelimit_state)) | ||
952 | dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n"); | ||
953 | } | ||
954 | |||
955 | dec_unacked(mdev); | ||
956 | |||
957 | kfree(di); | ||
958 | |||
959 | move_to_net_ee_or_free(mdev, e); | ||
960 | |||
961 | if (unlikely(!ok)) | ||
962 | dev_err(DEV, "drbd_send_block/ack() failed\n"); | ||
963 | return ok; | ||
964 | } | ||
965 | |||
966 | int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
967 | { | ||
968 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | ||
969 | int digest_size; | ||
970 | void *digest; | ||
971 | int ok = 1; | ||
972 | |||
973 | if (unlikely(cancel)) | ||
974 | goto out; | ||
975 | |||
976 | if (unlikely(!drbd_bio_uptodate(e->private_bio))) | ||
977 | goto out; | ||
978 | |||
979 | digest_size = crypto_hash_digestsize(mdev->verify_tfm); | ||
980 | /* FIXME if this allocation fails, online verify will not terminate! */ | ||
981 | digest = kmalloc(digest_size, GFP_NOIO); | ||
982 | if (digest) { | ||
983 | drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest); | ||
984 | inc_rs_pending(mdev); | ||
985 | ok = drbd_send_drequest_csum(mdev, e->sector, e->size, | ||
986 | digest, digest_size, P_OV_REPLY); | ||
987 | if (!ok) | ||
988 | dec_rs_pending(mdev); | ||
989 | kfree(digest); | ||
990 | } | ||
991 | |||
992 | out: | ||
993 | drbd_free_ee(mdev, e); | ||
994 | |||
995 | dec_unacked(mdev); | ||
996 | |||
997 | return ok; | ||
998 | } | ||
999 | |||
1000 | void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size) | ||
1001 | { | ||
1002 | if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) { | ||
1003 | mdev->ov_last_oos_size += size>>9; | ||
1004 | } else { | ||
1005 | mdev->ov_last_oos_start = sector; | ||
1006 | mdev->ov_last_oos_size = size>>9; | ||
1007 | } | ||
1008 | drbd_set_out_of_sync(mdev, sector, size); | ||
1009 | set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); | ||
1010 | } | ||
1011 | |||
1012 | int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1013 | { | ||
1014 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | ||
1015 | struct digest_info *di; | ||
1016 | int digest_size; | ||
1017 | void *digest; | ||
1018 | int ok, eq = 0; | ||
1019 | |||
1020 | if (unlikely(cancel)) { | ||
1021 | drbd_free_ee(mdev, e); | ||
1022 | dec_unacked(mdev); | ||
1023 | return 1; | ||
1024 | } | ||
1025 | |||
1026 | /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all | ||
1027 | * the resync lru has been cleaned up already */ | ||
1028 | drbd_rs_complete_io(mdev, e->sector); | ||
1029 | |||
1030 | di = (struct digest_info *)(unsigned long)e->block_id; | ||
1031 | |||
1032 | if (likely(drbd_bio_uptodate(e->private_bio))) { | ||
1033 | digest_size = crypto_hash_digestsize(mdev->verify_tfm); | ||
1034 | digest = kmalloc(digest_size, GFP_NOIO); | ||
1035 | if (digest) { | ||
1036 | drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest); | ||
1037 | |||
1038 | D_ASSERT(digest_size == di->digest_size); | ||
1039 | eq = !memcmp(digest, di->digest, digest_size); | ||
1040 | kfree(digest); | ||
1041 | } | ||
1042 | } else { | ||
1043 | ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); | ||
1044 | if (__ratelimit(&drbd_ratelimit_state)) | ||
1045 | dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n"); | ||
1046 | } | ||
1047 | |||
1048 | dec_unacked(mdev); | ||
1049 | |||
1050 | kfree(di); | ||
1051 | |||
1052 | if (!eq) | ||
1053 | drbd_ov_oos_found(mdev, e->sector, e->size); | ||
1054 | else | ||
1055 | ov_oos_print(mdev); | ||
1056 | |||
1057 | ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size, | ||
1058 | eq ? ID_IN_SYNC : ID_OUT_OF_SYNC); | ||
1059 | |||
1060 | drbd_free_ee(mdev, e); | ||
1061 | |||
1062 | if (--mdev->ov_left == 0) { | ||
1063 | ov_oos_print(mdev); | ||
1064 | drbd_resync_finished(mdev); | ||
1065 | } | ||
1066 | |||
1067 | return ok; | ||
1068 | } | ||
1069 | |||
1070 | int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1071 | { | ||
1072 | struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w); | ||
1073 | complete(&b->done); | ||
1074 | return 1; | ||
1075 | } | ||
1076 | |||
1077 | int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1078 | { | ||
1079 | struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w); | ||
1080 | struct p_barrier *p = &mdev->data.sbuf.barrier; | ||
1081 | int ok = 1; | ||
1082 | |||
1083 | /* really avoid racing with tl_clear. w.cb may have been referenced | ||
1084 | * just before it was reassigned and re-queued, so double check that. | ||
1085 | * actually, this race was harmless, since we only try to send the | ||
1086 | * barrier packet here, and otherwise do nothing with the object. | ||
1087 | * but compare with the head of w_clear_epoch */ | ||
1088 | spin_lock_irq(&mdev->req_lock); | ||
1089 | if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED) | ||
1090 | cancel = 1; | ||
1091 | spin_unlock_irq(&mdev->req_lock); | ||
1092 | if (cancel) | ||
1093 | return 1; | ||
1094 | |||
1095 | if (!drbd_get_data_sock(mdev)) | ||
1096 | return 0; | ||
1097 | p->barrier = b->br_number; | ||
1098 | /* inc_ap_pending was done where this was queued. | ||
1099 | * dec_ap_pending will be done in got_BarrierAck | ||
1100 | * or (on connection loss) in w_clear_epoch. */ | ||
1101 | ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER, | ||
1102 | (struct p_header *)p, sizeof(*p), 0); | ||
1103 | drbd_put_data_sock(mdev); | ||
1104 | |||
1105 | return ok; | ||
1106 | } | ||
1107 | |||
1108 | int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1109 | { | ||
1110 | if (cancel) | ||
1111 | return 1; | ||
1112 | return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE); | ||
1113 | } | ||
1114 | |||
1115 | /** | ||
1116 | * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request | ||
1117 | * @mdev: DRBD device. | ||
1118 | * @w: work object. | ||
1119 | * @cancel: The connection will be closed anyways | ||
1120 | */ | ||
1121 | int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1122 | { | ||
1123 | struct drbd_request *req = container_of(w, struct drbd_request, w); | ||
1124 | int ok; | ||
1125 | |||
1126 | if (unlikely(cancel)) { | ||
1127 | req_mod(req, send_canceled); | ||
1128 | return 1; | ||
1129 | } | ||
1130 | |||
1131 | ok = drbd_send_dblock(mdev, req); | ||
1132 | req_mod(req, ok ? handed_over_to_network : send_failed); | ||
1133 | |||
1134 | return ok; | ||
1135 | } | ||
1136 | |||
1137 | /** | ||
1138 | * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet | ||
1139 | * @mdev: DRBD device. | ||
1140 | * @w: work object. | ||
1141 | * @cancel: The connection will be closed anyways | ||
1142 | */ | ||
1143 | int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1144 | { | ||
1145 | struct drbd_request *req = container_of(w, struct drbd_request, w); | ||
1146 | int ok; | ||
1147 | |||
1148 | if (unlikely(cancel)) { | ||
1149 | req_mod(req, send_canceled); | ||
1150 | return 1; | ||
1151 | } | ||
1152 | |||
1153 | ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size, | ||
1154 | (unsigned long)req); | ||
1155 | |||
1156 | if (!ok) { | ||
1157 | /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send(); | ||
1158 | * so this is probably redundant */ | ||
1159 | if (mdev->state.conn >= C_CONNECTED) | ||
1160 | drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); | ||
1161 | } | ||
1162 | req_mod(req, ok ? handed_over_to_network : send_failed); | ||
1163 | |||
1164 | return ok; | ||
1165 | } | ||
1166 | |||
1167 | static int _drbd_may_sync_now(struct drbd_conf *mdev) | ||
1168 | { | ||
1169 | struct drbd_conf *odev = mdev; | ||
1170 | |||
1171 | while (1) { | ||
1172 | if (odev->sync_conf.after == -1) | ||
1173 | return 1; | ||
1174 | odev = minor_to_mdev(odev->sync_conf.after); | ||
1175 | ERR_IF(!odev) return 1; | ||
1176 | if ((odev->state.conn >= C_SYNC_SOURCE && | ||
1177 | odev->state.conn <= C_PAUSED_SYNC_T) || | ||
1178 | odev->state.aftr_isp || odev->state.peer_isp || | ||
1179 | odev->state.user_isp) | ||
1180 | return 0; | ||
1181 | } | ||
1182 | } | ||
1183 | |||
1184 | /** | ||
1185 | * _drbd_pause_after() - Pause resync on all devices that may not resync now | ||
1186 | * @mdev: DRBD device. | ||
1187 | * | ||
1188 | * Called from process context only (admin command and after_state_ch). | ||
1189 | */ | ||
1190 | static int _drbd_pause_after(struct drbd_conf *mdev) | ||
1191 | { | ||
1192 | struct drbd_conf *odev; | ||
1193 | int i, rv = 0; | ||
1194 | |||
1195 | for (i = 0; i < minor_count; i++) { | ||
1196 | odev = minor_to_mdev(i); | ||
1197 | if (!odev) | ||
1198 | continue; | ||
1199 | if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) | ||
1200 | continue; | ||
1201 | if (!_drbd_may_sync_now(odev)) | ||
1202 | rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL) | ||
1203 | != SS_NOTHING_TO_DO); | ||
1204 | } | ||
1205 | |||
1206 | return rv; | ||
1207 | } | ||
1208 | |||
1209 | /** | ||
1210 | * _drbd_resume_next() - Resume resync on all devices that may resync now | ||
1211 | * @mdev: DRBD device. | ||
1212 | * | ||
1213 | * Called from process context only (admin command and worker). | ||
1214 | */ | ||
1215 | static int _drbd_resume_next(struct drbd_conf *mdev) | ||
1216 | { | ||
1217 | struct drbd_conf *odev; | ||
1218 | int i, rv = 0; | ||
1219 | |||
1220 | for (i = 0; i < minor_count; i++) { | ||
1221 | odev = minor_to_mdev(i); | ||
1222 | if (!odev) | ||
1223 | continue; | ||
1224 | if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) | ||
1225 | continue; | ||
1226 | if (odev->state.aftr_isp) { | ||
1227 | if (_drbd_may_sync_now(odev)) | ||
1228 | rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0), | ||
1229 | CS_HARD, NULL) | ||
1230 | != SS_NOTHING_TO_DO) ; | ||
1231 | } | ||
1232 | } | ||
1233 | return rv; | ||
1234 | } | ||
1235 | |||
1236 | void resume_next_sg(struct drbd_conf *mdev) | ||
1237 | { | ||
1238 | write_lock_irq(&global_state_lock); | ||
1239 | _drbd_resume_next(mdev); | ||
1240 | write_unlock_irq(&global_state_lock); | ||
1241 | } | ||
1242 | |||
1243 | void suspend_other_sg(struct drbd_conf *mdev) | ||
1244 | { | ||
1245 | write_lock_irq(&global_state_lock); | ||
1246 | _drbd_pause_after(mdev); | ||
1247 | write_unlock_irq(&global_state_lock); | ||
1248 | } | ||
1249 | |||
1250 | static int sync_after_error(struct drbd_conf *mdev, int o_minor) | ||
1251 | { | ||
1252 | struct drbd_conf *odev; | ||
1253 | |||
1254 | if (o_minor == -1) | ||
1255 | return NO_ERROR; | ||
1256 | if (o_minor < -1 || minor_to_mdev(o_minor) == NULL) | ||
1257 | return ERR_SYNC_AFTER; | ||
1258 | |||
1259 | /* check for loops */ | ||
1260 | odev = minor_to_mdev(o_minor); | ||
1261 | while (1) { | ||
1262 | if (odev == mdev) | ||
1263 | return ERR_SYNC_AFTER_CYCLE; | ||
1264 | |||
1265 | /* dependency chain ends here, no cycles. */ | ||
1266 | if (odev->sync_conf.after == -1) | ||
1267 | return NO_ERROR; | ||
1268 | |||
1269 | /* follow the dependency chain */ | ||
1270 | odev = minor_to_mdev(odev->sync_conf.after); | ||
1271 | } | ||
1272 | } | ||
1273 | |||
1274 | int drbd_alter_sa(struct drbd_conf *mdev, int na) | ||
1275 | { | ||
1276 | int changes; | ||
1277 | int retcode; | ||
1278 | |||
1279 | write_lock_irq(&global_state_lock); | ||
1280 | retcode = sync_after_error(mdev, na); | ||
1281 | if (retcode == NO_ERROR) { | ||
1282 | mdev->sync_conf.after = na; | ||
1283 | do { | ||
1284 | changes = _drbd_pause_after(mdev); | ||
1285 | changes |= _drbd_resume_next(mdev); | ||
1286 | } while (changes); | ||
1287 | } | ||
1288 | write_unlock_irq(&global_state_lock); | ||
1289 | return retcode; | ||
1290 | } | ||
1291 | |||
1292 | static void ping_peer(struct drbd_conf *mdev) | ||
1293 | { | ||
1294 | clear_bit(GOT_PING_ACK, &mdev->flags); | ||
1295 | request_ping(mdev); | ||
1296 | wait_event(mdev->misc_wait, | ||
1297 | test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED); | ||
1298 | } | ||
1299 | |||
1300 | /** | ||
1301 | * drbd_start_resync() - Start the resync process | ||
1302 | * @mdev: DRBD device. | ||
1303 | * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET | ||
1304 | * | ||
1305 | * This function might bring you directly into one of the | ||
1306 | * C_PAUSED_SYNC_* states. | ||
1307 | */ | ||
1308 | void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | ||
1309 | { | ||
1310 | union drbd_state ns; | ||
1311 | int r; | ||
1312 | |||
1313 | if (mdev->state.conn >= C_SYNC_SOURCE) { | ||
1314 | dev_err(DEV, "Resync already running!\n"); | ||
1315 | return; | ||
1316 | } | ||
1317 | |||
1318 | /* In case a previous resync run was aborted by an IO error/detach on the peer. */ | ||
1319 | drbd_rs_cancel_all(mdev); | ||
1320 | |||
1321 | if (side == C_SYNC_TARGET) { | ||
1322 | /* Since application IO was locked out during C_WF_BITMAP_T and | ||
1323 | C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET | ||
1324 | we check that we might make the data inconsistent. */ | ||
1325 | r = drbd_khelper(mdev, "before-resync-target"); | ||
1326 | r = (r >> 8) & 0xff; | ||
1327 | if (r > 0) { | ||
1328 | dev_info(DEV, "before-resync-target handler returned %d, " | ||
1329 | "dropping connection.\n", r); | ||
1330 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
1331 | return; | ||
1332 | } | ||
1333 | } | ||
1334 | |||
1335 | drbd_state_lock(mdev); | ||
1336 | |||
1337 | if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { | ||
1338 | drbd_state_unlock(mdev); | ||
1339 | return; | ||
1340 | } | ||
1341 | |||
1342 | if (side == C_SYNC_TARGET) { | ||
1343 | mdev->bm_resync_fo = 0; | ||
1344 | } else /* side == C_SYNC_SOURCE */ { | ||
1345 | u64 uuid; | ||
1346 | |||
1347 | get_random_bytes(&uuid, sizeof(u64)); | ||
1348 | drbd_uuid_set(mdev, UI_BITMAP, uuid); | ||
1349 | drbd_send_sync_uuid(mdev, uuid); | ||
1350 | |||
1351 | D_ASSERT(mdev->state.disk == D_UP_TO_DATE); | ||
1352 | } | ||
1353 | |||
1354 | write_lock_irq(&global_state_lock); | ||
1355 | ns = mdev->state; | ||
1356 | |||
1357 | ns.aftr_isp = !_drbd_may_sync_now(mdev); | ||
1358 | |||
1359 | ns.conn = side; | ||
1360 | |||
1361 | if (side == C_SYNC_TARGET) | ||
1362 | ns.disk = D_INCONSISTENT; | ||
1363 | else /* side == C_SYNC_SOURCE */ | ||
1364 | ns.pdsk = D_INCONSISTENT; | ||
1365 | |||
1366 | r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | ||
1367 | ns = mdev->state; | ||
1368 | |||
1369 | if (ns.conn < C_CONNECTED) | ||
1370 | r = SS_UNKNOWN_ERROR; | ||
1371 | |||
1372 | if (r == SS_SUCCESS) { | ||
1373 | mdev->rs_total = | ||
1374 | mdev->rs_mark_left = drbd_bm_total_weight(mdev); | ||
1375 | mdev->rs_failed = 0; | ||
1376 | mdev->rs_paused = 0; | ||
1377 | mdev->rs_start = | ||
1378 | mdev->rs_mark_time = jiffies; | ||
1379 | mdev->rs_same_csum = 0; | ||
1380 | _drbd_pause_after(mdev); | ||
1381 | } | ||
1382 | write_unlock_irq(&global_state_lock); | ||
1383 | put_ldev(mdev); | ||
1384 | |||
1385 | if (r == SS_SUCCESS) { | ||
1386 | dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n", | ||
1387 | drbd_conn_str(ns.conn), | ||
1388 | (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10), | ||
1389 | (unsigned long) mdev->rs_total); | ||
1390 | |||
1391 | if (mdev->rs_total == 0) { | ||
1392 | /* Peer still reachable? Beware of failing before-resync-target handlers! */ | ||
1393 | ping_peer(mdev); | ||
1394 | drbd_resync_finished(mdev); | ||
1395 | } | ||
1396 | |||
1397 | /* ns.conn may already be != mdev->state.conn, | ||
1398 | * we may have been paused in between, or become paused until | ||
1399 | * the timer triggers. | ||
1400 | * No matter, that is handled in resync_timer_fn() */ | ||
1401 | if (ns.conn == C_SYNC_TARGET) | ||
1402 | mod_timer(&mdev->resync_timer, jiffies); | ||
1403 | |||
1404 | drbd_md_sync(mdev); | ||
1405 | } | ||
1406 | drbd_state_unlock(mdev); | ||
1407 | } | ||
1408 | |||
1409 | int drbd_worker(struct drbd_thread *thi) | ||
1410 | { | ||
1411 | struct drbd_conf *mdev = thi->mdev; | ||
1412 | struct drbd_work *w = NULL; | ||
1413 | LIST_HEAD(work_list); | ||
1414 | int intr = 0, i; | ||
1415 | |||
1416 | sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev)); | ||
1417 | |||
1418 | while (get_t_state(thi) == Running) { | ||
1419 | drbd_thread_current_set_cpu(mdev); | ||
1420 | |||
1421 | if (down_trylock(&mdev->data.work.s)) { | ||
1422 | mutex_lock(&mdev->data.mutex); | ||
1423 | if (mdev->data.socket && !mdev->net_conf->no_cork) | ||
1424 | drbd_tcp_uncork(mdev->data.socket); | ||
1425 | mutex_unlock(&mdev->data.mutex); | ||
1426 | |||
1427 | intr = down_interruptible(&mdev->data.work.s); | ||
1428 | |||
1429 | mutex_lock(&mdev->data.mutex); | ||
1430 | if (mdev->data.socket && !mdev->net_conf->no_cork) | ||
1431 | drbd_tcp_cork(mdev->data.socket); | ||
1432 | mutex_unlock(&mdev->data.mutex); | ||
1433 | } | ||
1434 | |||
1435 | if (intr) { | ||
1436 | D_ASSERT(intr == -EINTR); | ||
1437 | flush_signals(current); | ||
1438 | ERR_IF (get_t_state(thi) == Running) | ||
1439 | continue; | ||
1440 | break; | ||
1441 | } | ||
1442 | |||
1443 | if (get_t_state(thi) != Running) | ||
1444 | break; | ||
1445 | /* With this break, we have done a down() but not consumed | ||
1446 | the entry from the list. The cleanup code takes care of | ||
1447 | this... */ | ||
1448 | |||
1449 | w = NULL; | ||
1450 | spin_lock_irq(&mdev->data.work.q_lock); | ||
1451 | ERR_IF(list_empty(&mdev->data.work.q)) { | ||
1452 | /* something terribly wrong in our logic. | ||
1453 | * we were able to down() the semaphore, | ||
1454 | * but the list is empty... doh. | ||
1455 | * | ||
1456 | * what is the best thing to do now? | ||
1457 | * try again from scratch, restarting the receiver, | ||
1458 | * asender, whatnot? could break even more ugly, | ||
1459 | * e.g. when we are primary, but no good local data. | ||
1460 | * | ||
1461 | * I'll try to get away just starting over this loop. | ||
1462 | */ | ||
1463 | spin_unlock_irq(&mdev->data.work.q_lock); | ||
1464 | continue; | ||
1465 | } | ||
1466 | w = list_entry(mdev->data.work.q.next, struct drbd_work, list); | ||
1467 | list_del_init(&w->list); | ||
1468 | spin_unlock_irq(&mdev->data.work.q_lock); | ||
1469 | |||
1470 | if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) { | ||
1471 | /* dev_warn(DEV, "worker: a callback failed! \n"); */ | ||
1472 | if (mdev->state.conn >= C_CONNECTED) | ||
1473 | drbd_force_state(mdev, | ||
1474 | NS(conn, C_NETWORK_FAILURE)); | ||
1475 | } | ||
1476 | } | ||
1477 | D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags)); | ||
1478 | D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags)); | ||
1479 | |||
1480 | spin_lock_irq(&mdev->data.work.q_lock); | ||
1481 | i = 0; | ||
1482 | while (!list_empty(&mdev->data.work.q)) { | ||
1483 | list_splice_init(&mdev->data.work.q, &work_list); | ||
1484 | spin_unlock_irq(&mdev->data.work.q_lock); | ||
1485 | |||
1486 | while (!list_empty(&work_list)) { | ||
1487 | w = list_entry(work_list.next, struct drbd_work, list); | ||
1488 | list_del_init(&w->list); | ||
1489 | w->cb(mdev, w, 1); | ||
1490 | i++; /* dead debugging code */ | ||
1491 | } | ||
1492 | |||
1493 | spin_lock_irq(&mdev->data.work.q_lock); | ||
1494 | } | ||
1495 | sema_init(&mdev->data.work.s, 0); | ||
1496 | /* DANGEROUS race: if someone did queue his work within the spinlock, | ||
1497 | * but up() ed outside the spinlock, we could get an up() on the | ||
1498 | * semaphore without corresponding list entry. | ||
1499 | * So don't do that. | ||
1500 | */ | ||
1501 | spin_unlock_irq(&mdev->data.work.q_lock); | ||
1502 | |||
1503 | D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE); | ||
1504 | /* _drbd_set_state only uses stop_nowait. | ||
1505 | * wait here for the Exiting receiver. */ | ||
1506 | drbd_thread_stop(&mdev->receiver); | ||
1507 | drbd_mdev_cleanup(mdev); | ||
1508 | |||
1509 | dev_info(DEV, "worker terminated\n"); | ||
1510 | |||
1511 | clear_bit(DEVICE_DYING, &mdev->flags); | ||
1512 | clear_bit(CONFIG_PENDING, &mdev->flags); | ||
1513 | wake_up(&mdev->state_wait); | ||
1514 | |||
1515 | return 0; | ||
1516 | } | ||
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h new file mode 100644 index 000000000000..f93fa111ce50 --- /dev/null +++ b/drivers/block/drbd/drbd_wrappers.h | |||
@@ -0,0 +1,91 @@ | |||
1 | #ifndef _DRBD_WRAPPERS_H | ||
2 | #define _DRBD_WRAPPERS_H | ||
3 | |||
4 | #include <linux/ctype.h> | ||
5 | #include <linux/mm.h> | ||
6 | |||
7 | /* see get_sb_bdev and bd_claim */ | ||
8 | extern char *drbd_sec_holder; | ||
9 | |||
10 | /* sets the number of 512 byte sectors of our virtual device */ | ||
11 | static inline void drbd_set_my_capacity(struct drbd_conf *mdev, | ||
12 | sector_t size) | ||
13 | { | ||
14 | /* set_capacity(mdev->this_bdev->bd_disk, size); */ | ||
15 | set_capacity(mdev->vdisk, size); | ||
16 | mdev->this_bdev->bd_inode->i_size = (loff_t)size << 9; | ||
17 | } | ||
18 | |||
19 | #define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE) | ||
20 | |||
21 | static inline int drbd_bio_has_active_page(struct bio *bio) | ||
22 | { | ||
23 | struct bio_vec *bvec; | ||
24 | int i; | ||
25 | |||
26 | __bio_for_each_segment(bvec, bio, i, 0) { | ||
27 | if (page_count(bvec->bv_page) > 1) | ||
28 | return 1; | ||
29 | } | ||
30 | |||
31 | return 0; | ||
32 | } | ||
33 | |||
34 | /* bi_end_io handlers */ | ||
35 | extern void drbd_md_io_complete(struct bio *bio, int error); | ||
36 | extern void drbd_endio_read_sec(struct bio *bio, int error); | ||
37 | extern void drbd_endio_write_sec(struct bio *bio, int error); | ||
38 | extern void drbd_endio_pri(struct bio *bio, int error); | ||
39 | |||
40 | /* | ||
41 | * used to submit our private bio | ||
42 | */ | ||
43 | static inline void drbd_generic_make_request(struct drbd_conf *mdev, | ||
44 | int fault_type, struct bio *bio) | ||
45 | { | ||
46 | __release(local); | ||
47 | if (!bio->bi_bdev) { | ||
48 | printk(KERN_ERR "drbd%d: drbd_generic_make_request: " | ||
49 | "bio->bi_bdev == NULL\n", | ||
50 | mdev_to_minor(mdev)); | ||
51 | dump_stack(); | ||
52 | bio_endio(bio, -ENODEV); | ||
53 | return; | ||
54 | } | ||
55 | |||
56 | if (FAULT_ACTIVE(mdev, fault_type)) | ||
57 | bio_endio(bio, -EIO); | ||
58 | else | ||
59 | generic_make_request(bio); | ||
60 | } | ||
61 | |||
62 | static inline void drbd_plug_device(struct drbd_conf *mdev) | ||
63 | { | ||
64 | struct request_queue *q; | ||
65 | q = bdev_get_queue(mdev->this_bdev); | ||
66 | |||
67 | spin_lock_irq(q->queue_lock); | ||
68 | |||
69 | /* XXX the check on !blk_queue_plugged is redundant, | ||
70 | * implicitly checked in blk_plug_device */ | ||
71 | |||
72 | if (!blk_queue_plugged(q)) { | ||
73 | blk_plug_device(q); | ||
74 | del_timer(&q->unplug_timer); | ||
75 | /* unplugging should not happen automatically... */ | ||
76 | } | ||
77 | spin_unlock_irq(q->queue_lock); | ||
78 | } | ||
79 | |||
80 | static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm) | ||
81 | { | ||
82 | return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK) | ||
83 | == CRYPTO_ALG_TYPE_HASH; | ||
84 | } | ||
85 | |||
86 | #ifndef __CHECKER__ | ||
87 | # undef __cond_lock | ||
88 | # define __cond_lock(x,c) (c) | ||
89 | #endif | ||
90 | |||
91 | #endif | ||
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 5c01f747571b..90c4038702da 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c | |||
@@ -144,13 +144,23 @@ | |||
144 | * Better audit of register_blkdev. | 144 | * Better audit of register_blkdev. |
145 | */ | 145 | */ |
146 | 146 | ||
147 | #define FLOPPY_SANITY_CHECK | ||
148 | #undef FLOPPY_SILENT_DCL_CLEAR | 147 | #undef FLOPPY_SILENT_DCL_CLEAR |
149 | 148 | ||
150 | #define REALLY_SLOW_IO | 149 | #define REALLY_SLOW_IO |
151 | 150 | ||
152 | #define DEBUGT 2 | 151 | #define DEBUGT 2 |
153 | #define DCL_DEBUG /* debug disk change line */ | 152 | |
153 | #define DPRINT(format, args...) \ | ||
154 | pr_info("floppy%d: " format, current_drive, ##args) | ||
155 | |||
156 | #define DCL_DEBUG /* debug disk change line */ | ||
157 | #ifdef DCL_DEBUG | ||
158 | #define debug_dcl(test, fmt, args...) \ | ||
159 | do { if ((test) & FD_DEBUG) DPRINT(fmt, ##args); } while (0) | ||
160 | #else | ||
161 | #define debug_dcl(test, fmt, args...) \ | ||
162 | do { if (0) DPRINT(fmt, ##args); } while (0) | ||
163 | #endif | ||
154 | 164 | ||
155 | /* do print messages for unexpected interrupts */ | 165 | /* do print messages for unexpected interrupts */ |
156 | static int print_unex = 1; | 166 | static int print_unex = 1; |
@@ -180,6 +190,8 @@ static int print_unex = 1; | |||
180 | #include <linux/mod_devicetable.h> | 190 | #include <linux/mod_devicetable.h> |
181 | #include <linux/buffer_head.h> /* for invalidate_buffers() */ | 191 | #include <linux/buffer_head.h> /* for invalidate_buffers() */ |
182 | #include <linux/mutex.h> | 192 | #include <linux/mutex.h> |
193 | #include <linux/io.h> | ||
194 | #include <linux/uaccess.h> | ||
183 | 195 | ||
184 | /* | 196 | /* |
185 | * PS/2 floppies have much slower step rates than regular floppies. | 197 | * PS/2 floppies have much slower step rates than regular floppies. |
@@ -191,8 +203,6 @@ static int slow_floppy; | |||
191 | #include <asm/dma.h> | 203 | #include <asm/dma.h> |
192 | #include <asm/irq.h> | 204 | #include <asm/irq.h> |
193 | #include <asm/system.h> | 205 | #include <asm/system.h> |
194 | #include <asm/io.h> | ||
195 | #include <asm/uaccess.h> | ||
196 | 206 | ||
197 | static int FLOPPY_IRQ = 6; | 207 | static int FLOPPY_IRQ = 6; |
198 | static int FLOPPY_DMA = 2; | 208 | static int FLOPPY_DMA = 2; |
@@ -241,8 +251,6 @@ static int allowed_drive_mask = 0x33; | |||
241 | 251 | ||
242 | static int irqdma_allocated; | 252 | static int irqdma_allocated; |
243 | 253 | ||
244 | #define DEVICE_NAME "floppy" | ||
245 | |||
246 | #include <linux/blkdev.h> | 254 | #include <linux/blkdev.h> |
247 | #include <linux/blkpg.h> | 255 | #include <linux/blkpg.h> |
248 | #include <linux/cdrom.h> /* for the compatibility eject ioctl */ | 256 | #include <linux/cdrom.h> /* for the compatibility eject ioctl */ |
@@ -250,7 +258,7 @@ static int irqdma_allocated; | |||
250 | 258 | ||
251 | static struct request *current_req; | 259 | static struct request *current_req; |
252 | static struct request_queue *floppy_queue; | 260 | static struct request_queue *floppy_queue; |
253 | static void do_fd_request(struct request_queue * q); | 261 | static void do_fd_request(struct request_queue *q); |
254 | 262 | ||
255 | #ifndef fd_get_dma_residue | 263 | #ifndef fd_get_dma_residue |
256 | #define fd_get_dma_residue() get_dma_residue(FLOPPY_DMA) | 264 | #define fd_get_dma_residue() get_dma_residue(FLOPPY_DMA) |
@@ -263,7 +271,7 @@ static void do_fd_request(struct request_queue * q); | |||
263 | #endif | 271 | #endif |
264 | 272 | ||
265 | #ifndef fd_dma_mem_alloc | 273 | #ifndef fd_dma_mem_alloc |
266 | #define fd_dma_mem_alloc(size) __get_dma_pages(GFP_KERNEL,get_order(size)) | 274 | #define fd_dma_mem_alloc(size) __get_dma_pages(GFP_KERNEL, get_order(size)) |
267 | #endif | 275 | #endif |
268 | 276 | ||
269 | static inline void fallback_on_nodma_alloc(char **addr, size_t l) | 277 | static inline void fallback_on_nodma_alloc(char **addr, size_t l) |
@@ -273,7 +281,7 @@ static inline void fallback_on_nodma_alloc(char **addr, size_t l) | |||
273 | return; /* we have the memory */ | 281 | return; /* we have the memory */ |
274 | if (can_use_virtual_dma != 2) | 282 | if (can_use_virtual_dma != 2) |
275 | return; /* no fallback allowed */ | 283 | return; /* no fallback allowed */ |
276 | printk("DMA memory shortage. Temporarily falling back on virtual DMA\n"); | 284 | pr_info("DMA memory shortage. Temporarily falling back on virtual DMA\n"); |
277 | *addr = (char *)nodma_mem_alloc(l); | 285 | *addr = (char *)nodma_mem_alloc(l); |
278 | #else | 286 | #else |
279 | return; | 287 | return; |
@@ -283,59 +291,50 @@ static inline void fallback_on_nodma_alloc(char **addr, size_t l) | |||
283 | /* End dma memory related stuff */ | 291 | /* End dma memory related stuff */ |
284 | 292 | ||
285 | static unsigned long fake_change; | 293 | static unsigned long fake_change; |
286 | static int initialising = 1; | 294 | static bool initialized; |
287 | 295 | ||
288 | #define ITYPE(x) (((x)>>2) & 0x1f) | 296 | #define ITYPE(x) (((x) >> 2) & 0x1f) |
289 | #define TOMINOR(x) ((x & 3) | ((x & 4) << 5)) | 297 | #define TOMINOR(x) ((x & 3) | ((x & 4) << 5)) |
290 | #define UNIT(x) ((x) & 0x03) /* drive on fdc */ | 298 | #define UNIT(x) ((x) & 0x03) /* drive on fdc */ |
291 | #define FDC(x) (((x) & 0x04) >> 2) /* fdc of drive */ | 299 | #define FDC(x) (((x) & 0x04) >> 2) /* fdc of drive */ |
292 | /* reverse mapping from unit and fdc to drive */ | 300 | /* reverse mapping from unit and fdc to drive */ |
293 | #define REVDRIVE(fdc, unit) ((unit) + ((fdc) << 2)) | 301 | #define REVDRIVE(fdc, unit) ((unit) + ((fdc) << 2)) |
294 | #define DP (&drive_params[current_drive]) | ||
295 | #define DRS (&drive_state[current_drive]) | ||
296 | #define DRWE (&write_errors[current_drive]) | ||
297 | #define FDCS (&fdc_state[fdc]) | ||
298 | #define CLEARF(x) clear_bit(x##_BIT, &DRS->flags) | ||
299 | #define SETF(x) set_bit(x##_BIT, &DRS->flags) | ||
300 | #define TESTF(x) test_bit(x##_BIT, &DRS->flags) | ||
301 | 302 | ||
302 | #define UDP (&drive_params[drive]) | 303 | #define DP (&drive_params[current_drive]) |
303 | #define UDRS (&drive_state[drive]) | 304 | #define DRS (&drive_state[current_drive]) |
304 | #define UDRWE (&write_errors[drive]) | 305 | #define DRWE (&write_errors[current_drive]) |
305 | #define UFDCS (&fdc_state[FDC(drive)]) | 306 | #define FDCS (&fdc_state[fdc]) |
306 | #define UCLEARF(x) clear_bit(x##_BIT, &UDRS->flags) | ||
307 | #define USETF(x) set_bit(x##_BIT, &UDRS->flags) | ||
308 | #define UTESTF(x) test_bit(x##_BIT, &UDRS->flags) | ||
309 | 307 | ||
310 | #define DPRINT(format, args...) printk(DEVICE_NAME "%d: " format, current_drive , ## args) | 308 | #define UDP (&drive_params[drive]) |
309 | #define UDRS (&drive_state[drive]) | ||
310 | #define UDRWE (&write_errors[drive]) | ||
311 | #define UFDCS (&fdc_state[FDC(drive)]) | ||
311 | 312 | ||
312 | #define PH_HEAD(floppy,head) (((((floppy)->stretch & 2) >>1) ^ head) << 2) | 313 | #define PH_HEAD(floppy, head) (((((floppy)->stretch & 2) >> 1) ^ head) << 2) |
313 | #define STRETCH(floppy) ((floppy)->stretch & FD_STRETCH) | 314 | #define STRETCH(floppy) ((floppy)->stretch & FD_STRETCH) |
314 | |||
315 | #define CLEARSTRUCT(x) memset((x), 0, sizeof(*(x))) | ||
316 | 315 | ||
317 | /* read/write */ | 316 | /* read/write */ |
318 | #define COMMAND raw_cmd->cmd[0] | 317 | #define COMMAND (raw_cmd->cmd[0]) |
319 | #define DR_SELECT raw_cmd->cmd[1] | 318 | #define DR_SELECT (raw_cmd->cmd[1]) |
320 | #define TRACK raw_cmd->cmd[2] | 319 | #define TRACK (raw_cmd->cmd[2]) |
321 | #define HEAD raw_cmd->cmd[3] | 320 | #define HEAD (raw_cmd->cmd[3]) |
322 | #define SECTOR raw_cmd->cmd[4] | 321 | #define SECTOR (raw_cmd->cmd[4]) |
323 | #define SIZECODE raw_cmd->cmd[5] | 322 | #define SIZECODE (raw_cmd->cmd[5]) |
324 | #define SECT_PER_TRACK raw_cmd->cmd[6] | 323 | #define SECT_PER_TRACK (raw_cmd->cmd[6]) |
325 | #define GAP raw_cmd->cmd[7] | 324 | #define GAP (raw_cmd->cmd[7]) |
326 | #define SIZECODE2 raw_cmd->cmd[8] | 325 | #define SIZECODE2 (raw_cmd->cmd[8]) |
327 | #define NR_RW 9 | 326 | #define NR_RW 9 |
328 | 327 | ||
329 | /* format */ | 328 | /* format */ |
330 | #define F_SIZECODE raw_cmd->cmd[2] | 329 | #define F_SIZECODE (raw_cmd->cmd[2]) |
331 | #define F_SECT_PER_TRACK raw_cmd->cmd[3] | 330 | #define F_SECT_PER_TRACK (raw_cmd->cmd[3]) |
332 | #define F_GAP raw_cmd->cmd[4] | 331 | #define F_GAP (raw_cmd->cmd[4]) |
333 | #define F_FILL raw_cmd->cmd[5] | 332 | #define F_FILL (raw_cmd->cmd[5]) |
334 | #define NR_F 6 | 333 | #define NR_F 6 |
335 | 334 | ||
336 | /* | 335 | /* |
337 | * Maximum disk size (in kilobytes). This default is used whenever the | 336 | * Maximum disk size (in kilobytes). |
338 | * current disk size is unknown. | 337 | * This default is used whenever the current disk size is unknown. |
339 | * [Now it is rather a minimum] | 338 | * [Now it is rather a minimum] |
340 | */ | 339 | */ |
341 | #define MAX_DISK_SIZE 4 /* 3984 */ | 340 | #define MAX_DISK_SIZE 4 /* 3984 */ |
@@ -345,16 +344,17 @@ static int initialising = 1; | |||
345 | */ | 344 | */ |
346 | #define MAX_REPLIES 16 | 345 | #define MAX_REPLIES 16 |
347 | static unsigned char reply_buffer[MAX_REPLIES]; | 346 | static unsigned char reply_buffer[MAX_REPLIES]; |
348 | static int inr; /* size of reply buffer, when called from interrupt */ | 347 | static int inr; /* size of reply buffer, when called from interrupt */ |
349 | #define ST0 (reply_buffer[0]) | 348 | #define ST0 (reply_buffer[0]) |
350 | #define ST1 (reply_buffer[1]) | 349 | #define ST1 (reply_buffer[1]) |
351 | #define ST2 (reply_buffer[2]) | 350 | #define ST2 (reply_buffer[2]) |
352 | #define ST3 (reply_buffer[0]) /* result of GETSTATUS */ | 351 | #define ST3 (reply_buffer[0]) /* result of GETSTATUS */ |
353 | #define R_TRACK (reply_buffer[3]) | 352 | #define R_TRACK (reply_buffer[3]) |
354 | #define R_HEAD (reply_buffer[4]) | 353 | #define R_HEAD (reply_buffer[4]) |
355 | #define R_SECTOR (reply_buffer[5]) | 354 | #define R_SECTOR (reply_buffer[5]) |
356 | #define R_SIZECODE (reply_buffer[6]) | 355 | #define R_SIZECODE (reply_buffer[6]) |
357 | #define SEL_DLY (2*HZ/100) | 356 | |
357 | #define SEL_DLY (2 * HZ / 100) | ||
358 | 358 | ||
359 | /* | 359 | /* |
360 | * this struct defines the different floppy drive types. | 360 | * this struct defines the different floppy drive types. |
@@ -505,9 +505,9 @@ static char floppy_device_name[] = "floppy"; | |||
505 | static int probing; | 505 | static int probing; |
506 | 506 | ||
507 | /* Synchronization of FDC access. */ | 507 | /* Synchronization of FDC access. */ |
508 | #define FD_COMMAND_NONE -1 | 508 | #define FD_COMMAND_NONE -1 |
509 | #define FD_COMMAND_ERROR 2 | 509 | #define FD_COMMAND_ERROR 2 |
510 | #define FD_COMMAND_OKAY 3 | 510 | #define FD_COMMAND_OKAY 3 |
511 | 511 | ||
512 | static volatile int command_status = FD_COMMAND_NONE; | 512 | static volatile int command_status = FD_COMMAND_NONE; |
513 | static unsigned long fdc_busy; | 513 | static unsigned long fdc_busy; |
@@ -515,11 +515,6 @@ static DECLARE_WAIT_QUEUE_HEAD(fdc_wait); | |||
515 | static DECLARE_WAIT_QUEUE_HEAD(command_done); | 515 | static DECLARE_WAIT_QUEUE_HEAD(command_done); |
516 | 516 | ||
517 | #define NO_SIGNAL (!interruptible || !signal_pending(current)) | 517 | #define NO_SIGNAL (!interruptible || !signal_pending(current)) |
518 | #define CALL(x) if ((x) == -EINTR) return -EINTR | ||
519 | #define ECALL(x) if ((ret = (x))) return ret; | ||
520 | #define _WAIT(x,i) CALL(ret=wait_til_done((x),i)) | ||
521 | #define WAIT(x) _WAIT((x),interruptible) | ||
522 | #define IWAIT(x) _WAIT((x),1) | ||
523 | 518 | ||
524 | /* Errors during formatting are counted here. */ | 519 | /* Errors during formatting are counted here. */ |
525 | static int format_errors; | 520 | static int format_errors; |
@@ -545,8 +540,9 @@ static int max_buffer_sectors; | |||
545 | static int *errors; | 540 | static int *errors; |
546 | typedef void (*done_f)(int); | 541 | typedef void (*done_f)(int); |
547 | static struct cont_t { | 542 | static struct cont_t { |
548 | void (*interrupt)(void); /* this is called after the interrupt of the | 543 | void (*interrupt)(void); |
549 | * main command */ | 544 | /* this is called after the interrupt of the |
545 | * main command */ | ||
550 | void (*redo)(void); /* this is called to retry the operation */ | 546 | void (*redo)(void); /* this is called to retry the operation */ |
551 | void (*error)(void); /* this is called to tally an error */ | 547 | void (*error)(void); /* this is called to tally an error */ |
552 | done_f done; /* this is called to say if the operation has | 548 | done_f done; /* this is called to say if the operation has |
@@ -571,7 +567,6 @@ static void floppy_release_irq_and_dma(void); | |||
571 | * reset doesn't need to be tested before sending commands, because | 567 | * reset doesn't need to be tested before sending commands, because |
572 | * output_byte is automatically disabled when reset is set. | 568 | * output_byte is automatically disabled when reset is set. |
573 | */ | 569 | */ |
574 | #define CHECK_RESET { if (FDCS->reset){ reset_fdc(); return; } } | ||
575 | static void reset_fdc(void); | 570 | static void reset_fdc(void); |
576 | 571 | ||
577 | /* | 572 | /* |
@@ -579,9 +574,9 @@ static void reset_fdc(void); | |||
579 | * information to interrupts. They are the data used for the current | 574 | * information to interrupts. They are the data used for the current |
580 | * request. | 575 | * request. |
581 | */ | 576 | */ |
582 | #define NO_TRACK -1 | 577 | #define NO_TRACK -1 |
583 | #define NEED_1_RECAL -2 | 578 | #define NEED_1_RECAL -2 |
584 | #define NEED_2_RECAL -3 | 579 | #define NEED_2_RECAL -3 |
585 | 580 | ||
586 | static int usage_count; | 581 | static int usage_count; |
587 | 582 | ||
@@ -621,39 +616,35 @@ static inline void set_debugt(void) | |||
621 | debugtimer = jiffies; | 616 | debugtimer = jiffies; |
622 | } | 617 | } |
623 | 618 | ||
624 | static inline void debugt(const char *message) | 619 | static inline void debugt(const char *func, const char *msg) |
625 | { | 620 | { |
626 | if (DP->flags & DEBUGT) | 621 | if (DP->flags & DEBUGT) |
627 | printk("%s dtime=%lu\n", message, jiffies - debugtimer); | 622 | pr_info("%s:%s dtime=%lu\n", func, msg, jiffies - debugtimer); |
628 | } | 623 | } |
629 | #else | 624 | #else |
630 | static inline void set_debugt(void) { } | 625 | static inline void set_debugt(void) { } |
631 | static inline void debugt(const char *message) { } | 626 | static inline void debugt(const char *func, const char *msg) { } |
632 | #endif /* DEBUGT */ | 627 | #endif /* DEBUGT */ |
633 | 628 | ||
634 | typedef void (*timeout_fn) (unsigned long); | 629 | typedef void (*timeout_fn)(unsigned long); |
635 | static DEFINE_TIMER(fd_timeout, floppy_shutdown, 0, 0); | 630 | static DEFINE_TIMER(fd_timeout, floppy_shutdown, 0, 0); |
636 | 631 | ||
637 | static const char *timeout_message; | 632 | static const char *timeout_message; |
638 | 633 | ||
639 | #ifdef FLOPPY_SANITY_CHECK | 634 | static void is_alive(const char *func, const char *message) |
640 | static void is_alive(const char *message) | ||
641 | { | 635 | { |
642 | /* this routine checks whether the floppy driver is "alive" */ | 636 | /* this routine checks whether the floppy driver is "alive" */ |
643 | if (test_bit(0, &fdc_busy) && command_status < 2 | 637 | if (test_bit(0, &fdc_busy) && command_status < 2 && |
644 | && !timer_pending(&fd_timeout)) { | 638 | !timer_pending(&fd_timeout)) { |
645 | DPRINT("timeout handler died: %s\n", message); | 639 | DPRINT("%s: timeout handler died. %s\n", func, message); |
646 | } | 640 | } |
647 | } | 641 | } |
648 | #endif | ||
649 | 642 | ||
650 | static void (*do_floppy) (void) = NULL; | 643 | static void (*do_floppy)(void) = NULL; |
651 | |||
652 | #ifdef FLOPPY_SANITY_CHECK | ||
653 | 644 | ||
654 | #define OLOGSIZE 20 | 645 | #define OLOGSIZE 20 |
655 | 646 | ||
656 | static void (*lasthandler) (void); | 647 | static void (*lasthandler)(void); |
657 | static unsigned long interruptjiffies; | 648 | static unsigned long interruptjiffies; |
658 | static unsigned long resultjiffies; | 649 | static unsigned long resultjiffies; |
659 | static int resultsize; | 650 | static int resultsize; |
@@ -666,12 +657,11 @@ static struct output_log { | |||
666 | } output_log[OLOGSIZE]; | 657 | } output_log[OLOGSIZE]; |
667 | 658 | ||
668 | static int output_log_pos; | 659 | static int output_log_pos; |
669 | #endif | ||
670 | 660 | ||
671 | #define current_reqD -1 | 661 | #define current_reqD -1 |
672 | #define MAXTIMEOUT -2 | 662 | #define MAXTIMEOUT -2 |
673 | 663 | ||
674 | static void __reschedule_timeout(int drive, const char *message, int marg) | 664 | static void __reschedule_timeout(int drive, const char *message) |
675 | { | 665 | { |
676 | if (drive == current_reqD) | 666 | if (drive == current_reqD) |
677 | drive = current_drive; | 667 | drive = current_drive; |
@@ -682,25 +672,22 @@ static void __reschedule_timeout(int drive, const char *message, int marg) | |||
682 | } else | 672 | } else |
683 | fd_timeout.expires = jiffies + UDP->timeout; | 673 | fd_timeout.expires = jiffies + UDP->timeout; |
684 | add_timer(&fd_timeout); | 674 | add_timer(&fd_timeout); |
685 | if (UDP->flags & FD_DEBUG) { | 675 | if (UDP->flags & FD_DEBUG) |
686 | DPRINT("reschedule timeout "); | 676 | DPRINT("reschedule timeout %s\n", message); |
687 | printk(message, marg); | ||
688 | printk("\n"); | ||
689 | } | ||
690 | timeout_message = message; | 677 | timeout_message = message; |
691 | } | 678 | } |
692 | 679 | ||
693 | static void reschedule_timeout(int drive, const char *message, int marg) | 680 | static void reschedule_timeout(int drive, const char *message) |
694 | { | 681 | { |
695 | unsigned long flags; | 682 | unsigned long flags; |
696 | 683 | ||
697 | spin_lock_irqsave(&floppy_lock, flags); | 684 | spin_lock_irqsave(&floppy_lock, flags); |
698 | __reschedule_timeout(drive, message, marg); | 685 | __reschedule_timeout(drive, message); |
699 | spin_unlock_irqrestore(&floppy_lock, flags); | 686 | spin_unlock_irqrestore(&floppy_lock, flags); |
700 | } | 687 | } |
701 | 688 | ||
702 | #define INFBOUND(a,b) (a)=max_t(int, a, b) | 689 | #define INFBOUND(a, b) (a) = max_t(int, a, b) |
703 | #define SUPBOUND(a,b) (a)=min_t(int, a, b) | 690 | #define SUPBOUND(a, b) (a) = min_t(int, a, b) |
704 | 691 | ||
705 | /* | 692 | /* |
706 | * Bottom half floppy driver. | 693 | * Bottom half floppy driver. |
@@ -739,7 +726,6 @@ static int disk_change(int drive) | |||
739 | { | 726 | { |
740 | int fdc = FDC(drive); | 727 | int fdc = FDC(drive); |
741 | 728 | ||
742 | #ifdef FLOPPY_SANITY_CHECK | ||
743 | if (time_before(jiffies, UDRS->select_date + UDP->select_delay)) | 729 | if (time_before(jiffies, UDRS->select_date + UDP->select_delay)) |
744 | DPRINT("WARNING disk change called early\n"); | 730 | DPRINT("WARNING disk change called early\n"); |
745 | if (!(FDCS->dor & (0x10 << UNIT(drive))) || | 731 | if (!(FDCS->dor & (0x10 << UNIT(drive))) || |
@@ -748,31 +734,27 @@ static int disk_change(int drive) | |||
748 | DPRINT("drive=%d fdc=%d dor=%x\n", drive, FDC(drive), | 734 | DPRINT("drive=%d fdc=%d dor=%x\n", drive, FDC(drive), |
749 | (unsigned int)FDCS->dor); | 735 | (unsigned int)FDCS->dor); |
750 | } | 736 | } |
751 | #endif | ||
752 | 737 | ||
753 | #ifdef DCL_DEBUG | 738 | debug_dcl(UDP->flags, |
754 | if (UDP->flags & FD_DEBUG) { | 739 | "checking disk change line for drive %d\n", drive); |
755 | DPRINT("checking disk change line for drive %d\n", drive); | 740 | debug_dcl(UDP->flags, "jiffies=%lu\n", jiffies); |
756 | DPRINT("jiffies=%lu\n", jiffies); | 741 | debug_dcl(UDP->flags, "disk change line=%x\n", fd_inb(FD_DIR) & 0x80); |
757 | DPRINT("disk change line=%x\n", fd_inb(FD_DIR) & 0x80); | 742 | debug_dcl(UDP->flags, "flags=%lx\n", UDRS->flags); |
758 | DPRINT("flags=%lx\n", UDRS->flags); | 743 | |
759 | } | ||
760 | #endif | ||
761 | if (UDP->flags & FD_BROKEN_DCL) | 744 | if (UDP->flags & FD_BROKEN_DCL) |
762 | return UTESTF(FD_DISK_CHANGED); | 745 | return test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags); |
763 | if ((fd_inb(FD_DIR) ^ UDP->flags) & 0x80) { | 746 | if ((fd_inb(FD_DIR) ^ UDP->flags) & 0x80) { |
764 | USETF(FD_VERIFY); /* verify write protection */ | 747 | set_bit(FD_VERIFY_BIT, &UDRS->flags); |
765 | if (UDRS->maxblock) { | 748 | /* verify write protection */ |
766 | /* mark it changed */ | 749 | |
767 | USETF(FD_DISK_CHANGED); | 750 | if (UDRS->maxblock) /* mark it changed */ |
768 | } | 751 | set_bit(FD_DISK_CHANGED_BIT, &UDRS->flags); |
769 | 752 | ||
770 | /* invalidate its geometry */ | 753 | /* invalidate its geometry */ |
771 | if (UDRS->keep_data >= 0) { | 754 | if (UDRS->keep_data >= 0) { |
772 | if ((UDP->flags & FTD_MSG) && | 755 | if ((UDP->flags & FTD_MSG) && |
773 | current_type[drive] != NULL) | 756 | current_type[drive] != NULL) |
774 | DPRINT("Disk type is undefined after " | 757 | DPRINT("Disk type is undefined after disk change\n"); |
775 | "disk change\n"); | ||
776 | current_type[drive] = NULL; | 758 | current_type[drive] = NULL; |
777 | floppy_sizes[TOMINOR(drive)] = MAX_DISK_SIZE << 1; | 759 | floppy_sizes[TOMINOR(drive)] = MAX_DISK_SIZE << 1; |
778 | } | 760 | } |
@@ -780,7 +762,7 @@ static int disk_change(int drive) | |||
780 | return 1; | 762 | return 1; |
781 | } else { | 763 | } else { |
782 | UDRS->last_checked = jiffies; | 764 | UDRS->last_checked = jiffies; |
783 | UCLEARF(FD_DISK_NEWCHANGE); | 765 | clear_bit(FD_DISK_NEWCHANGE_BIT, &UDRS->flags); |
784 | } | 766 | } |
785 | return 0; | 767 | return 0; |
786 | } | 768 | } |
@@ -790,6 +772,12 @@ static inline int is_selected(int dor, int unit) | |||
790 | return ((dor & (0x10 << unit)) && (dor & 3) == unit); | 772 | return ((dor & (0x10 << unit)) && (dor & 3) == unit); |
791 | } | 773 | } |
792 | 774 | ||
775 | static bool is_ready_state(int status) | ||
776 | { | ||
777 | int state = status & (STATUS_READY | STATUS_DIR | STATUS_DMA); | ||
778 | return state == STATUS_READY; | ||
779 | } | ||
780 | |||
793 | static int set_dor(int fdc, char mask, char data) | 781 | static int set_dor(int fdc, char mask, char data) |
794 | { | 782 | { |
795 | unsigned char unit; | 783 | unsigned char unit; |
@@ -806,11 +794,8 @@ static int set_dor(int fdc, char mask, char data) | |||
806 | unit = olddor & 0x3; | 794 | unit = olddor & 0x3; |
807 | if (is_selected(olddor, unit) && !is_selected(newdor, unit)) { | 795 | if (is_selected(olddor, unit) && !is_selected(newdor, unit)) { |
808 | drive = REVDRIVE(fdc, unit); | 796 | drive = REVDRIVE(fdc, unit); |
809 | #ifdef DCL_DEBUG | 797 | debug_dcl(UDP->flags, |
810 | if (UDP->flags & FD_DEBUG) { | 798 | "calling disk change from set_dor\n"); |
811 | DPRINT("calling disk change from set_dor\n"); | ||
812 | } | ||
813 | #endif | ||
814 | disk_change(drive); | 799 | disk_change(drive); |
815 | } | 800 | } |
816 | FDCS->dor = newdor; | 801 | FDCS->dor = newdor; |
@@ -834,8 +819,10 @@ static void twaddle(void) | |||
834 | DRS->select_date = jiffies; | 819 | DRS->select_date = jiffies; |
835 | } | 820 | } |
836 | 821 | ||
837 | /* reset all driver information about the current fdc. This is needed after | 822 | /* |
838 | * a reset, and after a raw command. */ | 823 | * Reset all driver information about the current fdc. |
824 | * This is needed after a reset, and after a raw command. | ||
825 | */ | ||
839 | static void reset_fdc_info(int mode) | 826 | static void reset_fdc_info(int mode) |
840 | { | 827 | { |
841 | int drive; | 828 | int drive; |
@@ -857,7 +844,7 @@ static void set_fdc(int drive) | |||
857 | current_drive = drive; | 844 | current_drive = drive; |
858 | } | 845 | } |
859 | if (fdc != 1 && fdc != 0) { | 846 | if (fdc != 1 && fdc != 0) { |
860 | printk("bad fdc value\n"); | 847 | pr_info("bad fdc value\n"); |
861 | return; | 848 | return; |
862 | } | 849 | } |
863 | set_dor(fdc, ~0, 8); | 850 | set_dor(fdc, ~0, 8); |
@@ -871,11 +858,10 @@ static void set_fdc(int drive) | |||
871 | } | 858 | } |
872 | 859 | ||
873 | /* locks the driver */ | 860 | /* locks the driver */ |
874 | static int _lock_fdc(int drive, int interruptible, int line) | 861 | static int _lock_fdc(int drive, bool interruptible, int line) |
875 | { | 862 | { |
876 | if (!usage_count) { | 863 | if (!usage_count) { |
877 | printk(KERN_ERR | 864 | pr_err("Trying to lock fdc while usage count=0 at line %d\n", |
878 | "Trying to lock fdc while usage count=0 at line %d\n", | ||
879 | line); | 865 | line); |
880 | return -1; | 866 | return -1; |
881 | } | 867 | } |
@@ -904,15 +890,13 @@ static int _lock_fdc(int drive, int interruptible, int line) | |||
904 | } | 890 | } |
905 | command_status = FD_COMMAND_NONE; | 891 | command_status = FD_COMMAND_NONE; |
906 | 892 | ||
907 | __reschedule_timeout(drive, "lock fdc", 0); | 893 | __reschedule_timeout(drive, "lock fdc"); |
908 | set_fdc(drive); | 894 | set_fdc(drive); |
909 | return 0; | 895 | return 0; |
910 | } | 896 | } |
911 | 897 | ||
912 | #define lock_fdc(drive,interruptible) _lock_fdc(drive,interruptible, __LINE__) | 898 | #define lock_fdc(drive, interruptible) \ |
913 | 899 | _lock_fdc(drive, interruptible, __LINE__) | |
914 | #define LOCK_FDC(drive,interruptible) \ | ||
915 | if (lock_fdc(drive,interruptible)) return -EINTR; | ||
916 | 900 | ||
917 | /* unlocks the driver */ | 901 | /* unlocks the driver */ |
918 | static inline void unlock_fdc(void) | 902 | static inline void unlock_fdc(void) |
@@ -924,7 +908,7 @@ static inline void unlock_fdc(void) | |||
924 | DPRINT("FDC access conflict!\n"); | 908 | DPRINT("FDC access conflict!\n"); |
925 | 909 | ||
926 | if (do_floppy) | 910 | if (do_floppy) |
927 | DPRINT("device interrupt still active at FDC release: %p!\n", | 911 | DPRINT("device interrupt still active at FDC release: %pf!\n", |
928 | do_floppy); | 912 | do_floppy); |
929 | command_status = FD_COMMAND_NONE; | 913 | command_status = FD_COMMAND_NONE; |
930 | spin_lock_irqsave(&floppy_lock, flags); | 914 | spin_lock_irqsave(&floppy_lock, flags); |
@@ -1003,7 +987,7 @@ static void empty(void) | |||
1003 | 987 | ||
1004 | static DECLARE_WORK(floppy_work, NULL); | 988 | static DECLARE_WORK(floppy_work, NULL); |
1005 | 989 | ||
1006 | static void schedule_bh(void (*handler) (void)) | 990 | static void schedule_bh(void (*handler)(void)) |
1007 | { | 991 | { |
1008 | PREPARE_WORK(&floppy_work, (work_func_t)handler); | 992 | PREPARE_WORK(&floppy_work, (work_func_t)handler); |
1009 | schedule_work(&floppy_work); | 993 | schedule_work(&floppy_work); |
@@ -1026,11 +1010,7 @@ static void cancel_activity(void) | |||
1026 | * transfer */ | 1010 | * transfer */ |
1027 | static void fd_watchdog(void) | 1011 | static void fd_watchdog(void) |
1028 | { | 1012 | { |
1029 | #ifdef DCL_DEBUG | 1013 | debug_dcl(DP->flags, "calling disk change from watchdog\n"); |
1030 | if (DP->flags & FD_DEBUG) { | ||
1031 | DPRINT("calling disk change from watchdog\n"); | ||
1032 | } | ||
1033 | #endif | ||
1034 | 1014 | ||
1035 | if (disk_change(current_drive)) { | 1015 | if (disk_change(current_drive)) { |
1036 | DPRINT("disk removed during i/o\n"); | 1016 | DPRINT("disk removed during i/o\n"); |
@@ -1039,7 +1019,7 @@ static void fd_watchdog(void) | |||
1039 | reset_fdc(); | 1019 | reset_fdc(); |
1040 | } else { | 1020 | } else { |
1041 | del_timer(&fd_timer); | 1021 | del_timer(&fd_timer); |
1042 | fd_timer.function = (timeout_fn) fd_watchdog; | 1022 | fd_timer.function = (timeout_fn)fd_watchdog; |
1043 | fd_timer.expires = jiffies + HZ / 10; | 1023 | fd_timer.expires = jiffies + HZ / 10; |
1044 | add_timer(&fd_timer); | 1024 | add_timer(&fd_timer); |
1045 | } | 1025 | } |
@@ -1105,25 +1085,23 @@ static void setup_DMA(void) | |||
1105 | { | 1085 | { |
1106 | unsigned long f; | 1086 | unsigned long f; |
1107 | 1087 | ||
1108 | #ifdef FLOPPY_SANITY_CHECK | ||
1109 | if (raw_cmd->length == 0) { | 1088 | if (raw_cmd->length == 0) { |
1110 | int i; | 1089 | int i; |
1111 | 1090 | ||
1112 | printk("zero dma transfer size:"); | 1091 | pr_info("zero dma transfer size:"); |
1113 | for (i = 0; i < raw_cmd->cmd_count; i++) | 1092 | for (i = 0; i < raw_cmd->cmd_count; i++) |
1114 | printk("%x,", raw_cmd->cmd[i]); | 1093 | pr_cont("%x,", raw_cmd->cmd[i]); |
1115 | printk("\n"); | 1094 | pr_cont("\n"); |
1116 | cont->done(0); | 1095 | cont->done(0); |
1117 | FDCS->reset = 1; | 1096 | FDCS->reset = 1; |
1118 | return; | 1097 | return; |
1119 | } | 1098 | } |
1120 | if (((unsigned long)raw_cmd->kernel_data) % 512) { | 1099 | if (((unsigned long)raw_cmd->kernel_data) % 512) { |
1121 | printk("non aligned address: %p\n", raw_cmd->kernel_data); | 1100 | pr_info("non aligned address: %p\n", raw_cmd->kernel_data); |
1122 | cont->done(0); | 1101 | cont->done(0); |
1123 | FDCS->reset = 1; | 1102 | FDCS->reset = 1; |
1124 | return; | 1103 | return; |
1125 | } | 1104 | } |
1126 | #endif | ||
1127 | f = claim_dma_lock(); | 1105 | f = claim_dma_lock(); |
1128 | fd_disable_dma(); | 1106 | fd_disable_dma(); |
1129 | #ifdef fd_dma_setup | 1107 | #ifdef fd_dma_setup |
@@ -1165,7 +1143,7 @@ static int wait_til_ready(void) | |||
1165 | if (status & STATUS_READY) | 1143 | if (status & STATUS_READY) |
1166 | return status; | 1144 | return status; |
1167 | } | 1145 | } |
1168 | if (!initialising) { | 1146 | if (initialized) { |
1169 | DPRINT("Getstatus times out (%x) on fdc %d\n", status, fdc); | 1147 | DPRINT("Getstatus times out (%x) on fdc %d\n", status, fdc); |
1170 | show_floppy(); | 1148 | show_floppy(); |
1171 | } | 1149 | } |
@@ -1176,22 +1154,21 @@ static int wait_til_ready(void) | |||
1176 | /* sends a command byte to the fdc */ | 1154 | /* sends a command byte to the fdc */ |
1177 | static int output_byte(char byte) | 1155 | static int output_byte(char byte) |
1178 | { | 1156 | { |
1179 | int status; | 1157 | int status = wait_til_ready(); |
1180 | 1158 | ||
1181 | if ((status = wait_til_ready()) < 0) | 1159 | if (status < 0) |
1182 | return -1; | 1160 | return -1; |
1183 | if ((status & (STATUS_READY | STATUS_DIR | STATUS_DMA)) == STATUS_READY) { | 1161 | |
1162 | if (is_ready_state(status)) { | ||
1184 | fd_outb(byte, FD_DATA); | 1163 | fd_outb(byte, FD_DATA); |
1185 | #ifdef FLOPPY_SANITY_CHECK | ||
1186 | output_log[output_log_pos].data = byte; | 1164 | output_log[output_log_pos].data = byte; |
1187 | output_log[output_log_pos].status = status; | 1165 | output_log[output_log_pos].status = status; |
1188 | output_log[output_log_pos].jiffies = jiffies; | 1166 | output_log[output_log_pos].jiffies = jiffies; |
1189 | output_log_pos = (output_log_pos + 1) % OLOGSIZE; | 1167 | output_log_pos = (output_log_pos + 1) % OLOGSIZE; |
1190 | #endif | ||
1191 | return 0; | 1168 | return 0; |
1192 | } | 1169 | } |
1193 | FDCS->reset = 1; | 1170 | FDCS->reset = 1; |
1194 | if (!initialising) { | 1171 | if (initialized) { |
1195 | DPRINT("Unable to send byte %x to FDC. Fdc=%x Status=%x\n", | 1172 | DPRINT("Unable to send byte %x to FDC. Fdc=%x Status=%x\n", |
1196 | byte, fdc, status); | 1173 | byte, fdc, status); |
1197 | show_floppy(); | 1174 | show_floppy(); |
@@ -1199,8 +1176,6 @@ static int output_byte(char byte) | |||
1199 | return -1; | 1176 | return -1; |
1200 | } | 1177 | } |
1201 | 1178 | ||
1202 | #define LAST_OUT(x) if (output_byte(x)<0){ reset_fdc();return;} | ||
1203 | |||
1204 | /* gets the response from the fdc */ | 1179 | /* gets the response from the fdc */ |
1205 | static int result(void) | 1180 | static int result(void) |
1206 | { | 1181 | { |
@@ -1208,14 +1183,13 @@ static int result(void) | |||
1208 | int status = 0; | 1183 | int status = 0; |
1209 | 1184 | ||
1210 | for (i = 0; i < MAX_REPLIES; i++) { | 1185 | for (i = 0; i < MAX_REPLIES; i++) { |
1211 | if ((status = wait_til_ready()) < 0) | 1186 | status = wait_til_ready(); |
1187 | if (status < 0) | ||
1212 | break; | 1188 | break; |
1213 | status &= STATUS_DIR | STATUS_READY | STATUS_BUSY | STATUS_DMA; | 1189 | status &= STATUS_DIR | STATUS_READY | STATUS_BUSY | STATUS_DMA; |
1214 | if ((status & ~STATUS_BUSY) == STATUS_READY) { | 1190 | if ((status & ~STATUS_BUSY) == STATUS_READY) { |
1215 | #ifdef FLOPPY_SANITY_CHECK | ||
1216 | resultjiffies = jiffies; | 1191 | resultjiffies = jiffies; |
1217 | resultsize = i; | 1192 | resultsize = i; |
1218 | #endif | ||
1219 | return i; | 1193 | return i; |
1220 | } | 1194 | } |
1221 | if (status == (STATUS_DIR | STATUS_READY | STATUS_BUSY)) | 1195 | if (status == (STATUS_DIR | STATUS_READY | STATUS_BUSY)) |
@@ -1223,10 +1197,9 @@ static int result(void) | |||
1223 | else | 1197 | else |
1224 | break; | 1198 | break; |
1225 | } | 1199 | } |
1226 | if (!initialising) { | 1200 | if (initialized) { |
1227 | DPRINT | 1201 | DPRINT("get result error. Fdc=%d Last status=%x Read bytes=%d\n", |
1228 | ("get result error. Fdc=%d Last status=%x Read bytes=%d\n", | 1202 | fdc, status, i); |
1229 | fdc, status, i); | ||
1230 | show_floppy(); | 1203 | show_floppy(); |
1231 | } | 1204 | } |
1232 | FDCS->reset = 1; | 1205 | FDCS->reset = 1; |
@@ -1237,12 +1210,14 @@ static int result(void) | |||
1237 | /* does the fdc need more output? */ | 1210 | /* does the fdc need more output? */ |
1238 | static int need_more_output(void) | 1211 | static int need_more_output(void) |
1239 | { | 1212 | { |
1240 | int status; | 1213 | int status = wait_til_ready(); |
1241 | 1214 | ||
1242 | if ((status = wait_til_ready()) < 0) | 1215 | if (status < 0) |
1243 | return -1; | 1216 | return -1; |
1244 | if ((status & (STATUS_READY | STATUS_DIR | STATUS_DMA)) == STATUS_READY) | 1217 | |
1218 | if (is_ready_state(status)) | ||
1245 | return MORE_OUTPUT; | 1219 | return MORE_OUTPUT; |
1220 | |||
1246 | return result(); | 1221 | return result(); |
1247 | } | 1222 | } |
1248 | 1223 | ||
@@ -1264,9 +1239,12 @@ static inline void perpendicular_mode(void) | |||
1264 | default: | 1239 | default: |
1265 | DPRINT("Invalid data rate for perpendicular mode!\n"); | 1240 | DPRINT("Invalid data rate for perpendicular mode!\n"); |
1266 | cont->done(0); | 1241 | cont->done(0); |
1267 | FDCS->reset = 1; /* convenient way to return to | 1242 | FDCS->reset = 1; |
1268 | * redo without to much hassle (deep | 1243 | /* |
1269 | * stack et al. */ | 1244 | * convenient way to return to |
1245 | * redo without too much hassle | ||
1246 | * (deep stack et al.) | ||
1247 | */ | ||
1270 | return; | 1248 | return; |
1271 | } | 1249 | } |
1272 | } else | 1250 | } else |
@@ -1366,9 +1344,9 @@ static void fdc_specify(void) | |||
1366 | 1344 | ||
1367 | /* Convert step rate from microseconds to milliseconds and 4 bits */ | 1345 | /* Convert step rate from microseconds to milliseconds and 4 bits */ |
1368 | srt = 16 - DIV_ROUND_UP(DP->srt * scale_dtr / 1000, NOMINAL_DTR); | 1346 | srt = 16 - DIV_ROUND_UP(DP->srt * scale_dtr / 1000, NOMINAL_DTR); |
1369 | if (slow_floppy) { | 1347 | if (slow_floppy) |
1370 | srt = srt / 4; | 1348 | srt = srt / 4; |
1371 | } | 1349 | |
1372 | SUPBOUND(srt, 0xf); | 1350 | SUPBOUND(srt, 0xf); |
1373 | INFBOUND(srt, 0); | 1351 | INFBOUND(srt, 0); |
1374 | 1352 | ||
@@ -1415,16 +1393,46 @@ static int fdc_dtr(void) | |||
1415 | * Pause 5 msec to avoid trouble. (Needs to be 2 jiffies) | 1393 | * Pause 5 msec to avoid trouble. (Needs to be 2 jiffies) |
1416 | */ | 1394 | */ |
1417 | FDCS->dtr = raw_cmd->rate & 3; | 1395 | FDCS->dtr = raw_cmd->rate & 3; |
1418 | return (fd_wait_for_completion(jiffies + 2UL * HZ / 100, | 1396 | return fd_wait_for_completion(jiffies + 2UL * HZ / 100, |
1419 | (timeout_fn) floppy_ready)); | 1397 | (timeout_fn)floppy_ready); |
1420 | } /* fdc_dtr */ | 1398 | } /* fdc_dtr */ |
1421 | 1399 | ||
1422 | static void tell_sector(void) | 1400 | static void tell_sector(void) |
1423 | { | 1401 | { |
1424 | printk(": track %d, head %d, sector %d, size %d", | 1402 | pr_cont(": track %d, head %d, sector %d, size %d", |
1425 | R_TRACK, R_HEAD, R_SECTOR, R_SIZECODE); | 1403 | R_TRACK, R_HEAD, R_SECTOR, R_SIZECODE); |
1426 | } /* tell_sector */ | 1404 | } /* tell_sector */ |
1427 | 1405 | ||
1406 | static void print_errors(void) | ||
1407 | { | ||
1408 | DPRINT(""); | ||
1409 | if (ST0 & ST0_ECE) { | ||
1410 | pr_cont("Recalibrate failed!"); | ||
1411 | } else if (ST2 & ST2_CRC) { | ||
1412 | pr_cont("data CRC error"); | ||
1413 | tell_sector(); | ||
1414 | } else if (ST1 & ST1_CRC) { | ||
1415 | pr_cont("CRC error"); | ||
1416 | tell_sector(); | ||
1417 | } else if ((ST1 & (ST1_MAM | ST1_ND)) || | ||
1418 | (ST2 & ST2_MAM)) { | ||
1419 | if (!probing) { | ||
1420 | pr_cont("sector not found"); | ||
1421 | tell_sector(); | ||
1422 | } else | ||
1423 | pr_cont("probe failed..."); | ||
1424 | } else if (ST2 & ST2_WC) { /* seek error */ | ||
1425 | pr_cont("wrong cylinder"); | ||
1426 | } else if (ST2 & ST2_BC) { /* cylinder marked as bad */ | ||
1427 | pr_cont("bad cylinder"); | ||
1428 | } else { | ||
1429 | pr_cont("unknown error. ST[0..2] are: 0x%x 0x%x 0x%x", | ||
1430 | ST0, ST1, ST2); | ||
1431 | tell_sector(); | ||
1432 | } | ||
1433 | pr_cont("\n"); | ||
1434 | } | ||
1435 | |||
1428 | /* | 1436 | /* |
1429 | * OK, this error interpreting routine is called after a | 1437 | * OK, this error interpreting routine is called after a |
1430 | * DMA read/write has succeeded | 1438 | * DMA read/write has succeeded |
@@ -1437,7 +1445,7 @@ static int interpret_errors(void) | |||
1437 | char bad; | 1445 | char bad; |
1438 | 1446 | ||
1439 | if (inr != 7) { | 1447 | if (inr != 7) { |
1440 | DPRINT("-- FDC reply error"); | 1448 | DPRINT("-- FDC reply error\n"); |
1441 | FDCS->reset = 1; | 1449 | FDCS->reset = 1; |
1442 | return 1; | 1450 | return 1; |
1443 | } | 1451 | } |
@@ -1450,43 +1458,17 @@ static int interpret_errors(void) | |||
1450 | bad = 1; | 1458 | bad = 1; |
1451 | if (ST1 & ST1_WP) { | 1459 | if (ST1 & ST1_WP) { |
1452 | DPRINT("Drive is write protected\n"); | 1460 | DPRINT("Drive is write protected\n"); |
1453 | CLEARF(FD_DISK_WRITABLE); | 1461 | clear_bit(FD_DISK_WRITABLE_BIT, &DRS->flags); |
1454 | cont->done(0); | 1462 | cont->done(0); |
1455 | bad = 2; | 1463 | bad = 2; |
1456 | } else if (ST1 & ST1_ND) { | 1464 | } else if (ST1 & ST1_ND) { |
1457 | SETF(FD_NEED_TWADDLE); | 1465 | set_bit(FD_NEED_TWADDLE_BIT, &DRS->flags); |
1458 | } else if (ST1 & ST1_OR) { | 1466 | } else if (ST1 & ST1_OR) { |
1459 | if (DP->flags & FTD_MSG) | 1467 | if (DP->flags & FTD_MSG) |
1460 | DPRINT("Over/Underrun - retrying\n"); | 1468 | DPRINT("Over/Underrun - retrying\n"); |
1461 | bad = 0; | 1469 | bad = 0; |
1462 | } else if (*errors >= DP->max_errors.reporting) { | 1470 | } else if (*errors >= DP->max_errors.reporting) { |
1463 | DPRINT(""); | 1471 | print_errors(); |
1464 | if (ST0 & ST0_ECE) { | ||
1465 | printk("Recalibrate failed!"); | ||
1466 | } else if (ST2 & ST2_CRC) { | ||
1467 | printk("data CRC error"); | ||
1468 | tell_sector(); | ||
1469 | } else if (ST1 & ST1_CRC) { | ||
1470 | printk("CRC error"); | ||
1471 | tell_sector(); | ||
1472 | } else if ((ST1 & (ST1_MAM | ST1_ND)) | ||
1473 | || (ST2 & ST2_MAM)) { | ||
1474 | if (!probing) { | ||
1475 | printk("sector not found"); | ||
1476 | tell_sector(); | ||
1477 | } else | ||
1478 | printk("probe failed..."); | ||
1479 | } else if (ST2 & ST2_WC) { /* seek error */ | ||
1480 | printk("wrong cylinder"); | ||
1481 | } else if (ST2 & ST2_BC) { /* cylinder marked as bad */ | ||
1482 | printk("bad cylinder"); | ||
1483 | } else { | ||
1484 | printk | ||
1485 | ("unknown error. ST[0..2] are: 0x%x 0x%x 0x%x", | ||
1486 | ST0, ST1, ST2); | ||
1487 | tell_sector(); | ||
1488 | } | ||
1489 | printk("\n"); | ||
1490 | } | 1472 | } |
1491 | if (ST2 & ST2_WC || ST2 & ST2_BC) | 1473 | if (ST2 & ST2_WC || ST2 & ST2_BC) |
1492 | /* wrong cylinder => recal */ | 1474 | /* wrong cylinder => recal */ |
@@ -1531,9 +1513,9 @@ static void setup_rw_floppy(void) | |||
1531 | */ | 1513 | */ |
1532 | if (time_after(ready_date, jiffies + DP->select_delay)) { | 1514 | if (time_after(ready_date, jiffies + DP->select_delay)) { |
1533 | ready_date -= DP->select_delay; | 1515 | ready_date -= DP->select_delay; |
1534 | function = (timeout_fn) floppy_start; | 1516 | function = (timeout_fn)floppy_start; |
1535 | } else | 1517 | } else |
1536 | function = (timeout_fn) setup_rw_floppy; | 1518 | function = (timeout_fn)setup_rw_floppy; |
1537 | 1519 | ||
1538 | /* wait until the floppy is spinning fast enough */ | 1520 | /* wait until the floppy is spinning fast enough */ |
1539 | if (fd_wait_for_completion(ready_date, function)) | 1521 | if (fd_wait_for_completion(ready_date, function)) |
@@ -1551,7 +1533,7 @@ static void setup_rw_floppy(void) | |||
1551 | for (i = 0; i < raw_cmd->cmd_count; i++) | 1533 | for (i = 0; i < raw_cmd->cmd_count; i++) |
1552 | r |= output_byte(raw_cmd->cmd[i]); | 1534 | r |= output_byte(raw_cmd->cmd[i]); |
1553 | 1535 | ||
1554 | debugt("rw_command: "); | 1536 | debugt(__func__, "rw_command"); |
1555 | 1537 | ||
1556 | if (r) { | 1538 | if (r) { |
1557 | cont->error(); | 1539 | cont->error(); |
@@ -1574,7 +1556,7 @@ static int blind_seek; | |||
1574 | */ | 1556 | */ |
1575 | static void seek_interrupt(void) | 1557 | static void seek_interrupt(void) |
1576 | { | 1558 | { |
1577 | debugt("seek interrupt:"); | 1559 | debugt(__func__, ""); |
1578 | if (inr != 2 || (ST0 & 0xF8) != 0x20) { | 1560 | if (inr != 2 || (ST0 & 0xF8) != 0x20) { |
1579 | DPRINT("seek failed\n"); | 1561 | DPRINT("seek failed\n"); |
1580 | DRS->track = NEED_2_RECAL; | 1562 | DRS->track = NEED_2_RECAL; |
@@ -1583,14 +1565,11 @@ static void seek_interrupt(void) | |||
1583 | return; | 1565 | return; |
1584 | } | 1566 | } |
1585 | if (DRS->track >= 0 && DRS->track != ST1 && !blind_seek) { | 1567 | if (DRS->track >= 0 && DRS->track != ST1 && !blind_seek) { |
1586 | #ifdef DCL_DEBUG | 1568 | debug_dcl(DP->flags, |
1587 | if (DP->flags & FD_DEBUG) { | 1569 | "clearing NEWCHANGE flag because of effective seek\n"); |
1588 | DPRINT | 1570 | debug_dcl(DP->flags, "jiffies=%lu\n", jiffies); |
1589 | ("clearing NEWCHANGE flag because of effective seek\n"); | 1571 | clear_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags); |
1590 | DPRINT("jiffies=%lu\n", jiffies); | 1572 | /* effective seek */ |
1591 | } | ||
1592 | #endif | ||
1593 | CLEARF(FD_DISK_NEWCHANGE); /* effective seek */ | ||
1594 | DRS->select_date = jiffies; | 1573 | DRS->select_date = jiffies; |
1595 | } | 1574 | } |
1596 | DRS->track = ST1; | 1575 | DRS->track = ST1; |
@@ -1599,26 +1578,23 @@ static void seek_interrupt(void) | |||
1599 | 1578 | ||
1600 | static void check_wp(void) | 1579 | static void check_wp(void) |
1601 | { | 1580 | { |
1602 | if (TESTF(FD_VERIFY)) { | 1581 | if (test_bit(FD_VERIFY_BIT, &DRS->flags)) { |
1603 | /* check write protection */ | 1582 | /* check write protection */ |
1604 | output_byte(FD_GETSTATUS); | 1583 | output_byte(FD_GETSTATUS); |
1605 | output_byte(UNIT(current_drive)); | 1584 | output_byte(UNIT(current_drive)); |
1606 | if (result() != 1) { | 1585 | if (result() != 1) { |
1607 | FDCS->reset = 1; | 1586 | FDCS->reset = 1; |
1608 | return; | 1587 | return; |
1609 | } | 1588 | } |
1610 | CLEARF(FD_VERIFY); | 1589 | clear_bit(FD_VERIFY_BIT, &DRS->flags); |
1611 | CLEARF(FD_NEED_TWADDLE); | 1590 | clear_bit(FD_NEED_TWADDLE_BIT, &DRS->flags); |
1612 | #ifdef DCL_DEBUG | 1591 | debug_dcl(DP->flags, |
1613 | if (DP->flags & FD_DEBUG) { | 1592 | "checking whether disk is write protected\n"); |
1614 | DPRINT("checking whether disk is write protected\n"); | 1593 | debug_dcl(DP->flags, "wp=%x\n", ST3 & 0x40); |
1615 | DPRINT("wp=%x\n", ST3 & 0x40); | ||
1616 | } | ||
1617 | #endif | ||
1618 | if (!(ST3 & 0x40)) | 1594 | if (!(ST3 & 0x40)) |
1619 | SETF(FD_DISK_WRITABLE); | 1595 | set_bit(FD_DISK_WRITABLE_BIT, &DRS->flags); |
1620 | else | 1596 | else |
1621 | CLEARF(FD_DISK_WRITABLE); | 1597 | clear_bit(FD_DISK_WRITABLE_BIT, &DRS->flags); |
1622 | } | 1598 | } |
1623 | } | 1599 | } |
1624 | 1600 | ||
@@ -1628,19 +1604,15 @@ static void seek_floppy(void) | |||
1628 | 1604 | ||
1629 | blind_seek = 0; | 1605 | blind_seek = 0; |
1630 | 1606 | ||
1631 | #ifdef DCL_DEBUG | 1607 | debug_dcl(DP->flags, "calling disk change from %s\n", __func__); |
1632 | if (DP->flags & FD_DEBUG) { | ||
1633 | DPRINT("calling disk change from seek\n"); | ||
1634 | } | ||
1635 | #endif | ||
1636 | 1608 | ||
1637 | if (!TESTF(FD_DISK_NEWCHANGE) && | 1609 | if (!test_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags) && |
1638 | disk_change(current_drive) && (raw_cmd->flags & FD_RAW_NEED_DISK)) { | 1610 | disk_change(current_drive) && (raw_cmd->flags & FD_RAW_NEED_DISK)) { |
1639 | /* the media changed flag should be cleared after the seek. | 1611 | /* the media changed flag should be cleared after the seek. |
1640 | * If it isn't, this means that there is really no disk in | 1612 | * If it isn't, this means that there is really no disk in |
1641 | * the drive. | 1613 | * the drive. |
1642 | */ | 1614 | */ |
1643 | SETF(FD_DISK_CHANGED); | 1615 | set_bit(FD_DISK_CHANGED_BIT, &DRS->flags); |
1644 | cont->done(0); | 1616 | cont->done(0); |
1645 | cont->redo(); | 1617 | cont->redo(); |
1646 | return; | 1618 | return; |
@@ -1648,7 +1620,7 @@ static void seek_floppy(void) | |||
1648 | if (DRS->track <= NEED_1_RECAL) { | 1620 | if (DRS->track <= NEED_1_RECAL) { |
1649 | recalibrate_floppy(); | 1621 | recalibrate_floppy(); |
1650 | return; | 1622 | return; |
1651 | } else if (TESTF(FD_DISK_NEWCHANGE) && | 1623 | } else if (test_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags) && |
1652 | (raw_cmd->flags & FD_RAW_NEED_DISK) && | 1624 | (raw_cmd->flags & FD_RAW_NEED_DISK) && |
1653 | (DRS->track <= NO_TRACK || DRS->track == raw_cmd->track)) { | 1625 | (DRS->track <= NO_TRACK || DRS->track == raw_cmd->track)) { |
1654 | /* we seek to clear the media-changed condition. Does anybody | 1626 | /* we seek to clear the media-changed condition. Does anybody |
@@ -1677,19 +1649,22 @@ static void seek_floppy(void) | |||
1677 | do_floppy = seek_interrupt; | 1649 | do_floppy = seek_interrupt; |
1678 | output_byte(FD_SEEK); | 1650 | output_byte(FD_SEEK); |
1679 | output_byte(UNIT(current_drive)); | 1651 | output_byte(UNIT(current_drive)); |
1680 | LAST_OUT(track); | 1652 | if (output_byte(track) < 0) { |
1681 | debugt("seek command:"); | 1653 | reset_fdc(); |
1654 | return; | ||
1655 | } | ||
1656 | debugt(__func__, ""); | ||
1682 | } | 1657 | } |
1683 | 1658 | ||
1684 | static void recal_interrupt(void) | 1659 | static void recal_interrupt(void) |
1685 | { | 1660 | { |
1686 | debugt("recal interrupt:"); | 1661 | debugt(__func__, ""); |
1687 | if (inr != 2) | 1662 | if (inr != 2) |
1688 | FDCS->reset = 1; | 1663 | FDCS->reset = 1; |
1689 | else if (ST0 & ST0_ECE) { | 1664 | else if (ST0 & ST0_ECE) { |
1690 | switch (DRS->track) { | 1665 | switch (DRS->track) { |
1691 | case NEED_1_RECAL: | 1666 | case NEED_1_RECAL: |
1692 | debugt("recal interrupt need 1 recal:"); | 1667 | debugt(__func__, "need 1 recal"); |
1693 | /* after a second recalibrate, we still haven't | 1668 | /* after a second recalibrate, we still haven't |
1694 | * reached track 0. Probably no drive. Raise an | 1669 | * reached track 0. Probably no drive. Raise an |
1695 | * error, as failing immediately might upset | 1670 | * error, as failing immediately might upset |
@@ -1698,25 +1673,21 @@ static void recal_interrupt(void) | |||
1698 | cont->redo(); | 1673 | cont->redo(); |
1699 | return; | 1674 | return; |
1700 | case NEED_2_RECAL: | 1675 | case NEED_2_RECAL: |
1701 | debugt("recal interrupt need 2 recal:"); | 1676 | debugt(__func__, "need 2 recal"); |
1702 | /* If we already did a recalibrate, | 1677 | /* If we already did a recalibrate, |
1703 | * and we are not at track 0, this | 1678 | * and we are not at track 0, this |
1704 | * means we have moved. (The only way | 1679 | * means we have moved. (The only way |
1705 | * not to move at recalibration is to | 1680 | * not to move at recalibration is to |
1706 | * be already at track 0.) Clear the | 1681 | * be already at track 0.) Clear the |
1707 | * new change flag */ | 1682 | * new change flag */ |
1708 | #ifdef DCL_DEBUG | 1683 | debug_dcl(DP->flags, |
1709 | if (DP->flags & FD_DEBUG) { | 1684 | "clearing NEWCHANGE flag because of second recalibrate\n"); |
1710 | DPRINT | ||
1711 | ("clearing NEWCHANGE flag because of second recalibrate\n"); | ||
1712 | } | ||
1713 | #endif | ||
1714 | 1685 | ||
1715 | CLEARF(FD_DISK_NEWCHANGE); | 1686 | clear_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags); |
1716 | DRS->select_date = jiffies; | 1687 | DRS->select_date = jiffies; |
1717 | /* fall through */ | 1688 | /* fall through */ |
1718 | default: | 1689 | default: |
1719 | debugt("recal interrupt default:"); | 1690 | debugt(__func__, "default"); |
1720 | /* Recalibrate moves the head by at | 1691 | /* Recalibrate moves the head by at |
1721 | * most 80 steps. If after one | 1692 | * most 80 steps. If after one |
1722 | * recalibrate we don't have reached | 1693 | * recalibrate we don't have reached |
@@ -1738,8 +1709,8 @@ static void print_result(char *message, int inr) | |||
1738 | DPRINT("%s ", message); | 1709 | DPRINT("%s ", message); |
1739 | if (inr >= 0) | 1710 | if (inr >= 0) |
1740 | for (i = 0; i < inr; i++) | 1711 | for (i = 0; i < inr; i++) |
1741 | printk("repl[%d]=%x ", i, reply_buffer[i]); | 1712 | pr_cont("repl[%d]=%x ", i, reply_buffer[i]); |
1742 | printk("\n"); | 1713 | pr_cont("\n"); |
1743 | } | 1714 | } |
1744 | 1715 | ||
1745 | /* interrupt handler. Note that this can be called externally on the Sparc */ | 1716 | /* interrupt handler. Note that this can be called externally on the Sparc */ |
@@ -1760,10 +1731,10 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id) | |||
1760 | do_floppy = NULL; | 1731 | do_floppy = NULL; |
1761 | if (fdc >= N_FDC || FDCS->address == -1) { | 1732 | if (fdc >= N_FDC || FDCS->address == -1) { |
1762 | /* we don't even know which FDC is the culprit */ | 1733 | /* we don't even know which FDC is the culprit */ |
1763 | printk("DOR0=%x\n", fdc_state[0].dor); | 1734 | pr_info("DOR0=%x\n", fdc_state[0].dor); |
1764 | printk("floppy interrupt on bizarre fdc %d\n", fdc); | 1735 | pr_info("floppy interrupt on bizarre fdc %d\n", fdc); |
1765 | printk("handler=%p\n", handler); | 1736 | pr_info("handler=%pf\n", handler); |
1766 | is_alive("bizarre fdc"); | 1737 | is_alive(__func__, "bizarre fdc"); |
1767 | return IRQ_NONE; | 1738 | return IRQ_NONE; |
1768 | } | 1739 | } |
1769 | 1740 | ||
@@ -1777,7 +1748,7 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id) | |||
1777 | * activity. | 1748 | * activity. |
1778 | */ | 1749 | */ |
1779 | 1750 | ||
1780 | do_print = !handler && print_unex && !initialising; | 1751 | do_print = !handler && print_unex && initialized; |
1781 | 1752 | ||
1782 | inr = result(); | 1753 | inr = result(); |
1783 | if (do_print) | 1754 | if (do_print) |
@@ -1790,15 +1761,15 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id) | |||
1790 | if (do_print) | 1761 | if (do_print) |
1791 | print_result("sensei", inr); | 1762 | print_result("sensei", inr); |
1792 | max_sensei--; | 1763 | max_sensei--; |
1793 | } while ((ST0 & 0x83) != UNIT(current_drive) && inr == 2 | 1764 | } while ((ST0 & 0x83) != UNIT(current_drive) && |
1794 | && max_sensei); | 1765 | inr == 2 && max_sensei); |
1795 | } | 1766 | } |
1796 | if (!handler) { | 1767 | if (!handler) { |
1797 | FDCS->reset = 1; | 1768 | FDCS->reset = 1; |
1798 | return IRQ_NONE; | 1769 | return IRQ_NONE; |
1799 | } | 1770 | } |
1800 | schedule_bh(handler); | 1771 | schedule_bh(handler); |
1801 | is_alive("normal interrupt end"); | 1772 | is_alive(__func__, "normal interrupt end"); |
1802 | 1773 | ||
1803 | /* FIXME! Was it really for us? */ | 1774 | /* FIXME! Was it really for us? */ |
1804 | return IRQ_HANDLED; | 1775 | return IRQ_HANDLED; |
@@ -1806,10 +1777,11 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id) | |||
1806 | 1777 | ||
1807 | static void recalibrate_floppy(void) | 1778 | static void recalibrate_floppy(void) |
1808 | { | 1779 | { |
1809 | debugt("recalibrate floppy:"); | 1780 | debugt(__func__, ""); |
1810 | do_floppy = recal_interrupt; | 1781 | do_floppy = recal_interrupt; |
1811 | output_byte(FD_RECALIBRATE); | 1782 | output_byte(FD_RECALIBRATE); |
1812 | LAST_OUT(UNIT(current_drive)); | 1783 | if (output_byte(UNIT(current_drive)) < 0) |
1784 | reset_fdc(); | ||
1813 | } | 1785 | } |
1814 | 1786 | ||
1815 | /* | 1787 | /* |
@@ -1817,10 +1789,10 @@ static void recalibrate_floppy(void) | |||
1817 | */ | 1789 | */ |
1818 | static void reset_interrupt(void) | 1790 | static void reset_interrupt(void) |
1819 | { | 1791 | { |
1820 | debugt("reset interrupt:"); | 1792 | debugt(__func__, ""); |
1821 | result(); /* get the status ready for set_fdc */ | 1793 | result(); /* get the status ready for set_fdc */ |
1822 | if (FDCS->reset) { | 1794 | if (FDCS->reset) { |
1823 | printk("reset set in interrupt, calling %p\n", cont->error); | 1795 | pr_info("reset set in interrupt, calling %pf\n", cont->error); |
1824 | cont->error(); /* a reset just after a reset. BAD! */ | 1796 | cont->error(); /* a reset just after a reset. BAD! */ |
1825 | } | 1797 | } |
1826 | cont->redo(); | 1798 | cont->redo(); |
@@ -1858,53 +1830,49 @@ static void show_floppy(void) | |||
1858 | { | 1830 | { |
1859 | int i; | 1831 | int i; |
1860 | 1832 | ||
1861 | printk("\n"); | 1833 | pr_info("\n"); |
1862 | printk("floppy driver state\n"); | 1834 | pr_info("floppy driver state\n"); |
1863 | printk("-------------------\n"); | 1835 | pr_info("-------------------\n"); |
1864 | printk("now=%lu last interrupt=%lu diff=%lu last called handler=%p\n", | 1836 | pr_info("now=%lu last interrupt=%lu diff=%lu last called handler=%pf\n", |
1865 | jiffies, interruptjiffies, jiffies - interruptjiffies, | 1837 | jiffies, interruptjiffies, jiffies - interruptjiffies, |
1866 | lasthandler); | 1838 | lasthandler); |
1867 | 1839 | ||
1868 | #ifdef FLOPPY_SANITY_CHECK | 1840 | pr_info("timeout_message=%s\n", timeout_message); |
1869 | printk("timeout_message=%s\n", timeout_message); | 1841 | pr_info("last output bytes:\n"); |
1870 | printk("last output bytes:\n"); | ||
1871 | for (i = 0; i < OLOGSIZE; i++) | 1842 | for (i = 0; i < OLOGSIZE; i++) |
1872 | printk("%2x %2x %lu\n", | 1843 | pr_info("%2x %2x %lu\n", |
1873 | output_log[(i + output_log_pos) % OLOGSIZE].data, | 1844 | output_log[(i + output_log_pos) % OLOGSIZE].data, |
1874 | output_log[(i + output_log_pos) % OLOGSIZE].status, | 1845 | output_log[(i + output_log_pos) % OLOGSIZE].status, |
1875 | output_log[(i + output_log_pos) % OLOGSIZE].jiffies); | 1846 | output_log[(i + output_log_pos) % OLOGSIZE].jiffies); |
1876 | printk("last result at %lu\n", resultjiffies); | 1847 | pr_info("last result at %lu\n", resultjiffies); |
1877 | printk("last redo_fd_request at %lu\n", lastredo); | 1848 | pr_info("last redo_fd_request at %lu\n", lastredo); |
1878 | for (i = 0; i < resultsize; i++) { | 1849 | print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, 16, 1, |
1879 | printk("%2x ", reply_buffer[i]); | 1850 | reply_buffer, resultsize, true); |
1880 | } | 1851 | |
1881 | printk("\n"); | 1852 | pr_info("status=%x\n", fd_inb(FD_STATUS)); |
1882 | #endif | 1853 | pr_info("fdc_busy=%lu\n", fdc_busy); |
1883 | |||
1884 | printk("status=%x\n", fd_inb(FD_STATUS)); | ||
1885 | printk("fdc_busy=%lu\n", fdc_busy); | ||
1886 | if (do_floppy) | 1854 | if (do_floppy) |
1887 | printk("do_floppy=%p\n", do_floppy); | 1855 | pr_info("do_floppy=%pf\n", do_floppy); |
1888 | if (work_pending(&floppy_work)) | 1856 | if (work_pending(&floppy_work)) |
1889 | printk("floppy_work.func=%p\n", floppy_work.func); | 1857 | pr_info("floppy_work.func=%pf\n", floppy_work.func); |
1890 | if (timer_pending(&fd_timer)) | 1858 | if (timer_pending(&fd_timer)) |
1891 | printk("fd_timer.function=%p\n", fd_timer.function); | 1859 | pr_info("fd_timer.function=%pf\n", fd_timer.function); |
1892 | if (timer_pending(&fd_timeout)) { | 1860 | if (timer_pending(&fd_timeout)) { |
1893 | printk("timer_function=%p\n", fd_timeout.function); | 1861 | pr_info("timer_function=%pf\n", fd_timeout.function); |
1894 | printk("expires=%lu\n", fd_timeout.expires - jiffies); | 1862 | pr_info("expires=%lu\n", fd_timeout.expires - jiffies); |
1895 | printk("now=%lu\n", jiffies); | 1863 | pr_info("now=%lu\n", jiffies); |
1896 | } | 1864 | } |
1897 | printk("cont=%p\n", cont); | 1865 | pr_info("cont=%p\n", cont); |
1898 | printk("current_req=%p\n", current_req); | 1866 | pr_info("current_req=%p\n", current_req); |
1899 | printk("command_status=%d\n", command_status); | 1867 | pr_info("command_status=%d\n", command_status); |
1900 | printk("\n"); | 1868 | pr_info("\n"); |
1901 | } | 1869 | } |
1902 | 1870 | ||
1903 | static void floppy_shutdown(unsigned long data) | 1871 | static void floppy_shutdown(unsigned long data) |
1904 | { | 1872 | { |
1905 | unsigned long flags; | 1873 | unsigned long flags; |
1906 | 1874 | ||
1907 | if (!initialising) | 1875 | if (initialized) |
1908 | show_floppy(); | 1876 | show_floppy(); |
1909 | cancel_activity(); | 1877 | cancel_activity(); |
1910 | 1878 | ||
@@ -1916,17 +1884,17 @@ static void floppy_shutdown(unsigned long data) | |||
1916 | 1884 | ||
1917 | /* avoid dma going to a random drive after shutdown */ | 1885 | /* avoid dma going to a random drive after shutdown */ |
1918 | 1886 | ||
1919 | if (!initialising) | 1887 | if (initialized) |
1920 | DPRINT("floppy timeout called\n"); | 1888 | DPRINT("floppy timeout called\n"); |
1921 | FDCS->reset = 1; | 1889 | FDCS->reset = 1; |
1922 | if (cont) { | 1890 | if (cont) { |
1923 | cont->done(0); | 1891 | cont->done(0); |
1924 | cont->redo(); /* this will recall reset when needed */ | 1892 | cont->redo(); /* this will recall reset when needed */ |
1925 | } else { | 1893 | } else { |
1926 | printk("no cont in shutdown!\n"); | 1894 | pr_info("no cont in shutdown!\n"); |
1927 | process_fd_request(); | 1895 | process_fd_request(); |
1928 | } | 1896 | } |
1929 | is_alive("floppy shutdown"); | 1897 | is_alive(__func__, ""); |
1930 | } | 1898 | } |
1931 | 1899 | ||
1932 | /* start motor, check media-changed condition and write protection */ | 1900 | /* start motor, check media-changed condition and write protection */ |
@@ -1954,27 +1922,26 @@ static int start_motor(void (*function)(void)) | |||
1954 | set_dor(fdc, mask, data); | 1922 | set_dor(fdc, mask, data); |
1955 | 1923 | ||
1956 | /* wait_for_completion also schedules reset if needed. */ | 1924 | /* wait_for_completion also schedules reset if needed. */ |
1957 | return (fd_wait_for_completion(DRS->select_date + DP->select_delay, | 1925 | return fd_wait_for_completion(DRS->select_date + DP->select_delay, |
1958 | (timeout_fn) function)); | 1926 | (timeout_fn)function); |
1959 | } | 1927 | } |
1960 | 1928 | ||
1961 | static void floppy_ready(void) | 1929 | static void floppy_ready(void) |
1962 | { | 1930 | { |
1963 | CHECK_RESET; | 1931 | if (FDCS->reset) { |
1932 | reset_fdc(); | ||
1933 | return; | ||
1934 | } | ||
1964 | if (start_motor(floppy_ready)) | 1935 | if (start_motor(floppy_ready)) |
1965 | return; | 1936 | return; |
1966 | if (fdc_dtr()) | 1937 | if (fdc_dtr()) |
1967 | return; | 1938 | return; |
1968 | 1939 | ||
1969 | #ifdef DCL_DEBUG | 1940 | debug_dcl(DP->flags, "calling disk change from floppy_ready\n"); |
1970 | if (DP->flags & FD_DEBUG) { | ||
1971 | DPRINT("calling disk change from floppy_ready\n"); | ||
1972 | } | ||
1973 | #endif | ||
1974 | if (!(raw_cmd->flags & FD_RAW_NO_MOTOR) && | 1941 | if (!(raw_cmd->flags & FD_RAW_NO_MOTOR) && |
1975 | disk_change(current_drive) && !DP->select_delay) | 1942 | disk_change(current_drive) && !DP->select_delay) |
1976 | twaddle(); /* this clears the dcl on certain drive/controller | 1943 | twaddle(); /* this clears the dcl on certain |
1977 | * combinations */ | 1944 | * drive/controller combinations */ |
1978 | 1945 | ||
1979 | #ifdef fd_chose_dma_mode | 1946 | #ifdef fd_chose_dma_mode |
1980 | if ((raw_cmd->flags & FD_RAW_READ) || (raw_cmd->flags & FD_RAW_WRITE)) { | 1947 | if ((raw_cmd->flags & FD_RAW_READ) || (raw_cmd->flags & FD_RAW_WRITE)) { |
@@ -1998,15 +1965,11 @@ static void floppy_ready(void) | |||
1998 | 1965 | ||
1999 | static void floppy_start(void) | 1966 | static void floppy_start(void) |
2000 | { | 1967 | { |
2001 | reschedule_timeout(current_reqD, "floppy start", 0); | 1968 | reschedule_timeout(current_reqD, "floppy start"); |
2002 | 1969 | ||
2003 | scandrives(); | 1970 | scandrives(); |
2004 | #ifdef DCL_DEBUG | 1971 | debug_dcl(DP->flags, "setting NEWCHANGE in floppy_start\n"); |
2005 | if (DP->flags & FD_DEBUG) { | 1972 | set_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags); |
2006 | DPRINT("setting NEWCHANGE in floppy_start\n"); | ||
2007 | } | ||
2008 | #endif | ||
2009 | SETF(FD_DISK_NEWCHANGE); | ||
2010 | floppy_ready(); | 1973 | floppy_ready(); |
2011 | } | 1974 | } |
2012 | 1975 | ||
@@ -2026,7 +1989,7 @@ static void floppy_start(void) | |||
2026 | 1989 | ||
2027 | static void do_wakeup(void) | 1990 | static void do_wakeup(void) |
2028 | { | 1991 | { |
2029 | reschedule_timeout(MAXTIMEOUT, "do wakeup", 0); | 1992 | reschedule_timeout(MAXTIMEOUT, "do wakeup"); |
2030 | cont = NULL; | 1993 | cont = NULL; |
2031 | command_status += 2; | 1994 | command_status += 2; |
2032 | wake_up(&command_done); | 1995 | wake_up(&command_done); |
@@ -2046,7 +2009,7 @@ static struct cont_t intr_cont = { | |||
2046 | .done = (done_f)empty | 2009 | .done = (done_f)empty |
2047 | }; | 2010 | }; |
2048 | 2011 | ||
2049 | static int wait_til_done(void (*handler)(void), int interruptible) | 2012 | static int wait_til_done(void (*handler)(void), bool interruptible) |
2050 | { | 2013 | { |
2051 | int ret; | 2014 | int ret; |
2052 | 2015 | ||
@@ -2064,7 +2027,7 @@ static int wait_til_done(void (*handler)(void), int interruptible) | |||
2064 | if (command_status >= 2 || !NO_SIGNAL) | 2027 | if (command_status >= 2 || !NO_SIGNAL) |
2065 | break; | 2028 | break; |
2066 | 2029 | ||
2067 | is_alive("wait_til_done"); | 2030 | is_alive(__func__, ""); |
2068 | schedule(); | 2031 | schedule(); |
2069 | } | 2032 | } |
2070 | 2033 | ||
@@ -2180,9 +2143,9 @@ static void format_interrupt(void) | |||
2180 | cont->redo(); | 2143 | cont->redo(); |
2181 | } | 2144 | } |
2182 | 2145 | ||
2183 | #define CODE2SIZE (ssize = ((1 << SIZECODE) + 3) >> 2) | 2146 | #define FM_MODE(x, y) ((y) & ~(((x)->rate & 0x80) >> 1)) |
2184 | #define FM_MODE(x,y) ((y) & ~(((x)->rate & 0x80) >>1)) | ||
2185 | #define CT(x) ((x) | 0xc0) | 2147 | #define CT(x) ((x) | 0xc0) |
2148 | |||
2186 | static void setup_format_params(int track) | 2149 | static void setup_format_params(int track) |
2187 | { | 2150 | { |
2188 | int n; | 2151 | int n; |
@@ -2197,8 +2160,8 @@ static void setup_format_params(int track) | |||
2197 | raw_cmd = &default_raw_cmd; | 2160 | raw_cmd = &default_raw_cmd; |
2198 | raw_cmd->track = track; | 2161 | raw_cmd->track = track; |
2199 | 2162 | ||
2200 | raw_cmd->flags = FD_RAW_WRITE | FD_RAW_INTR | FD_RAW_SPIN | | 2163 | raw_cmd->flags = (FD_RAW_WRITE | FD_RAW_INTR | FD_RAW_SPIN | |
2201 | FD_RAW_NEED_DISK | FD_RAW_NEED_SEEK; | 2164 | FD_RAW_NEED_DISK | FD_RAW_NEED_SEEK); |
2202 | raw_cmd->rate = _floppy->rate & 0x43; | 2165 | raw_cmd->rate = _floppy->rate & 0x43; |
2203 | raw_cmd->cmd_count = NR_F; | 2166 | raw_cmd->cmd_count = NR_F; |
2204 | COMMAND = FM_MODE(_floppy, FD_FORMAT); | 2167 | COMMAND = FM_MODE(_floppy, FD_FORMAT); |
@@ -2257,7 +2220,7 @@ static void redo_format(void) | |||
2257 | buffer_track = -1; | 2220 | buffer_track = -1; |
2258 | setup_format_params(format_req.track << STRETCH(_floppy)); | 2221 | setup_format_params(format_req.track << STRETCH(_floppy)); |
2259 | floppy_start(); | 2222 | floppy_start(); |
2260 | debugt("queue format request"); | 2223 | debugt(__func__, "queue format request"); |
2261 | } | 2224 | } |
2262 | 2225 | ||
2263 | static struct cont_t format_cont = { | 2226 | static struct cont_t format_cont = { |
@@ -2271,7 +2234,9 @@ static int do_format(int drive, struct format_descr *tmp_format_req) | |||
2271 | { | 2234 | { |
2272 | int ret; | 2235 | int ret; |
2273 | 2236 | ||
2274 | LOCK_FDC(drive, 1); | 2237 | if (lock_fdc(drive, true)) |
2238 | return -EINTR; | ||
2239 | |||
2275 | set_floppy(drive); | 2240 | set_floppy(drive); |
2276 | if (!_floppy || | 2241 | if (!_floppy || |
2277 | _floppy->track > DP->tracks || | 2242 | _floppy->track > DP->tracks || |
@@ -2286,7 +2251,9 @@ static int do_format(int drive, struct format_descr *tmp_format_req) | |||
2286 | format_errors = 0; | 2251 | format_errors = 0; |
2287 | cont = &format_cont; | 2252 | cont = &format_cont; |
2288 | errors = &format_errors; | 2253 | errors = &format_errors; |
2289 | IWAIT(redo_format); | 2254 | ret = wait_til_done(redo_format, true); |
2255 | if (ret == -EINTR) | ||
2256 | return -EINTR; | ||
2290 | process_fd_request(); | 2257 | process_fd_request(); |
2291 | return ret; | 2258 | return ret; |
2292 | } | 2259 | } |
@@ -2320,12 +2287,14 @@ static void request_done(int uptodate) | |||
2320 | struct request *req = current_req; | 2287 | struct request *req = current_req; |
2321 | unsigned long flags; | 2288 | unsigned long flags; |
2322 | int block; | 2289 | int block; |
2290 | char msg[sizeof("request done ") + sizeof(int) * 3]; | ||
2323 | 2291 | ||
2324 | probing = 0; | 2292 | probing = 0; |
2325 | reschedule_timeout(MAXTIMEOUT, "request done %d", uptodate); | 2293 | snprintf(msg, sizeof(msg), "request done %d", uptodate); |
2294 | reschedule_timeout(MAXTIMEOUT, msg); | ||
2326 | 2295 | ||
2327 | if (!req) { | 2296 | if (!req) { |
2328 | printk("floppy.c: no request in request_done\n"); | 2297 | pr_info("floppy.c: no request in request_done\n"); |
2329 | return; | 2298 | return; |
2330 | } | 2299 | } |
2331 | 2300 | ||
@@ -2377,7 +2346,7 @@ static void rw_interrupt(void) | |||
2377 | DRS->first_read_date = jiffies; | 2346 | DRS->first_read_date = jiffies; |
2378 | 2347 | ||
2379 | nr_sectors = 0; | 2348 | nr_sectors = 0; |
2380 | CODE2SIZE; | 2349 | ssize = DIV_ROUND_UP(1 << SIZECODE, 4); |
2381 | 2350 | ||
2382 | if (ST1 & ST1_EOC) | 2351 | if (ST1 & ST1_EOC) |
2383 | eoc = 1; | 2352 | eoc = 1; |
@@ -2393,20 +2362,18 @@ static void rw_interrupt(void) | |||
2393 | R_HEAD - HEAD) * SECT_PER_TRACK + | 2362 | R_HEAD - HEAD) * SECT_PER_TRACK + |
2394 | R_SECTOR - SECTOR + eoc) << SIZECODE >> 2; | 2363 | R_SECTOR - SECTOR + eoc) << SIZECODE >> 2; |
2395 | 2364 | ||
2396 | #ifdef FLOPPY_SANITY_CHECK | ||
2397 | if (nr_sectors / ssize > | 2365 | if (nr_sectors / ssize > |
2398 | DIV_ROUND_UP(in_sector_offset + current_count_sectors, ssize)) { | 2366 | DIV_ROUND_UP(in_sector_offset + current_count_sectors, ssize)) { |
2399 | DPRINT("long rw: %x instead of %lx\n", | 2367 | DPRINT("long rw: %x instead of %lx\n", |
2400 | nr_sectors, current_count_sectors); | 2368 | nr_sectors, current_count_sectors); |
2401 | printk("rs=%d s=%d\n", R_SECTOR, SECTOR); | 2369 | pr_info("rs=%d s=%d\n", R_SECTOR, SECTOR); |
2402 | printk("rh=%d h=%d\n", R_HEAD, HEAD); | 2370 | pr_info("rh=%d h=%d\n", R_HEAD, HEAD); |
2403 | printk("rt=%d t=%d\n", R_TRACK, TRACK); | 2371 | pr_info("rt=%d t=%d\n", R_TRACK, TRACK); |
2404 | printk("heads=%d eoc=%d\n", heads, eoc); | 2372 | pr_info("heads=%d eoc=%d\n", heads, eoc); |
2405 | printk("spt=%d st=%d ss=%d\n", SECT_PER_TRACK, | 2373 | pr_info("spt=%d st=%d ss=%d\n", |
2406 | fsector_t, ssize); | 2374 | SECT_PER_TRACK, fsector_t, ssize); |
2407 | printk("in_sector_offset=%d\n", in_sector_offset); | 2375 | pr_info("in_sector_offset=%d\n", in_sector_offset); |
2408 | } | 2376 | } |
2409 | #endif | ||
2410 | 2377 | ||
2411 | nr_sectors -= in_sector_offset; | 2378 | nr_sectors -= in_sector_offset; |
2412 | INFBOUND(nr_sectors, 0); | 2379 | INFBOUND(nr_sectors, 0); |
@@ -2511,19 +2478,17 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2) | |||
2511 | blk_rq_sectors(current_req)); | 2478 | blk_rq_sectors(current_req)); |
2512 | 2479 | ||
2513 | remaining = current_count_sectors << 9; | 2480 | remaining = current_count_sectors << 9; |
2514 | #ifdef FLOPPY_SANITY_CHECK | ||
2515 | if (remaining > blk_rq_bytes(current_req) && CT(COMMAND) == FD_WRITE) { | 2481 | if (remaining > blk_rq_bytes(current_req) && CT(COMMAND) == FD_WRITE) { |
2516 | DPRINT("in copy buffer\n"); | 2482 | DPRINT("in copy buffer\n"); |
2517 | printk("current_count_sectors=%ld\n", current_count_sectors); | 2483 | pr_info("current_count_sectors=%ld\n", current_count_sectors); |
2518 | printk("remaining=%d\n", remaining >> 9); | 2484 | pr_info("remaining=%d\n", remaining >> 9); |
2519 | printk("current_req->nr_sectors=%u\n", | 2485 | pr_info("current_req->nr_sectors=%u\n", |
2520 | blk_rq_sectors(current_req)); | 2486 | blk_rq_sectors(current_req)); |
2521 | printk("current_req->current_nr_sectors=%u\n", | 2487 | pr_info("current_req->current_nr_sectors=%u\n", |
2522 | blk_rq_cur_sectors(current_req)); | 2488 | blk_rq_cur_sectors(current_req)); |
2523 | printk("max_sector=%d\n", max_sector); | 2489 | pr_info("max_sector=%d\n", max_sector); |
2524 | printk("ssize=%d\n", ssize); | 2490 | pr_info("ssize=%d\n", ssize); |
2525 | } | 2491 | } |
2526 | #endif | ||
2527 | 2492 | ||
2528 | buffer_max = max(max_sector, buffer_max); | 2493 | buffer_max = max(max_sector, buffer_max); |
2529 | 2494 | ||
@@ -2539,26 +2504,24 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2) | |||
2539 | SUPBOUND(size, remaining); | 2504 | SUPBOUND(size, remaining); |
2540 | 2505 | ||
2541 | buffer = page_address(bv->bv_page) + bv->bv_offset; | 2506 | buffer = page_address(bv->bv_page) + bv->bv_offset; |
2542 | #ifdef FLOPPY_SANITY_CHECK | ||
2543 | if (dma_buffer + size > | 2507 | if (dma_buffer + size > |
2544 | floppy_track_buffer + (max_buffer_sectors << 10) || | 2508 | floppy_track_buffer + (max_buffer_sectors << 10) || |
2545 | dma_buffer < floppy_track_buffer) { | 2509 | dma_buffer < floppy_track_buffer) { |
2546 | DPRINT("buffer overrun in copy buffer %d\n", | 2510 | DPRINT("buffer overrun in copy buffer %d\n", |
2547 | (int)((floppy_track_buffer - | 2511 | (int)((floppy_track_buffer - dma_buffer) >> 9)); |
2548 | dma_buffer) >> 9)); | 2512 | pr_info("fsector_t=%d buffer_min=%d\n", |
2549 | printk("fsector_t=%d buffer_min=%d\n", | 2513 | fsector_t, buffer_min); |
2550 | fsector_t, buffer_min); | 2514 | pr_info("current_count_sectors=%ld\n", |
2551 | printk("current_count_sectors=%ld\n", | 2515 | current_count_sectors); |
2552 | current_count_sectors); | ||
2553 | if (CT(COMMAND) == FD_READ) | 2516 | if (CT(COMMAND) == FD_READ) |
2554 | printk("read\n"); | 2517 | pr_info("read\n"); |
2555 | if (CT(COMMAND) == FD_WRITE) | 2518 | if (CT(COMMAND) == FD_WRITE) |
2556 | printk("write\n"); | 2519 | pr_info("write\n"); |
2557 | break; | 2520 | break; |
2558 | } | 2521 | } |
2559 | if (((unsigned long)buffer) % 512) | 2522 | if (((unsigned long)buffer) % 512) |
2560 | DPRINT("%p buffer not aligned\n", buffer); | 2523 | DPRINT("%p buffer not aligned\n", buffer); |
2561 | #endif | 2524 | |
2562 | if (CT(COMMAND) == FD_READ) | 2525 | if (CT(COMMAND) == FD_READ) |
2563 | memcpy(buffer, dma_buffer, size); | 2526 | memcpy(buffer, dma_buffer, size); |
2564 | else | 2527 | else |
@@ -2567,13 +2530,11 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2) | |||
2567 | remaining -= size; | 2530 | remaining -= size; |
2568 | dma_buffer += size; | 2531 | dma_buffer += size; |
2569 | } | 2532 | } |
2570 | #ifdef FLOPPY_SANITY_CHECK | ||
2571 | if (remaining) { | 2533 | if (remaining) { |
2572 | if (remaining > 0) | 2534 | if (remaining > 0) |
2573 | max_sector -= remaining >> 9; | 2535 | max_sector -= remaining >> 9; |
2574 | DPRINT("weirdness: remaining %d\n", remaining >> 9); | 2536 | DPRINT("weirdness: remaining %d\n", remaining >> 9); |
2575 | } | 2537 | } |
2576 | #endif | ||
2577 | } | 2538 | } |
2578 | 2539 | ||
2579 | /* work around a bug in pseudo DMA | 2540 | /* work around a bug in pseudo DMA |
@@ -2593,15 +2554,14 @@ static void virtualdmabug_workaround(void) | |||
2593 | 2554 | ||
2594 | hard_sectors = raw_cmd->length >> (7 + SIZECODE); | 2555 | hard_sectors = raw_cmd->length >> (7 + SIZECODE); |
2595 | end_sector = SECTOR + hard_sectors - 1; | 2556 | end_sector = SECTOR + hard_sectors - 1; |
2596 | #ifdef FLOPPY_SANITY_CHECK | ||
2597 | if (end_sector > SECT_PER_TRACK) { | 2557 | if (end_sector > SECT_PER_TRACK) { |
2598 | printk("too many sectors %d > %d\n", | 2558 | pr_info("too many sectors %d > %d\n", |
2599 | end_sector, SECT_PER_TRACK); | 2559 | end_sector, SECT_PER_TRACK); |
2600 | return; | 2560 | return; |
2601 | } | 2561 | } |
2602 | #endif | 2562 | SECT_PER_TRACK = end_sector; |
2603 | SECT_PER_TRACK = end_sector; /* make sure SECT_PER_TRACK points | 2563 | /* make sure SECT_PER_TRACK |
2604 | * to end of transfer */ | 2564 | * points to end of transfer */ |
2605 | } | 2565 | } |
2606 | } | 2566 | } |
2607 | 2567 | ||
@@ -2624,7 +2584,7 @@ static int make_raw_rw_request(void) | |||
2624 | int ssize; | 2584 | int ssize; |
2625 | 2585 | ||
2626 | if (max_buffer_sectors == 0) { | 2586 | if (max_buffer_sectors == 0) { |
2627 | printk("VFS: Block I/O scheduled on unopened device\n"); | 2587 | pr_info("VFS: Block I/O scheduled on unopened device\n"); |
2628 | return 0; | 2588 | return 0; |
2629 | } | 2589 | } |
2630 | 2590 | ||
@@ -2641,7 +2601,7 @@ static int make_raw_rw_request(void) | |||
2641 | raw_cmd->flags |= FD_RAW_WRITE; | 2601 | raw_cmd->flags |= FD_RAW_WRITE; |
2642 | COMMAND = FM_MODE(_floppy, FD_WRITE); | 2602 | COMMAND = FM_MODE(_floppy, FD_WRITE); |
2643 | } else { | 2603 | } else { |
2644 | DPRINT("make_raw_rw_request: unknown command\n"); | 2604 | DPRINT("%s: unknown command\n", __func__); |
2645 | return 0; | 2605 | return 0; |
2646 | } | 2606 | } |
2647 | 2607 | ||
@@ -2659,7 +2619,8 @@ static int make_raw_rw_request(void) | |||
2659 | HEAD = fsector_t / _floppy->sect; | 2619 | HEAD = fsector_t / _floppy->sect; |
2660 | 2620 | ||
2661 | if (((_floppy->stretch & (FD_SWAPSIDES | FD_SECTBASEMASK)) || | 2621 | if (((_floppy->stretch & (FD_SWAPSIDES | FD_SECTBASEMASK)) || |
2662 | TESTF(FD_NEED_TWADDLE)) && fsector_t < _floppy->sect) | 2622 | test_bit(FD_NEED_TWADDLE_BIT, &DRS->flags)) && |
2623 | fsector_t < _floppy->sect) | ||
2663 | max_sector = _floppy->sect; | 2624 | max_sector = _floppy->sect; |
2664 | 2625 | ||
2665 | /* 2M disks have phantom sectors on the first track */ | 2626 | /* 2M disks have phantom sectors on the first track */ |
@@ -2685,7 +2646,7 @@ static int make_raw_rw_request(void) | |||
2685 | raw_cmd->track = TRACK << STRETCH(_floppy); | 2646 | raw_cmd->track = TRACK << STRETCH(_floppy); |
2686 | DR_SELECT = UNIT(current_drive) + PH_HEAD(_floppy, HEAD); | 2647 | DR_SELECT = UNIT(current_drive) + PH_HEAD(_floppy, HEAD); |
2687 | GAP = _floppy->gap; | 2648 | GAP = _floppy->gap; |
2688 | CODE2SIZE; | 2649 | ssize = DIV_ROUND_UP(1 << SIZECODE, 4); |
2689 | SECT_PER_TRACK = _floppy->sect << 2 >> SIZECODE; | 2650 | SECT_PER_TRACK = _floppy->sect << 2 >> SIZECODE; |
2690 | SECTOR = ((fsector_t % _floppy->sect) << 2 >> SIZECODE) + | 2651 | SECTOR = ((fsector_t % _floppy->sect) << 2 >> SIZECODE) + |
2691 | FD_SECTBASE(_floppy); | 2652 | FD_SECTBASE(_floppy); |
@@ -2730,8 +2691,10 @@ static int make_raw_rw_request(void) | |||
2730 | } | 2691 | } |
2731 | } else if (in_sector_offset || blk_rq_sectors(current_req) < ssize) { | 2692 | } else if (in_sector_offset || blk_rq_sectors(current_req) < ssize) { |
2732 | if (CT(COMMAND) == FD_WRITE) { | 2693 | if (CT(COMMAND) == FD_WRITE) { |
2733 | if (fsector_t + blk_rq_sectors(current_req) > ssize && | 2694 | unsigned int sectors; |
2734 | fsector_t + blk_rq_sectors(current_req) < ssize + ssize) | 2695 | |
2696 | sectors = fsector_t + blk_rq_sectors(current_req); | ||
2697 | if (sectors > ssize && sectors < ssize + ssize) | ||
2735 | max_size = ssize + ssize; | 2698 | max_size = ssize + ssize; |
2736 | else | 2699 | else |
2737 | max_size = ssize; | 2700 | max_size = ssize; |
@@ -2752,12 +2715,10 @@ static int make_raw_rw_request(void) | |||
2752 | * on a 64 bit machine! | 2715 | * on a 64 bit machine! |
2753 | */ | 2716 | */ |
2754 | max_size = buffer_chain_size(); | 2717 | max_size = buffer_chain_size(); |
2755 | dma_limit = | 2718 | dma_limit = (MAX_DMA_ADDRESS - |
2756 | (MAX_DMA_ADDRESS - | 2719 | ((unsigned long)current_req->buffer)) >> 9; |
2757 | ((unsigned long)current_req->buffer)) >> 9; | 2720 | if ((unsigned long)max_size > dma_limit) |
2758 | if ((unsigned long)max_size > dma_limit) { | ||
2759 | max_size = dma_limit; | 2721 | max_size = dma_limit; |
2760 | } | ||
2761 | /* 64 kb boundaries */ | 2722 | /* 64 kb boundaries */ |
2762 | if (CROSS_64KB(current_req->buffer, max_size << 9)) | 2723 | if (CROSS_64KB(current_req->buffer, max_size << 9)) |
2763 | max_size = (K_64 - | 2724 | max_size = (K_64 - |
@@ -2773,16 +2734,16 @@ static int make_raw_rw_request(void) | |||
2773 | */ | 2734 | */ |
2774 | if (!direct || | 2735 | if (!direct || |
2775 | (indirect * 2 > direct * 3 && | 2736 | (indirect * 2 > direct * 3 && |
2776 | *errors < DP->max_errors.read_track && ((!probing | 2737 | *errors < DP->max_errors.read_track && |
2777 | || (DP->read_track & (1 << DRS->probed_format)))))) { | 2738 | ((!probing || |
2739 | (DP->read_track & (1 << DRS->probed_format)))))) { | ||
2778 | max_size = blk_rq_sectors(current_req); | 2740 | max_size = blk_rq_sectors(current_req); |
2779 | } else { | 2741 | } else { |
2780 | raw_cmd->kernel_data = current_req->buffer; | 2742 | raw_cmd->kernel_data = current_req->buffer; |
2781 | raw_cmd->length = current_count_sectors << 9; | 2743 | raw_cmd->length = current_count_sectors << 9; |
2782 | if (raw_cmd->length == 0) { | 2744 | if (raw_cmd->length == 0) { |
2783 | DPRINT | 2745 | DPRINT("%s: zero dma transfer attempted\n", __func__); |
2784 | ("zero dma transfer attempted from make_raw_request\n"); | 2746 | DPRINT("indirect=%d direct=%d fsector_t=%d\n", |
2785 | DPRINT("indirect=%d direct=%d fsector_t=%d", | ||
2786 | indirect, direct, fsector_t); | 2747 | indirect, direct, fsector_t); |
2787 | return 0; | 2748 | return 0; |
2788 | } | 2749 | } |
@@ -2802,25 +2763,22 @@ static int make_raw_rw_request(void) | |||
2802 | ((CT(COMMAND) == FD_READ || | 2763 | ((CT(COMMAND) == FD_READ || |
2803 | (!in_sector_offset && blk_rq_sectors(current_req) >= ssize)) && | 2764 | (!in_sector_offset && blk_rq_sectors(current_req) >= ssize)) && |
2804 | max_sector > 2 * max_buffer_sectors + buffer_min && | 2765 | max_sector > 2 * max_buffer_sectors + buffer_min && |
2805 | max_size + fsector_t > 2 * max_buffer_sectors + buffer_min) | 2766 | max_size + fsector_t > 2 * max_buffer_sectors + buffer_min)) { |
2806 | /* not enough space */ | 2767 | /* not enough space */ |
2807 | ) { | ||
2808 | buffer_track = -1; | 2768 | buffer_track = -1; |
2809 | buffer_drive = current_drive; | 2769 | buffer_drive = current_drive; |
2810 | buffer_max = buffer_min = aligned_sector_t; | 2770 | buffer_max = buffer_min = aligned_sector_t; |
2811 | } | 2771 | } |
2812 | raw_cmd->kernel_data = floppy_track_buffer + | 2772 | raw_cmd->kernel_data = floppy_track_buffer + |
2813 | ((aligned_sector_t - buffer_min) << 9); | 2773 | ((aligned_sector_t - buffer_min) << 9); |
2814 | 2774 | ||
2815 | if (CT(COMMAND) == FD_WRITE) { | 2775 | if (CT(COMMAND) == FD_WRITE) { |
2816 | /* copy write buffer to track buffer. | 2776 | /* copy write buffer to track buffer. |
2817 | * if we get here, we know that the write | 2777 | * if we get here, we know that the write |
2818 | * is either aligned or the data already in the buffer | 2778 | * is either aligned or the data already in the buffer |
2819 | * (buffer will be overwritten) */ | 2779 | * (buffer will be overwritten) */ |
2820 | #ifdef FLOPPY_SANITY_CHECK | ||
2821 | if (in_sector_offset && buffer_track == -1) | 2780 | if (in_sector_offset && buffer_track == -1) |
2822 | DPRINT("internal error offset !=0 on write\n"); | 2781 | DPRINT("internal error offset !=0 on write\n"); |
2823 | #endif | ||
2824 | buffer_track = raw_cmd->track; | 2782 | buffer_track = raw_cmd->track; |
2825 | buffer_drive = current_drive; | 2783 | buffer_drive = current_drive; |
2826 | copy_buffer(ssize, max_sector, | 2784 | copy_buffer(ssize, max_sector, |
@@ -2834,7 +2792,6 @@ static int make_raw_rw_request(void) | |||
2834 | raw_cmd->length = in_sector_offset + current_count_sectors; | 2792 | raw_cmd->length = in_sector_offset + current_count_sectors; |
2835 | raw_cmd->length = ((raw_cmd->length - 1) | (ssize - 1)) + 1; | 2793 | raw_cmd->length = ((raw_cmd->length - 1) | (ssize - 1)) + 1; |
2836 | raw_cmd->length <<= 9; | 2794 | raw_cmd->length <<= 9; |
2837 | #ifdef FLOPPY_SANITY_CHECK | ||
2838 | if ((raw_cmd->length < current_count_sectors << 9) || | 2795 | if ((raw_cmd->length < current_count_sectors << 9) || |
2839 | (raw_cmd->kernel_data != current_req->buffer && | 2796 | (raw_cmd->kernel_data != current_req->buffer && |
2840 | CT(COMMAND) == FD_WRITE && | 2797 | CT(COMMAND) == FD_WRITE && |
@@ -2845,19 +2802,19 @@ static int make_raw_rw_request(void) | |||
2845 | DPRINT("fractionary current count b=%lx s=%lx\n", | 2802 | DPRINT("fractionary current count b=%lx s=%lx\n", |
2846 | raw_cmd->length, current_count_sectors); | 2803 | raw_cmd->length, current_count_sectors); |
2847 | if (raw_cmd->kernel_data != current_req->buffer) | 2804 | if (raw_cmd->kernel_data != current_req->buffer) |
2848 | printk("addr=%d, length=%ld\n", | 2805 | pr_info("addr=%d, length=%ld\n", |
2849 | (int)((raw_cmd->kernel_data - | 2806 | (int)((raw_cmd->kernel_data - |
2850 | floppy_track_buffer) >> 9), | 2807 | floppy_track_buffer) >> 9), |
2851 | current_count_sectors); | 2808 | current_count_sectors); |
2852 | printk("st=%d ast=%d mse=%d msi=%d\n", | 2809 | pr_info("st=%d ast=%d mse=%d msi=%d\n", |
2853 | fsector_t, aligned_sector_t, max_sector, max_size); | 2810 | fsector_t, aligned_sector_t, max_sector, max_size); |
2854 | printk("ssize=%x SIZECODE=%d\n", ssize, SIZECODE); | 2811 | pr_info("ssize=%x SIZECODE=%d\n", ssize, SIZECODE); |
2855 | printk("command=%x SECTOR=%d HEAD=%d, TRACK=%d\n", | 2812 | pr_info("command=%x SECTOR=%d HEAD=%d, TRACK=%d\n", |
2856 | COMMAND, SECTOR, HEAD, TRACK); | 2813 | COMMAND, SECTOR, HEAD, TRACK); |
2857 | printk("buffer drive=%d\n", buffer_drive); | 2814 | pr_info("buffer drive=%d\n", buffer_drive); |
2858 | printk("buffer track=%d\n", buffer_track); | 2815 | pr_info("buffer track=%d\n", buffer_track); |
2859 | printk("buffer_min=%d\n", buffer_min); | 2816 | pr_info("buffer_min=%d\n", buffer_min); |
2860 | printk("buffer_max=%d\n", buffer_max); | 2817 | pr_info("buffer_max=%d\n", buffer_max); |
2861 | return 0; | 2818 | return 0; |
2862 | } | 2819 | } |
2863 | 2820 | ||
@@ -2868,14 +2825,14 @@ static int make_raw_rw_request(void) | |||
2868 | raw_cmd->kernel_data + raw_cmd->length > | 2825 | raw_cmd->kernel_data + raw_cmd->length > |
2869 | floppy_track_buffer + (max_buffer_sectors << 10)) { | 2826 | floppy_track_buffer + (max_buffer_sectors << 10)) { |
2870 | DPRINT("buffer overrun in schedule dma\n"); | 2827 | DPRINT("buffer overrun in schedule dma\n"); |
2871 | printk("fsector_t=%d buffer_min=%d current_count=%ld\n", | 2828 | pr_info("fsector_t=%d buffer_min=%d current_count=%ld\n", |
2872 | fsector_t, buffer_min, raw_cmd->length >> 9); | 2829 | fsector_t, buffer_min, raw_cmd->length >> 9); |
2873 | printk("current_count_sectors=%ld\n", | 2830 | pr_info("current_count_sectors=%ld\n", |
2874 | current_count_sectors); | 2831 | current_count_sectors); |
2875 | if (CT(COMMAND) == FD_READ) | 2832 | if (CT(COMMAND) == FD_READ) |
2876 | printk("read\n"); | 2833 | pr_info("read\n"); |
2877 | if (CT(COMMAND) == FD_WRITE) | 2834 | if (CT(COMMAND) == FD_WRITE) |
2878 | printk("write\n"); | 2835 | pr_info("write\n"); |
2879 | return 0; | 2836 | return 0; |
2880 | } | 2837 | } |
2881 | } else if (raw_cmd->length > blk_rq_bytes(current_req) || | 2838 | } else if (raw_cmd->length > blk_rq_bytes(current_req) || |
@@ -2884,14 +2841,13 @@ static int make_raw_rw_request(void) | |||
2884 | return 0; | 2841 | return 0; |
2885 | } else if (raw_cmd->length < current_count_sectors << 9) { | 2842 | } else if (raw_cmd->length < current_count_sectors << 9) { |
2886 | DPRINT("more sectors than bytes\n"); | 2843 | DPRINT("more sectors than bytes\n"); |
2887 | printk("bytes=%ld\n", raw_cmd->length >> 9); | 2844 | pr_info("bytes=%ld\n", raw_cmd->length >> 9); |
2888 | printk("sectors=%ld\n", current_count_sectors); | 2845 | pr_info("sectors=%ld\n", current_count_sectors); |
2889 | } | 2846 | } |
2890 | if (raw_cmd->length == 0) { | 2847 | if (raw_cmd->length == 0) { |
2891 | DPRINT("zero dma transfer attempted from make_raw_request\n"); | 2848 | DPRINT("zero dma transfer attempted from make_raw_request\n"); |
2892 | return 0; | 2849 | return 0; |
2893 | } | 2850 | } |
2894 | #endif | ||
2895 | 2851 | ||
2896 | virtualdmabug_workaround(); | 2852 | virtualdmabug_workaround(); |
2897 | return 2; | 2853 | return 2; |
@@ -2899,7 +2855,6 @@ static int make_raw_rw_request(void) | |||
2899 | 2855 | ||
2900 | static void redo_fd_request(void) | 2856 | static void redo_fd_request(void) |
2901 | { | 2857 | { |
2902 | #define REPEAT {request_done(0); continue; } | ||
2903 | int drive; | 2858 | int drive; |
2904 | int tmp; | 2859 | int tmp; |
2905 | 2860 | ||
@@ -2907,63 +2862,63 @@ static void redo_fd_request(void) | |||
2907 | if (current_drive < N_DRIVE) | 2862 | if (current_drive < N_DRIVE) |
2908 | floppy_off(current_drive); | 2863 | floppy_off(current_drive); |
2909 | 2864 | ||
2910 | for (;;) { | 2865 | do_request: |
2911 | if (!current_req) { | 2866 | if (!current_req) { |
2912 | struct request *req; | 2867 | struct request *req; |
2913 | |||
2914 | spin_lock_irq(floppy_queue->queue_lock); | ||
2915 | req = blk_fetch_request(floppy_queue); | ||
2916 | spin_unlock_irq(floppy_queue->queue_lock); | ||
2917 | if (!req) { | ||
2918 | do_floppy = NULL; | ||
2919 | unlock_fdc(); | ||
2920 | return; | ||
2921 | } | ||
2922 | current_req = req; | ||
2923 | } | ||
2924 | drive = (long)current_req->rq_disk->private_data; | ||
2925 | set_fdc(drive); | ||
2926 | reschedule_timeout(current_reqD, "redo fd request", 0); | ||
2927 | 2868 | ||
2928 | set_floppy(drive); | 2869 | spin_lock_irq(floppy_queue->queue_lock); |
2929 | raw_cmd = &default_raw_cmd; | 2870 | req = blk_fetch_request(floppy_queue); |
2930 | raw_cmd->flags = 0; | 2871 | spin_unlock_irq(floppy_queue->queue_lock); |
2931 | if (start_motor(redo_fd_request)) | 2872 | if (!req) { |
2873 | do_floppy = NULL; | ||
2874 | unlock_fdc(); | ||
2932 | return; | 2875 | return; |
2933 | disk_change(current_drive); | ||
2934 | if (test_bit(current_drive, &fake_change) || | ||
2935 | TESTF(FD_DISK_CHANGED)) { | ||
2936 | DPRINT("disk absent or changed during operation\n"); | ||
2937 | REPEAT; | ||
2938 | } | ||
2939 | if (!_floppy) { /* Autodetection */ | ||
2940 | if (!probing) { | ||
2941 | DRS->probed_format = 0; | ||
2942 | if (next_valid_format()) { | ||
2943 | DPRINT("no autodetectable formats\n"); | ||
2944 | _floppy = NULL; | ||
2945 | REPEAT; | ||
2946 | } | ||
2947 | } | ||
2948 | probing = 1; | ||
2949 | _floppy = | ||
2950 | floppy_type + DP->autodetect[DRS->probed_format]; | ||
2951 | } else | ||
2952 | probing = 0; | ||
2953 | errors = &(current_req->errors); | ||
2954 | tmp = make_raw_rw_request(); | ||
2955 | if (tmp < 2) { | ||
2956 | request_done(tmp); | ||
2957 | continue; | ||
2958 | } | 2876 | } |
2877 | current_req = req; | ||
2878 | } | ||
2879 | drive = (long)current_req->rq_disk->private_data; | ||
2880 | set_fdc(drive); | ||
2881 | reschedule_timeout(current_reqD, "redo fd request"); | ||
2959 | 2882 | ||
2960 | if (TESTF(FD_NEED_TWADDLE)) | 2883 | set_floppy(drive); |
2961 | twaddle(); | 2884 | raw_cmd = &default_raw_cmd; |
2962 | schedule_bh(floppy_start); | 2885 | raw_cmd->flags = 0; |
2963 | debugt("queue fd request"); | 2886 | if (start_motor(redo_fd_request)) |
2964 | return; | 2887 | return; |
2888 | |||
2889 | disk_change(current_drive); | ||
2890 | if (test_bit(current_drive, &fake_change) || | ||
2891 | test_bit(FD_DISK_CHANGED_BIT, &DRS->flags)) { | ||
2892 | DPRINT("disk absent or changed during operation\n"); | ||
2893 | request_done(0); | ||
2894 | goto do_request; | ||
2895 | } | ||
2896 | if (!_floppy) { /* Autodetection */ | ||
2897 | if (!probing) { | ||
2898 | DRS->probed_format = 0; | ||
2899 | if (next_valid_format()) { | ||
2900 | DPRINT("no autodetectable formats\n"); | ||
2901 | _floppy = NULL; | ||
2902 | request_done(0); | ||
2903 | goto do_request; | ||
2904 | } | ||
2905 | } | ||
2906 | probing = 1; | ||
2907 | _floppy = floppy_type + DP->autodetect[DRS->probed_format]; | ||
2908 | } else | ||
2909 | probing = 0; | ||
2910 | errors = &(current_req->errors); | ||
2911 | tmp = make_raw_rw_request(); | ||
2912 | if (tmp < 2) { | ||
2913 | request_done(tmp); | ||
2914 | goto do_request; | ||
2965 | } | 2915 | } |
2966 | #undef REPEAT | 2916 | |
2917 | if (test_bit(FD_NEED_TWADDLE_BIT, &DRS->flags)) | ||
2918 | twaddle(); | ||
2919 | schedule_bh(floppy_start); | ||
2920 | debugt(__func__, "queue fd request"); | ||
2921 | return; | ||
2967 | } | 2922 | } |
2968 | 2923 | ||
2969 | static struct cont_t rw_cont = { | 2924 | static struct cont_t rw_cont = { |
@@ -2979,30 +2934,30 @@ static void process_fd_request(void) | |||
2979 | schedule_bh(redo_fd_request); | 2934 | schedule_bh(redo_fd_request); |
2980 | } | 2935 | } |
2981 | 2936 | ||
2982 | static void do_fd_request(struct request_queue * q) | 2937 | static void do_fd_request(struct request_queue *q) |
2983 | { | 2938 | { |
2984 | if (max_buffer_sectors == 0) { | 2939 | if (max_buffer_sectors == 0) { |
2985 | printk("VFS: do_fd_request called on non-open device\n"); | 2940 | pr_info("VFS: %s called on non-open device\n", __func__); |
2986 | return; | 2941 | return; |
2987 | } | 2942 | } |
2988 | 2943 | ||
2989 | if (usage_count == 0) { | 2944 | if (usage_count == 0) { |
2990 | printk("warning: usage count=0, current_req=%p exiting\n", | 2945 | pr_info("warning: usage count=0, current_req=%p exiting\n", |
2991 | current_req); | 2946 | current_req); |
2992 | printk("sect=%ld type=%x flags=%x\n", | 2947 | pr_info("sect=%ld type=%x flags=%x\n", |
2993 | (long)blk_rq_pos(current_req), current_req->cmd_type, | 2948 | (long)blk_rq_pos(current_req), current_req->cmd_type, |
2994 | current_req->cmd_flags); | 2949 | current_req->cmd_flags); |
2995 | return; | 2950 | return; |
2996 | } | 2951 | } |
2997 | if (test_bit(0, &fdc_busy)) { | 2952 | if (test_bit(0, &fdc_busy)) { |
2998 | /* fdc busy, this new request will be treated when the | 2953 | /* fdc busy, this new request will be treated when the |
2999 | current one is done */ | 2954 | current one is done */ |
3000 | is_alive("do fd request, old request running"); | 2955 | is_alive(__func__, "old request running"); |
3001 | return; | 2956 | return; |
3002 | } | 2957 | } |
3003 | lock_fdc(MAXTIMEOUT, 0); | 2958 | lock_fdc(MAXTIMEOUT, false); |
3004 | process_fd_request(); | 2959 | process_fd_request(); |
3005 | is_alive("do fd request"); | 2960 | is_alive(__func__, ""); |
3006 | } | 2961 | } |
3007 | 2962 | ||
3008 | static struct cont_t poll_cont = { | 2963 | static struct cont_t poll_cont = { |
@@ -3012,24 +2967,18 @@ static struct cont_t poll_cont = { | |||
3012 | .done = generic_done | 2967 | .done = generic_done |
3013 | }; | 2968 | }; |
3014 | 2969 | ||
3015 | static int poll_drive(int interruptible, int flag) | 2970 | static int poll_drive(bool interruptible, int flag) |
3016 | { | 2971 | { |
3017 | int ret; | ||
3018 | |||
3019 | /* no auto-sense, just clear dcl */ | 2972 | /* no auto-sense, just clear dcl */ |
3020 | raw_cmd = &default_raw_cmd; | 2973 | raw_cmd = &default_raw_cmd; |
3021 | raw_cmd->flags = flag; | 2974 | raw_cmd->flags = flag; |
3022 | raw_cmd->track = 0; | 2975 | raw_cmd->track = 0; |
3023 | raw_cmd->cmd_count = 0; | 2976 | raw_cmd->cmd_count = 0; |
3024 | cont = &poll_cont; | 2977 | cont = &poll_cont; |
3025 | #ifdef DCL_DEBUG | 2978 | debug_dcl(DP->flags, "setting NEWCHANGE in poll_drive\n"); |
3026 | if (DP->flags & FD_DEBUG) { | 2979 | set_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags); |
3027 | DPRINT("setting NEWCHANGE in poll_drive\n"); | 2980 | |
3028 | } | 2981 | return wait_til_done(floppy_ready, interruptible); |
3029 | #endif | ||
3030 | SETF(FD_DISK_NEWCHANGE); | ||
3031 | WAIT(floppy_ready); | ||
3032 | return ret; | ||
3033 | } | 2982 | } |
3034 | 2983 | ||
3035 | /* | 2984 | /* |
@@ -3039,7 +2988,7 @@ static int poll_drive(int interruptible, int flag) | |||
3039 | 2988 | ||
3040 | static void reset_intr(void) | 2989 | static void reset_intr(void) |
3041 | { | 2990 | { |
3042 | printk("weird, reset interrupt called\n"); | 2991 | pr_info("weird, reset interrupt called\n"); |
3043 | } | 2992 | } |
3044 | 2993 | ||
3045 | static struct cont_t reset_cont = { | 2994 | static struct cont_t reset_cont = { |
@@ -3049,20 +2998,23 @@ static struct cont_t reset_cont = { | |||
3049 | .done = generic_done | 2998 | .done = generic_done |
3050 | }; | 2999 | }; |
3051 | 3000 | ||
3052 | static int user_reset_fdc(int drive, int arg, int interruptible) | 3001 | static int user_reset_fdc(int drive, int arg, bool interruptible) |
3053 | { | 3002 | { |
3054 | int ret; | 3003 | int ret; |
3055 | 3004 | ||
3056 | ret = 0; | 3005 | if (lock_fdc(drive, interruptible)) |
3057 | LOCK_FDC(drive, interruptible); | 3006 | return -EINTR; |
3007 | |||
3058 | if (arg == FD_RESET_ALWAYS) | 3008 | if (arg == FD_RESET_ALWAYS) |
3059 | FDCS->reset = 1; | 3009 | FDCS->reset = 1; |
3060 | if (FDCS->reset) { | 3010 | if (FDCS->reset) { |
3061 | cont = &reset_cont; | 3011 | cont = &reset_cont; |
3062 | WAIT(reset_fdc); | 3012 | ret = wait_til_done(reset_fdc, interruptible); |
3013 | if (ret == -EINTR) | ||
3014 | return -EINTR; | ||
3063 | } | 3015 | } |
3064 | process_fd_request(); | 3016 | process_fd_request(); |
3065 | return ret; | 3017 | return 0; |
3066 | } | 3018 | } |
3067 | 3019 | ||
3068 | /* | 3020 | /* |
@@ -3075,17 +3027,12 @@ static inline int fd_copyout(void __user *param, const void *address, | |||
3075 | return copy_to_user(param, address, size) ? -EFAULT : 0; | 3027 | return copy_to_user(param, address, size) ? -EFAULT : 0; |
3076 | } | 3028 | } |
3077 | 3029 | ||
3078 | static inline int fd_copyin(void __user *param, void *address, unsigned long size) | 3030 | static inline int fd_copyin(void __user *param, void *address, |
3031 | unsigned long size) | ||
3079 | { | 3032 | { |
3080 | return copy_from_user(address, param, size) ? -EFAULT : 0; | 3033 | return copy_from_user(address, param, size) ? -EFAULT : 0; |
3081 | } | 3034 | } |
3082 | 3035 | ||
3083 | #define _COPYOUT(x) (copy_to_user((void __user *)param, &(x), sizeof(x)) ? -EFAULT : 0) | ||
3084 | #define _COPYIN(x) (copy_from_user(&(x), (void __user *)param, sizeof(x)) ? -EFAULT : 0) | ||
3085 | |||
3086 | #define COPYOUT(x) ECALL(_COPYOUT(x)) | ||
3087 | #define COPYIN(x) ECALL(_COPYIN(x)) | ||
3088 | |||
3089 | static inline const char *drive_name(int type, int drive) | 3036 | static inline const char *drive_name(int type, int drive) |
3090 | { | 3037 | { |
3091 | struct floppy_struct *floppy; | 3038 | struct floppy_struct *floppy; |
@@ -3156,23 +3103,29 @@ static struct cont_t raw_cmd_cont = { | |||
3156 | .done = raw_cmd_done | 3103 | .done = raw_cmd_done |
3157 | }; | 3104 | }; |
3158 | 3105 | ||
3159 | static inline int raw_cmd_copyout(int cmd, char __user *param, | 3106 | static inline int raw_cmd_copyout(int cmd, void __user *param, |
3160 | struct floppy_raw_cmd *ptr) | 3107 | struct floppy_raw_cmd *ptr) |
3161 | { | 3108 | { |
3162 | int ret; | 3109 | int ret; |
3163 | 3110 | ||
3164 | while (ptr) { | 3111 | while (ptr) { |
3165 | COPYOUT(*ptr); | 3112 | ret = copy_to_user(param, ptr, sizeof(*ptr)); |
3113 | if (ret) | ||
3114 | return -EFAULT; | ||
3166 | param += sizeof(struct floppy_raw_cmd); | 3115 | param += sizeof(struct floppy_raw_cmd); |
3167 | if ((ptr->flags & FD_RAW_READ) && ptr->buffer_length) { | 3116 | if ((ptr->flags & FD_RAW_READ) && ptr->buffer_length) { |
3168 | if (ptr->length >= 0 | 3117 | if (ptr->length >= 0 && |
3169 | && ptr->length <= ptr->buffer_length) | 3118 | ptr->length <= ptr->buffer_length) { |
3170 | ECALL(fd_copyout | 3119 | long length = ptr->buffer_length - ptr->length; |
3171 | (ptr->data, ptr->kernel_data, | 3120 | ret = fd_copyout(ptr->data, ptr->kernel_data, |
3172 | ptr->buffer_length - ptr->length)); | 3121 | length); |
3122 | if (ret) | ||
3123 | return ret; | ||
3124 | } | ||
3173 | } | 3125 | } |
3174 | ptr = ptr->next; | 3126 | ptr = ptr->next; |
3175 | } | 3127 | } |
3128 | |||
3176 | return 0; | 3129 | return 0; |
3177 | } | 3130 | } |
3178 | 3131 | ||
@@ -3195,7 +3148,7 @@ static void raw_cmd_free(struct floppy_raw_cmd **ptr) | |||
3195 | } | 3148 | } |
3196 | } | 3149 | } |
3197 | 3150 | ||
3198 | static inline int raw_cmd_copyin(int cmd, char __user *param, | 3151 | static inline int raw_cmd_copyin(int cmd, void __user *param, |
3199 | struct floppy_raw_cmd **rcmd) | 3152 | struct floppy_raw_cmd **rcmd) |
3200 | { | 3153 | { |
3201 | struct floppy_raw_cmd *ptr; | 3154 | struct floppy_raw_cmd *ptr; |
@@ -3203,17 +3156,19 @@ static inline int raw_cmd_copyin(int cmd, char __user *param, | |||
3203 | int i; | 3156 | int i; |
3204 | 3157 | ||
3205 | *rcmd = NULL; | 3158 | *rcmd = NULL; |
3206 | while (1) { | 3159 | |
3207 | ptr = (struct floppy_raw_cmd *) | 3160 | loop: |
3208 | kmalloc(sizeof(struct floppy_raw_cmd), GFP_USER); | 3161 | ptr = kmalloc(sizeof(struct floppy_raw_cmd), GFP_USER); |
3209 | if (!ptr) | 3162 | if (!ptr) |
3210 | return -ENOMEM; | 3163 | return -ENOMEM; |
3211 | *rcmd = ptr; | 3164 | *rcmd = ptr; |
3212 | COPYIN(*ptr); | 3165 | ret = copy_from_user(ptr, param, sizeof(*ptr)); |
3213 | ptr->next = NULL; | 3166 | if (ret) |
3214 | ptr->buffer_length = 0; | 3167 | return -EFAULT; |
3215 | param += sizeof(struct floppy_raw_cmd); | 3168 | ptr->next = NULL; |
3216 | if (ptr->cmd_count > 33) | 3169 | ptr->buffer_length = 0; |
3170 | param += sizeof(struct floppy_raw_cmd); | ||
3171 | if (ptr->cmd_count > 33) | ||
3217 | /* the command may now also take up the space | 3172 | /* the command may now also take up the space |
3218 | * initially intended for the reply & the | 3173 | * initially intended for the reply & the |
3219 | * reply count. Needed for long 82078 commands | 3174 | * reply count. Needed for long 82078 commands |
@@ -3222,31 +3177,35 @@ static inline int raw_cmd_copyin(int cmd, char __user *param, | |||
3222 | * 16 bytes for a structure, you'll one day | 3177 | * 16 bytes for a structure, you'll one day |
3223 | * discover that you really need 17... | 3178 | * discover that you really need 17... |
3224 | */ | 3179 | */ |
3180 | return -EINVAL; | ||
3181 | |||
3182 | for (i = 0; i < 16; i++) | ||
3183 | ptr->reply[i] = 0; | ||
3184 | ptr->resultcode = 0; | ||
3185 | ptr->kernel_data = NULL; | ||
3186 | |||
3187 | if (ptr->flags & (FD_RAW_READ | FD_RAW_WRITE)) { | ||
3188 | if (ptr->length <= 0) | ||
3225 | return -EINVAL; | 3189 | return -EINVAL; |
3190 | ptr->kernel_data = (char *)fd_dma_mem_alloc(ptr->length); | ||
3191 | fallback_on_nodma_alloc(&ptr->kernel_data, ptr->length); | ||
3192 | if (!ptr->kernel_data) | ||
3193 | return -ENOMEM; | ||
3194 | ptr->buffer_length = ptr->length; | ||
3195 | } | ||
3196 | if (ptr->flags & FD_RAW_WRITE) { | ||
3197 | ret = fd_copyin(ptr->data, ptr->kernel_data, ptr->length); | ||
3198 | if (ret) | ||
3199 | return ret; | ||
3200 | } | ||
3226 | 3201 | ||
3227 | for (i = 0; i < 16; i++) | 3202 | if (ptr->flags & FD_RAW_MORE) { |
3228 | ptr->reply[i] = 0; | ||
3229 | ptr->resultcode = 0; | ||
3230 | ptr->kernel_data = NULL; | ||
3231 | |||
3232 | if (ptr->flags & (FD_RAW_READ | FD_RAW_WRITE)) { | ||
3233 | if (ptr->length <= 0) | ||
3234 | return -EINVAL; | ||
3235 | ptr->kernel_data = | ||
3236 | (char *)fd_dma_mem_alloc(ptr->length); | ||
3237 | fallback_on_nodma_alloc(&ptr->kernel_data, ptr->length); | ||
3238 | if (!ptr->kernel_data) | ||
3239 | return -ENOMEM; | ||
3240 | ptr->buffer_length = ptr->length; | ||
3241 | } | ||
3242 | if (ptr->flags & FD_RAW_WRITE) | ||
3243 | ECALL(fd_copyin(ptr->data, ptr->kernel_data, | ||
3244 | ptr->length)); | ||
3245 | rcmd = &(ptr->next); | 3203 | rcmd = &(ptr->next); |
3246 | if (!(ptr->flags & FD_RAW_MORE)) | ||
3247 | return 0; | ||
3248 | ptr->rate &= 0x43; | 3204 | ptr->rate &= 0x43; |
3205 | goto loop; | ||
3249 | } | 3206 | } |
3207 | |||
3208 | return 0; | ||
3250 | } | 3209 | } |
3251 | 3210 | ||
3252 | static int raw_cmd_ioctl(int cmd, void __user *param) | 3211 | static int raw_cmd_ioctl(int cmd, void __user *param) |
@@ -3283,12 +3242,8 @@ static int raw_cmd_ioctl(int cmd, void __user *param) | |||
3283 | 3242 | ||
3284 | raw_cmd = my_raw_cmd; | 3243 | raw_cmd = my_raw_cmd; |
3285 | cont = &raw_cmd_cont; | 3244 | cont = &raw_cmd_cont; |
3286 | ret = wait_til_done(floppy_start, 1); | 3245 | ret = wait_til_done(floppy_start, true); |
3287 | #ifdef DCL_DEBUG | 3246 | debug_dcl(DP->flags, "calling disk change from raw_cmd ioctl\n"); |
3288 | if (DP->flags & FD_DEBUG) { | ||
3289 | DPRINT("calling disk change from raw_cmd ioctl\n"); | ||
3290 | } | ||
3291 | #endif | ||
3292 | 3247 | ||
3293 | if (ret != -EINTR && FDCS->reset) | 3248 | if (ret != -EINTR && FDCS->reset) |
3294 | ret = -EIO; | 3249 | ret = -EIO; |
@@ -3327,7 +3282,7 @@ static inline int set_geometry(unsigned int cmd, struct floppy_struct *g, | |||
3327 | if (!capable(CAP_SYS_ADMIN)) | 3282 | if (!capable(CAP_SYS_ADMIN)) |
3328 | return -EPERM; | 3283 | return -EPERM; |
3329 | mutex_lock(&open_lock); | 3284 | mutex_lock(&open_lock); |
3330 | if (lock_fdc(drive, 1)) { | 3285 | if (lock_fdc(drive, true)) { |
3331 | mutex_unlock(&open_lock); | 3286 | mutex_unlock(&open_lock); |
3332 | return -EINTR; | 3287 | return -EINTR; |
3333 | } | 3288 | } |
@@ -3346,11 +3301,15 @@ static inline int set_geometry(unsigned int cmd, struct floppy_struct *g, | |||
3346 | mutex_unlock(&open_lock); | 3301 | mutex_unlock(&open_lock); |
3347 | } else { | 3302 | } else { |
3348 | int oldStretch; | 3303 | int oldStretch; |
3349 | LOCK_FDC(drive, 1); | 3304 | |
3350 | if (cmd != FDDEFPRM) | 3305 | if (lock_fdc(drive, true)) |
3306 | return -EINTR; | ||
3307 | if (cmd != FDDEFPRM) { | ||
3351 | /* notice a disk change immediately, else | 3308 | /* notice a disk change immediately, else |
3352 | * we lose our settings immediately*/ | 3309 | * we lose our settings immediately*/ |
3353 | CALL(poll_drive(1, FD_RAW_NEED_DISK)); | 3310 | if (poll_drive(true, FD_RAW_NEED_DISK) == -EINTR) |
3311 | return -EINTR; | ||
3312 | } | ||
3354 | oldStretch = g->stretch; | 3313 | oldStretch = g->stretch; |
3355 | user_params[drive] = *g; | 3314 | user_params[drive] = *g; |
3356 | if (buffer_drive == drive) | 3315 | if (buffer_drive == drive) |
@@ -3415,7 +3374,7 @@ static inline int normalize_ioctl(int *cmd, int *size) | |||
3415 | *size = _IOC_SIZE(*cmd); | 3374 | *size = _IOC_SIZE(*cmd); |
3416 | *cmd = ioctl_table[i]; | 3375 | *cmd = ioctl_table[i]; |
3417 | if (*size > _IOC_SIZE(*cmd)) { | 3376 | if (*size > _IOC_SIZE(*cmd)) { |
3418 | printk("ioctl not yet supported\n"); | 3377 | pr_info("ioctl not yet supported\n"); |
3419 | return -EFAULT; | 3378 | return -EFAULT; |
3420 | } | 3379 | } |
3421 | return 0; | 3380 | return 0; |
@@ -3429,8 +3388,10 @@ static int get_floppy_geometry(int drive, int type, struct floppy_struct **g) | |||
3429 | if (type) | 3388 | if (type) |
3430 | *g = &floppy_type[type]; | 3389 | *g = &floppy_type[type]; |
3431 | else { | 3390 | else { |
3432 | LOCK_FDC(drive, 0); | 3391 | if (lock_fdc(drive, false)) |
3433 | CALL(poll_drive(0, 0)); | 3392 | return -EINTR; |
3393 | if (poll_drive(false, 0) == -EINTR) | ||
3394 | return -EINTR; | ||
3434 | process_fd_request(); | 3395 | process_fd_request(); |
3435 | *g = current_type[drive]; | 3396 | *g = current_type[drive]; |
3436 | } | 3397 | } |
@@ -3459,10 +3420,6 @@ static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo) | |||
3459 | static int fd_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, | 3420 | static int fd_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, |
3460 | unsigned long param) | 3421 | unsigned long param) |
3461 | { | 3422 | { |
3462 | #define FD_IOCTL_ALLOWED (mode & (FMODE_WRITE|FMODE_WRITE_IOCTL)) | ||
3463 | #define OUT(c,x) case c: outparam = (const char *) (x); break | ||
3464 | #define IN(c,x,tag) case c: *(x) = inparam. tag ; return 0 | ||
3465 | |||
3466 | int drive = (long)bdev->bd_disk->private_data; | 3423 | int drive = (long)bdev->bd_disk->private_data; |
3467 | int type = ITYPE(UDRS->fd_device); | 3424 | int type = ITYPE(UDRS->fd_device); |
3468 | int i; | 3425 | int i; |
@@ -3474,153 +3431,171 @@ static int fd_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, | |||
3474 | struct floppy_max_errors max_errors; | 3431 | struct floppy_max_errors max_errors; |
3475 | struct floppy_drive_params dp; | 3432 | struct floppy_drive_params dp; |
3476 | } inparam; /* parameters coming from user space */ | 3433 | } inparam; /* parameters coming from user space */ |
3477 | const char *outparam; /* parameters passed back to user space */ | 3434 | const void *outparam; /* parameters passed back to user space */ |
3478 | 3435 | ||
3479 | /* convert compatibility eject ioctls into floppy eject ioctl. | 3436 | /* convert compatibility eject ioctls into floppy eject ioctl. |
3480 | * We do this in order to provide a means to eject floppy disks before | 3437 | * We do this in order to provide a means to eject floppy disks before |
3481 | * installing the new fdutils package */ | 3438 | * installing the new fdutils package */ |
3482 | if (cmd == CDROMEJECT || /* CD-ROM eject */ | 3439 | if (cmd == CDROMEJECT || /* CD-ROM eject */ |
3483 | cmd == 0x6470 /* SunOS floppy eject */ ) { | 3440 | cmd == 0x6470) { /* SunOS floppy eject */ |
3484 | DPRINT("obsolete eject ioctl\n"); | 3441 | DPRINT("obsolete eject ioctl\n"); |
3485 | DPRINT("please use floppycontrol --eject\n"); | 3442 | DPRINT("please use floppycontrol --eject\n"); |
3486 | cmd = FDEJECT; | 3443 | cmd = FDEJECT; |
3487 | } | 3444 | } |
3488 | 3445 | ||
3489 | /* convert the old style command into a new style command */ | 3446 | if (!((cmd & 0xff00) == 0x0200)) |
3490 | if ((cmd & 0xff00) == 0x0200) { | ||
3491 | ECALL(normalize_ioctl(&cmd, &size)); | ||
3492 | } else | ||
3493 | return -EINVAL; | 3447 | return -EINVAL; |
3494 | 3448 | ||
3449 | /* convert the old style command into a new style command */ | ||
3450 | ret = normalize_ioctl(&cmd, &size); | ||
3451 | if (ret) | ||
3452 | return ret; | ||
3453 | |||
3495 | /* permission checks */ | 3454 | /* permission checks */ |
3496 | if (((cmd & 0x40) && !FD_IOCTL_ALLOWED) || | 3455 | if (((cmd & 0x40) && !(mode & (FMODE_WRITE | FMODE_WRITE_IOCTL))) || |
3497 | ((cmd & 0x80) && !capable(CAP_SYS_ADMIN))) | 3456 | ((cmd & 0x80) && !capable(CAP_SYS_ADMIN))) |
3498 | return -EPERM; | 3457 | return -EPERM; |
3499 | 3458 | ||
3459 | if (WARN_ON(size < 0 || size > sizeof(inparam))) | ||
3460 | return -EINVAL; | ||
3461 | |||
3500 | /* copyin */ | 3462 | /* copyin */ |
3501 | CLEARSTRUCT(&inparam); | 3463 | memset(&inparam, 0, sizeof(inparam)); |
3502 | if (_IOC_DIR(cmd) & _IOC_WRITE) | 3464 | if (_IOC_DIR(cmd) & _IOC_WRITE) { |
3503 | ECALL(fd_copyin((void __user *)param, &inparam, size)) | 3465 | ret = fd_copyin((void __user *)param, &inparam, size); |
3504 | 3466 | if (ret) | |
3505 | switch (cmd) { | ||
3506 | case FDEJECT: | ||
3507 | if (UDRS->fd_ref != 1) | ||
3508 | /* somebody else has this drive open */ | ||
3509 | return -EBUSY; | ||
3510 | LOCK_FDC(drive, 1); | ||
3511 | |||
3512 | /* do the actual eject. Fails on | ||
3513 | * non-Sparc architectures */ | ||
3514 | ret = fd_eject(UNIT(drive)); | ||
3515 | |||
3516 | USETF(FD_DISK_CHANGED); | ||
3517 | USETF(FD_VERIFY); | ||
3518 | process_fd_request(); | ||
3519 | return ret; | 3467 | return ret; |
3520 | case FDCLRPRM: | 3468 | } |
3521 | LOCK_FDC(drive, 1); | ||
3522 | current_type[drive] = NULL; | ||
3523 | floppy_sizes[drive] = MAX_DISK_SIZE << 1; | ||
3524 | UDRS->keep_data = 0; | ||
3525 | return invalidate_drive(bdev); | ||
3526 | case FDSETPRM: | ||
3527 | case FDDEFPRM: | ||
3528 | return set_geometry(cmd, &inparam.g, | ||
3529 | drive, type, bdev); | ||
3530 | case FDGETPRM: | ||
3531 | ECALL(get_floppy_geometry(drive, type, | ||
3532 | (struct floppy_struct **) | ||
3533 | &outparam)); | ||
3534 | break; | ||
3535 | |||
3536 | case FDMSGON: | ||
3537 | UDP->flags |= FTD_MSG; | ||
3538 | return 0; | ||
3539 | case FDMSGOFF: | ||
3540 | UDP->flags &= ~FTD_MSG; | ||
3541 | return 0; | ||
3542 | |||
3543 | case FDFMTBEG: | ||
3544 | LOCK_FDC(drive, 1); | ||
3545 | CALL(poll_drive(1, FD_RAW_NEED_DISK)); | ||
3546 | ret = UDRS->flags; | ||
3547 | process_fd_request(); | ||
3548 | if (ret & FD_VERIFY) | ||
3549 | return -ENODEV; | ||
3550 | if (!(ret & FD_DISK_WRITABLE)) | ||
3551 | return -EROFS; | ||
3552 | return 0; | ||
3553 | case FDFMTTRK: | ||
3554 | if (UDRS->fd_ref != 1) | ||
3555 | return -EBUSY; | ||
3556 | return do_format(drive, &inparam.f); | ||
3557 | case FDFMTEND: | ||
3558 | case FDFLUSH: | ||
3559 | LOCK_FDC(drive, 1); | ||
3560 | return invalidate_drive(bdev); | ||
3561 | |||
3562 | case FDSETEMSGTRESH: | ||
3563 | UDP->max_errors.reporting = | ||
3564 | (unsigned short)(param & 0x0f); | ||
3565 | return 0; | ||
3566 | OUT(FDGETMAXERRS, &UDP->max_errors); | ||
3567 | IN(FDSETMAXERRS, &UDP->max_errors, max_errors); | ||
3568 | |||
3569 | case FDGETDRVTYP: | ||
3570 | outparam = drive_name(type, drive); | ||
3571 | SUPBOUND(size, strlen(outparam) + 1); | ||
3572 | break; | ||
3573 | |||
3574 | IN(FDSETDRVPRM, UDP, dp); | ||
3575 | OUT(FDGETDRVPRM, UDP); | ||
3576 | |||
3577 | case FDPOLLDRVSTAT: | ||
3578 | LOCK_FDC(drive, 1); | ||
3579 | CALL(poll_drive(1, FD_RAW_NEED_DISK)); | ||
3580 | process_fd_request(); | ||
3581 | /* fall through */ | ||
3582 | OUT(FDGETDRVSTAT, UDRS); | ||
3583 | |||
3584 | case FDRESET: | ||
3585 | return user_reset_fdc(drive, (int)param, 1); | ||
3586 | |||
3587 | OUT(FDGETFDCSTAT, UFDCS); | ||
3588 | 3469 | ||
3589 | case FDWERRORCLR: | 3470 | switch (cmd) { |
3590 | CLEARSTRUCT(UDRWE); | 3471 | case FDEJECT: |
3591 | return 0; | 3472 | if (UDRS->fd_ref != 1) |
3592 | OUT(FDWERRORGET, UDRWE); | 3473 | /* somebody else has this drive open */ |
3593 | 3474 | return -EBUSY; | |
3594 | case FDRAWCMD: | 3475 | if (lock_fdc(drive, true)) |
3595 | if (type) | 3476 | return -EINTR; |
3596 | return -EINVAL; | ||
3597 | LOCK_FDC(drive, 1); | ||
3598 | set_floppy(drive); | ||
3599 | CALL(i = raw_cmd_ioctl(cmd, (void __user *)param)); | ||
3600 | process_fd_request(); | ||
3601 | return i; | ||
3602 | 3477 | ||
3603 | case FDTWADDLE: | 3478 | /* do the actual eject. Fails on |
3604 | LOCK_FDC(drive, 1); | 3479 | * non-Sparc architectures */ |
3605 | twaddle(); | 3480 | ret = fd_eject(UNIT(drive)); |
3606 | process_fd_request(); | ||
3607 | return 0; | ||
3608 | 3481 | ||
3609 | default: | 3482 | set_bit(FD_DISK_CHANGED_BIT, &UDRS->flags); |
3483 | set_bit(FD_VERIFY_BIT, &UDRS->flags); | ||
3484 | process_fd_request(); | ||
3485 | return ret; | ||
3486 | case FDCLRPRM: | ||
3487 | if (lock_fdc(drive, true)) | ||
3488 | return -EINTR; | ||
3489 | current_type[drive] = NULL; | ||
3490 | floppy_sizes[drive] = MAX_DISK_SIZE << 1; | ||
3491 | UDRS->keep_data = 0; | ||
3492 | return invalidate_drive(bdev); | ||
3493 | case FDSETPRM: | ||
3494 | case FDDEFPRM: | ||
3495 | return set_geometry(cmd, &inparam.g, drive, type, bdev); | ||
3496 | case FDGETPRM: | ||
3497 | ret = get_floppy_geometry(drive, type, | ||
3498 | (struct floppy_struct **)&outparam); | ||
3499 | if (ret) | ||
3500 | return ret; | ||
3501 | break; | ||
3502 | case FDMSGON: | ||
3503 | UDP->flags |= FTD_MSG; | ||
3504 | return 0; | ||
3505 | case FDMSGOFF: | ||
3506 | UDP->flags &= ~FTD_MSG; | ||
3507 | return 0; | ||
3508 | case FDFMTBEG: | ||
3509 | if (lock_fdc(drive, true)) | ||
3510 | return -EINTR; | ||
3511 | if (poll_drive(true, FD_RAW_NEED_DISK) == -EINTR) | ||
3512 | return -EINTR; | ||
3513 | ret = UDRS->flags; | ||
3514 | process_fd_request(); | ||
3515 | if (ret & FD_VERIFY) | ||
3516 | return -ENODEV; | ||
3517 | if (!(ret & FD_DISK_WRITABLE)) | ||
3518 | return -EROFS; | ||
3519 | return 0; | ||
3520 | case FDFMTTRK: | ||
3521 | if (UDRS->fd_ref != 1) | ||
3522 | return -EBUSY; | ||
3523 | return do_format(drive, &inparam.f); | ||
3524 | case FDFMTEND: | ||
3525 | case FDFLUSH: | ||
3526 | if (lock_fdc(drive, true)) | ||
3527 | return -EINTR; | ||
3528 | return invalidate_drive(bdev); | ||
3529 | case FDSETEMSGTRESH: | ||
3530 | UDP->max_errors.reporting = (unsigned short)(param & 0x0f); | ||
3531 | return 0; | ||
3532 | case FDGETMAXERRS: | ||
3533 | outparam = &UDP->max_errors; | ||
3534 | break; | ||
3535 | case FDSETMAXERRS: | ||
3536 | UDP->max_errors = inparam.max_errors; | ||
3537 | break; | ||
3538 | case FDGETDRVTYP: | ||
3539 | outparam = drive_name(type, drive); | ||
3540 | SUPBOUND(size, strlen((const char *)outparam) + 1); | ||
3541 | break; | ||
3542 | case FDSETDRVPRM: | ||
3543 | *UDP = inparam.dp; | ||
3544 | break; | ||
3545 | case FDGETDRVPRM: | ||
3546 | outparam = UDP; | ||
3547 | break; | ||
3548 | case FDPOLLDRVSTAT: | ||
3549 | if (lock_fdc(drive, true)) | ||
3550 | return -EINTR; | ||
3551 | if (poll_drive(true, FD_RAW_NEED_DISK) == -EINTR) | ||
3552 | return -EINTR; | ||
3553 | process_fd_request(); | ||
3554 | /* fall through */ | ||
3555 | case FDGETDRVSTAT: | ||
3556 | outparam = UDRS; | ||
3557 | break; | ||
3558 | case FDRESET: | ||
3559 | return user_reset_fdc(drive, (int)param, true); | ||
3560 | case FDGETFDCSTAT: | ||
3561 | outparam = UFDCS; | ||
3562 | break; | ||
3563 | case FDWERRORCLR: | ||
3564 | memset(UDRWE, 0, sizeof(*UDRWE)); | ||
3565 | return 0; | ||
3566 | case FDWERRORGET: | ||
3567 | outparam = UDRWE; | ||
3568 | break; | ||
3569 | case FDRAWCMD: | ||
3570 | if (type) | ||
3610 | return -EINVAL; | 3571 | return -EINVAL; |
3611 | } | 3572 | if (lock_fdc(drive, true)) |
3573 | return -EINTR; | ||
3574 | set_floppy(drive); | ||
3575 | i = raw_cmd_ioctl(cmd, (void __user *)param); | ||
3576 | if (i == -EINTR) | ||
3577 | return -EINTR; | ||
3578 | process_fd_request(); | ||
3579 | return i; | ||
3580 | case FDTWADDLE: | ||
3581 | if (lock_fdc(drive, true)) | ||
3582 | return -EINTR; | ||
3583 | twaddle(); | ||
3584 | process_fd_request(); | ||
3585 | return 0; | ||
3586 | default: | ||
3587 | return -EINVAL; | ||
3588 | } | ||
3612 | 3589 | ||
3613 | if (_IOC_DIR(cmd) & _IOC_READ) | 3590 | if (_IOC_DIR(cmd) & _IOC_READ) |
3614 | return fd_copyout((void __user *)param, outparam, size); | 3591 | return fd_copyout((void __user *)param, outparam, size); |
3615 | else | 3592 | |
3616 | return 0; | 3593 | return 0; |
3617 | #undef OUT | ||
3618 | #undef IN | ||
3619 | } | 3594 | } |
3620 | 3595 | ||
3621 | static void __init config_types(void) | 3596 | static void __init config_types(void) |
3622 | { | 3597 | { |
3623 | int first = 1; | 3598 | bool has_drive = false; |
3624 | int drive; | 3599 | int drive; |
3625 | 3600 | ||
3626 | /* read drive info out of physical CMOS */ | 3601 | /* read drive info out of physical CMOS */ |
@@ -3652,17 +3627,22 @@ static void __init config_types(void) | |||
3652 | name = temparea; | 3627 | name = temparea; |
3653 | } | 3628 | } |
3654 | if (name) { | 3629 | if (name) { |
3655 | const char *prepend = ","; | 3630 | const char *prepend; |
3656 | if (first) { | 3631 | if (!has_drive) { |
3657 | prepend = KERN_INFO "Floppy drive(s):"; | 3632 | prepend = ""; |
3658 | first = 0; | 3633 | has_drive = true; |
3634 | pr_info("Floppy drive(s):"); | ||
3635 | } else { | ||
3636 | prepend = ","; | ||
3659 | } | 3637 | } |
3660 | printk("%s fd%d is %s", prepend, drive, name); | 3638 | |
3639 | pr_cont("%s fd%d is %s", prepend, drive, name); | ||
3661 | } | 3640 | } |
3662 | *UDP = *params; | 3641 | *UDP = *params; |
3663 | } | 3642 | } |
3664 | if (!first) | 3643 | |
3665 | printk("\n"); | 3644 | if (has_drive) |
3645 | pr_cont("\n"); | ||
3666 | } | 3646 | } |
3667 | 3647 | ||
3668 | static int floppy_release(struct gendisk *disk, fmode_t mode) | 3648 | static int floppy_release(struct gendisk *disk, fmode_t mode) |
@@ -3702,8 +3682,8 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) | |||
3702 | goto out2; | 3682 | goto out2; |
3703 | 3683 | ||
3704 | if (!UDRS->fd_ref && (UDP->flags & FD_BROKEN_DCL)) { | 3684 | if (!UDRS->fd_ref && (UDP->flags & FD_BROKEN_DCL)) { |
3705 | USETF(FD_DISK_CHANGED); | 3685 | set_bit(FD_DISK_CHANGED_BIT, &UDRS->flags); |
3706 | USETF(FD_VERIFY); | 3686 | set_bit(FD_VERIFY_BIT, &UDRS->flags); |
3707 | } | 3687 | } |
3708 | 3688 | ||
3709 | if (UDRS->fd_ref == -1 || (UDRS->fd_ref && (mode & FMODE_EXCL))) | 3689 | if (UDRS->fd_ref == -1 || (UDRS->fd_ref && (mode & FMODE_EXCL))) |
@@ -3732,9 +3712,8 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) | |||
3732 | INFBOUND(try, 16); | 3712 | INFBOUND(try, 16); |
3733 | tmp = (char *)fd_dma_mem_alloc(1024 * try); | 3713 | tmp = (char *)fd_dma_mem_alloc(1024 * try); |
3734 | } | 3714 | } |
3735 | if (!tmp && !floppy_track_buffer) { | 3715 | if (!tmp && !floppy_track_buffer) |
3736 | fallback_on_nodma_alloc(&tmp, 2048 * try); | 3716 | fallback_on_nodma_alloc(&tmp, 2048 * try); |
3737 | } | ||
3738 | if (!tmp && !floppy_track_buffer) { | 3717 | if (!tmp && !floppy_track_buffer) { |
3739 | DPRINT("Unable to allocate DMA memory\n"); | 3718 | DPRINT("Unable to allocate DMA memory\n"); |
3740 | goto out; | 3719 | goto out; |
@@ -3764,11 +3743,12 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) | |||
3764 | if (mode & (FMODE_READ|FMODE_WRITE)) { | 3743 | if (mode & (FMODE_READ|FMODE_WRITE)) { |
3765 | UDRS->last_checked = 0; | 3744 | UDRS->last_checked = 0; |
3766 | check_disk_change(bdev); | 3745 | check_disk_change(bdev); |
3767 | if (UTESTF(FD_DISK_CHANGED)) | 3746 | if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags)) |
3768 | goto out; | 3747 | goto out; |
3769 | } | 3748 | } |
3770 | res = -EROFS; | 3749 | res = -EROFS; |
3771 | if ((mode & FMODE_WRITE) && !(UTESTF(FD_DISK_WRITABLE))) | 3750 | if ((mode & FMODE_WRITE) && |
3751 | !test_bit(FD_DISK_WRITABLE_BIT, &UDRS->flags)) | ||
3772 | goto out; | 3752 | goto out; |
3773 | } | 3753 | } |
3774 | mutex_unlock(&open_lock); | 3754 | mutex_unlock(&open_lock); |
@@ -3792,17 +3772,18 @@ static int check_floppy_change(struct gendisk *disk) | |||
3792 | { | 3772 | { |
3793 | int drive = (long)disk->private_data; | 3773 | int drive = (long)disk->private_data; |
3794 | 3774 | ||
3795 | if (UTESTF(FD_DISK_CHANGED) || UTESTF(FD_VERIFY)) | 3775 | if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) || |
3776 | test_bit(FD_VERIFY_BIT, &UDRS->flags)) | ||
3796 | return 1; | 3777 | return 1; |
3797 | 3778 | ||
3798 | if (time_after(jiffies, UDRS->last_checked + UDP->checkfreq)) { | 3779 | if (time_after(jiffies, UDRS->last_checked + UDP->checkfreq)) { |
3799 | lock_fdc(drive, 0); | 3780 | lock_fdc(drive, false); |
3800 | poll_drive(0, 0); | 3781 | poll_drive(false, 0); |
3801 | process_fd_request(); | 3782 | process_fd_request(); |
3802 | } | 3783 | } |
3803 | 3784 | ||
3804 | if (UTESTF(FD_DISK_CHANGED) || | 3785 | if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) || |
3805 | UTESTF(FD_VERIFY) || | 3786 | test_bit(FD_VERIFY_BIT, &UDRS->flags) || |
3806 | test_bit(drive, &fake_change) || | 3787 | test_bit(drive, &fake_change) || |
3807 | (!ITYPE(UDRS->fd_device) && !current_type[drive])) | 3788 | (!ITYPE(UDRS->fd_device) && !current_type[drive])) |
3808 | return 1; | 3789 | return 1; |
@@ -3815,8 +3796,7 @@ static int check_floppy_change(struct gendisk *disk) | |||
3815 | * a disk in the drive, and whether that disk is writable. | 3796 | * a disk in the drive, and whether that disk is writable. |
3816 | */ | 3797 | */ |
3817 | 3798 | ||
3818 | static void floppy_rb0_complete(struct bio *bio, | 3799 | static void floppy_rb0_complete(struct bio *bio, int err) |
3819 | int err) | ||
3820 | { | 3800 | { |
3821 | complete((struct completion *)bio->bi_private); | 3801 | complete((struct completion *)bio->bi_private); |
3822 | } | 3802 | } |
@@ -3874,14 +3854,16 @@ static int floppy_revalidate(struct gendisk *disk) | |||
3874 | int cf; | 3854 | int cf; |
3875 | int res = 0; | 3855 | int res = 0; |
3876 | 3856 | ||
3877 | if (UTESTF(FD_DISK_CHANGED) || | 3857 | if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) || |
3878 | UTESTF(FD_VERIFY) || test_bit(drive, &fake_change) || NO_GEOM) { | 3858 | test_bit(FD_VERIFY_BIT, &UDRS->flags) || |
3859 | test_bit(drive, &fake_change) || NO_GEOM) { | ||
3879 | if (usage_count == 0) { | 3860 | if (usage_count == 0) { |
3880 | printk("VFS: revalidate called on non-open device.\n"); | 3861 | pr_info("VFS: revalidate called on non-open device.\n"); |
3881 | return -EFAULT; | 3862 | return -EFAULT; |
3882 | } | 3863 | } |
3883 | lock_fdc(drive, 0); | 3864 | lock_fdc(drive, false); |
3884 | cf = UTESTF(FD_DISK_CHANGED) || UTESTF(FD_VERIFY); | 3865 | cf = (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) || |
3866 | test_bit(FD_VERIFY_BIT, &UDRS->flags)); | ||
3885 | if (!(cf || test_bit(drive, &fake_change) || NO_GEOM)) { | 3867 | if (!(cf || test_bit(drive, &fake_change) || NO_GEOM)) { |
3886 | process_fd_request(); /*already done by another thread */ | 3868 | process_fd_request(); /*already done by another thread */ |
3887 | return 0; | 3869 | return 0; |
@@ -3891,7 +3873,7 @@ static int floppy_revalidate(struct gendisk *disk) | |||
3891 | if (buffer_drive == drive) | 3873 | if (buffer_drive == drive) |
3892 | buffer_track = -1; | 3874 | buffer_track = -1; |
3893 | clear_bit(drive, &fake_change); | 3875 | clear_bit(drive, &fake_change); |
3894 | UCLEARF(FD_DISK_CHANGED); | 3876 | clear_bit(FD_DISK_CHANGED_BIT, &UDRS->flags); |
3895 | if (cf) | 3877 | if (cf) |
3896 | UDRS->generation++; | 3878 | UDRS->generation++; |
3897 | if (NO_GEOM) { | 3879 | if (NO_GEOM) { |
@@ -3899,7 +3881,7 @@ static int floppy_revalidate(struct gendisk *disk) | |||
3899 | res = __floppy_read_block_0(opened_bdev[drive]); | 3881 | res = __floppy_read_block_0(opened_bdev[drive]); |
3900 | } else { | 3882 | } else { |
3901 | if (cf) | 3883 | if (cf) |
3902 | poll_drive(0, FD_RAW_NEED_DISK); | 3884 | poll_drive(false, FD_RAW_NEED_DISK); |
3903 | process_fd_request(); | 3885 | process_fd_request(); |
3904 | } | 3886 | } |
3905 | } | 3887 | } |
@@ -3931,21 +3913,21 @@ static char __init get_fdc_version(void) | |||
3931 | output_byte(FD_DUMPREGS); /* 82072 and better know DUMPREGS */ | 3913 | output_byte(FD_DUMPREGS); /* 82072 and better know DUMPREGS */ |
3932 | if (FDCS->reset) | 3914 | if (FDCS->reset) |
3933 | return FDC_NONE; | 3915 | return FDC_NONE; |
3934 | if ((r = result()) <= 0x00) | 3916 | r = result(); |
3917 | if (r <= 0x00) | ||
3935 | return FDC_NONE; /* No FDC present ??? */ | 3918 | return FDC_NONE; /* No FDC present ??? */ |
3936 | if ((r == 1) && (reply_buffer[0] == 0x80)) { | 3919 | if ((r == 1) && (reply_buffer[0] == 0x80)) { |
3937 | printk(KERN_INFO "FDC %d is an 8272A\n", fdc); | 3920 | pr_info("FDC %d is an 8272A\n", fdc); |
3938 | return FDC_8272A; /* 8272a/765 don't know DUMPREGS */ | 3921 | return FDC_8272A; /* 8272a/765 don't know DUMPREGS */ |
3939 | } | 3922 | } |
3940 | if (r != 10) { | 3923 | if (r != 10) { |
3941 | printk | 3924 | pr_info("FDC %d init: DUMPREGS: unexpected return of %d bytes.\n", |
3942 | ("FDC %d init: DUMPREGS: unexpected return of %d bytes.\n", | 3925 | fdc, r); |
3943 | fdc, r); | ||
3944 | return FDC_UNKNOWN; | 3926 | return FDC_UNKNOWN; |
3945 | } | 3927 | } |
3946 | 3928 | ||
3947 | if (!fdc_configure()) { | 3929 | if (!fdc_configure()) { |
3948 | printk(KERN_INFO "FDC %d is an 82072\n", fdc); | 3930 | pr_info("FDC %d is an 82072\n", fdc); |
3949 | return FDC_82072; /* 82072 doesn't know CONFIGURE */ | 3931 | return FDC_82072; /* 82072 doesn't know CONFIGURE */ |
3950 | } | 3932 | } |
3951 | 3933 | ||
@@ -3953,52 +3935,50 @@ static char __init get_fdc_version(void) | |||
3953 | if (need_more_output() == MORE_OUTPUT) { | 3935 | if (need_more_output() == MORE_OUTPUT) { |
3954 | output_byte(0); | 3936 | output_byte(0); |
3955 | } else { | 3937 | } else { |
3956 | printk(KERN_INFO "FDC %d is an 82072A\n", fdc); | 3938 | pr_info("FDC %d is an 82072A\n", fdc); |
3957 | return FDC_82072A; /* 82072A as found on Sparcs. */ | 3939 | return FDC_82072A; /* 82072A as found on Sparcs. */ |
3958 | } | 3940 | } |
3959 | 3941 | ||
3960 | output_byte(FD_UNLOCK); | 3942 | output_byte(FD_UNLOCK); |
3961 | r = result(); | 3943 | r = result(); |
3962 | if ((r == 1) && (reply_buffer[0] == 0x80)) { | 3944 | if ((r == 1) && (reply_buffer[0] == 0x80)) { |
3963 | printk(KERN_INFO "FDC %d is a pre-1991 82077\n", fdc); | 3945 | pr_info("FDC %d is a pre-1991 82077\n", fdc); |
3964 | return FDC_82077_ORIG; /* Pre-1991 82077, doesn't know | 3946 | return FDC_82077_ORIG; /* Pre-1991 82077, doesn't know |
3965 | * LOCK/UNLOCK */ | 3947 | * LOCK/UNLOCK */ |
3966 | } | 3948 | } |
3967 | if ((r != 1) || (reply_buffer[0] != 0x00)) { | 3949 | if ((r != 1) || (reply_buffer[0] != 0x00)) { |
3968 | printk("FDC %d init: UNLOCK: unexpected return of %d bytes.\n", | 3950 | pr_info("FDC %d init: UNLOCK: unexpected return of %d bytes.\n", |
3969 | fdc, r); | 3951 | fdc, r); |
3970 | return FDC_UNKNOWN; | 3952 | return FDC_UNKNOWN; |
3971 | } | 3953 | } |
3972 | output_byte(FD_PARTID); | 3954 | output_byte(FD_PARTID); |
3973 | r = result(); | 3955 | r = result(); |
3974 | if (r != 1) { | 3956 | if (r != 1) { |
3975 | printk("FDC %d init: PARTID: unexpected return of %d bytes.\n", | 3957 | pr_info("FDC %d init: PARTID: unexpected return of %d bytes.\n", |
3976 | fdc, r); | 3958 | fdc, r); |
3977 | return FDC_UNKNOWN; | 3959 | return FDC_UNKNOWN; |
3978 | } | 3960 | } |
3979 | if (reply_buffer[0] == 0x80) { | 3961 | if (reply_buffer[0] == 0x80) { |
3980 | printk(KERN_INFO "FDC %d is a post-1991 82077\n", fdc); | 3962 | pr_info("FDC %d is a post-1991 82077\n", fdc); |
3981 | return FDC_82077; /* Revised 82077AA passes all the tests */ | 3963 | return FDC_82077; /* Revised 82077AA passes all the tests */ |
3982 | } | 3964 | } |
3983 | switch (reply_buffer[0] >> 5) { | 3965 | switch (reply_buffer[0] >> 5) { |
3984 | case 0x0: | 3966 | case 0x0: |
3985 | /* Either a 82078-1 or a 82078SL running at 5Volt */ | 3967 | /* Either a 82078-1 or a 82078SL running at 5Volt */ |
3986 | printk(KERN_INFO "FDC %d is an 82078.\n", fdc); | 3968 | pr_info("FDC %d is an 82078.\n", fdc); |
3987 | return FDC_82078; | 3969 | return FDC_82078; |
3988 | case 0x1: | 3970 | case 0x1: |
3989 | printk(KERN_INFO "FDC %d is a 44pin 82078\n", fdc); | 3971 | pr_info("FDC %d is a 44pin 82078\n", fdc); |
3990 | return FDC_82078; | 3972 | return FDC_82078; |
3991 | case 0x2: | 3973 | case 0x2: |
3992 | printk(KERN_INFO "FDC %d is a S82078B\n", fdc); | 3974 | pr_info("FDC %d is a S82078B\n", fdc); |
3993 | return FDC_S82078B; | 3975 | return FDC_S82078B; |
3994 | case 0x3: | 3976 | case 0x3: |
3995 | printk(KERN_INFO "FDC %d is a National Semiconductor PC87306\n", | 3977 | pr_info("FDC %d is a National Semiconductor PC87306\n", fdc); |
3996 | fdc); | ||
3997 | return FDC_87306; | 3978 | return FDC_87306; |
3998 | default: | 3979 | default: |
3999 | printk(KERN_INFO | 3980 | pr_info("FDC %d init: 82078 variant with unknown PARTID=%d.\n", |
4000 | "FDC %d init: 82078 variant with unknown PARTID=%d.\n", | 3981 | fdc, reply_buffer[0] >> 5); |
4001 | fdc, reply_buffer[0] >> 5); | ||
4002 | return FDC_82078_UNKN; | 3982 | return FDC_82078_UNKN; |
4003 | } | 3983 | } |
4004 | } /* get_fdc_version */ | 3984 | } /* get_fdc_version */ |
@@ -4110,9 +4090,9 @@ static int __init floppy_setup(char *str) | |||
4110 | else | 4090 | else |
4111 | param = config_params[i].def_param; | 4091 | param = config_params[i].def_param; |
4112 | if (config_params[i].fn) | 4092 | if (config_params[i].fn) |
4113 | config_params[i]. | 4093 | config_params[i].fn(ints, param, |
4114 | fn(ints, param, | 4094 | config_params[i]. |
4115 | config_params[i].param2); | 4095 | param2); |
4116 | if (config_params[i].var) { | 4096 | if (config_params[i].var) { |
4117 | DPRINT("%s=%d\n", str, param); | 4097 | DPRINT("%s=%d\n", str, param); |
4118 | *config_params[i].var = param; | 4098 | *config_params[i].var = param; |
@@ -4126,8 +4106,8 @@ static int __init floppy_setup(char *str) | |||
4126 | 4106 | ||
4127 | DPRINT("allowed options are:"); | 4107 | DPRINT("allowed options are:"); |
4128 | for (i = 0; i < ARRAY_SIZE(config_params); i++) | 4108 | for (i = 0; i < ARRAY_SIZE(config_params); i++) |
4129 | printk(" %s", config_params[i].name); | 4109 | pr_cont(" %s", config_params[i].name); |
4130 | printk("\n"); | 4110 | pr_cont("\n"); |
4131 | } else | 4111 | } else |
4132 | DPRINT("botched floppy option\n"); | 4112 | DPRINT("botched floppy option\n"); |
4133 | DPRINT("Read Documentation/blockdev/floppy.txt\n"); | 4113 | DPRINT("Read Documentation/blockdev/floppy.txt\n"); |
@@ -4145,7 +4125,8 @@ static ssize_t floppy_cmos_show(struct device *dev, | |||
4145 | drive = p->id; | 4125 | drive = p->id; |
4146 | return sprintf(buf, "%X\n", UDP->cmos); | 4126 | return sprintf(buf, "%X\n", UDP->cmos); |
4147 | } | 4127 | } |
4148 | DEVICE_ATTR(cmos,S_IRUGO,floppy_cmos_show,NULL); | 4128 | |
4129 | DEVICE_ATTR(cmos, S_IRUGO, floppy_cmos_show, NULL); | ||
4149 | 4130 | ||
4150 | static void floppy_device_release(struct device *dev) | 4131 | static void floppy_device_release(struct device *dev) |
4151 | { | 4132 | { |
@@ -4157,20 +4138,20 @@ static int floppy_resume(struct device *dev) | |||
4157 | 4138 | ||
4158 | for (fdc = 0; fdc < N_FDC; fdc++) | 4139 | for (fdc = 0; fdc < N_FDC; fdc++) |
4159 | if (FDCS->address != -1) | 4140 | if (FDCS->address != -1) |
4160 | user_reset_fdc(-1, FD_RESET_ALWAYS, 0); | 4141 | user_reset_fdc(-1, FD_RESET_ALWAYS, false); |
4161 | 4142 | ||
4162 | return 0; | 4143 | return 0; |
4163 | } | 4144 | } |
4164 | 4145 | ||
4165 | static struct dev_pm_ops floppy_pm_ops = { | 4146 | static const struct dev_pm_ops floppy_pm_ops = { |
4166 | .resume = floppy_resume, | 4147 | .resume = floppy_resume, |
4167 | .restore = floppy_resume, | 4148 | .restore = floppy_resume, |
4168 | }; | 4149 | }; |
4169 | 4150 | ||
4170 | static struct platform_driver floppy_driver = { | 4151 | static struct platform_driver floppy_driver = { |
4171 | .driver = { | 4152 | .driver = { |
4172 | .name = "floppy", | 4153 | .name = "floppy", |
4173 | .pm = &floppy_pm_ops, | 4154 | .pm = &floppy_pm_ops, |
4174 | }, | 4155 | }, |
4175 | }; | 4156 | }; |
4176 | 4157 | ||
@@ -4231,7 +4212,7 @@ static int __init floppy_init(void) | |||
4231 | err = -ENOMEM; | 4212 | err = -ENOMEM; |
4232 | goto out_unreg_driver; | 4213 | goto out_unreg_driver; |
4233 | } | 4214 | } |
4234 | blk_queue_max_sectors(floppy_queue, 64); | 4215 | blk_queue_max_hw_sectors(floppy_queue, 64); |
4235 | 4216 | ||
4236 | blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE, | 4217 | blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE, |
4237 | floppy_find, NULL, NULL); | 4218 | floppy_find, NULL, NULL); |
@@ -4242,16 +4223,16 @@ static int __init floppy_init(void) | |||
4242 | else | 4223 | else |
4243 | floppy_sizes[i] = MAX_DISK_SIZE << 1; | 4224 | floppy_sizes[i] = MAX_DISK_SIZE << 1; |
4244 | 4225 | ||
4245 | reschedule_timeout(MAXTIMEOUT, "floppy init", MAXTIMEOUT); | 4226 | reschedule_timeout(MAXTIMEOUT, "floppy init"); |
4246 | config_types(); | 4227 | config_types(); |
4247 | 4228 | ||
4248 | for (i = 0; i < N_FDC; i++) { | 4229 | for (i = 0; i < N_FDC; i++) { |
4249 | fdc = i; | 4230 | fdc = i; |
4250 | CLEARSTRUCT(FDCS); | 4231 | memset(FDCS, 0, sizeof(*FDCS)); |
4251 | FDCS->dtr = -1; | 4232 | FDCS->dtr = -1; |
4252 | FDCS->dor = 0x4; | 4233 | FDCS->dor = 0x4; |
4253 | #if defined(__sparc__) || defined(__mc68000__) | 4234 | #if defined(__sparc__) || defined(__mc68000__) |
4254 | /*sparcs/sun3x don't have a DOR reset which we can fall back on to */ | 4235 | /*sparcs/sun3x don't have a DOR reset which we can fall back on to */ |
4255 | #ifdef __mc68000__ | 4236 | #ifdef __mc68000__ |
4256 | if (MACH_IS_SUN3X) | 4237 | if (MACH_IS_SUN3X) |
4257 | #endif | 4238 | #endif |
@@ -4280,11 +4261,11 @@ static int __init floppy_init(void) | |||
4280 | 4261 | ||
4281 | /* initialise drive state */ | 4262 | /* initialise drive state */ |
4282 | for (drive = 0; drive < N_DRIVE; drive++) { | 4263 | for (drive = 0; drive < N_DRIVE; drive++) { |
4283 | CLEARSTRUCT(UDRS); | 4264 | memset(UDRS, 0, sizeof(*UDRS)); |
4284 | CLEARSTRUCT(UDRWE); | 4265 | memset(UDRWE, 0, sizeof(*UDRWE)); |
4285 | USETF(FD_DISK_NEWCHANGE); | 4266 | set_bit(FD_DISK_NEWCHANGE_BIT, &UDRS->flags); |
4286 | USETF(FD_DISK_CHANGED); | 4267 | set_bit(FD_DISK_CHANGED_BIT, &UDRS->flags); |
4287 | USETF(FD_VERIFY); | 4268 | set_bit(FD_VERIFY_BIT, &UDRS->flags); |
4288 | UDRS->fd_device = -1; | 4269 | UDRS->fd_device = -1; |
4289 | floppy_track_buffer = NULL; | 4270 | floppy_track_buffer = NULL; |
4290 | max_buffer_sectors = 0; | 4271 | max_buffer_sectors = 0; |
@@ -4304,7 +4285,7 @@ static int __init floppy_init(void) | |||
4304 | if (FDCS->address == -1) | 4285 | if (FDCS->address == -1) |
4305 | continue; | 4286 | continue; |
4306 | FDCS->rawcmd = 2; | 4287 | FDCS->rawcmd = 2; |
4307 | if (user_reset_fdc(-1, FD_RESET_ALWAYS, 0)) { | 4288 | if (user_reset_fdc(-1, FD_RESET_ALWAYS, false)) { |
4308 | /* free ioports reserved by floppy_grab_irq_and_dma() */ | 4289 | /* free ioports reserved by floppy_grab_irq_and_dma() */ |
4309 | floppy_release_regions(fdc); | 4290 | floppy_release_regions(fdc); |
4310 | FDCS->address = -1; | 4291 | FDCS->address = -1; |
@@ -4327,12 +4308,12 @@ static int __init floppy_init(void) | |||
4327 | * properly, so force a reset for the standard FDC clones, | 4308 | * properly, so force a reset for the standard FDC clones, |
4328 | * to avoid interrupt garbage. | 4309 | * to avoid interrupt garbage. |
4329 | */ | 4310 | */ |
4330 | user_reset_fdc(-1, FD_RESET_ALWAYS, 0); | 4311 | user_reset_fdc(-1, FD_RESET_ALWAYS, false); |
4331 | } | 4312 | } |
4332 | fdc = 0; | 4313 | fdc = 0; |
4333 | del_timer(&fd_timeout); | 4314 | del_timer(&fd_timeout); |
4334 | current_drive = 0; | 4315 | current_drive = 0; |
4335 | initialising = 0; | 4316 | initialized = true; |
4336 | if (have_no_fdc) { | 4317 | if (have_no_fdc) { |
4337 | DPRINT("no floppy controllers found\n"); | 4318 | DPRINT("no floppy controllers found\n"); |
4338 | err = have_no_fdc; | 4319 | err = have_no_fdc; |
@@ -4353,7 +4334,8 @@ static int __init floppy_init(void) | |||
4353 | if (err) | 4334 | if (err) |
4354 | goto out_flush_work; | 4335 | goto out_flush_work; |
4355 | 4336 | ||
4356 | err = device_create_file(&floppy_device[drive].dev,&dev_attr_cmos); | 4337 | err = device_create_file(&floppy_device[drive].dev, |
4338 | &dev_attr_cmos); | ||
4357 | if (err) | 4339 | if (err) |
4358 | goto out_unreg_platform_dev; | 4340 | goto out_unreg_platform_dev; |
4359 | 4341 | ||
@@ -4417,8 +4399,10 @@ static int floppy_request_regions(int fdc) | |||
4417 | const struct io_region *p; | 4399 | const struct io_region *p; |
4418 | 4400 | ||
4419 | for (p = io_regions; p < ARRAY_END(io_regions); p++) { | 4401 | for (p = io_regions; p < ARRAY_END(io_regions); p++) { |
4420 | if (!request_region(FDCS->address + p->offset, p->size, "floppy")) { | 4402 | if (!request_region(FDCS->address + p->offset, |
4421 | DPRINT("Floppy io-port 0x%04lx in use\n", FDCS->address + p->offset); | 4403 | p->size, "floppy")) { |
4404 | DPRINT("Floppy io-port 0x%04lx in use\n", | ||
4405 | FDCS->address + p->offset); | ||
4422 | floppy_release_allocated_regions(fdc, p); | 4406 | floppy_release_allocated_regions(fdc, p); |
4423 | return -EBUSY; | 4407 | return -EBUSY; |
4424 | } | 4408 | } |
@@ -4509,11 +4493,9 @@ cleanup: | |||
4509 | static void floppy_release_irq_and_dma(void) | 4493 | static void floppy_release_irq_and_dma(void) |
4510 | { | 4494 | { |
4511 | int old_fdc; | 4495 | int old_fdc; |
4512 | #ifdef FLOPPY_SANITY_CHECK | ||
4513 | #ifndef __sparc__ | 4496 | #ifndef __sparc__ |
4514 | int drive; | 4497 | int drive; |
4515 | #endif | 4498 | #endif |
4516 | #endif | ||
4517 | long tmpsize; | 4499 | long tmpsize; |
4518 | unsigned long tmpaddr; | 4500 | unsigned long tmpaddr; |
4519 | unsigned long flags; | 4501 | unsigned long flags; |
@@ -4544,20 +4526,18 @@ static void floppy_release_irq_and_dma(void) | |||
4544 | buffer_min = buffer_max = -1; | 4526 | buffer_min = buffer_max = -1; |
4545 | fd_dma_mem_free(tmpaddr, tmpsize); | 4527 | fd_dma_mem_free(tmpaddr, tmpsize); |
4546 | } | 4528 | } |
4547 | #ifdef FLOPPY_SANITY_CHECK | ||
4548 | #ifndef __sparc__ | 4529 | #ifndef __sparc__ |
4549 | for (drive = 0; drive < N_FDC * 4; drive++) | 4530 | for (drive = 0; drive < N_FDC * 4; drive++) |
4550 | if (timer_pending(motor_off_timer + drive)) | 4531 | if (timer_pending(motor_off_timer + drive)) |
4551 | printk("motor off timer %d still active\n", drive); | 4532 | pr_info("motor off timer %d still active\n", drive); |
4552 | #endif | 4533 | #endif |
4553 | 4534 | ||
4554 | if (timer_pending(&fd_timeout)) | 4535 | if (timer_pending(&fd_timeout)) |
4555 | printk("floppy timer still active:%s\n", timeout_message); | 4536 | pr_info("floppy timer still active:%s\n", timeout_message); |
4556 | if (timer_pending(&fd_timer)) | 4537 | if (timer_pending(&fd_timer)) |
4557 | printk("auxiliary floppy timer still active\n"); | 4538 | pr_info("auxiliary floppy timer still active\n"); |
4558 | if (work_pending(&floppy_work)) | 4539 | if (work_pending(&floppy_work)) |
4559 | printk("work still pending\n"); | 4540 | pr_info("work still pending\n"); |
4560 | #endif | ||
4561 | old_fdc = fdc; | 4541 | old_fdc = fdc; |
4562 | for (fdc = 0; fdc < N_FDC; fdc++) | 4542 | for (fdc = 0; fdc < N_FDC; fdc++) |
4563 | if (FDCS->address != -1) | 4543 | if (FDCS->address != -1) |
@@ -4574,7 +4554,9 @@ static void __init parse_floppy_cfg_string(char *cfg) | |||
4574 | char *ptr; | 4554 | char *ptr; |
4575 | 4555 | ||
4576 | while (*cfg) { | 4556 | while (*cfg) { |
4577 | for (ptr = cfg; *cfg && *cfg != ' ' && *cfg != '\t'; cfg++) ; | 4557 | ptr = cfg; |
4558 | while (*cfg && *cfg != ' ' && *cfg != '\t') | ||
4559 | cfg++; | ||
4578 | if (*cfg) { | 4560 | if (*cfg) { |
4579 | *cfg = '\0'; | 4561 | *cfg = '\0'; |
4580 | cfg++; | 4562 | cfg++; |
@@ -4622,6 +4604,7 @@ static void __exit floppy_module_exit(void) | |||
4622 | /* eject disk, if any */ | 4604 | /* eject disk, if any */ |
4623 | fd_eject(0); | 4605 | fd_eject(0); |
4624 | } | 4606 | } |
4607 | |||
4625 | module_exit(floppy_module_exit); | 4608 | module_exit(floppy_module_exit); |
4626 | 4609 | ||
4627 | module_param(floppy, charp, 0); | 4610 | module_param(floppy, charp, 0); |
@@ -4633,9 +4616,10 @@ MODULE_LICENSE("GPL"); | |||
4633 | 4616 | ||
4634 | /* This doesn't actually get used other than for module information */ | 4617 | /* This doesn't actually get used other than for module information */ |
4635 | static const struct pnp_device_id floppy_pnpids[] = { | 4618 | static const struct pnp_device_id floppy_pnpids[] = { |
4636 | { "PNP0700", 0 }, | 4619 | {"PNP0700", 0}, |
4637 | { } | 4620 | {} |
4638 | }; | 4621 | }; |
4622 | |||
4639 | MODULE_DEVICE_TABLE(pnp, floppy_pnpids); | 4623 | MODULE_DEVICE_TABLE(pnp, floppy_pnpids); |
4640 | 4624 | ||
4641 | #else | 4625 | #else |
diff --git a/drivers/block/hd.c b/drivers/block/hd.c index d5cdce08ffd2..034e6dfc878c 100644 --- a/drivers/block/hd.c +++ b/drivers/block/hd.c | |||
@@ -34,7 +34,6 @@ | |||
34 | #include <linux/fs.h> | 34 | #include <linux/fs.h> |
35 | #include <linux/kernel.h> | 35 | #include <linux/kernel.h> |
36 | #include <linux/genhd.h> | 36 | #include <linux/genhd.h> |
37 | #include <linux/slab.h> | ||
38 | #include <linux/string.h> | 37 | #include <linux/string.h> |
39 | #include <linux/ioport.h> | 38 | #include <linux/ioport.h> |
40 | #include <linux/init.h> | 39 | #include <linux/init.h> |
@@ -719,7 +718,7 @@ static int __init hd_init(void) | |||
719 | return -ENOMEM; | 718 | return -ENOMEM; |
720 | } | 719 | } |
721 | 720 | ||
722 | blk_queue_max_sectors(hd_queue, 255); | 721 | blk_queue_max_hw_sectors(hd_queue, 255); |
723 | init_timer(&device_timer); | 722 | init_timer(&device_timer); |
724 | device_timer.function = hd_times_out; | 723 | device_timer.function = hd_times_out; |
725 | blk_queue_logical_block_size(hd_queue, 512); | 724 | blk_queue_logical_block_size(hd_queue, 512); |
diff --git a/drivers/block/loop.c b/drivers/block/loop.c index bd112c8c7bcd..8546d123b9a7 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c | |||
@@ -71,7 +71,6 @@ | |||
71 | #include <linux/buffer_head.h> /* for invalidate_bdev() */ | 71 | #include <linux/buffer_head.h> /* for invalidate_bdev() */ |
72 | #include <linux/completion.h> | 72 | #include <linux/completion.h> |
73 | #include <linux/highmem.h> | 73 | #include <linux/highmem.h> |
74 | #include <linux/gfp.h> | ||
75 | #include <linux/kthread.h> | 74 | #include <linux/kthread.h> |
76 | #include <linux/splice.h> | 75 | #include <linux/splice.h> |
77 | 76 | ||
@@ -238,6 +237,8 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec, | |||
238 | if (ret) | 237 | if (ret) |
239 | goto fail; | 238 | goto fail; |
240 | 239 | ||
240 | file_update_time(file); | ||
241 | |||
241 | transfer_result = lo_do_transfer(lo, WRITE, page, offset, | 242 | transfer_result = lo_do_transfer(lo, WRITE, page, offset, |
242 | bvec->bv_page, bv_offs, size, IV); | 243 | bvec->bv_page, bv_offs, size, IV); |
243 | copied = size; | 244 | copied = size; |
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index e0339aaa1815..28db925dbdad 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/platform_device.h> | 23 | #include <linux/platform_device.h> |
24 | #include <linux/gpio.h> | 24 | #include <linux/gpio.h> |
25 | #include <linux/mg_disk.h> | 25 | #include <linux/mg_disk.h> |
26 | #include <linux/slab.h> | ||
26 | 27 | ||
27 | #define MG_RES_SEC (CONFIG_MG_DISK_RES << 1) | 28 | #define MG_RES_SEC (CONFIG_MG_DISK_RES << 1) |
28 | 29 | ||
@@ -860,7 +861,7 @@ static int mg_probe(struct platform_device *plat_dev) | |||
860 | err = -EINVAL; | 861 | err = -EINVAL; |
861 | goto probe_err_2; | 862 | goto probe_err_2; |
862 | } | 863 | } |
863 | host->dev_base = ioremap(rsc->start , rsc->end + 1); | 864 | host->dev_base = ioremap(rsc->start, resource_size(rsc)); |
864 | if (!host->dev_base) { | 865 | if (!host->dev_base) { |
865 | printk(KERN_ERR "%s:%d ioremap fail\n", | 866 | printk(KERN_ERR "%s:%d ioremap fail\n", |
866 | __func__, __LINE__); | 867 | __func__, __LINE__); |
@@ -980,7 +981,7 @@ static int mg_probe(struct platform_device *plat_dev) | |||
980 | __func__, __LINE__); | 981 | __func__, __LINE__); |
981 | goto probe_err_6; | 982 | goto probe_err_6; |
982 | } | 983 | } |
983 | blk_queue_max_sectors(host->breq, MG_MAX_SECTS); | 984 | blk_queue_max_hw_sectors(host->breq, MG_MAX_SECTS); |
984 | blk_queue_logical_block_size(host->breq, MG_SECTOR_SIZE); | 985 | blk_queue_logical_block_size(host->breq, MG_SECTOR_SIZE); |
985 | 986 | ||
986 | init_timer(&host->timer); | 987 | init_timer(&host->timer); |
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index cc923a5b430c..218d091f3c52 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/compiler.h> | 27 | #include <linux/compiler.h> |
28 | #include <linux/err.h> | 28 | #include <linux/err.h> |
29 | #include <linux/kernel.h> | 29 | #include <linux/kernel.h> |
30 | #include <linux/slab.h> | ||
30 | #include <net/sock.h> | 31 | #include <net/sock.h> |
31 | #include <linux/net.h> | 32 | #include <linux/net.h> |
32 | #include <linux/kthread.h> | 33 | #include <linux/kthread.h> |
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c index a808b1530b3b..6cd8b705b11b 100644 --- a/drivers/block/osdblk.c +++ b/drivers/block/osdblk.c | |||
@@ -63,6 +63,7 @@ | |||
63 | #include <linux/device.h> | 63 | #include <linux/device.h> |
64 | #include <linux/module.h> | 64 | #include <linux/module.h> |
65 | #include <linux/fs.h> | 65 | #include <linux/fs.h> |
66 | #include <linux/slab.h> | ||
66 | #include <scsi/osd_initiator.h> | 67 | #include <scsi/osd_initiator.h> |
67 | #include <scsi/osd_attributes.h> | 68 | #include <scsi/osd_attributes.h> |
68 | #include <scsi/osd_sec.h> | 69 | #include <scsi/osd_sec.h> |
@@ -476,7 +477,9 @@ static void class_osdblk_release(struct class *cls) | |||
476 | kfree(cls); | 477 | kfree(cls); |
477 | } | 478 | } |
478 | 479 | ||
479 | static ssize_t class_osdblk_list(struct class *c, char *data) | 480 | static ssize_t class_osdblk_list(struct class *c, |
481 | struct class_attribute *attr, | ||
482 | char *data) | ||
480 | { | 483 | { |
481 | int n = 0; | 484 | int n = 0; |
482 | struct list_head *tmp; | 485 | struct list_head *tmp; |
@@ -500,7 +503,9 @@ static ssize_t class_osdblk_list(struct class *c, char *data) | |||
500 | return n; | 503 | return n; |
501 | } | 504 | } |
502 | 505 | ||
503 | static ssize_t class_osdblk_add(struct class *c, const char *buf, size_t count) | 506 | static ssize_t class_osdblk_add(struct class *c, |
507 | struct class_attribute *attr, | ||
508 | const char *buf, size_t count) | ||
504 | { | 509 | { |
505 | struct osdblk_device *osdev; | 510 | struct osdblk_device *osdev; |
506 | ssize_t rc; | 511 | ssize_t rc; |
@@ -592,7 +597,9 @@ err_out_mod: | |||
592 | return rc; | 597 | return rc; |
593 | } | 598 | } |
594 | 599 | ||
595 | static ssize_t class_osdblk_remove(struct class *c, const char *buf, | 600 | static ssize_t class_osdblk_remove(struct class *c, |
601 | struct class_attribute *attr, | ||
602 | const char *buf, | ||
596 | size_t count) | 603 | size_t count) |
597 | { | 604 | { |
598 | struct osdblk_device *osdev = NULL; | 605 | struct osdblk_device *osdev = NULL; |
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 8866ca369d5e..71acf4e53356 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c | |||
@@ -341,11 +341,11 @@ static int pcd_wait(struct pcd_unit *cd, int go, int stop, char *fun, char *msg) | |||
341 | && (j++ < PCD_SPIN)) | 341 | && (j++ < PCD_SPIN)) |
342 | udelay(PCD_DELAY); | 342 | udelay(PCD_DELAY); |
343 | 343 | ||
344 | if ((r & (IDE_ERR & stop)) || (j >= PCD_SPIN)) { | 344 | if ((r & (IDE_ERR & stop)) || (j > PCD_SPIN)) { |
345 | s = read_reg(cd, 7); | 345 | s = read_reg(cd, 7); |
346 | e = read_reg(cd, 1); | 346 | e = read_reg(cd, 1); |
347 | p = read_reg(cd, 2); | 347 | p = read_reg(cd, 2); |
348 | if (j >= PCD_SPIN) | 348 | if (j > PCD_SPIN) |
349 | e |= 0x100; | 349 | e |= 0x100; |
350 | if (fun) | 350 | if (fun) |
351 | printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x" | 351 | printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x" |
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 569e39e8f114..c1e5cd029b23 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c | |||
@@ -145,6 +145,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_GEO, D_SBY, D_DLY, D_SLV}; | |||
145 | 145 | ||
146 | #include <linux/init.h> | 146 | #include <linux/init.h> |
147 | #include <linux/module.h> | 147 | #include <linux/module.h> |
148 | #include <linux/gfp.h> | ||
148 | #include <linux/fs.h> | 149 | #include <linux/fs.h> |
149 | #include <linux/delay.h> | 150 | #include <linux/delay.h> |
150 | #include <linux/hdreg.h> | 151 | #include <linux/hdreg.h> |
@@ -906,7 +907,7 @@ static int __init pd_init(void) | |||
906 | if (!pd_queue) | 907 | if (!pd_queue) |
907 | goto out1; | 908 | goto out1; |
908 | 909 | ||
909 | blk_queue_max_sectors(pd_queue, cluster); | 910 | blk_queue_max_hw_sectors(pd_queue, cluster); |
910 | 911 | ||
911 | if (register_blkdev(major, name)) | 912 | if (register_blkdev(major, name)) |
912 | goto out2; | 913 | goto out2; |
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index ea54ea393553..c059aab3006b 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c | |||
@@ -391,11 +391,11 @@ static int pf_wait(struct pf_unit *pf, int go, int stop, char *fun, char *msg) | |||
391 | && (j++ < PF_SPIN)) | 391 | && (j++ < PF_SPIN)) |
392 | udelay(PF_SPIN_DEL); | 392 | udelay(PF_SPIN_DEL); |
393 | 393 | ||
394 | if ((r & (STAT_ERR & stop)) || (j >= PF_SPIN)) { | 394 | if ((r & (STAT_ERR & stop)) || (j > PF_SPIN)) { |
395 | s = read_reg(pf, 7); | 395 | s = read_reg(pf, 7); |
396 | e = read_reg(pf, 1); | 396 | e = read_reg(pf, 1); |
397 | p = read_reg(pf, 2); | 397 | p = read_reg(pf, 2); |
398 | if (j >= PF_SPIN) | 398 | if (j > PF_SPIN) |
399 | e |= 0x100; | 399 | e |= 0x100; |
400 | if (fun) | 400 | if (fun) |
401 | printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x" | 401 | printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x" |
@@ -956,8 +956,7 @@ static int __init pf_init(void) | |||
956 | return -ENOMEM; | 956 | return -ENOMEM; |
957 | } | 957 | } |
958 | 958 | ||
959 | blk_queue_max_phys_segments(pf_queue, cluster); | 959 | blk_queue_max_segments(pf_queue, cluster); |
960 | blk_queue_max_hw_segments(pf_queue, cluster); | ||
961 | 960 | ||
962 | for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) { | 961 | for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) { |
963 | struct gendisk *disk = pf->disk; | 962 | struct gendisk *disk = pf->disk; |
diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c index 1e4006e18f03..bc5825fdeaab 100644 --- a/drivers/block/paride/pt.c +++ b/drivers/block/paride/pt.c | |||
@@ -274,11 +274,11 @@ static int pt_wait(struct pt_unit *tape, int go, int stop, char *fun, char *msg) | |||
274 | && (j++ < PT_SPIN)) | 274 | && (j++ < PT_SPIN)) |
275 | udelay(PT_SPIN_DEL); | 275 | udelay(PT_SPIN_DEL); |
276 | 276 | ||
277 | if ((r & (STAT_ERR & stop)) || (j >= PT_SPIN)) { | 277 | if ((r & (STAT_ERR & stop)) || (j > PT_SPIN)) { |
278 | s = read_reg(pi, 7); | 278 | s = read_reg(pi, 7); |
279 | e = read_reg(pi, 1); | 279 | e = read_reg(pi, 1); |
280 | p = read_reg(pi, 2); | 280 | p = read_reg(pi, 2); |
281 | if (j >= PT_SPIN) | 281 | if (j > PT_SPIN) |
282 | e |= 0x100; | 282 | e |= 0x100; |
283 | if (fun) | 283 | if (fun) |
284 | printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x" | 284 | printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x" |
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 2ddf03ae034e..8a549db2aa78 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c | |||
@@ -48,6 +48,7 @@ | |||
48 | #include <linux/module.h> | 48 | #include <linux/module.h> |
49 | #include <linux/types.h> | 49 | #include <linux/types.h> |
50 | #include <linux/kernel.h> | 50 | #include <linux/kernel.h> |
51 | #include <linux/compat.h> | ||
51 | #include <linux/kthread.h> | 52 | #include <linux/kthread.h> |
52 | #include <linux/errno.h> | 53 | #include <linux/errno.h> |
53 | #include <linux/spinlock.h> | 54 | #include <linux/spinlock.h> |
@@ -57,6 +58,7 @@ | |||
57 | #include <linux/miscdevice.h> | 58 | #include <linux/miscdevice.h> |
58 | #include <linux/freezer.h> | 59 | #include <linux/freezer.h> |
59 | #include <linux/mutex.h> | 60 | #include <linux/mutex.h> |
61 | #include <linux/slab.h> | ||
60 | #include <scsi/scsi_cmnd.h> | 62 | #include <scsi/scsi_cmnd.h> |
61 | #include <scsi/scsi_ioctl.h> | 63 | #include <scsi/scsi_ioctl.h> |
62 | #include <scsi/scsi.h> | 64 | #include <scsi/scsi.h> |
@@ -284,7 +286,7 @@ static ssize_t kobj_pkt_store(struct kobject *kobj, | |||
284 | return len; | 286 | return len; |
285 | } | 287 | } |
286 | 288 | ||
287 | static struct sysfs_ops kobj_pkt_ops = { | 289 | static const struct sysfs_ops kobj_pkt_ops = { |
288 | .show = kobj_pkt_show, | 290 | .show = kobj_pkt_show, |
289 | .store = kobj_pkt_store | 291 | .store = kobj_pkt_store |
290 | }; | 292 | }; |
@@ -322,7 +324,7 @@ static void pkt_sysfs_dev_remove(struct pktcdvd_device *pd) | |||
322 | pkt_kobj_remove(pd->kobj_stat); | 324 | pkt_kobj_remove(pd->kobj_stat); |
323 | pkt_kobj_remove(pd->kobj_wqueue); | 325 | pkt_kobj_remove(pd->kobj_wqueue); |
324 | if (class_pktcdvd) | 326 | if (class_pktcdvd) |
325 | device_destroy(class_pktcdvd, pd->pkt_dev); | 327 | device_unregister(pd->dev); |
326 | } | 328 | } |
327 | 329 | ||
328 | 330 | ||
@@ -337,7 +339,9 @@ static void class_pktcdvd_release(struct class *cls) | |||
337 | { | 339 | { |
338 | kfree(cls); | 340 | kfree(cls); |
339 | } | 341 | } |
340 | static ssize_t class_pktcdvd_show_map(struct class *c, char *data) | 342 | static ssize_t class_pktcdvd_show_map(struct class *c, |
343 | struct class_attribute *attr, | ||
344 | char *data) | ||
341 | { | 345 | { |
342 | int n = 0; | 346 | int n = 0; |
343 | int idx; | 347 | int idx; |
@@ -356,7 +360,9 @@ static ssize_t class_pktcdvd_show_map(struct class *c, char *data) | |||
356 | return n; | 360 | return n; |
357 | } | 361 | } |
358 | 362 | ||
359 | static ssize_t class_pktcdvd_store_add(struct class *c, const char *buf, | 363 | static ssize_t class_pktcdvd_store_add(struct class *c, |
364 | struct class_attribute *attr, | ||
365 | const char *buf, | ||
360 | size_t count) | 366 | size_t count) |
361 | { | 367 | { |
362 | unsigned int major, minor; | 368 | unsigned int major, minor; |
@@ -376,7 +382,9 @@ static ssize_t class_pktcdvd_store_add(struct class *c, const char *buf, | |||
376 | return -EINVAL; | 382 | return -EINVAL; |
377 | } | 383 | } |
378 | 384 | ||
379 | static ssize_t class_pktcdvd_store_remove(struct class *c, const char *buf, | 385 | static ssize_t class_pktcdvd_store_remove(struct class *c, |
386 | struct class_attribute *attr, | ||
387 | const char *buf, | ||
380 | size_t count) | 388 | size_t count) |
381 | { | 389 | { |
382 | unsigned int major, minor; | 390 | unsigned int major, minor; |
@@ -569,6 +577,7 @@ static struct packet_data *pkt_alloc_packet_data(int frames) | |||
569 | } | 577 | } |
570 | 578 | ||
571 | spin_lock_init(&pkt->lock); | 579 | spin_lock_init(&pkt->lock); |
580 | bio_list_init(&pkt->orig_bios); | ||
572 | 581 | ||
573 | for (i = 0; i < frames; i++) { | 582 | for (i = 0; i < frames; i++) { |
574 | struct bio *bio = pkt_bio_alloc(1); | 583 | struct bio *bio = pkt_bio_alloc(1); |
@@ -721,43 +730,6 @@ static void pkt_rbtree_insert(struct pktcdvd_device *pd, struct pkt_rb_node *nod | |||
721 | } | 730 | } |
722 | 731 | ||
723 | /* | 732 | /* |
724 | * Add a bio to a single linked list defined by its head and tail pointers. | ||
725 | */ | ||
726 | static void pkt_add_list_last(struct bio *bio, struct bio **list_head, struct bio **list_tail) | ||
727 | { | ||
728 | bio->bi_next = NULL; | ||
729 | if (*list_tail) { | ||
730 | BUG_ON((*list_head) == NULL); | ||
731 | (*list_tail)->bi_next = bio; | ||
732 | (*list_tail) = bio; | ||
733 | } else { | ||
734 | BUG_ON((*list_head) != NULL); | ||
735 | (*list_head) = bio; | ||
736 | (*list_tail) = bio; | ||
737 | } | ||
738 | } | ||
739 | |||
740 | /* | ||
741 | * Remove and return the first bio from a single linked list defined by its | ||
742 | * head and tail pointers. | ||
743 | */ | ||
744 | static inline struct bio *pkt_get_list_first(struct bio **list_head, struct bio **list_tail) | ||
745 | { | ||
746 | struct bio *bio; | ||
747 | |||
748 | if (*list_head == NULL) | ||
749 | return NULL; | ||
750 | |||
751 | bio = *list_head; | ||
752 | *list_head = bio->bi_next; | ||
753 | if (*list_head == NULL) | ||
754 | *list_tail = NULL; | ||
755 | |||
756 | bio->bi_next = NULL; | ||
757 | return bio; | ||
758 | } | ||
759 | |||
760 | /* | ||
761 | * Send a packet_command to the underlying block device and | 733 | * Send a packet_command to the underlying block device and |
762 | * wait for completion. | 734 | * wait for completion. |
763 | */ | 735 | */ |
@@ -876,13 +848,10 @@ static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd, | |||
876 | static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio) | 848 | static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio) |
877 | { | 849 | { |
878 | spin_lock(&pd->iosched.lock); | 850 | spin_lock(&pd->iosched.lock); |
879 | if (bio_data_dir(bio) == READ) { | 851 | if (bio_data_dir(bio) == READ) |
880 | pkt_add_list_last(bio, &pd->iosched.read_queue, | 852 | bio_list_add(&pd->iosched.read_queue, bio); |
881 | &pd->iosched.read_queue_tail); | 853 | else |
882 | } else { | 854 | bio_list_add(&pd->iosched.write_queue, bio); |
883 | pkt_add_list_last(bio, &pd->iosched.write_queue, | ||
884 | &pd->iosched.write_queue_tail); | ||
885 | } | ||
886 | spin_unlock(&pd->iosched.lock); | 855 | spin_unlock(&pd->iosched.lock); |
887 | 856 | ||
888 | atomic_set(&pd->iosched.attention, 1); | 857 | atomic_set(&pd->iosched.attention, 1); |
@@ -917,8 +886,8 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) | |||
917 | int reads_queued, writes_queued; | 886 | int reads_queued, writes_queued; |
918 | 887 | ||
919 | spin_lock(&pd->iosched.lock); | 888 | spin_lock(&pd->iosched.lock); |
920 | reads_queued = (pd->iosched.read_queue != NULL); | 889 | reads_queued = !bio_list_empty(&pd->iosched.read_queue); |
921 | writes_queued = (pd->iosched.write_queue != NULL); | 890 | writes_queued = !bio_list_empty(&pd->iosched.write_queue); |
922 | spin_unlock(&pd->iosched.lock); | 891 | spin_unlock(&pd->iosched.lock); |
923 | 892 | ||
924 | if (!reads_queued && !writes_queued) | 893 | if (!reads_queued && !writes_queued) |
@@ -927,7 +896,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) | |||
927 | if (pd->iosched.writing) { | 896 | if (pd->iosched.writing) { |
928 | int need_write_seek = 1; | 897 | int need_write_seek = 1; |
929 | spin_lock(&pd->iosched.lock); | 898 | spin_lock(&pd->iosched.lock); |
930 | bio = pd->iosched.write_queue; | 899 | bio = bio_list_peek(&pd->iosched.write_queue); |
931 | spin_unlock(&pd->iosched.lock); | 900 | spin_unlock(&pd->iosched.lock); |
932 | if (bio && (bio->bi_sector == pd->iosched.last_write)) | 901 | if (bio && (bio->bi_sector == pd->iosched.last_write)) |
933 | need_write_seek = 0; | 902 | need_write_seek = 0; |
@@ -950,13 +919,10 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) | |||
950 | } | 919 | } |
951 | 920 | ||
952 | spin_lock(&pd->iosched.lock); | 921 | spin_lock(&pd->iosched.lock); |
953 | if (pd->iosched.writing) { | 922 | if (pd->iosched.writing) |
954 | bio = pkt_get_list_first(&pd->iosched.write_queue, | 923 | bio = bio_list_pop(&pd->iosched.write_queue); |
955 | &pd->iosched.write_queue_tail); | 924 | else |
956 | } else { | 925 | bio = bio_list_pop(&pd->iosched.read_queue); |
957 | bio = pkt_get_list_first(&pd->iosched.read_queue, | ||
958 | &pd->iosched.read_queue_tail); | ||
959 | } | ||
960 | spin_unlock(&pd->iosched.lock); | 926 | spin_unlock(&pd->iosched.lock); |
961 | 927 | ||
962 | if (!bio) | 928 | if (!bio) |
@@ -992,14 +958,14 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) | |||
992 | static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_queue *q) | 958 | static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_queue *q) |
993 | { | 959 | { |
994 | if ((pd->settings.size << 9) / CD_FRAMESIZE | 960 | if ((pd->settings.size << 9) / CD_FRAMESIZE |
995 | <= queue_max_phys_segments(q)) { | 961 | <= queue_max_segments(q)) { |
996 | /* | 962 | /* |
997 | * The cdrom device can handle one segment/frame | 963 | * The cdrom device can handle one segment/frame |
998 | */ | 964 | */ |
999 | clear_bit(PACKET_MERGE_SEGS, &pd->flags); | 965 | clear_bit(PACKET_MERGE_SEGS, &pd->flags); |
1000 | return 0; | 966 | return 0; |
1001 | } else if ((pd->settings.size << 9) / PAGE_SIZE | 967 | } else if ((pd->settings.size << 9) / PAGE_SIZE |
1002 | <= queue_max_phys_segments(q)) { | 968 | <= queue_max_segments(q)) { |
1003 | /* | 969 | /* |
1004 | * We can handle this case at the expense of some extra memory | 970 | * We can handle this case at the expense of some extra memory |
1005 | * copies during write operations | 971 | * copies during write operations |
@@ -1114,7 +1080,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) | |||
1114 | int f; | 1080 | int f; |
1115 | char written[PACKET_MAX_SIZE]; | 1081 | char written[PACKET_MAX_SIZE]; |
1116 | 1082 | ||
1117 | BUG_ON(!pkt->orig_bios); | 1083 | BUG_ON(bio_list_empty(&pkt->orig_bios)); |
1118 | 1084 | ||
1119 | atomic_set(&pkt->io_wait, 0); | 1085 | atomic_set(&pkt->io_wait, 0); |
1120 | atomic_set(&pkt->io_errors, 0); | 1086 | atomic_set(&pkt->io_errors, 0); |
@@ -1124,7 +1090,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) | |||
1124 | */ | 1090 | */ |
1125 | memset(written, 0, sizeof(written)); | 1091 | memset(written, 0, sizeof(written)); |
1126 | spin_lock(&pkt->lock); | 1092 | spin_lock(&pkt->lock); |
1127 | for (bio = pkt->orig_bios; bio; bio = bio->bi_next) { | 1093 | bio_list_for_each(bio, &pkt->orig_bios) { |
1128 | int first_frame = (bio->bi_sector - pkt->sector) / (CD_FRAMESIZE >> 9); | 1094 | int first_frame = (bio->bi_sector - pkt->sector) / (CD_FRAMESIZE >> 9); |
1129 | int num_frames = bio->bi_size / CD_FRAMESIZE; | 1095 | int num_frames = bio->bi_size / CD_FRAMESIZE; |
1130 | pd->stats.secs_w += num_frames * (CD_FRAMESIZE >> 9); | 1096 | pd->stats.secs_w += num_frames * (CD_FRAMESIZE >> 9); |
@@ -1363,7 +1329,7 @@ try_next_bio: | |||
1363 | break; | 1329 | break; |
1364 | pkt_rbtree_erase(pd, node); | 1330 | pkt_rbtree_erase(pd, node); |
1365 | spin_lock(&pkt->lock); | 1331 | spin_lock(&pkt->lock); |
1366 | pkt_add_list_last(bio, &pkt->orig_bios, &pkt->orig_bios_tail); | 1332 | bio_list_add(&pkt->orig_bios, bio); |
1367 | pkt->write_size += bio->bi_size / CD_FRAMESIZE; | 1333 | pkt->write_size += bio->bi_size / CD_FRAMESIZE; |
1368 | spin_unlock(&pkt->lock); | 1334 | spin_unlock(&pkt->lock); |
1369 | } | 1335 | } |
@@ -1409,7 +1375,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) | |||
1409 | */ | 1375 | */ |
1410 | frames_write = 0; | 1376 | frames_write = 0; |
1411 | spin_lock(&pkt->lock); | 1377 | spin_lock(&pkt->lock); |
1412 | for (bio = pkt->orig_bios; bio; bio = bio->bi_next) { | 1378 | bio_list_for_each(bio, &pkt->orig_bios) { |
1413 | int segment = bio->bi_idx; | 1379 | int segment = bio->bi_idx; |
1414 | int src_offs = 0; | 1380 | int src_offs = 0; |
1415 | int first_frame = (bio->bi_sector - pkt->sector) / (CD_FRAMESIZE >> 9); | 1381 | int first_frame = (bio->bi_sector - pkt->sector) / (CD_FRAMESIZE >> 9); |
@@ -1472,20 +1438,14 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) | |||
1472 | 1438 | ||
1473 | static void pkt_finish_packet(struct packet_data *pkt, int uptodate) | 1439 | static void pkt_finish_packet(struct packet_data *pkt, int uptodate) |
1474 | { | 1440 | { |
1475 | struct bio *bio, *next; | 1441 | struct bio *bio; |
1476 | 1442 | ||
1477 | if (!uptodate) | 1443 | if (!uptodate) |
1478 | pkt->cache_valid = 0; | 1444 | pkt->cache_valid = 0; |
1479 | 1445 | ||
1480 | /* Finish all bios corresponding to this packet */ | 1446 | /* Finish all bios corresponding to this packet */ |
1481 | bio = pkt->orig_bios; | 1447 | while ((bio = bio_list_pop(&pkt->orig_bios))) |
1482 | while (bio) { | ||
1483 | next = bio->bi_next; | ||
1484 | bio->bi_next = NULL; | ||
1485 | bio_endio(bio, uptodate ? 0 : -EIO); | 1448 | bio_endio(bio, uptodate ? 0 : -EIO); |
1486 | bio = next; | ||
1487 | } | ||
1488 | pkt->orig_bios = pkt->orig_bios_tail = NULL; | ||
1489 | } | 1449 | } |
1490 | 1450 | ||
1491 | static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data *pkt) | 1451 | static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data *pkt) |
@@ -2360,7 +2320,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) | |||
2360 | * even if the size is a multiple of the packet size. | 2320 | * even if the size is a multiple of the packet size. |
2361 | */ | 2321 | */ |
2362 | spin_lock_irq(q->queue_lock); | 2322 | spin_lock_irq(q->queue_lock); |
2363 | blk_queue_max_sectors(q, pd->settings.size); | 2323 | blk_queue_max_hw_sectors(q, pd->settings.size); |
2364 | spin_unlock_irq(q->queue_lock); | 2324 | spin_unlock_irq(q->queue_lock); |
2365 | set_bit(PACKET_WRITABLE, &pd->flags); | 2325 | set_bit(PACKET_WRITABLE, &pd->flags); |
2366 | } else { | 2326 | } else { |
@@ -2567,8 +2527,7 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio) | |||
2567 | spin_lock(&pkt->lock); | 2527 | spin_lock(&pkt->lock); |
2568 | if ((pkt->state == PACKET_WAITING_STATE) || | 2528 | if ((pkt->state == PACKET_WAITING_STATE) || |
2569 | (pkt->state == PACKET_READ_WAIT_STATE)) { | 2529 | (pkt->state == PACKET_READ_WAIT_STATE)) { |
2570 | pkt_add_list_last(bio, &pkt->orig_bios, | 2530 | bio_list_add(&pkt->orig_bios, bio); |
2571 | &pkt->orig_bios_tail); | ||
2572 | pkt->write_size += bio->bi_size / CD_FRAMESIZE; | 2531 | pkt->write_size += bio->bi_size / CD_FRAMESIZE; |
2573 | if ((pkt->write_size >= pkt->frames) && | 2532 | if ((pkt->write_size >= pkt->frames) && |
2574 | (pkt->state == PACKET_WAITING_STATE)) { | 2533 | (pkt->state == PACKET_WAITING_STATE)) { |
@@ -2662,7 +2621,7 @@ static void pkt_init_queue(struct pktcdvd_device *pd) | |||
2662 | 2621 | ||
2663 | blk_queue_make_request(q, pkt_make_request); | 2622 | blk_queue_make_request(q, pkt_make_request); |
2664 | blk_queue_logical_block_size(q, CD_FRAMESIZE); | 2623 | blk_queue_logical_block_size(q, CD_FRAMESIZE); |
2665 | blk_queue_max_sectors(q, PACKET_MAX_SECTORS); | 2624 | blk_queue_max_hw_sectors(q, PACKET_MAX_SECTORS); |
2666 | blk_queue_merge_bvec(q, pkt_merge_bvec); | 2625 | blk_queue_merge_bvec(q, pkt_merge_bvec); |
2667 | q->queuedata = pd; | 2626 | q->queuedata = pd; |
2668 | } | 2627 | } |
@@ -2898,6 +2857,8 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) | |||
2898 | 2857 | ||
2899 | spin_lock_init(&pd->lock); | 2858 | spin_lock_init(&pd->lock); |
2900 | spin_lock_init(&pd->iosched.lock); | 2859 | spin_lock_init(&pd->iosched.lock); |
2860 | bio_list_init(&pd->iosched.read_queue); | ||
2861 | bio_list_init(&pd->iosched.write_queue); | ||
2901 | sprintf(pd->name, DRIVER_NAME"%d", idx); | 2862 | sprintf(pd->name, DRIVER_NAME"%d", idx); |
2902 | init_waitqueue_head(&pd->wqueue); | 2863 | init_waitqueue_head(&pd->wqueue); |
2903 | pd->bio_queue = RB_ROOT; | 2864 | pd->bio_queue = RB_ROOT; |
@@ -3024,7 +2985,7 @@ static void pkt_get_status(struct pkt_ctrl_command *ctrl_cmd) | |||
3024 | mutex_unlock(&ctl_mutex); | 2985 | mutex_unlock(&ctl_mutex); |
3025 | } | 2986 | } |
3026 | 2987 | ||
3027 | static int pkt_ctl_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) | 2988 | static long pkt_ctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) |
3028 | { | 2989 | { |
3029 | void __user *argp = (void __user *)arg; | 2990 | void __user *argp = (void __user *)arg; |
3030 | struct pkt_ctrl_command ctrl_cmd; | 2991 | struct pkt_ctrl_command ctrl_cmd; |
@@ -3061,10 +3022,20 @@ static int pkt_ctl_ioctl(struct inode *inode, struct file *file, unsigned int cm | |||
3061 | return ret; | 3022 | return ret; |
3062 | } | 3023 | } |
3063 | 3024 | ||
3025 | #ifdef CONFIG_COMPAT | ||
3026 | static long pkt_ctl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | ||
3027 | { | ||
3028 | return pkt_ctl_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); | ||
3029 | } | ||
3030 | #endif | ||
3064 | 3031 | ||
3065 | static const struct file_operations pkt_ctl_fops = { | 3032 | static const struct file_operations pkt_ctl_fops = { |
3066 | .ioctl = pkt_ctl_ioctl, | 3033 | .open = nonseekable_open, |
3067 | .owner = THIS_MODULE, | 3034 | .unlocked_ioctl = pkt_ctl_ioctl, |
3035 | #ifdef CONFIG_COMPAT | ||
3036 | .compat_ioctl = pkt_ctl_compat_ioctl, | ||
3037 | #endif | ||
3038 | .owner = THIS_MODULE, | ||
3068 | }; | 3039 | }; |
3069 | 3040 | ||
3070 | static struct miscdevice pkt_misc = { | 3041 | static struct miscdevice pkt_misc = { |
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index 03a130dca8ab..3b419e3fffa1 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c | |||
@@ -20,6 +20,7 @@ | |||
20 | 20 | ||
21 | #include <linux/ata.h> | 21 | #include <linux/ata.h> |
22 | #include <linux/blkdev.h> | 22 | #include <linux/blkdev.h> |
23 | #include <linux/slab.h> | ||
23 | 24 | ||
24 | #include <asm/lv1call.h> | 25 | #include <asm/lv1call.h> |
25 | #include <asm/ps3stor.h> | 26 | #include <asm/ps3stor.h> |
@@ -474,7 +475,7 @@ static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev) | |||
474 | 475 | ||
475 | blk_queue_bounce_limit(queue, BLK_BOUNCE_HIGH); | 476 | blk_queue_bounce_limit(queue, BLK_BOUNCE_HIGH); |
476 | 477 | ||
477 | blk_queue_max_sectors(queue, dev->bounce_size >> 9); | 478 | blk_queue_max_hw_sectors(queue, dev->bounce_size >> 9); |
478 | blk_queue_segment_boundary(queue, -1UL); | 479 | blk_queue_segment_boundary(queue, -1UL); |
479 | blk_queue_dma_alignment(queue, dev->blk_size-1); | 480 | blk_queue_dma_alignment(queue, dev->blk_size-1); |
480 | blk_queue_logical_block_size(queue, dev->blk_size); | 481 | blk_queue_logical_block_size(queue, dev->blk_size); |
@@ -482,8 +483,7 @@ static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev) | |||
482 | blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH, | 483 | blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH, |
483 | ps3disk_prepare_flush); | 484 | ps3disk_prepare_flush); |
484 | 485 | ||
485 | blk_queue_max_phys_segments(queue, -1); | 486 | blk_queue_max_segments(queue, -1); |
486 | blk_queue_max_hw_segments(queue, -1); | ||
487 | blk_queue_max_segment_size(queue, dev->bounce_size); | 487 | blk_queue_max_segment_size(queue, dev->bounce_size); |
488 | 488 | ||
489 | gendisk = alloc_disk(PS3DISK_MINORS); | 489 | gendisk = alloc_disk(PS3DISK_MINORS); |
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 3bb7c47c869f..b3bdb8af89cf 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/delay.h> | 12 | #include <linux/delay.h> |
13 | #include <linux/proc_fs.h> | 13 | #include <linux/proc_fs.h> |
14 | #include <linux/seq_file.h> | 14 | #include <linux/seq_file.h> |
15 | #include <linux/slab.h> | ||
15 | 16 | ||
16 | #include <asm/cell-regs.h> | 17 | #include <asm/cell-regs.h> |
17 | #include <asm/firmware.h> | 18 | #include <asm/firmware.h> |
@@ -123,7 +124,15 @@ static int ps3vram_notifier_wait(struct ps3_system_bus_device *dev, | |||
123 | { | 124 | { |
124 | struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); | 125 | struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); |
125 | u32 *notify = ps3vram_get_notifier(priv->reports, NOTIFIER); | 126 | u32 *notify = ps3vram_get_notifier(priv->reports, NOTIFIER); |
126 | unsigned long timeout = jiffies + msecs_to_jiffies(timeout_ms); | 127 | unsigned long timeout; |
128 | |||
129 | for (timeout = 20; timeout; timeout--) { | ||
130 | if (!notify[3]) | ||
131 | return 0; | ||
132 | udelay(10); | ||
133 | } | ||
134 | |||
135 | timeout = jiffies + msecs_to_jiffies(timeout_ms); | ||
127 | 136 | ||
128 | do { | 137 | do { |
129 | if (!notify[3]) | 138 | if (!notify[3]) |
@@ -743,10 +752,9 @@ static int __devinit ps3vram_probe(struct ps3_system_bus_device *dev) | |||
743 | priv->queue = queue; | 752 | priv->queue = queue; |
744 | queue->queuedata = dev; | 753 | queue->queuedata = dev; |
745 | blk_queue_make_request(queue, ps3vram_make_request); | 754 | blk_queue_make_request(queue, ps3vram_make_request); |
746 | blk_queue_max_phys_segments(queue, MAX_PHYS_SEGMENTS); | 755 | blk_queue_max_segments(queue, BLK_MAX_SEGMENTS); |
747 | blk_queue_max_hw_segments(queue, MAX_HW_SEGMENTS); | 756 | blk_queue_max_segment_size(queue, BLK_MAX_SEGMENT_SIZE); |
748 | blk_queue_max_segment_size(queue, MAX_SEGMENT_SIZE); | 757 | blk_queue_max_hw_sectors(queue, BLK_SAFE_MAX_SECTORS); |
749 | blk_queue_max_sectors(queue, SAFE_MAX_SECTORS); | ||
750 | 758 | ||
751 | gendisk = alloc_disk(1); | 759 | gendisk = alloc_disk(1); |
752 | if (!gendisk) { | 760 | if (!gendisk) { |
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 411f064760b4..48e8fee9f2d4 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c | |||
@@ -691,9 +691,8 @@ static int probe_disk(struct vdc_port *port) | |||
691 | 691 | ||
692 | port->disk = g; | 692 | port->disk = g; |
693 | 693 | ||
694 | blk_queue_max_hw_segments(q, port->ring_cookies); | 694 | blk_queue_max_segments(q, port->ring_cookies); |
695 | blk_queue_max_phys_segments(q, port->ring_cookies); | 695 | blk_queue_max_hw_sectors(q, port->max_xfer_size); |
696 | blk_queue_max_sectors(q, port->max_xfer_size); | ||
697 | g->major = vdc_major; | 696 | g->major = vdc_major; |
698 | g->first_minor = port->vio.vdev->dev_no << PARTITION_SHIFT; | 697 | g->first_minor = port->vio.vdev->dev_no << PARTITION_SHIFT; |
699 | strcpy(g->disk_name, port->disk_name); | 698 | strcpy(g->disk_name, port->disk_name); |
diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 8f569e3df890..e463657569ff 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c | |||
@@ -18,6 +18,7 @@ | |||
18 | 18 | ||
19 | #include <linux/module.h> | 19 | #include <linux/module.h> |
20 | #include <linux/fd.h> | 20 | #include <linux/fd.h> |
21 | #include <linux/slab.h> | ||
21 | #include <linux/blkdev.h> | 22 | #include <linux/blkdev.h> |
22 | #include <linux/hdreg.h> | 23 | #include <linux/hdreg.h> |
23 | #include <linux/kernel.h> | 24 | #include <linux/kernel.h> |
@@ -864,7 +865,7 @@ static int __devinit swim_probe(struct platform_device *dev) | |||
864 | struct swim_priv *swd; | 865 | struct swim_priv *swd; |
865 | int ret; | 866 | int ret; |
866 | 867 | ||
867 | res = platform_get_resource_byname(dev, IORESOURCE_MEM, "swim-regs"); | 868 | res = platform_get_resource(dev, IORESOURCE_MEM, 0); |
868 | if (!res) { | 869 | if (!res) { |
869 | ret = -ENODEV; | 870 | ret = -ENODEV; |
870 | goto out; | 871 | goto out; |
@@ -942,7 +943,7 @@ static int __devexit swim_remove(struct platform_device *dev) | |||
942 | 943 | ||
943 | iounmap(swd->base); | 944 | iounmap(swd->base); |
944 | 945 | ||
945 | res = platform_get_resource_byname(dev, IORESOURCE_MEM, "swim-regs"); | 946 | res = platform_get_resource(dev, IORESOURCE_MEM, 0); |
946 | if (res) | 947 | if (res) |
947 | release_mem_region(res->start, resource_size(res)); | 948 | release_mem_region(res->start, resource_size(res)); |
948 | 949 | ||
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 6380ad8d91bd..59ca2b77b574 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c | |||
@@ -200,7 +200,7 @@ struct floppy_state { | |||
200 | int ejected; | 200 | int ejected; |
201 | wait_queue_head_t wait; | 201 | wait_queue_head_t wait; |
202 | int wanted; | 202 | int wanted; |
203 | struct device_node* media_bay; /* NULL when not in bay */ | 203 | struct macio_dev *mdev; |
204 | char dbdma_cmd_space[5 * sizeof(struct dbdma_cmd)]; | 204 | char dbdma_cmd_space[5 * sizeof(struct dbdma_cmd)]; |
205 | }; | 205 | }; |
206 | 206 | ||
@@ -303,14 +303,13 @@ static int swim3_readbit(struct floppy_state *fs, int bit) | |||
303 | static void do_fd_request(struct request_queue * q) | 303 | static void do_fd_request(struct request_queue * q) |
304 | { | 304 | { |
305 | int i; | 305 | int i; |
306 | for(i=0;i<floppy_count;i++) | 306 | |
307 | { | 307 | for(i=0; i<floppy_count; i++) { |
308 | #ifdef CONFIG_PMAC_MEDIABAY | 308 | struct floppy_state *fs = &floppy_states[i]; |
309 | if (floppy_states[i].media_bay && | 309 | if (fs->mdev->media_bay && |
310 | check_media_bay(floppy_states[i].media_bay, MB_FD)) | 310 | check_media_bay(fs->mdev->media_bay) != MB_FD) |
311 | continue; | 311 | continue; |
312 | #endif /* CONFIG_PMAC_MEDIABAY */ | 312 | start_request(fs); |
313 | start_request(&floppy_states[i]); | ||
314 | } | 313 | } |
315 | } | 314 | } |
316 | 315 | ||
@@ -849,10 +848,9 @@ static int floppy_ioctl(struct block_device *bdev, fmode_t mode, | |||
849 | if ((cmd & 0x80) && !capable(CAP_SYS_ADMIN)) | 848 | if ((cmd & 0x80) && !capable(CAP_SYS_ADMIN)) |
850 | return -EPERM; | 849 | return -EPERM; |
851 | 850 | ||
852 | #ifdef CONFIG_PMAC_MEDIABAY | 851 | if (fs->mdev->media_bay && |
853 | if (fs->media_bay && check_media_bay(fs->media_bay, MB_FD)) | 852 | check_media_bay(fs->mdev->media_bay) != MB_FD) |
854 | return -ENXIO; | 853 | return -ENXIO; |
855 | #endif | ||
856 | 854 | ||
857 | switch (cmd) { | 855 | switch (cmd) { |
858 | case FDEJECT: | 856 | case FDEJECT: |
@@ -876,10 +874,9 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) | |||
876 | int n, err = 0; | 874 | int n, err = 0; |
877 | 875 | ||
878 | if (fs->ref_count == 0) { | 876 | if (fs->ref_count == 0) { |
879 | #ifdef CONFIG_PMAC_MEDIABAY | 877 | if (fs->mdev->media_bay && |
880 | if (fs->media_bay && check_media_bay(fs->media_bay, MB_FD)) | 878 | check_media_bay(fs->mdev->media_bay) != MB_FD) |
881 | return -ENXIO; | 879 | return -ENXIO; |
882 | #endif | ||
883 | out_8(&sw->setup, S_IBM_DRIVE | S_FCLK_DIV2); | 880 | out_8(&sw->setup, S_IBM_DRIVE | S_FCLK_DIV2); |
884 | out_8(&sw->control_bic, 0xff); | 881 | out_8(&sw->control_bic, 0xff); |
885 | out_8(&sw->mode, 0x95); | 882 | out_8(&sw->mode, 0x95); |
@@ -963,10 +960,9 @@ static int floppy_revalidate(struct gendisk *disk) | |||
963 | struct swim3 __iomem *sw; | 960 | struct swim3 __iomem *sw; |
964 | int ret, n; | 961 | int ret, n; |
965 | 962 | ||
966 | #ifdef CONFIG_PMAC_MEDIABAY | 963 | if (fs->mdev->media_bay && |
967 | if (fs->media_bay && check_media_bay(fs->media_bay, MB_FD)) | 964 | check_media_bay(fs->mdev->media_bay) != MB_FD) |
968 | return -ENXIO; | 965 | return -ENXIO; |
969 | #endif | ||
970 | 966 | ||
971 | sw = fs->swim3; | 967 | sw = fs->swim3; |
972 | grab_drive(fs, revalidating, 0); | 968 | grab_drive(fs, revalidating, 0); |
@@ -1009,7 +1005,6 @@ static const struct block_device_operations floppy_fops = { | |||
1009 | static int swim3_add_device(struct macio_dev *mdev, int index) | 1005 | static int swim3_add_device(struct macio_dev *mdev, int index) |
1010 | { | 1006 | { |
1011 | struct device_node *swim = mdev->ofdev.node; | 1007 | struct device_node *swim = mdev->ofdev.node; |
1012 | struct device_node *mediabay; | ||
1013 | struct floppy_state *fs = &floppy_states[index]; | 1008 | struct floppy_state *fs = &floppy_states[index]; |
1014 | int rc = -EBUSY; | 1009 | int rc = -EBUSY; |
1015 | 1010 | ||
@@ -1036,9 +1031,7 @@ static int swim3_add_device(struct macio_dev *mdev, int index) | |||
1036 | } | 1031 | } |
1037 | dev_set_drvdata(&mdev->ofdev.dev, fs); | 1032 | dev_set_drvdata(&mdev->ofdev.dev, fs); |
1038 | 1033 | ||
1039 | mediabay = (strcasecmp(swim->parent->type, "media-bay") == 0) ? | 1034 | if (mdev->media_bay == NULL) |
1040 | swim->parent : NULL; | ||
1041 | if (mediabay == NULL) | ||
1042 | pmac_call_feature(PMAC_FTR_SWIM3_ENABLE, swim, 0, 1); | 1035 | pmac_call_feature(PMAC_FTR_SWIM3_ENABLE, swim, 0, 1); |
1043 | 1036 | ||
1044 | memset(fs, 0, sizeof(*fs)); | 1037 | memset(fs, 0, sizeof(*fs)); |
@@ -1068,7 +1061,7 @@ static int swim3_add_device(struct macio_dev *mdev, int index) | |||
1068 | fs->secpercyl = 36; | 1061 | fs->secpercyl = 36; |
1069 | fs->secpertrack = 18; | 1062 | fs->secpertrack = 18; |
1070 | fs->total_secs = 2880; | 1063 | fs->total_secs = 2880; |
1071 | fs->media_bay = mediabay; | 1064 | fs->mdev = mdev; |
1072 | init_waitqueue_head(&fs->wait); | 1065 | init_waitqueue_head(&fs->wait); |
1073 | 1066 | ||
1074 | fs->dma_cmd = (struct dbdma_cmd *) DBDMA_ALIGN(fs->dbdma_cmd_space); | 1067 | fs->dma_cmd = (struct dbdma_cmd *) DBDMA_ALIGN(fs->dbdma_cmd_space); |
@@ -1093,7 +1086,7 @@ static int swim3_add_device(struct macio_dev *mdev, int index) | |||
1093 | init_timer(&fs->timeout); | 1086 | init_timer(&fs->timeout); |
1094 | 1087 | ||
1095 | printk(KERN_INFO "fd%d: SWIM3 floppy controller %s\n", floppy_count, | 1088 | printk(KERN_INFO "fd%d: SWIM3 floppy controller %s\n", floppy_count, |
1096 | mediabay ? "in media bay" : ""); | 1089 | mdev->media_bay ? "in media bay" : ""); |
1097 | 1090 | ||
1098 | return 0; | 1091 | return 0; |
1099 | 1092 | ||
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index a7c4184f4a63..b70f0fca9a42 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c | |||
@@ -409,7 +409,7 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) | |||
409 | static void carm_remove_one (struct pci_dev *pdev); | 409 | static void carm_remove_one (struct pci_dev *pdev); |
410 | static int carm_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo); | 410 | static int carm_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo); |
411 | 411 | ||
412 | static struct pci_device_id carm_pci_tbl[] = { | 412 | static const struct pci_device_id carm_pci_tbl[] = { |
413 | { PCI_VENDOR_ID_PROMISE, 0x8000, PCI_ANY_ID, PCI_ANY_ID, 0, 0, }, | 413 | { PCI_VENDOR_ID_PROMISE, 0x8000, PCI_ANY_ID, PCI_ANY_ID, 0, 0, }, |
414 | { PCI_VENDOR_ID_PROMISE, 0x8002, PCI_ANY_ID, PCI_ANY_ID, 0, 0, }, | 414 | { PCI_VENDOR_ID_PROMISE, 0x8002, PCI_ANY_ID, PCI_ANY_ID, 0, 0, }, |
415 | { } /* terminate list */ | 415 | { } /* terminate list */ |
@@ -1518,8 +1518,7 @@ static int carm_init_disks(struct carm_host *host) | |||
1518 | break; | 1518 | break; |
1519 | } | 1519 | } |
1520 | disk->queue = q; | 1520 | disk->queue = q; |
1521 | blk_queue_max_hw_segments(q, CARM_MAX_REQ_SG); | 1521 | blk_queue_max_segments(q, CARM_MAX_REQ_SG); |
1522 | blk_queue_max_phys_segments(q, CARM_MAX_REQ_SG); | ||
1523 | blk_queue_segment_boundary(q, CARM_SG_BOUNDARY); | 1522 | blk_queue_segment_boundary(q, CARM_SG_BOUNDARY); |
1524 | 1523 | ||
1525 | q->queuedata = port; | 1524 | q->queuedata = port; |
diff --git a/drivers/block/ub.c b/drivers/block/ub.c index c739b203fe91..0536b5b29adc 100644 --- a/drivers/block/ub.c +++ b/drivers/block/ub.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/blkdev.h> | 27 | #include <linux/blkdev.h> |
28 | #include <linux/timer.h> | 28 | #include <linux/timer.h> |
29 | #include <linux/scatterlist.h> | 29 | #include <linux/scatterlist.h> |
30 | #include <linux/slab.h> | ||
30 | #include <scsi/scsi.h> | 31 | #include <scsi/scsi.h> |
31 | 32 | ||
32 | #define DRV_NAME "ub" | 33 | #define DRV_NAME "ub" |
@@ -393,7 +394,7 @@ static int ub_probe_lun(struct ub_dev *sc, int lnum); | |||
393 | #define ub_usb_ids usb_storage_usb_ids | 394 | #define ub_usb_ids usb_storage_usb_ids |
394 | #else | 395 | #else |
395 | 396 | ||
396 | static struct usb_device_id ub_usb_ids[] = { | 397 | static const struct usb_device_id ub_usb_ids[] = { |
397 | { USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_SCSI, US_PR_BULK) }, | 398 | { USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_SCSI, US_PR_BULK) }, |
398 | { } | 399 | { } |
399 | }; | 400 | }; |
@@ -2320,10 +2321,9 @@ static int ub_probe_lun(struct ub_dev *sc, int lnum) | |||
2320 | disk->queue = q; | 2321 | disk->queue = q; |
2321 | 2322 | ||
2322 | blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); | 2323 | blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); |
2323 | blk_queue_max_hw_segments(q, UB_MAX_REQ_SG); | 2324 | blk_queue_max_segments(q, UB_MAX_REQ_SG); |
2324 | blk_queue_max_phys_segments(q, UB_MAX_REQ_SG); | ||
2325 | blk_queue_segment_boundary(q, 0xffffffff); /* Dubious. */ | 2325 | blk_queue_segment_boundary(q, 0xffffffff); /* Dubious. */ |
2326 | blk_queue_max_sectors(q, UB_MAX_SECTORS); | 2326 | blk_queue_max_hw_sectors(q, UB_MAX_SECTORS); |
2327 | blk_queue_logical_block_size(q, lun->capacity.bsize); | 2327 | blk_queue_logical_block_size(q, lun->capacity.bsize); |
2328 | 2328 | ||
2329 | lun->disk = disk; | 2329 | lun->disk = disk; |
diff --git a/drivers/block/umem.c b/drivers/block/umem.c index ad1ba393801a..2f9470ff8f7c 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c | |||
@@ -40,13 +40,13 @@ | |||
40 | #include <linux/kernel.h> | 40 | #include <linux/kernel.h> |
41 | #include <linux/mm.h> | 41 | #include <linux/mm.h> |
42 | #include <linux/mman.h> | 42 | #include <linux/mman.h> |
43 | #include <linux/gfp.h> | ||
43 | #include <linux/ioctl.h> | 44 | #include <linux/ioctl.h> |
44 | #include <linux/module.h> | 45 | #include <linux/module.h> |
45 | #include <linux/init.h> | 46 | #include <linux/init.h> |
46 | #include <linux/interrupt.h> | 47 | #include <linux/interrupt.h> |
47 | #include <linux/timer.h> | 48 | #include <linux/timer.h> |
48 | #include <linux/pci.h> | 49 | #include <linux/pci.h> |
49 | #include <linux/slab.h> | ||
50 | #include <linux/dma-mapping.h> | 50 | #include <linux/dma-mapping.h> |
51 | 51 | ||
52 | #include <linux/fcntl.h> /* O_ACCMODE */ | 52 | #include <linux/fcntl.h> /* O_ACCMODE */ |
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c index a8c8b56b275e..788d93882ab9 100644 --- a/drivers/block/viodasd.c +++ b/drivers/block/viodasd.c | |||
@@ -28,6 +28,9 @@ | |||
28 | * All disk operations are performed by sending messages back and forth to | 28 | * All disk operations are performed by sending messages back and forth to |
29 | * the OS/400 partition. | 29 | * the OS/400 partition. |
30 | */ | 30 | */ |
31 | |||
32 | #define pr_fmt(fmt) "viod: " fmt | ||
33 | |||
31 | #include <linux/major.h> | 34 | #include <linux/major.h> |
32 | #include <linux/fs.h> | 35 | #include <linux/fs.h> |
33 | #include <linux/module.h> | 36 | #include <linux/module.h> |
@@ -63,9 +66,6 @@ MODULE_LICENSE("GPL"); | |||
63 | 66 | ||
64 | #define VIOD_VERS "1.64" | 67 | #define VIOD_VERS "1.64" |
65 | 68 | ||
66 | #define VIOD_KERN_WARNING KERN_WARNING "viod: " | ||
67 | #define VIOD_KERN_INFO KERN_INFO "viod: " | ||
68 | |||
69 | enum { | 69 | enum { |
70 | PARTITION_SHIFT = 3, | 70 | PARTITION_SHIFT = 3, |
71 | MAX_DISKNO = HVMAXARCHITECTEDVIRTUALDISKS, | 71 | MAX_DISKNO = HVMAXARCHITECTEDVIRTUALDISKS, |
@@ -156,7 +156,7 @@ static int viodasd_open(struct block_device *bdev, fmode_t mode) | |||
156 | ((u64)DEVICE_NO(d) << 48) | ((u64)flags << 32), | 156 | ((u64)DEVICE_NO(d) << 48) | ((u64)flags << 32), |
157 | 0, 0, 0); | 157 | 0, 0, 0); |
158 | if (hvrc != 0) { | 158 | if (hvrc != 0) { |
159 | printk(VIOD_KERN_WARNING "HV open failed %d\n", (int)hvrc); | 159 | pr_warning("HV open failed %d\n", (int)hvrc); |
160 | return -EIO; | 160 | return -EIO; |
161 | } | 161 | } |
162 | 162 | ||
@@ -167,9 +167,8 @@ static int viodasd_open(struct block_device *bdev, fmode_t mode) | |||
167 | const struct vio_error_entry *err = | 167 | const struct vio_error_entry *err = |
168 | vio_lookup_rc(viodasd_err_table, we.sub_result); | 168 | vio_lookup_rc(viodasd_err_table, we.sub_result); |
169 | 169 | ||
170 | printk(VIOD_KERN_WARNING | 170 | pr_warning("bad rc opening disk: %d:0x%04x (%s)\n", |
171 | "bad rc opening disk: %d:0x%04x (%s)\n", | 171 | (int)we.rc, we.sub_result, err->msg); |
172 | (int)we.rc, we.sub_result, err->msg); | ||
173 | return -EIO; | 172 | return -EIO; |
174 | } | 173 | } |
175 | 174 | ||
@@ -195,8 +194,7 @@ static int viodasd_release(struct gendisk *disk, fmode_t mode) | |||
195 | ((u64)DEVICE_NO(d) << 48) /* | ((u64)flags << 32) */, | 194 | ((u64)DEVICE_NO(d) << 48) /* | ((u64)flags << 32) */, |
196 | 0, 0, 0); | 195 | 0, 0, 0); |
197 | if (hvrc != 0) | 196 | if (hvrc != 0) |
198 | printk(VIOD_KERN_WARNING "HV close call failed %d\n", | 197 | pr_warning("HV close call failed %d\n", (int)hvrc); |
199 | (int)hvrc); | ||
200 | return 0; | 198 | return 0; |
201 | } | 199 | } |
202 | 200 | ||
@@ -288,8 +286,7 @@ static int send_request(struct request *req) | |||
288 | bevent = (struct vioblocklpevent *) | 286 | bevent = (struct vioblocklpevent *) |
289 | vio_get_event_buffer(viomajorsubtype_blockio); | 287 | vio_get_event_buffer(viomajorsubtype_blockio); |
290 | if (bevent == NULL) { | 288 | if (bevent == NULL) { |
291 | printk(VIOD_KERN_WARNING | 289 | pr_warning("error allocating disk event buffer\n"); |
292 | "error allocating disk event buffer\n"); | ||
293 | goto error_ret; | 290 | goto error_ret; |
294 | } | 291 | } |
295 | 292 | ||
@@ -333,9 +330,8 @@ static int send_request(struct request *req) | |||
333 | } | 330 | } |
334 | 331 | ||
335 | if (hvrc != HvLpEvent_Rc_Good) { | 332 | if (hvrc != HvLpEvent_Rc_Good) { |
336 | printk(VIOD_KERN_WARNING | 333 | pr_warning("error sending disk event to OS/400 (rc %d)\n", |
337 | "error sending disk event to OS/400 (rc %d)\n", | 334 | (int)hvrc); |
338 | (int)hvrc); | ||
339 | goto error_ret; | 335 | goto error_ret; |
340 | } | 336 | } |
341 | spin_unlock_irqrestore(&viodasd_spinlock, flags); | 337 | spin_unlock_irqrestore(&viodasd_spinlock, flags); |
@@ -402,7 +398,7 @@ retry: | |||
402 | ((u64)dev_no << 48) | ((u64)flags<< 32), | 398 | ((u64)dev_no << 48) | ((u64)flags<< 32), |
403 | 0, 0, 0); | 399 | 0, 0, 0); |
404 | if (hvrc != 0) { | 400 | if (hvrc != 0) { |
405 | printk(VIOD_KERN_WARNING "bad rc on HV open %d\n", (int)hvrc); | 401 | pr_warning("bad rc on HV open %d\n", (int)hvrc); |
406 | return 0; | 402 | return 0; |
407 | } | 403 | } |
408 | 404 | ||
@@ -416,9 +412,8 @@ retry: | |||
416 | goto retry; | 412 | goto retry; |
417 | } | 413 | } |
418 | if (we.max_disk > (MAX_DISKNO - 1)) { | 414 | if (we.max_disk > (MAX_DISKNO - 1)) { |
419 | printk_once(VIOD_KERN_INFO | 415 | printk_once(KERN_INFO pr_fmt("Only examining the first %d of %d disks connected\n"), |
420 | "Only examining the first %d of %d disks connected\n", | 416 | MAX_DISKNO, we.max_disk + 1); |
421 | MAX_DISKNO, we.max_disk + 1); | ||
422 | } | 417 | } |
423 | 418 | ||
424 | /* Send the close event to OS/400. We DON'T expect a response */ | 419 | /* Send the close event to OS/400. We DON'T expect a response */ |
@@ -432,17 +427,15 @@ retry: | |||
432 | ((u64)dev_no << 48) | ((u64)flags << 32), | 427 | ((u64)dev_no << 48) | ((u64)flags << 32), |
433 | 0, 0, 0); | 428 | 0, 0, 0); |
434 | if (hvrc != 0) { | 429 | if (hvrc != 0) { |
435 | printk(VIOD_KERN_WARNING | 430 | pr_warning("bad rc sending event to OS/400 %d\n", (int)hvrc); |
436 | "bad rc sending event to OS/400 %d\n", (int)hvrc); | ||
437 | return 0; | 431 | return 0; |
438 | } | 432 | } |
439 | 433 | ||
440 | if (d->dev == NULL) { | 434 | if (d->dev == NULL) { |
441 | /* this is when we reprobe for new disks */ | 435 | /* this is when we reprobe for new disks */ |
442 | if (vio_create_viodasd(dev_no) == NULL) { | 436 | if (vio_create_viodasd(dev_no) == NULL) { |
443 | printk(VIOD_KERN_WARNING | 437 | pr_warning("cannot allocate virtual device for disk %d\n", |
444 | "cannot allocate virtual device for disk %d\n", | 438 | dev_no); |
445 | dev_no); | ||
446 | return 0; | 439 | return 0; |
447 | } | 440 | } |
448 | /* | 441 | /* |
@@ -457,23 +450,20 @@ retry: | |||
457 | spin_lock_init(&d->q_lock); | 450 | spin_lock_init(&d->q_lock); |
458 | q = blk_init_queue(do_viodasd_request, &d->q_lock); | 451 | q = blk_init_queue(do_viodasd_request, &d->q_lock); |
459 | if (q == NULL) { | 452 | if (q == NULL) { |
460 | printk(VIOD_KERN_WARNING "cannot allocate queue for disk %d\n", | 453 | pr_warning("cannot allocate queue for disk %d\n", dev_no); |
461 | dev_no); | ||
462 | return 0; | 454 | return 0; |
463 | } | 455 | } |
464 | g = alloc_disk(1 << PARTITION_SHIFT); | 456 | g = alloc_disk(1 << PARTITION_SHIFT); |
465 | if (g == NULL) { | 457 | if (g == NULL) { |
466 | printk(VIOD_KERN_WARNING | 458 | pr_warning("cannot allocate disk structure for disk %d\n", |
467 | "cannot allocate disk structure for disk %d\n", | 459 | dev_no); |
468 | dev_no); | ||
469 | blk_cleanup_queue(q); | 460 | blk_cleanup_queue(q); |
470 | return 0; | 461 | return 0; |
471 | } | 462 | } |
472 | 463 | ||
473 | d->disk = g; | 464 | d->disk = g; |
474 | blk_queue_max_hw_segments(q, VIOMAXBLOCKDMA); | 465 | blk_queue_max_segments(q, VIOMAXBLOCKDMA); |
475 | blk_queue_max_phys_segments(q, VIOMAXBLOCKDMA); | 466 | blk_queue_max_hw_sectors(q, VIODASD_MAXSECTORS); |
476 | blk_queue_max_sectors(q, VIODASD_MAXSECTORS); | ||
477 | g->major = VIODASD_MAJOR; | 467 | g->major = VIODASD_MAJOR; |
478 | g->first_minor = dev_no << PARTITION_SHIFT; | 468 | g->first_minor = dev_no << PARTITION_SHIFT; |
479 | if (dev_no >= 26) | 469 | if (dev_no >= 26) |
@@ -489,13 +479,12 @@ retry: | |||
489 | g->driverfs_dev = d->dev; | 479 | g->driverfs_dev = d->dev; |
490 | set_capacity(g, d->size >> 9); | 480 | set_capacity(g, d->size >> 9); |
491 | 481 | ||
492 | printk(VIOD_KERN_INFO "disk %d: %lu sectors (%lu MB) " | 482 | pr_info("disk %d: %lu sectors (%lu MB) CHS=%d/%d/%d sector size %d%s\n", |
493 | "CHS=%d/%d/%d sector size %d%s\n", | 483 | dev_no, (unsigned long)(d->size >> 9), |
494 | dev_no, (unsigned long)(d->size >> 9), | 484 | (unsigned long)(d->size >> 20), |
495 | (unsigned long)(d->size >> 20), | 485 | (int)d->cylinders, (int)d->tracks, |
496 | (int)d->cylinders, (int)d->tracks, | 486 | (int)d->sectors, (int)d->bytes_per_sector, |
497 | (int)d->sectors, (int)d->bytes_per_sector, | 487 | d->read_only ? " (RO)" : ""); |
498 | d->read_only ? " (RO)" : ""); | ||
499 | 488 | ||
500 | /* register us in the global list */ | 489 | /* register us in the global list */ |
501 | add_disk(g); | 490 | add_disk(g); |
@@ -580,8 +569,8 @@ static int viodasd_handle_read_write(struct vioblocklpevent *bevent) | |||
580 | if (error) { | 569 | if (error) { |
581 | const struct vio_error_entry *err; | 570 | const struct vio_error_entry *err; |
582 | err = vio_lookup_rc(viodasd_err_table, bevent->sub_result); | 571 | err = vio_lookup_rc(viodasd_err_table, bevent->sub_result); |
583 | printk(VIOD_KERN_WARNING "read/write error %d:0x%04x (%s)\n", | 572 | pr_warning("read/write error %d:0x%04x (%s)\n", |
584 | event->xRc, bevent->sub_result, err->msg); | 573 | event->xRc, bevent->sub_result, err->msg); |
585 | num_sect = blk_rq_sectors(req); | 574 | num_sect = blk_rq_sectors(req); |
586 | } | 575 | } |
587 | qlock = req->q->queue_lock; | 576 | qlock = req->q->queue_lock; |
@@ -606,8 +595,7 @@ static void handle_block_event(struct HvLpEvent *event) | |||
606 | return; | 595 | return; |
607 | /* First, we should NEVER get an int here...only acks */ | 596 | /* First, we should NEVER get an int here...only acks */ |
608 | if (hvlpevent_is_int(event)) { | 597 | if (hvlpevent_is_int(event)) { |
609 | printk(VIOD_KERN_WARNING | 598 | pr_warning("Yikes! got an int in viodasd event handler!\n"); |
610 | "Yikes! got an int in viodasd event handler!\n"); | ||
611 | if (hvlpevent_need_ack(event)) { | 599 | if (hvlpevent_need_ack(event)) { |
612 | event->xRc = HvLpEvent_Rc_InvalidSubtype; | 600 | event->xRc = HvLpEvent_Rc_InvalidSubtype; |
613 | HvCallEvent_ackLpEvent(event); | 601 | HvCallEvent_ackLpEvent(event); |
@@ -650,7 +638,7 @@ static void handle_block_event(struct HvLpEvent *event) | |||
650 | break; | 638 | break; |
651 | 639 | ||
652 | default: | 640 | default: |
653 | printk(VIOD_KERN_WARNING "invalid subtype!"); | 641 | pr_warning("invalid subtype!"); |
654 | if (hvlpevent_need_ack(event)) { | 642 | if (hvlpevent_need_ack(event)) { |
655 | event->xRc = HvLpEvent_Rc_InvalidSubtype; | 643 | event->xRc = HvLpEvent_Rc_InvalidSubtype; |
656 | HvCallEvent_ackLpEvent(event); | 644 | HvCallEvent_ackLpEvent(event); |
@@ -739,29 +727,26 @@ static int __init viodasd_init(void) | |||
739 | vio_set_hostlp(); | 727 | vio_set_hostlp(); |
740 | 728 | ||
741 | if (viopath_hostLp == HvLpIndexInvalid) { | 729 | if (viopath_hostLp == HvLpIndexInvalid) { |
742 | printk(VIOD_KERN_WARNING "invalid hosting partition\n"); | 730 | pr_warning("invalid hosting partition\n"); |
743 | rc = -EIO; | 731 | rc = -EIO; |
744 | goto early_fail; | 732 | goto early_fail; |
745 | } | 733 | } |
746 | 734 | ||
747 | printk(VIOD_KERN_INFO "vers " VIOD_VERS ", hosting partition %d\n", | 735 | pr_info("vers " VIOD_VERS ", hosting partition %d\n", viopath_hostLp); |
748 | viopath_hostLp); | ||
749 | 736 | ||
750 | /* register the block device */ | 737 | /* register the block device */ |
751 | rc = register_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME); | 738 | rc = register_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME); |
752 | if (rc) { | 739 | if (rc) { |
753 | printk(VIOD_KERN_WARNING | 740 | pr_warning("Unable to get major number %d for %s\n", |
754 | "Unable to get major number %d for %s\n", | 741 | VIODASD_MAJOR, VIOD_GENHD_NAME); |
755 | VIODASD_MAJOR, VIOD_GENHD_NAME); | ||
756 | goto early_fail; | 742 | goto early_fail; |
757 | } | 743 | } |
758 | /* Actually open the path to the hosting partition */ | 744 | /* Actually open the path to the hosting partition */ |
759 | rc = viopath_open(viopath_hostLp, viomajorsubtype_blockio, | 745 | rc = viopath_open(viopath_hostLp, viomajorsubtype_blockio, |
760 | VIOMAXREQ + 2); | 746 | VIOMAXREQ + 2); |
761 | if (rc) { | 747 | if (rc) { |
762 | printk(VIOD_KERN_WARNING | 748 | pr_warning("error opening path to host partition %d\n", |
763 | "error opening path to host partition %d\n", | 749 | viopath_hostLp); |
764 | viopath_hostLp); | ||
765 | goto unregister_blk; | 750 | goto unregister_blk; |
766 | } | 751 | } |
767 | 752 | ||
@@ -770,7 +755,7 @@ static int __init viodasd_init(void) | |||
770 | 755 | ||
771 | rc = vio_register_driver(&viodasd_driver); | 756 | rc = vio_register_driver(&viodasd_driver); |
772 | if (rc) { | 757 | if (rc) { |
773 | printk(VIOD_KERN_WARNING "vio_register_driver failed\n"); | 758 | pr_warning("vio_register_driver failed\n"); |
774 | goto unset_handler; | 759 | goto unset_handler; |
775 | } | 760 | } |
776 | 761 | ||
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 51042f0ba7e1..2138a7ae050c 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c | |||
@@ -1,5 +1,6 @@ | |||
1 | //#define DEBUG | 1 | //#define DEBUG |
2 | #include <linux/spinlock.h> | 2 | #include <linux/spinlock.h> |
3 | #include <linux/slab.h> | ||
3 | #include <linux/blkdev.h> | 4 | #include <linux/blkdev.h> |
4 | #include <linux/hdreg.h> | 5 | #include <linux/hdreg.h> |
5 | #include <linux/virtio.h> | 6 | #include <linux/virtio.h> |
@@ -243,10 +244,12 @@ static int index_to_minor(int index) | |||
243 | static int __devinit virtblk_probe(struct virtio_device *vdev) | 244 | static int __devinit virtblk_probe(struct virtio_device *vdev) |
244 | { | 245 | { |
245 | struct virtio_blk *vblk; | 246 | struct virtio_blk *vblk; |
247 | struct request_queue *q; | ||
246 | int err; | 248 | int err; |
247 | u64 cap; | 249 | u64 cap; |
248 | u32 v; | 250 | u32 v, blk_size, sg_elems, opt_io_size; |
249 | u32 blk_size, sg_elems; | 251 | u16 min_io_size; |
252 | u8 physical_block_exp, alignment_offset; | ||
250 | 253 | ||
251 | if (index_to_minor(index) >= 1 << MINORBITS) | 254 | if (index_to_minor(index) >= 1 << MINORBITS) |
252 | return -ENOSPC; | 255 | return -ENOSPC; |
@@ -293,13 +296,13 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) | |||
293 | goto out_mempool; | 296 | goto out_mempool; |
294 | } | 297 | } |
295 | 298 | ||
296 | vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock); | 299 | q = vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock); |
297 | if (!vblk->disk->queue) { | 300 | if (!q) { |
298 | err = -ENOMEM; | 301 | err = -ENOMEM; |
299 | goto out_put_disk; | 302 | goto out_put_disk; |
300 | } | 303 | } |
301 | 304 | ||
302 | vblk->disk->queue->queuedata = vblk; | 305 | q->queuedata = vblk; |
303 | 306 | ||
304 | if (index < 26) { | 307 | if (index < 26) { |
305 | sprintf(vblk->disk->disk_name, "vd%c", 'a' + index % 26); | 308 | sprintf(vblk->disk->disk_name, "vd%c", 'a' + index % 26); |
@@ -323,10 +326,10 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) | |||
323 | 326 | ||
324 | /* If barriers are supported, tell block layer that queue is ordered */ | 327 | /* If barriers are supported, tell block layer that queue is ordered */ |
325 | if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH)) | 328 | if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH)) |
326 | blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_DRAIN_FLUSH, | 329 | blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH, |
327 | virtblk_prepare_flush); | 330 | virtblk_prepare_flush); |
328 | else if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER)) | 331 | else if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER)) |
329 | blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_TAG, NULL); | 332 | blk_queue_ordered(q, QUEUE_ORDERED_TAG, NULL); |
330 | 333 | ||
331 | /* If disk is read-only in the host, the guest should obey */ | 334 | /* If disk is read-only in the host, the guest should obey */ |
332 | if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO)) | 335 | if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO)) |
@@ -345,14 +348,13 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) | |||
345 | set_capacity(vblk->disk, cap); | 348 | set_capacity(vblk->disk, cap); |
346 | 349 | ||
347 | /* We can handle whatever the host told us to handle. */ | 350 | /* We can handle whatever the host told us to handle. */ |
348 | blk_queue_max_phys_segments(vblk->disk->queue, vblk->sg_elems-2); | 351 | blk_queue_max_segments(q, vblk->sg_elems-2); |
349 | blk_queue_max_hw_segments(vblk->disk->queue, vblk->sg_elems-2); | ||
350 | 352 | ||
351 | /* No need to bounce any requests */ | 353 | /* No need to bounce any requests */ |
352 | blk_queue_bounce_limit(vblk->disk->queue, BLK_BOUNCE_ANY); | 354 | blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); |
353 | 355 | ||
354 | /* No real sector limit. */ | 356 | /* No real sector limit. */ |
355 | blk_queue_max_sectors(vblk->disk->queue, -1U); | 357 | blk_queue_max_hw_sectors(q, -1U); |
356 | 358 | ||
357 | /* Host can optionally specify maximum segment size and number of | 359 | /* Host can optionally specify maximum segment size and number of |
358 | * segments. */ | 360 | * segments. */ |
@@ -360,16 +362,45 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) | |||
360 | offsetof(struct virtio_blk_config, size_max), | 362 | offsetof(struct virtio_blk_config, size_max), |
361 | &v); | 363 | &v); |
362 | if (!err) | 364 | if (!err) |
363 | blk_queue_max_segment_size(vblk->disk->queue, v); | 365 | blk_queue_max_segment_size(q, v); |
364 | else | 366 | else |
365 | blk_queue_max_segment_size(vblk->disk->queue, -1U); | 367 | blk_queue_max_segment_size(q, -1U); |
366 | 368 | ||
367 | /* Host can optionally specify the block size of the device */ | 369 | /* Host can optionally specify the block size of the device */ |
368 | err = virtio_config_val(vdev, VIRTIO_BLK_F_BLK_SIZE, | 370 | err = virtio_config_val(vdev, VIRTIO_BLK_F_BLK_SIZE, |
369 | offsetof(struct virtio_blk_config, blk_size), | 371 | offsetof(struct virtio_blk_config, blk_size), |
370 | &blk_size); | 372 | &blk_size); |
371 | if (!err) | 373 | if (!err) |
372 | blk_queue_logical_block_size(vblk->disk->queue, blk_size); | 374 | blk_queue_logical_block_size(q, blk_size); |
375 | else | ||
376 | blk_size = queue_logical_block_size(q); | ||
377 | |||
378 | /* Use topology information if available */ | ||
379 | err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY, | ||
380 | offsetof(struct virtio_blk_config, physical_block_exp), | ||
381 | &physical_block_exp); | ||
382 | if (!err && physical_block_exp) | ||
383 | blk_queue_physical_block_size(q, | ||
384 | blk_size * (1 << physical_block_exp)); | ||
385 | |||
386 | err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY, | ||
387 | offsetof(struct virtio_blk_config, alignment_offset), | ||
388 | &alignment_offset); | ||
389 | if (!err && alignment_offset) | ||
390 | blk_queue_alignment_offset(q, blk_size * alignment_offset); | ||
391 | |||
392 | err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY, | ||
393 | offsetof(struct virtio_blk_config, min_io_size), | ||
394 | &min_io_size); | ||
395 | if (!err && min_io_size) | ||
396 | blk_queue_io_min(q, blk_size * min_io_size); | ||
397 | |||
398 | err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY, | ||
399 | offsetof(struct virtio_blk_config, opt_io_size), | ||
400 | &opt_io_size); | ||
401 | if (!err && opt_io_size) | ||
402 | blk_queue_io_opt(q, blk_size * opt_io_size); | ||
403 | |||
373 | 404 | ||
374 | add_disk(vblk->disk); | 405 | add_disk(vblk->disk); |
375 | return 0; | 406 | return 0; |
@@ -404,7 +435,7 @@ static void __devexit virtblk_remove(struct virtio_device *vdev) | |||
404 | kfree(vblk); | 435 | kfree(vblk); |
405 | } | 436 | } |
406 | 437 | ||
407 | static struct virtio_device_id id_table[] = { | 438 | static const struct virtio_device_id id_table[] = { |
408 | { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID }, | 439 | { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID }, |
409 | { 0 }, | 440 | { 0 }, |
410 | }; | 441 | }; |
@@ -412,7 +443,7 @@ static struct virtio_device_id id_table[] = { | |||
412 | static unsigned int features[] = { | 443 | static unsigned int features[] = { |
413 | VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, | 444 | VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, |
414 | VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, | 445 | VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, |
415 | VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_FLUSH | 446 | VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY |
416 | }; | 447 | }; |
417 | 448 | ||
418 | /* | 449 | /* |
diff --git a/drivers/block/xd.c b/drivers/block/xd.c index 0877d3628fda..18a80ff57ce8 100644 --- a/drivers/block/xd.c +++ b/drivers/block/xd.c | |||
@@ -49,6 +49,7 @@ | |||
49 | #include <linux/blkpg.h> | 49 | #include <linux/blkpg.h> |
50 | #include <linux/delay.h> | 50 | #include <linux/delay.h> |
51 | #include <linux/io.h> | 51 | #include <linux/io.h> |
52 | #include <linux/gfp.h> | ||
52 | 53 | ||
53 | #include <asm/system.h> | 54 | #include <asm/system.h> |
54 | #include <asm/uaccess.h> | 55 | #include <asm/uaccess.h> |
@@ -169,13 +170,6 @@ static int __init xd_init(void) | |||
169 | 170 | ||
170 | init_timer (&xd_watchdog_int); xd_watchdog_int.function = xd_watchdog; | 171 | init_timer (&xd_watchdog_int); xd_watchdog_int.function = xd_watchdog; |
171 | 172 | ||
172 | if (!xd_dma_buffer) | ||
173 | xd_dma_buffer = (char *)xd_dma_mem_alloc(xd_maxsectors * 0x200); | ||
174 | if (!xd_dma_buffer) { | ||
175 | printk(KERN_ERR "xd: Out of memory.\n"); | ||
176 | return -ENOMEM; | ||
177 | } | ||
178 | |||
179 | err = -EBUSY; | 173 | err = -EBUSY; |
180 | if (register_blkdev(XT_DISK_MAJOR, "xd")) | 174 | if (register_blkdev(XT_DISK_MAJOR, "xd")) |
181 | goto out1; | 175 | goto out1; |
@@ -202,6 +196,19 @@ static int __init xd_init(void) | |||
202 | xd_drives,xd_drives == 1 ? "" : "s",xd_irq,xd_dma); | 196 | xd_drives,xd_drives == 1 ? "" : "s",xd_irq,xd_dma); |
203 | } | 197 | } |
204 | 198 | ||
199 | /* | ||
200 | * With the drive detected, xd_maxsectors should now be known. | ||
201 | * If xd_maxsectors is 0, nothing was detected and we fall through | ||
202 | * to return -ENODEV | ||
203 | */ | ||
204 | if (!xd_dma_buffer && xd_maxsectors) { | ||
205 | xd_dma_buffer = (char *)xd_dma_mem_alloc(xd_maxsectors * 0x200); | ||
206 | if (!xd_dma_buffer) { | ||
207 | printk(KERN_ERR "xd: Out of memory.\n"); | ||
208 | goto out3; | ||
209 | } | ||
210 | } | ||
211 | |||
205 | err = -ENODEV; | 212 | err = -ENODEV; |
206 | if (!xd_drives) | 213 | if (!xd_drives) |
207 | goto out3; | 214 | goto out3; |
@@ -236,7 +243,7 @@ static int __init xd_init(void) | |||
236 | } | 243 | } |
237 | 244 | ||
238 | /* xd_maxsectors depends on controller - so set after detection */ | 245 | /* xd_maxsectors depends on controller - so set after detection */ |
239 | blk_queue_max_sectors(xd_queue, xd_maxsectors); | 246 | blk_queue_max_hw_sectors(xd_queue, xd_maxsectors); |
240 | 247 | ||
241 | for (i = 0; i < xd_drives; i++) | 248 | for (i = 0; i < xd_drives; i++) |
242 | add_disk(xd_gendisk[i]); | 249 | add_disk(xd_gendisk[i]); |
@@ -249,15 +256,17 @@ out4: | |||
249 | for (i = 0; i < xd_drives; i++) | 256 | for (i = 0; i < xd_drives; i++) |
250 | put_disk(xd_gendisk[i]); | 257 | put_disk(xd_gendisk[i]); |
251 | out3: | 258 | out3: |
252 | release_region(xd_iobase,4); | 259 | if (xd_maxsectors) |
260 | release_region(xd_iobase,4); | ||
261 | |||
262 | if (xd_dma_buffer) | ||
263 | xd_dma_mem_free((unsigned long)xd_dma_buffer, | ||
264 | xd_maxsectors * 0x200); | ||
253 | out2: | 265 | out2: |
254 | blk_cleanup_queue(xd_queue); | 266 | blk_cleanup_queue(xd_queue); |
255 | out1a: | 267 | out1a: |
256 | unregister_blkdev(XT_DISK_MAJOR, "xd"); | 268 | unregister_blkdev(XT_DISK_MAJOR, "xd"); |
257 | out1: | 269 | out1: |
258 | if (xd_dma_buffer) | ||
259 | xd_dma_mem_free((unsigned long)xd_dma_buffer, | ||
260 | xd_maxsectors * 0x200); | ||
261 | return err; | 270 | return err; |
262 | Enomem: | 271 | Enomem: |
263 | err = -ENOMEM; | 272 | err = -ENOMEM; |
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index b8578bb3f4c9..82ed403147c0 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c | |||
@@ -40,8 +40,10 @@ | |||
40 | #include <linux/hdreg.h> | 40 | #include <linux/hdreg.h> |
41 | #include <linux/cdrom.h> | 41 | #include <linux/cdrom.h> |
42 | #include <linux/module.h> | 42 | #include <linux/module.h> |
43 | #include <linux/slab.h> | ||
43 | #include <linux/scatterlist.h> | 44 | #include <linux/scatterlist.h> |
44 | 45 | ||
46 | #include <xen/xen.h> | ||
45 | #include <xen/xenbus.h> | 47 | #include <xen/xenbus.h> |
46 | #include <xen/grant_table.h> | 48 | #include <xen/grant_table.h> |
47 | #include <xen/events.h> | 49 | #include <xen/events.h> |
@@ -345,15 +347,14 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) | |||
345 | 347 | ||
346 | /* Hard sector size and max sectors impersonate the equiv. hardware. */ | 348 | /* Hard sector size and max sectors impersonate the equiv. hardware. */ |
347 | blk_queue_logical_block_size(rq, sector_size); | 349 | blk_queue_logical_block_size(rq, sector_size); |
348 | blk_queue_max_sectors(rq, 512); | 350 | blk_queue_max_hw_sectors(rq, 512); |
349 | 351 | ||
350 | /* Each segment in a request is up to an aligned page in size. */ | 352 | /* Each segment in a request is up to an aligned page in size. */ |
351 | blk_queue_segment_boundary(rq, PAGE_SIZE - 1); | 353 | blk_queue_segment_boundary(rq, PAGE_SIZE - 1); |
352 | blk_queue_max_segment_size(rq, PAGE_SIZE); | 354 | blk_queue_max_segment_size(rq, PAGE_SIZE); |
353 | 355 | ||
354 | /* Ensure a merged request will fit in a single I/O ring slot. */ | 356 | /* Ensure a merged request will fit in a single I/O ring slot. */ |
355 | blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); | 357 | blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); |
356 | blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); | ||
357 | 358 | ||
358 | /* Make sure buffer addresses are sector-aligned. */ | 359 | /* Make sure buffer addresses are sector-aligned. */ |
359 | blk_queue_dma_alignment(rq, 511); | 360 | blk_queue_dma_alignment(rq, 511); |
@@ -1049,7 +1050,7 @@ static const struct block_device_operations xlvbd_block_fops = | |||
1049 | }; | 1050 | }; |
1050 | 1051 | ||
1051 | 1052 | ||
1052 | static struct xenbus_device_id blkfront_ids[] = { | 1053 | static const struct xenbus_device_id blkfront_ids[] = { |
1053 | { "vbd" }, | 1054 | { "vbd" }, |
1054 | { "" } | 1055 | { "" } |
1055 | }; | 1056 | }; |
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c index e5c5415eb45e..e1c95e208a66 100644 --- a/drivers/block/xsysace.c +++ b/drivers/block/xsysace.c | |||
@@ -1227,7 +1227,7 @@ static int __devexit ace_of_remove(struct of_device *op) | |||
1227 | } | 1227 | } |
1228 | 1228 | ||
1229 | /* Match table for of_platform binding */ | 1229 | /* Match table for of_platform binding */ |
1230 | static struct of_device_id ace_of_match[] __devinitdata = { | 1230 | static const struct of_device_id ace_of_match[] __devinitconst = { |
1231 | { .compatible = "xlnx,opb-sysace-1.00.b", }, | 1231 | { .compatible = "xlnx,opb-sysace-1.00.b", }, |
1232 | { .compatible = "xlnx,opb-sysace-1.00.c", }, | 1232 | { .compatible = "xlnx,opb-sysace-1.00.c", }, |
1233 | { .compatible = "xlnx,xps-sysace-1.00.a", }, | 1233 | { .compatible = "xlnx,xps-sysace-1.00.a", }, |
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index 64f941e0f14b..9114654b54d9 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/module.h> | 33 | #include <linux/module.h> |
34 | #include <linux/blkdev.h> | 34 | #include <linux/blkdev.h> |
35 | #include <linux/bitops.h> | 35 | #include <linux/bitops.h> |
36 | #include <linux/slab.h> | ||
36 | 37 | ||
37 | #include <asm/setup.h> | 38 | #include <asm/setup.h> |
38 | #include <asm/amigahw.h> | 39 | #include <asm/amigahw.h> |