aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/DAC960.c9
-rw-r--r--drivers/block/Kconfig2
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/amiflop.c1
-rw-r--r--drivers/block/aoe/aoeblk.c1
-rw-r--r--drivers/block/aoe/aoechr.c1
-rw-r--r--drivers/block/aoe/aoecmd.c18
-rw-r--r--drivers/block/aoe/aoedev.c1
-rw-r--r--drivers/block/aoe/aoenet.c1
-rw-r--r--drivers/block/ataflop.c5
-rw-r--r--drivers/block/brd.c4
-rw-r--r--drivers/block/cciss.c588
-rw-r--r--drivers/block/cciss.h25
-rw-r--r--drivers/block/cciss_cmd.h171
-rw-r--r--drivers/block/cciss_scsi.c149
-rw-r--r--drivers/block/cciss_scsi.h18
-rw-r--r--drivers/block/cpqarray.c5
-rw-r--r--drivers/block/drbd/Kconfig71
-rw-r--r--drivers/block/drbd/Makefile5
-rw-r--r--drivers/block/drbd/drbd_actlog.c1433
-rw-r--r--drivers/block/drbd/drbd_bitmap.c1328
-rw-r--r--drivers/block/drbd/drbd_int.h2261
-rw-r--r--drivers/block/drbd/drbd_main.c3716
-rw-r--r--drivers/block/drbd/drbd_nl.c2367
-rw-r--r--drivers/block/drbd/drbd_proc.c264
-rw-r--r--drivers/block/drbd/drbd_receiver.c4462
-rw-r--r--drivers/block/drbd/drbd_req.c1125
-rw-r--r--drivers/block/drbd/drbd_req.h326
-rw-r--r--drivers/block/drbd/drbd_strings.c113
-rw-r--r--drivers/block/drbd/drbd_vli.h351
-rw-r--r--drivers/block/drbd/drbd_worker.c1516
-rw-r--r--drivers/block/drbd/drbd_wrappers.h91
-rw-r--r--drivers/block/floppy.c1500
-rw-r--r--drivers/block/hd.c3
-rw-r--r--drivers/block/loop.c3
-rw-r--r--drivers/block/mg_disk.c5
-rw-r--r--drivers/block/nbd.c1
-rw-r--r--drivers/block/osdblk.c13
-rw-r--r--drivers/block/paride/pcd.c4
-rw-r--r--drivers/block/paride/pd.c3
-rw-r--r--drivers/block/paride/pf.c7
-rw-r--r--drivers/block/paride/pt.c4
-rw-r--r--drivers/block/pktcdvd.c131
-rw-r--r--drivers/block/ps3disk.c6
-rw-r--r--drivers/block/ps3vram.c18
-rw-r--r--drivers/block/sunvdc.c5
-rw-r--r--drivers/block/swim.c5
-rw-r--r--drivers/block/swim3.c39
-rw-r--r--drivers/block/sx8.c5
-rw-r--r--drivers/block/ub.c8
-rw-r--r--drivers/block/umem.c2
-rw-r--r--drivers/block/viodasd.c91
-rw-r--r--drivers/block/virtio_blk.c63
-rw-r--r--drivers/block/xd.c33
-rw-r--r--drivers/block/xen-blkfront.c9
-rw-r--r--drivers/block/xsysace.c2
-rw-r--r--drivers/block/z2ram.c1
57 files changed, 20858 insertions, 1532 deletions
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index eb4fa1943944..c5f22bb0a48e 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -2533,9 +2533,8 @@ static bool DAC960_RegisterBlockDevice(DAC960_Controller_T *Controller)
2533 Controller->RequestQueue[n] = RequestQueue; 2533 Controller->RequestQueue[n] = RequestQueue;
2534 blk_queue_bounce_limit(RequestQueue, Controller->BounceBufferLimit); 2534 blk_queue_bounce_limit(RequestQueue, Controller->BounceBufferLimit);
2535 RequestQueue->queuedata = Controller; 2535 RequestQueue->queuedata = Controller;
2536 blk_queue_max_hw_segments(RequestQueue, Controller->DriverScatterGatherLimit); 2536 blk_queue_max_segments(RequestQueue, Controller->DriverScatterGatherLimit);
2537 blk_queue_max_phys_segments(RequestQueue, Controller->DriverScatterGatherLimit); 2537 blk_queue_max_hw_sectors(RequestQueue, Controller->MaxBlocksPerCommand);
2538 blk_queue_max_sectors(RequestQueue, Controller->MaxBlocksPerCommand);
2539 disk->queue = RequestQueue; 2538 disk->queue = RequestQueue;
2540 sprintf(disk->disk_name, "rd/c%dd%d", Controller->ControllerNumber, n); 2539 sprintf(disk->disk_name, "rd/c%dd%d", Controller->ControllerNumber, n);
2541 disk->major = MajorNumber; 2540 disk->major = MajorNumber;
@@ -7101,7 +7100,7 @@ static struct DAC960_privdata DAC960_BA_privdata = {
7101 7100
7102static struct DAC960_privdata DAC960_LP_privdata = { 7101static struct DAC960_privdata DAC960_LP_privdata = {
7103 .HardwareType = DAC960_LP_Controller, 7102 .HardwareType = DAC960_LP_Controller,
7104 .FirmwareType = DAC960_LP_Controller, 7103 .FirmwareType = DAC960_V2_Controller,
7105 .InterruptHandler = DAC960_LP_InterruptHandler, 7104 .InterruptHandler = DAC960_LP_InterruptHandler,
7106 .MemoryWindowSize = DAC960_LP_RegisterWindowSize, 7105 .MemoryWindowSize = DAC960_LP_RegisterWindowSize,
7107}; 7106};
@@ -7134,7 +7133,7 @@ static struct DAC960_privdata DAC960_P_privdata = {
7134 .MemoryWindowSize = DAC960_PD_RegisterWindowSize, 7133 .MemoryWindowSize = DAC960_PD_RegisterWindowSize,
7135}; 7134};
7136 7135
7137static struct pci_device_id DAC960_id_table[] = { 7136static const struct pci_device_id DAC960_id_table[] = {
7138 { 7137 {
7139 .vendor = PCI_VENDOR_ID_MYLEX, 7138 .vendor = PCI_VENDOR_ID_MYLEX,
7140 .device = PCI_DEVICE_ID_MYLEX_DAC960_GEM, 7139 .device = PCI_DEVICE_ID_MYLEX_DAC960_GEM,
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 1d886e079c58..77bfce52e9ca 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -271,6 +271,8 @@ config BLK_DEV_CRYPTOLOOP
271 instead, which can be configured to be on-disk compatible with the 271 instead, which can be configured to be on-disk compatible with the
272 cryptoloop device. 272 cryptoloop device.
273 273
274source "drivers/block/drbd/Kconfig"
275
274config BLK_DEV_NBD 276config BLK_DEV_NBD
275 tristate "Network block device support" 277 tristate "Network block device support"
276 depends on NET 278 depends on NET
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index cdaa3f8fddf0..aff5ac925c34 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -36,5 +36,6 @@ obj-$(CONFIG_BLK_DEV_UB) += ub.o
36obj-$(CONFIG_BLK_DEV_HD) += hd.o 36obj-$(CONFIG_BLK_DEV_HD) += hd.o
37 37
38obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o 38obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
39obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
39 40
40swim_mod-objs := swim.o swim_asm.o 41swim_mod-objs := swim.o swim_asm.o
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 055225839024..0182a22c423a 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -54,6 +54,7 @@
54 */ 54 */
55 55
56#include <linux/module.h> 56#include <linux/module.h>
57#include <linux/slab.h>
57 58
58#include <linux/fd.h> 59#include <linux/fd.h>
59#include <linux/hdreg.h> 60#include <linux/hdreg.h>
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 3af97d4da2db..035cefe4045a 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -9,6 +9,7 @@
9#include <linux/backing-dev.h> 9#include <linux/backing-dev.h>
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/ioctl.h> 11#include <linux/ioctl.h>
12#include <linux/slab.h>
12#include <linux/genhd.h> 13#include <linux/genhd.h>
13#include <linux/netdevice.h> 14#include <linux/netdevice.h>
14#include "aoe.h" 15#include "aoe.h"
diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c
index 62141ec09a22..4a1b9e7464aa 100644
--- a/drivers/block/aoe/aoechr.c
+++ b/drivers/block/aoe/aoechr.c
@@ -8,6 +8,7 @@
8#include <linux/blkdev.h> 8#include <linux/blkdev.h>
9#include <linux/completion.h> 9#include <linux/completion.h>
10#include <linux/delay.h> 10#include <linux/delay.h>
11#include <linux/slab.h>
11#include <linux/smp_lock.h> 12#include <linux/smp_lock.h>
12#include <linux/skbuff.h> 13#include <linux/skbuff.h>
13#include "aoe.h" 14#include "aoe.h"
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 13bb69d2abb3..5674bd01d96d 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -5,6 +5,7 @@
5 */ 5 */
6 6
7#include <linux/ata.h> 7#include <linux/ata.h>
8#include <linux/slab.h>
8#include <linux/hdreg.h> 9#include <linux/hdreg.h>
9#include <linux/blkdev.h> 10#include <linux/blkdev.h>
10#include <linux/skbuff.h> 11#include <linux/skbuff.h>
@@ -735,21 +736,6 @@ diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector
735 part_stat_unlock(); 736 part_stat_unlock();
736} 737}
737 738
738/*
739 * Ensure we don't create aliases in VI caches
740 */
741static inline void
742killalias(struct bio *bio)
743{
744 struct bio_vec *bv;
745 int i;
746
747 if (bio_data_dir(bio) == READ)
748 __bio_for_each_segment(bv, bio, i, 0) {
749 flush_dcache_page(bv->bv_page);
750 }
751}
752
753void 739void
754aoecmd_ata_rsp(struct sk_buff *skb) 740aoecmd_ata_rsp(struct sk_buff *skb)
755{ 741{
@@ -871,7 +857,7 @@ aoecmd_ata_rsp(struct sk_buff *skb)
871 if (buf->flags & BUFFL_FAIL) 857 if (buf->flags & BUFFL_FAIL)
872 bio_endio(buf->bio, -EIO); 858 bio_endio(buf->bio, -EIO);
873 else { 859 else {
874 killalias(buf->bio); 860 bio_flush_dcache_pages(buf->bio);
875 bio_endio(buf->bio, 0); 861 bio_endio(buf->bio, 0);
876 } 862 }
877 mempool_free(buf, d->bufpool); 863 mempool_free(buf, d->bufpool);
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index fa67027789aa..0849280bfc1c 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -8,6 +8,7 @@
8#include <linux/blkdev.h> 8#include <linux/blkdev.h>
9#include <linux/netdevice.h> 9#include <linux/netdevice.h>
10#include <linux/delay.h> 10#include <linux/delay.h>
11#include <linux/slab.h>
11#include "aoe.h" 12#include "aoe.h"
12 13
13static void dummy_timer(ulong); 14static void dummy_timer(ulong);
diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c
index ce0d62cd71b2..4d3bc0d49df5 100644
--- a/drivers/block/aoe/aoenet.c
+++ b/drivers/block/aoe/aoenet.c
@@ -4,6 +4,7 @@
4 * Ethernet portion of AoE driver 4 * Ethernet portion of AoE driver
5 */ 5 */
6 6
7#include <linux/gfp.h>
7#include <linux/hdreg.h> 8#include <linux/hdreg.h>
8#include <linux/blkdev.h> 9#include <linux/blkdev.h>
9#include <linux/netdevice.h> 10#include <linux/netdevice.h>
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 847a9e57570a..e35cf59cbfde 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1470,18 +1470,13 @@ repeat:
1470 1470
1471void do_fd_request(struct request_queue * q) 1471void do_fd_request(struct request_queue * q)
1472{ 1472{
1473 unsigned long flags;
1474
1475 DPRINT(("do_fd_request for pid %d\n",current->pid)); 1473 DPRINT(("do_fd_request for pid %d\n",current->pid));
1476 while( fdc_busy ) sleep_on( &fdc_wait ); 1474 while( fdc_busy ) sleep_on( &fdc_wait );
1477 fdc_busy = 1; 1475 fdc_busy = 1;
1478 stdma_lock(floppy_irq, NULL); 1476 stdma_lock(floppy_irq, NULL);
1479 1477
1480 atari_disable_irq( IRQ_MFP_FDC ); 1478 atari_disable_irq( IRQ_MFP_FDC );
1481 local_save_flags(flags); /* The request function is called with ints
1482 local_irq_disable(); * disabled... so must save the IPL for later */
1483 redo_fd_request(); 1479 redo_fd_request();
1484 local_irq_restore(flags);
1485 atari_enable_irq( IRQ_MFP_FDC ); 1480 atari_enable_irq( IRQ_MFP_FDC );
1486} 1481}
1487 1482
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 4f688434daf1..6081e81d5738 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -15,9 +15,9 @@
15#include <linux/blkdev.h> 15#include <linux/blkdev.h>
16#include <linux/bio.h> 16#include <linux/bio.h>
17#include <linux/highmem.h> 17#include <linux/highmem.h>
18#include <linux/gfp.h>
19#include <linux/radix-tree.h> 18#include <linux/radix-tree.h>
20#include <linux/buffer_head.h> /* invalidate_bh_lrus() */ 19#include <linux/buffer_head.h> /* invalidate_bh_lrus() */
20#include <linux/slab.h>
21 21
22#include <asm/uaccess.h> 22#include <asm/uaccess.h>
23 23
@@ -434,7 +434,7 @@ static struct brd_device *brd_alloc(int i)
434 goto out_free_dev; 434 goto out_free_dev;
435 blk_queue_make_request(brd->brd_queue, brd_make_request); 435 blk_queue_make_request(brd->brd_queue, brd_make_request);
436 blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG, NULL); 436 blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG, NULL);
437 blk_queue_max_sectors(brd->brd_queue, 1024); 437 blk_queue_max_hw_sectors(brd->brd_queue, 1024);
438 blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY); 438 blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
439 439
440 disk = brd->brd_disk = alloc_disk(1 << part_shift); 440 disk = brd->brd_disk = alloc_disk(1 << part_shift);
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 92b126394fa1..eb5ff0531cfb 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -179,19 +179,17 @@ static int rebuild_lun_table(ctlr_info_t *h, int first_time, int via_ioctl);
179static int deregister_disk(ctlr_info_t *h, int drv_index, 179static int deregister_disk(ctlr_info_t *h, int drv_index,
180 int clear_all, int via_ioctl); 180 int clear_all, int via_ioctl);
181 181
182static void cciss_read_capacity(int ctlr, int logvol, int withirq, 182static void cciss_read_capacity(int ctlr, int logvol,
183 sector_t *total_size, unsigned int *block_size); 183 sector_t *total_size, unsigned int *block_size);
184static void cciss_read_capacity_16(int ctlr, int logvol, int withirq, 184static void cciss_read_capacity_16(int ctlr, int logvol,
185 sector_t *total_size, unsigned int *block_size); 185 sector_t *total_size, unsigned int *block_size);
186static void cciss_geometry_inquiry(int ctlr, int logvol, 186static void cciss_geometry_inquiry(int ctlr, int logvol,
187 int withirq, sector_t total_size, 187 sector_t total_size,
188 unsigned int block_size, InquiryData_struct *inq_buff, 188 unsigned int block_size, InquiryData_struct *inq_buff,
189 drive_info_struct *drv); 189 drive_info_struct *drv);
190static void __devinit cciss_interrupt_mode(ctlr_info_t *, struct pci_dev *, 190static void __devinit cciss_interrupt_mode(ctlr_info_t *, struct pci_dev *,
191 __u32); 191 __u32);
192static void start_io(ctlr_info_t *h); 192static void start_io(ctlr_info_t *h);
193static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size,
194 __u8 page_code, unsigned char *scsi3addr, int cmd_type);
195static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size, 193static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size,
196 __u8 page_code, unsigned char scsi3addr[], 194 __u8 page_code, unsigned char scsi3addr[],
197 int cmd_type); 195 int cmd_type);
@@ -259,6 +257,79 @@ static inline void removeQ(CommandList_struct *c)
259 hlist_del_init(&c->list); 257 hlist_del_init(&c->list);
260} 258}
261 259
260static void cciss_free_sg_chain_blocks(SGDescriptor_struct **cmd_sg_list,
261 int nr_cmds)
262{
263 int i;
264
265 if (!cmd_sg_list)
266 return;
267 for (i = 0; i < nr_cmds; i++) {
268 kfree(cmd_sg_list[i]);
269 cmd_sg_list[i] = NULL;
270 }
271 kfree(cmd_sg_list);
272}
273
274static SGDescriptor_struct **cciss_allocate_sg_chain_blocks(
275 ctlr_info_t *h, int chainsize, int nr_cmds)
276{
277 int j;
278 SGDescriptor_struct **cmd_sg_list;
279
280 if (chainsize <= 0)
281 return NULL;
282
283 cmd_sg_list = kmalloc(sizeof(*cmd_sg_list) * nr_cmds, GFP_KERNEL);
284 if (!cmd_sg_list)
285 return NULL;
286
287 /* Build up chain blocks for each command */
288 for (j = 0; j < nr_cmds; j++) {
289 /* Need a block of chainsized s/g elements. */
290 cmd_sg_list[j] = kmalloc((chainsize *
291 sizeof(*cmd_sg_list[j])), GFP_KERNEL);
292 if (!cmd_sg_list[j]) {
293 dev_err(&h->pdev->dev, "Cannot get memory "
294 "for s/g chains.\n");
295 goto clean;
296 }
297 }
298 return cmd_sg_list;
299clean:
300 cciss_free_sg_chain_blocks(cmd_sg_list, nr_cmds);
301 return NULL;
302}
303
304static void cciss_unmap_sg_chain_block(ctlr_info_t *h, CommandList_struct *c)
305{
306 SGDescriptor_struct *chain_sg;
307 u64bit temp64;
308
309 if (c->Header.SGTotal <= h->max_cmd_sgentries)
310 return;
311
312 chain_sg = &c->SG[h->max_cmd_sgentries - 1];
313 temp64.val32.lower = chain_sg->Addr.lower;
314 temp64.val32.upper = chain_sg->Addr.upper;
315 pci_unmap_single(h->pdev, temp64.val, chain_sg->Len, PCI_DMA_TODEVICE);
316}
317
318static void cciss_map_sg_chain_block(ctlr_info_t *h, CommandList_struct *c,
319 SGDescriptor_struct *chain_block, int len)
320{
321 SGDescriptor_struct *chain_sg;
322 u64bit temp64;
323
324 chain_sg = &c->SG[h->max_cmd_sgentries - 1];
325 chain_sg->Ext = CCISS_SG_CHAIN;
326 chain_sg->Len = len;
327 temp64.val = pci_map_single(h->pdev, chain_block, len,
328 PCI_DMA_TODEVICE);
329 chain_sg->Addr.lower = temp64.val32.lower;
330 chain_sg->Addr.upper = temp64.val32.upper;
331}
332
262#include "cciss_scsi.c" /* For SCSI tape support */ 333#include "cciss_scsi.c" /* For SCSI tape support */
263 334
264static const char *raid_label[] = { "0", "4", "1(1+0)", "5", "5+1", "ADG", 335static const char *raid_label[] = { "0", "4", "1(1+0)", "5", "5+1", "ADG",
@@ -339,6 +410,9 @@ static int cciss_seq_show(struct seq_file *seq, void *v)
339 if (*pos > h->highest_lun) 410 if (*pos > h->highest_lun)
340 return 0; 411 return 0;
341 412
413 if (drv == NULL) /* it's possible for h->drv[] to have holes. */
414 return 0;
415
342 if (drv->heads == 0) 416 if (drv->heads == 0)
343 return 0; 417 return 0;
344 418
@@ -424,12 +498,9 @@ cciss_proc_write(struct file *file, const char __user *buf,
424 if (strncmp(ENGAGE_SCSI, buffer, sizeof ENGAGE_SCSI - 1) == 0) { 498 if (strncmp(ENGAGE_SCSI, buffer, sizeof ENGAGE_SCSI - 1) == 0) {
425 struct seq_file *seq = file->private_data; 499 struct seq_file *seq = file->private_data;
426 ctlr_info_t *h = seq->private; 500 ctlr_info_t *h = seq->private;
427 int rc;
428 501
429 rc = cciss_engage_scsi(h->ctlr); 502 err = cciss_engage_scsi(h->ctlr);
430 if (rc != 0) 503 if (err == 0)
431 err = -rc;
432 else
433 err = length; 504 err = length;
434 } else 505 } else
435#endif /* CONFIG_CISS_SCSI_TAPE */ 506#endif /* CONFIG_CISS_SCSI_TAPE */
@@ -1346,26 +1417,27 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
1346 kfree(buff); 1417 kfree(buff);
1347 return -ENOMEM; 1418 return -ENOMEM;
1348 } 1419 }
1349 // Fill in the command type 1420 /* Fill in the command type */
1350 c->cmd_type = CMD_IOCTL_PEND; 1421 c->cmd_type = CMD_IOCTL_PEND;
1351 // Fill in Command Header 1422 /* Fill in Command Header */
1352 c->Header.ReplyQueue = 0; // unused in simple mode 1423 c->Header.ReplyQueue = 0; /* unused in simple mode */
1353 if (iocommand.buf_size > 0) // buffer to fill 1424 if (iocommand.buf_size > 0) /* buffer to fill */
1354 { 1425 {
1355 c->Header.SGList = 1; 1426 c->Header.SGList = 1;
1356 c->Header.SGTotal = 1; 1427 c->Header.SGTotal = 1;
1357 } else // no buffers to fill 1428 } else /* no buffers to fill */
1358 { 1429 {
1359 c->Header.SGList = 0; 1430 c->Header.SGList = 0;
1360 c->Header.SGTotal = 0; 1431 c->Header.SGTotal = 0;
1361 } 1432 }
1362 c->Header.LUN = iocommand.LUN_info; 1433 c->Header.LUN = iocommand.LUN_info;
1363 c->Header.Tag.lower = c->busaddr; // use the kernel address the cmd block for tag 1434 /* use the kernel address the cmd block for tag */
1435 c->Header.Tag.lower = c->busaddr;
1364 1436
1365 // Fill in Request block 1437 /* Fill in Request block */
1366 c->Request = iocommand.Request; 1438 c->Request = iocommand.Request;
1367 1439
1368 // Fill in the scatter gather information 1440 /* Fill in the scatter gather information */
1369 if (iocommand.buf_size > 0) { 1441 if (iocommand.buf_size > 0) {
1370 temp64.val = pci_map_single(host->pdev, buff, 1442 temp64.val = pci_map_single(host->pdev, buff,
1371 iocommand.buf_size, 1443 iocommand.buf_size,
@@ -1373,7 +1445,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
1373 c->SG[0].Addr.lower = temp64.val32.lower; 1445 c->SG[0].Addr.lower = temp64.val32.lower;
1374 c->SG[0].Addr.upper = temp64.val32.upper; 1446 c->SG[0].Addr.upper = temp64.val32.upper;
1375 c->SG[0].Len = iocommand.buf_size; 1447 c->SG[0].Len = iocommand.buf_size;
1376 c->SG[0].Ext = 0; // we are not chaining 1448 c->SG[0].Ext = 0; /* we are not chaining */
1377 } 1449 }
1378 c->waiting = &wait; 1450 c->waiting = &wait;
1379 1451
@@ -1657,9 +1729,11 @@ static void cciss_softirq_done(struct request *rq)
1657{ 1729{
1658 CommandList_struct *cmd = rq->completion_data; 1730 CommandList_struct *cmd = rq->completion_data;
1659 ctlr_info_t *h = hba[cmd->ctlr]; 1731 ctlr_info_t *h = hba[cmd->ctlr];
1732 SGDescriptor_struct *curr_sg = cmd->SG;
1660 unsigned long flags; 1733 unsigned long flags;
1661 u64bit temp64; 1734 u64bit temp64;
1662 int i, ddir; 1735 int i, ddir;
1736 int sg_index = 0;
1663 1737
1664 if (cmd->Request.Type.Direction == XFER_READ) 1738 if (cmd->Request.Type.Direction == XFER_READ)
1665 ddir = PCI_DMA_FROMDEVICE; 1739 ddir = PCI_DMA_FROMDEVICE;
@@ -1669,9 +1743,17 @@ static void cciss_softirq_done(struct request *rq)
1669 /* command did not need to be retried */ 1743 /* command did not need to be retried */
1670 /* unmap the DMA mapping for all the scatter gather elements */ 1744 /* unmap the DMA mapping for all the scatter gather elements */
1671 for (i = 0; i < cmd->Header.SGList; i++) { 1745 for (i = 0; i < cmd->Header.SGList; i++) {
1672 temp64.val32.lower = cmd->SG[i].Addr.lower; 1746 if (curr_sg[sg_index].Ext == CCISS_SG_CHAIN) {
1673 temp64.val32.upper = cmd->SG[i].Addr.upper; 1747 cciss_unmap_sg_chain_block(h, cmd);
1674 pci_unmap_page(h->pdev, temp64.val, cmd->SG[i].Len, ddir); 1748 /* Point to the next block */
1749 curr_sg = h->cmd_sg_list[cmd->cmdindex];
1750 sg_index = 0;
1751 }
1752 temp64.val32.lower = curr_sg[sg_index].Addr.lower;
1753 temp64.val32.upper = curr_sg[sg_index].Addr.upper;
1754 pci_unmap_page(h->pdev, temp64.val, curr_sg[sg_index].Len,
1755 ddir);
1756 ++sg_index;
1675 } 1757 }
1676 1758
1677#ifdef CCISS_DEBUG 1759#ifdef CCISS_DEBUG
@@ -1701,7 +1783,7 @@ static inline void log_unit_to_scsi3addr(ctlr_info_t *h,
1701 * via the inquiry page 0. Model, vendor, and rev are set to empty strings if 1783 * via the inquiry page 0. Model, vendor, and rev are set to empty strings if
1702 * they cannot be read. 1784 * they cannot be read.
1703 */ 1785 */
1704static void cciss_get_device_descr(int ctlr, int logvol, int withirq, 1786static void cciss_get_device_descr(int ctlr, int logvol,
1705 char *vendor, char *model, char *rev) 1787 char *vendor, char *model, char *rev)
1706{ 1788{
1707 int rc; 1789 int rc;
@@ -1717,14 +1799,8 @@ static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
1717 return; 1799 return;
1718 1800
1719 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); 1801 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
1720 if (withirq) 1802 rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf, sizeof(*inq_buf), 0,
1721 rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf, 1803 scsi3addr, TYPE_CMD);
1722 sizeof(InquiryData_struct), 0,
1723 scsi3addr, TYPE_CMD);
1724 else
1725 rc = sendcmd(CISS_INQUIRY, ctlr, inq_buf,
1726 sizeof(InquiryData_struct), 0,
1727 scsi3addr, TYPE_CMD);
1728 if (rc == IO_OK) { 1804 if (rc == IO_OK) {
1729 memcpy(vendor, &inq_buf->data_byte[8], VENDOR_LEN); 1805 memcpy(vendor, &inq_buf->data_byte[8], VENDOR_LEN);
1730 vendor[VENDOR_LEN] = '\0'; 1806 vendor[VENDOR_LEN] = '\0';
@@ -1743,7 +1819,7 @@ static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
1743 * number cannot be had, for whatever reason, 16 bytes of 0xff 1819 * number cannot be had, for whatever reason, 16 bytes of 0xff
1744 * are returned instead. 1820 * are returned instead.
1745 */ 1821 */
1746static void cciss_get_serial_no(int ctlr, int logvol, int withirq, 1822static void cciss_get_serial_no(int ctlr, int logvol,
1747 unsigned char *serial_no, int buflen) 1823 unsigned char *serial_no, int buflen)
1748{ 1824{
1749#define PAGE_83_INQ_BYTES 64 1825#define PAGE_83_INQ_BYTES 64
@@ -1759,12 +1835,8 @@ static void cciss_get_serial_no(int ctlr, int logvol, int withirq,
1759 return; 1835 return;
1760 memset(serial_no, 0, buflen); 1836 memset(serial_no, 0, buflen);
1761 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); 1837 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
1762 if (withirq) 1838 rc = sendcmd_withirq(CISS_INQUIRY, ctlr, buf,
1763 rc = sendcmd_withirq(CISS_INQUIRY, ctlr, buf, 1839 PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
1764 PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
1765 else
1766 rc = sendcmd(CISS_INQUIRY, ctlr, buf,
1767 PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
1768 if (rc == IO_OK) 1840 if (rc == IO_OK)
1769 memcpy(serial_no, &buf[8], buflen); 1841 memcpy(serial_no, &buf[8], buflen);
1770 kfree(buf); 1842 kfree(buf);
@@ -1793,12 +1865,9 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
1793 blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask); 1865 blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask);
1794 1866
1795 /* This is a hardware imposed limit. */ 1867 /* This is a hardware imposed limit. */
1796 blk_queue_max_hw_segments(disk->queue, MAXSGENTRIES); 1868 blk_queue_max_segments(disk->queue, h->maxsgentries);
1797
1798 /* This is a limit in the driver and could be eliminated. */
1799 blk_queue_max_phys_segments(disk->queue, MAXSGENTRIES);
1800 1869
1801 blk_queue_max_sectors(disk->queue, h->cciss_max_sectors); 1870 blk_queue_max_hw_sectors(disk->queue, h->cciss_max_sectors);
1802 1871
1803 blk_queue_softirq_done(disk->queue, cciss_softirq_done); 1872 blk_queue_softirq_done(disk->queue, cciss_softirq_done);
1804 1873
@@ -1852,18 +1921,16 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time,
1852 1921
1853 /* testing to see if 16-byte CDBs are already being used */ 1922 /* testing to see if 16-byte CDBs are already being used */
1854 if (h->cciss_read == CCISS_READ_16) { 1923 if (h->cciss_read == CCISS_READ_16) {
1855 cciss_read_capacity_16(h->ctlr, drv_index, 1, 1924 cciss_read_capacity_16(h->ctlr, drv_index,
1856 &total_size, &block_size); 1925 &total_size, &block_size);
1857 1926
1858 } else { 1927 } else {
1859 cciss_read_capacity(ctlr, drv_index, 1, 1928 cciss_read_capacity(ctlr, drv_index, &total_size, &block_size);
1860 &total_size, &block_size);
1861
1862 /* if read_capacity returns all F's this volume is >2TB */ 1929 /* if read_capacity returns all F's this volume is >2TB */
1863 /* in size so we switch to 16-byte CDB's for all */ 1930 /* in size so we switch to 16-byte CDB's for all */
1864 /* read/write ops */ 1931 /* read/write ops */
1865 if (total_size == 0xFFFFFFFFULL) { 1932 if (total_size == 0xFFFFFFFFULL) {
1866 cciss_read_capacity_16(ctlr, drv_index, 1, 1933 cciss_read_capacity_16(ctlr, drv_index,
1867 &total_size, &block_size); 1934 &total_size, &block_size);
1868 h->cciss_read = CCISS_READ_16; 1935 h->cciss_read = CCISS_READ_16;
1869 h->cciss_write = CCISS_WRITE_16; 1936 h->cciss_write = CCISS_WRITE_16;
@@ -1873,14 +1940,14 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time,
1873 } 1940 }
1874 } 1941 }
1875 1942
1876 cciss_geometry_inquiry(ctlr, drv_index, 1, total_size, block_size, 1943 cciss_geometry_inquiry(ctlr, drv_index, total_size, block_size,
1877 inq_buff, drvinfo); 1944 inq_buff, drvinfo);
1878 drvinfo->block_size = block_size; 1945 drvinfo->block_size = block_size;
1879 drvinfo->nr_blocks = total_size + 1; 1946 drvinfo->nr_blocks = total_size + 1;
1880 1947
1881 cciss_get_device_descr(ctlr, drv_index, 1, drvinfo->vendor, 1948 cciss_get_device_descr(ctlr, drv_index, drvinfo->vendor,
1882 drvinfo->model, drvinfo->rev); 1949 drvinfo->model, drvinfo->rev);
1883 cciss_get_serial_no(ctlr, drv_index, 1, drvinfo->serial_no, 1950 cciss_get_serial_no(ctlr, drv_index, drvinfo->serial_no,
1884 sizeof(drvinfo->serial_no)); 1951 sizeof(drvinfo->serial_no));
1885 /* Save the lunid in case we deregister the disk, below. */ 1952 /* Save the lunid in case we deregister the disk, below. */
1886 memcpy(drvinfo->LunID, h->drv[drv_index]->LunID, 1953 memcpy(drvinfo->LunID, h->drv[drv_index]->LunID,
@@ -2424,7 +2491,7 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff,
2424 c->Request.Type.Direction = XFER_READ; 2491 c->Request.Type.Direction = XFER_READ;
2425 c->Request.Timeout = 0; 2492 c->Request.Timeout = 0;
2426 c->Request.CDB[0] = cmd; 2493 c->Request.CDB[0] = cmd;
2427 c->Request.CDB[6] = (size >> 24) & 0xFF; //MSB 2494 c->Request.CDB[6] = (size >> 24) & 0xFF; /* MSB */
2428 c->Request.CDB[7] = (size >> 16) & 0xFF; 2495 c->Request.CDB[7] = (size >> 16) & 0xFF;
2429 c->Request.CDB[8] = (size >> 8) & 0xFF; 2496 c->Request.CDB[8] = (size >> 8) & 0xFF;
2430 c->Request.CDB[9] = size & 0xFF; 2497 c->Request.CDB[9] = size & 0xFF;
@@ -2531,6 +2598,8 @@ static int check_target_status(ctlr_info_t *h, CommandList_struct *c)
2531 case 0: return IO_OK; /* no sense */ 2598 case 0: return IO_OK; /* no sense */
2532 case 1: return IO_OK; /* recovered error */ 2599 case 1: return IO_OK; /* recovered error */
2533 default: 2600 default:
2601 if (check_for_unit_attention(h, c))
2602 return IO_NEEDS_RETRY;
2534 printk(KERN_WARNING "cciss%d: cmd 0x%02x " 2603 printk(KERN_WARNING "cciss%d: cmd 0x%02x "
2535 "check condition, sense key = 0x%02x\n", 2604 "check condition, sense key = 0x%02x\n",
2536 h->ctlr, c->Request.CDB[0], 2605 h->ctlr, c->Request.CDB[0],
@@ -2672,7 +2741,7 @@ static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size,
2672} 2741}
2673 2742
2674static void cciss_geometry_inquiry(int ctlr, int logvol, 2743static void cciss_geometry_inquiry(int ctlr, int logvol,
2675 int withirq, sector_t total_size, 2744 sector_t total_size,
2676 unsigned int block_size, 2745 unsigned int block_size,
2677 InquiryData_struct *inq_buff, 2746 InquiryData_struct *inq_buff,
2678 drive_info_struct *drv) 2747 drive_info_struct *drv)
@@ -2683,21 +2752,15 @@ static void cciss_geometry_inquiry(int ctlr, int logvol,
2683 2752
2684 memset(inq_buff, 0, sizeof(InquiryData_struct)); 2753 memset(inq_buff, 0, sizeof(InquiryData_struct));
2685 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); 2754 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
2686 if (withirq) 2755 return_code = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buff,
2687 return_code = sendcmd_withirq(CISS_INQUIRY, ctlr, 2756 sizeof(*inq_buff), 0xC1, scsi3addr, TYPE_CMD);
2688 inq_buff, sizeof(*inq_buff),
2689 0xC1, scsi3addr, TYPE_CMD);
2690 else
2691 return_code = sendcmd(CISS_INQUIRY, ctlr, inq_buff,
2692 sizeof(*inq_buff), 0xC1, scsi3addr,
2693 TYPE_CMD);
2694 if (return_code == IO_OK) { 2757 if (return_code == IO_OK) {
2695 if (inq_buff->data_byte[8] == 0xFF) { 2758 if (inq_buff->data_byte[8] == 0xFF) {
2696 printk(KERN_WARNING 2759 printk(KERN_WARNING
2697 "cciss: reading geometry failed, volume " 2760 "cciss: reading geometry failed, volume "
2698 "does not support reading geometry\n"); 2761 "does not support reading geometry\n");
2699 drv->heads = 255; 2762 drv->heads = 255;
2700 drv->sectors = 32; // Sectors per track 2763 drv->sectors = 32; /* Sectors per track */
2701 drv->cylinders = total_size + 1; 2764 drv->cylinders = total_size + 1;
2702 drv->raid_level = RAID_UNKNOWN; 2765 drv->raid_level = RAID_UNKNOWN;
2703 } else { 2766 } else {
@@ -2723,7 +2786,7 @@ static void cciss_geometry_inquiry(int ctlr, int logvol,
2723} 2786}
2724 2787
2725static void 2788static void
2726cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size, 2789cciss_read_capacity(int ctlr, int logvol, sector_t *total_size,
2727 unsigned int *block_size) 2790 unsigned int *block_size)
2728{ 2791{
2729 ReadCapdata_struct *buf; 2792 ReadCapdata_struct *buf;
@@ -2737,14 +2800,8 @@ cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size,
2737 } 2800 }
2738 2801
2739 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); 2802 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
2740 if (withirq) 2803 return_code = sendcmd_withirq(CCISS_READ_CAPACITY, ctlr, buf,
2741 return_code = sendcmd_withirq(CCISS_READ_CAPACITY, 2804 sizeof(ReadCapdata_struct), 0, scsi3addr, TYPE_CMD);
2742 ctlr, buf, sizeof(ReadCapdata_struct),
2743 0, scsi3addr, TYPE_CMD);
2744 else
2745 return_code = sendcmd(CCISS_READ_CAPACITY,
2746 ctlr, buf, sizeof(ReadCapdata_struct),
2747 0, scsi3addr, TYPE_CMD);
2748 if (return_code == IO_OK) { 2805 if (return_code == IO_OK) {
2749 *total_size = be32_to_cpu(*(__be32 *) buf->total_size); 2806 *total_size = be32_to_cpu(*(__be32 *) buf->total_size);
2750 *block_size = be32_to_cpu(*(__be32 *) buf->block_size); 2807 *block_size = be32_to_cpu(*(__be32 *) buf->block_size);
@@ -2756,8 +2813,8 @@ cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size,
2756 kfree(buf); 2813 kfree(buf);
2757} 2814}
2758 2815
2759static void 2816static void cciss_read_capacity_16(int ctlr, int logvol,
2760cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size, unsigned int *block_size) 2817 sector_t *total_size, unsigned int *block_size)
2761{ 2818{
2762 ReadCapdata_struct_16 *buf; 2819 ReadCapdata_struct_16 *buf;
2763 int return_code; 2820 int return_code;
@@ -2770,16 +2827,9 @@ cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size,
2770 } 2827 }
2771 2828
2772 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); 2829 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
2773 if (withirq) { 2830 return_code = sendcmd_withirq(CCISS_READ_CAPACITY_16,
2774 return_code = sendcmd_withirq(CCISS_READ_CAPACITY_16, 2831 ctlr, buf, sizeof(ReadCapdata_struct_16),
2775 ctlr, buf, sizeof(ReadCapdata_struct_16), 2832 0, scsi3addr, TYPE_CMD);
2776 0, scsi3addr, TYPE_CMD);
2777 }
2778 else {
2779 return_code = sendcmd(CCISS_READ_CAPACITY_16,
2780 ctlr, buf, sizeof(ReadCapdata_struct_16),
2781 0, scsi3addr, TYPE_CMD);
2782 }
2783 if (return_code == IO_OK) { 2833 if (return_code == IO_OK) {
2784 *total_size = be64_to_cpu(*(__be64 *) buf->total_size); 2834 *total_size = be64_to_cpu(*(__be64 *) buf->total_size);
2785 *block_size = be32_to_cpu(*(__be32 *) buf->block_size); 2835 *block_size = be32_to_cpu(*(__be32 *) buf->block_size);
@@ -2820,13 +2870,13 @@ static int cciss_revalidate(struct gendisk *disk)
2820 return 1; 2870 return 1;
2821 } 2871 }
2822 if (h->cciss_read == CCISS_READ_10) { 2872 if (h->cciss_read == CCISS_READ_10) {
2823 cciss_read_capacity(h->ctlr, logvol, 1, 2873 cciss_read_capacity(h->ctlr, logvol,
2824 &total_size, &block_size); 2874 &total_size, &block_size);
2825 } else { 2875 } else {
2826 cciss_read_capacity_16(h->ctlr, logvol, 1, 2876 cciss_read_capacity_16(h->ctlr, logvol,
2827 &total_size, &block_size); 2877 &total_size, &block_size);
2828 } 2878 }
2829 cciss_geometry_inquiry(h->ctlr, logvol, 1, total_size, block_size, 2879 cciss_geometry_inquiry(h->ctlr, logvol, total_size, block_size,
2830 inq_buff, drv); 2880 inq_buff, drv);
2831 2881
2832 blk_queue_logical_block_size(drv->queue, drv->block_size); 2882 blk_queue_logical_block_size(drv->queue, drv->block_size);
@@ -2837,167 +2887,6 @@ static int cciss_revalidate(struct gendisk *disk)
2837} 2887}
2838 2888
2839/* 2889/*
2840 * Wait polling for a command to complete.
2841 * The memory mapped FIFO is polled for the completion.
2842 * Used only at init time, interrupts from the HBA are disabled.
2843 */
2844static unsigned long pollcomplete(int ctlr)
2845{
2846 unsigned long done;
2847 int i;
2848
2849 /* Wait (up to 20 seconds) for a command to complete */
2850
2851 for (i = 20 * HZ; i > 0; i--) {
2852 done = hba[ctlr]->access.command_completed(hba[ctlr]);
2853 if (done == FIFO_EMPTY)
2854 schedule_timeout_uninterruptible(1);
2855 else
2856 return done;
2857 }
2858 /* Invalid address to tell caller we ran out of time */
2859 return 1;
2860}
2861
2862/* Send command c to controller h and poll for it to complete.
2863 * Turns interrupts off on the board. Used at driver init time
2864 * and during SCSI error recovery.
2865 */
2866static int sendcmd_core(ctlr_info_t *h, CommandList_struct *c)
2867{
2868 int i;
2869 unsigned long complete;
2870 int status = IO_ERROR;
2871 u64bit buff_dma_handle;
2872
2873resend_cmd1:
2874
2875 /* Disable interrupt on the board. */
2876 h->access.set_intr_mask(h, CCISS_INTR_OFF);
2877
2878 /* Make sure there is room in the command FIFO */
2879 /* Actually it should be completely empty at this time */
2880 /* unless we are in here doing error handling for the scsi */
2881 /* tape side of the driver. */
2882 for (i = 200000; i > 0; i--) {
2883 /* if fifo isn't full go */
2884 if (!(h->access.fifo_full(h)))
2885 break;
2886 udelay(10);
2887 printk(KERN_WARNING "cciss cciss%d: SendCmd FIFO full,"
2888 " waiting!\n", h->ctlr);
2889 }
2890 h->access.submit_command(h, c); /* Send the cmd */
2891 do {
2892 complete = pollcomplete(h->ctlr);
2893
2894#ifdef CCISS_DEBUG
2895 printk(KERN_DEBUG "cciss: command completed\n");
2896#endif /* CCISS_DEBUG */
2897
2898 if (complete == 1) {
2899 printk(KERN_WARNING
2900 "cciss cciss%d: SendCmd Timeout out, "
2901 "No command list address returned!\n", h->ctlr);
2902 status = IO_ERROR;
2903 break;
2904 }
2905
2906 /* Make sure it's the command we're expecting. */
2907 if ((complete & ~CISS_ERROR_BIT) != c->busaddr) {
2908 printk(KERN_WARNING "cciss%d: Unexpected command "
2909 "completion.\n", h->ctlr);
2910 continue;
2911 }
2912
2913 /* It is our command. If no error, we're done. */
2914 if (!(complete & CISS_ERROR_BIT)) {
2915 status = IO_OK;
2916 break;
2917 }
2918
2919 /* There is an error... */
2920
2921 /* if data overrun or underun on Report command ignore it */
2922 if (((c->Request.CDB[0] == CISS_REPORT_LOG) ||
2923 (c->Request.CDB[0] == CISS_REPORT_PHYS) ||
2924 (c->Request.CDB[0] == CISS_INQUIRY)) &&
2925 ((c->err_info->CommandStatus == CMD_DATA_OVERRUN) ||
2926 (c->err_info->CommandStatus == CMD_DATA_UNDERRUN))) {
2927 complete = c->busaddr;
2928 status = IO_OK;
2929 break;
2930 }
2931
2932 if (c->err_info->CommandStatus == CMD_UNSOLICITED_ABORT) {
2933 printk(KERN_WARNING "cciss%d: unsolicited abort %p\n",
2934 h->ctlr, c);
2935 if (c->retry_count < MAX_CMD_RETRIES) {
2936 printk(KERN_WARNING "cciss%d: retrying %p\n",
2937 h->ctlr, c);
2938 c->retry_count++;
2939 /* erase the old error information */
2940 memset(c->err_info, 0, sizeof(c->err_info));
2941 goto resend_cmd1;
2942 }
2943 printk(KERN_WARNING "cciss%d: retried %p too many "
2944 "times\n", h->ctlr, c);
2945 status = IO_ERROR;
2946 break;
2947 }
2948
2949 if (c->err_info->CommandStatus == CMD_UNABORTABLE) {
2950 printk(KERN_WARNING "cciss%d: command could not be "
2951 "aborted.\n", h->ctlr);
2952 status = IO_ERROR;
2953 break;
2954 }
2955
2956 if (c->err_info->CommandStatus == CMD_TARGET_STATUS) {
2957 status = check_target_status(h, c);
2958 break;
2959 }
2960
2961 printk(KERN_WARNING "cciss%d: sendcmd error\n", h->ctlr);
2962 printk(KERN_WARNING "cmd = 0x%02x, CommandStatus = 0x%02x\n",
2963 c->Request.CDB[0], c->err_info->CommandStatus);
2964 status = IO_ERROR;
2965 break;
2966
2967 } while (1);
2968
2969 /* unlock the data buffer from DMA */
2970 buff_dma_handle.val32.lower = c->SG[0].Addr.lower;
2971 buff_dma_handle.val32.upper = c->SG[0].Addr.upper;
2972 pci_unmap_single(h->pdev, (dma_addr_t) buff_dma_handle.val,
2973 c->SG[0].Len, PCI_DMA_BIDIRECTIONAL);
2974 return status;
2975}
2976
2977/*
2978 * Send a command to the controller, and wait for it to complete.
2979 * Used at init time, and during SCSI error recovery.
2980 */
2981static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size,
2982 __u8 page_code, unsigned char *scsi3addr, int cmd_type)
2983{
2984 CommandList_struct *c;
2985 int status;
2986
2987 c = cmd_alloc(hba[ctlr], 1);
2988 if (!c) {
2989 printk(KERN_WARNING "cciss: unable to get memory");
2990 return IO_ERROR;
2991 }
2992 status = fill_cmd(c, cmd, ctlr, buff, size, page_code,
2993 scsi3addr, cmd_type);
2994 if (status == IO_OK)
2995 status = sendcmd_core(hba[ctlr], c);
2996 cmd_free(hba[ctlr], c, 1);
2997 return status;
2998}
2999
3000/*
3001 * Map (physical) PCI mem into (virtual) kernel space 2890 * Map (physical) PCI mem into (virtual) kernel space
3002 */ 2891 */
3003static void __iomem *remap_pci_mem(ulong base, ulong size) 2892static void __iomem *remap_pci_mem(ulong base, ulong size)
@@ -3255,9 +3144,12 @@ static void do_cciss_request(struct request_queue *q)
3255 int seg; 3144 int seg;
3256 struct request *creq; 3145 struct request *creq;
3257 u64bit temp64; 3146 u64bit temp64;
3258 struct scatterlist tmp_sg[MAXSGENTRIES]; 3147 struct scatterlist *tmp_sg;
3148 SGDescriptor_struct *curr_sg;
3259 drive_info_struct *drv; 3149 drive_info_struct *drv;
3260 int i, dir; 3150 int i, dir;
3151 int sg_index = 0;
3152 int chained = 0;
3261 3153
3262 /* We call start_io here in case there is a command waiting on the 3154 /* We call start_io here in case there is a command waiting on the
3263 * queue that has not been sent. 3155 * queue that has not been sent.
@@ -3270,13 +3162,14 @@ static void do_cciss_request(struct request_queue *q)
3270 if (!creq) 3162 if (!creq)
3271 goto startio; 3163 goto startio;
3272 3164
3273 BUG_ON(creq->nr_phys_segments > MAXSGENTRIES); 3165 BUG_ON(creq->nr_phys_segments > h->maxsgentries);
3274 3166
3275 if ((c = cmd_alloc(h, 1)) == NULL) 3167 if ((c = cmd_alloc(h, 1)) == NULL)
3276 goto full; 3168 goto full;
3277 3169
3278 blk_start_request(creq); 3170 blk_start_request(creq);
3279 3171
3172 tmp_sg = h->scatter_list[c->cmdindex];
3280 spin_unlock_irq(q->queue_lock); 3173 spin_unlock_irq(q->queue_lock);
3281 3174
3282 c->cmd_type = CMD_RWREQ; 3175 c->cmd_type = CMD_RWREQ;
@@ -3284,19 +3177,19 @@ static void do_cciss_request(struct request_queue *q)
3284 3177
3285 /* fill in the request */ 3178 /* fill in the request */
3286 drv = creq->rq_disk->private_data; 3179 drv = creq->rq_disk->private_data;
3287 c->Header.ReplyQueue = 0; // unused in simple mode 3180 c->Header.ReplyQueue = 0; /* unused in simple mode */
3288 /* got command from pool, so use the command block index instead */ 3181 /* got command from pool, so use the command block index instead */
3289 /* for direct lookups. */ 3182 /* for direct lookups. */
3290 /* The first 2 bits are reserved for controller error reporting. */ 3183 /* The first 2 bits are reserved for controller error reporting. */
3291 c->Header.Tag.lower = (c->cmdindex << 3); 3184 c->Header.Tag.lower = (c->cmdindex << 3);
3292 c->Header.Tag.lower |= 0x04; /* flag for direct lookup. */ 3185 c->Header.Tag.lower |= 0x04; /* flag for direct lookup. */
3293 memcpy(&c->Header.LUN, drv->LunID, sizeof(drv->LunID)); 3186 memcpy(&c->Header.LUN, drv->LunID, sizeof(drv->LunID));
3294 c->Request.CDBLen = 10; // 12 byte commands not in FW yet; 3187 c->Request.CDBLen = 10; /* 12 byte commands not in FW yet; */
3295 c->Request.Type.Type = TYPE_CMD; // It is a command. 3188 c->Request.Type.Type = TYPE_CMD; /* It is a command. */
3296 c->Request.Type.Attribute = ATTR_SIMPLE; 3189 c->Request.Type.Attribute = ATTR_SIMPLE;
3297 c->Request.Type.Direction = 3190 c->Request.Type.Direction =
3298 (rq_data_dir(creq) == READ) ? XFER_READ : XFER_WRITE; 3191 (rq_data_dir(creq) == READ) ? XFER_READ : XFER_WRITE;
3299 c->Request.Timeout = 0; // Don't time out 3192 c->Request.Timeout = 0; /* Don't time out */
3300 c->Request.CDB[0] = 3193 c->Request.CDB[0] =
3301 (rq_data_dir(creq) == READ) ? h->cciss_read : h->cciss_write; 3194 (rq_data_dir(creq) == READ) ? h->cciss_read : h->cciss_write;
3302 start_blk = blk_rq_pos(creq); 3195 start_blk = blk_rq_pos(creq);
@@ -3305,7 +3198,7 @@ static void do_cciss_request(struct request_queue *q)
3305 (int)blk_rq_pos(creq), (int)blk_rq_sectors(creq)); 3198 (int)blk_rq_pos(creq), (int)blk_rq_sectors(creq));
3306#endif /* CCISS_DEBUG */ 3199#endif /* CCISS_DEBUG */
3307 3200
3308 sg_init_table(tmp_sg, MAXSGENTRIES); 3201 sg_init_table(tmp_sg, h->maxsgentries);
3309 seg = blk_rq_map_sg(q, creq, tmp_sg); 3202 seg = blk_rq_map_sg(q, creq, tmp_sg);
3310 3203
3311 /* get the DMA records for the setup */ 3204 /* get the DMA records for the setup */
@@ -3314,33 +3207,54 @@ static void do_cciss_request(struct request_queue *q)
3314 else 3207 else
3315 dir = PCI_DMA_TODEVICE; 3208 dir = PCI_DMA_TODEVICE;
3316 3209
3210 curr_sg = c->SG;
3211 sg_index = 0;
3212 chained = 0;
3213
3317 for (i = 0; i < seg; i++) { 3214 for (i = 0; i < seg; i++) {
3318 c->SG[i].Len = tmp_sg[i].length; 3215 if (((sg_index+1) == (h->max_cmd_sgentries)) &&
3216 !chained && ((seg - i) > 1)) {
3217 /* Point to next chain block. */
3218 curr_sg = h->cmd_sg_list[c->cmdindex];
3219 sg_index = 0;
3220 chained = 1;
3221 }
3222 curr_sg[sg_index].Len = tmp_sg[i].length;
3319 temp64.val = (__u64) pci_map_page(h->pdev, sg_page(&tmp_sg[i]), 3223 temp64.val = (__u64) pci_map_page(h->pdev, sg_page(&tmp_sg[i]),
3320 tmp_sg[i].offset, 3224 tmp_sg[i].offset,
3321 tmp_sg[i].length, dir); 3225 tmp_sg[i].length, dir);
3322 c->SG[i].Addr.lower = temp64.val32.lower; 3226 curr_sg[sg_index].Addr.lower = temp64.val32.lower;
3323 c->SG[i].Addr.upper = temp64.val32.upper; 3227 curr_sg[sg_index].Addr.upper = temp64.val32.upper;
3324 c->SG[i].Ext = 0; // we are not chaining 3228 curr_sg[sg_index].Ext = 0; /* we are not chaining */
3229 ++sg_index;
3325 } 3230 }
3231 if (chained)
3232 cciss_map_sg_chain_block(h, c, h->cmd_sg_list[c->cmdindex],
3233 (seg - (h->max_cmd_sgentries - 1)) *
3234 sizeof(SGDescriptor_struct));
3235
3326 /* track how many SG entries we are using */ 3236 /* track how many SG entries we are using */
3327 if (seg > h->maxSG) 3237 if (seg > h->maxSG)
3328 h->maxSG = seg; 3238 h->maxSG = seg;
3329 3239
3330#ifdef CCISS_DEBUG 3240#ifdef CCISS_DEBUG
3331 printk(KERN_DEBUG "cciss: Submitting %u sectors in %d segments\n", 3241 printk(KERN_DEBUG "cciss: Submitting %ld sectors in %d segments "
3332 blk_rq_sectors(creq), seg); 3242 "chained[%d]\n",
3243 blk_rq_sectors(creq), seg, chained);
3333#endif /* CCISS_DEBUG */ 3244#endif /* CCISS_DEBUG */
3334 3245
3335 c->Header.SGList = c->Header.SGTotal = seg; 3246 c->Header.SGList = c->Header.SGTotal = seg + chained;
3247 if (seg > h->max_cmd_sgentries)
3248 c->Header.SGList = h->max_cmd_sgentries;
3249
3336 if (likely(blk_fs_request(creq))) { 3250 if (likely(blk_fs_request(creq))) {
3337 if(h->cciss_read == CCISS_READ_10) { 3251 if(h->cciss_read == CCISS_READ_10) {
3338 c->Request.CDB[1] = 0; 3252 c->Request.CDB[1] = 0;
3339 c->Request.CDB[2] = (start_blk >> 24) & 0xff; //MSB 3253 c->Request.CDB[2] = (start_blk >> 24) & 0xff; /* MSB */
3340 c->Request.CDB[3] = (start_blk >> 16) & 0xff; 3254 c->Request.CDB[3] = (start_blk >> 16) & 0xff;
3341 c->Request.CDB[4] = (start_blk >> 8) & 0xff; 3255 c->Request.CDB[4] = (start_blk >> 8) & 0xff;
3342 c->Request.CDB[5] = start_blk & 0xff; 3256 c->Request.CDB[5] = start_blk & 0xff;
3343 c->Request.CDB[6] = 0; // (sect >> 24) & 0xff; MSB 3257 c->Request.CDB[6] = 0; /* (sect >> 24) & 0xff; MSB */
3344 c->Request.CDB[7] = (blk_rq_sectors(creq) >> 8) & 0xff; 3258 c->Request.CDB[7] = (blk_rq_sectors(creq) >> 8) & 0xff;
3345 c->Request.CDB[8] = blk_rq_sectors(creq) & 0xff; 3259 c->Request.CDB[8] = blk_rq_sectors(creq) & 0xff;
3346 c->Request.CDB[9] = c->Request.CDB[11] = c->Request.CDB[12] = 0; 3260 c->Request.CDB[9] = c->Request.CDB[11] = c->Request.CDB[12] = 0;
@@ -3349,7 +3263,7 @@ static void do_cciss_request(struct request_queue *q)
3349 3263
3350 c->Request.CDBLen = 16; 3264 c->Request.CDBLen = 16;
3351 c->Request.CDB[1]= 0; 3265 c->Request.CDB[1]= 0;
3352 c->Request.CDB[2]= (upper32 >> 24) & 0xff; //MSB 3266 c->Request.CDB[2]= (upper32 >> 24) & 0xff; /* MSB */
3353 c->Request.CDB[3]= (upper32 >> 16) & 0xff; 3267 c->Request.CDB[3]= (upper32 >> 16) & 0xff;
3354 c->Request.CDB[4]= (upper32 >> 8) & 0xff; 3268 c->Request.CDB[4]= (upper32 >> 8) & 0xff;
3355 c->Request.CDB[5]= upper32 & 0xff; 3269 c->Request.CDB[5]= upper32 & 0xff;
@@ -3427,6 +3341,7 @@ static irqreturn_t do_cciss_intr(int irq, void *dev_id)
3427 printk(KERN_WARNING 3341 printk(KERN_WARNING
3428 "cciss: controller cciss%d failed, stopping.\n", 3342 "cciss: controller cciss%d failed, stopping.\n",
3429 h->ctlr); 3343 h->ctlr);
3344 spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
3430 fail_all_cmds(h->ctlr); 3345 fail_all_cmds(h->ctlr);
3431 return IRQ_HANDLED; 3346 return IRQ_HANDLED;
3432 } 3347 }
@@ -3513,28 +3428,33 @@ static int add_to_scan_list(struct ctlr_info *h)
3513 * @h: Pointer to the controller. 3428 * @h: Pointer to the controller.
3514 * 3429 *
3515 * Removes the controller from the rescan queue if present. Blocks if 3430 * Removes the controller from the rescan queue if present. Blocks if
3516 * the controller is currently conducting a rescan. 3431 * the controller is currently conducting a rescan. The controller
3432 * can be in one of three states:
3433 * 1. Doesn't need a scan
3434 * 2. On the scan list, but not scanning yet (we remove it)
3435 * 3. Busy scanning (and not on the list). In this case we want to wait for
3436 * the scan to complete to make sure the scanning thread for this
3437 * controller is completely idle.
3517 **/ 3438 **/
3518static void remove_from_scan_list(struct ctlr_info *h) 3439static void remove_from_scan_list(struct ctlr_info *h)
3519{ 3440{
3520 struct ctlr_info *test_h, *tmp_h; 3441 struct ctlr_info *test_h, *tmp_h;
3521 int scanning = 0;
3522 3442
3523 mutex_lock(&scan_mutex); 3443 mutex_lock(&scan_mutex);
3524 list_for_each_entry_safe(test_h, tmp_h, &scan_q, scan_list) { 3444 list_for_each_entry_safe(test_h, tmp_h, &scan_q, scan_list) {
3525 if (test_h == h) { 3445 if (test_h == h) { /* state 2. */
3526 list_del(&h->scan_list); 3446 list_del(&h->scan_list);
3527 complete_all(&h->scan_wait); 3447 complete_all(&h->scan_wait);
3528 mutex_unlock(&scan_mutex); 3448 mutex_unlock(&scan_mutex);
3529 return; 3449 return;
3530 } 3450 }
3531 } 3451 }
3532 if (&h->busy_scanning) 3452 if (h->busy_scanning) { /* state 3. */
3533 scanning = 0; 3453 mutex_unlock(&scan_mutex);
3534 mutex_unlock(&scan_mutex);
3535
3536 if (scanning)
3537 wait_for_completion(&h->scan_wait); 3454 wait_for_completion(&h->scan_wait);
3455 } else { /* state 1, nothing to do. */
3456 mutex_unlock(&scan_mutex);
3457 }
3538} 3458}
3539 3459
3540/** 3460/**
@@ -3573,13 +3493,11 @@ static int scan_thread(void *data)
3573 h->busy_scanning = 1; 3493 h->busy_scanning = 1;
3574 mutex_unlock(&scan_mutex); 3494 mutex_unlock(&scan_mutex);
3575 3495
3576 if (h) { 3496 rebuild_lun_table(h, 0, 0);
3577 rebuild_lun_table(h, 0, 0); 3497 complete_all(&h->scan_wait);
3578 complete_all(&h->scan_wait); 3498 mutex_lock(&scan_mutex);
3579 mutex_lock(&scan_mutex); 3499 h->busy_scanning = 0;
3580 h->busy_scanning = 0; 3500 mutex_unlock(&scan_mutex);
3581 mutex_unlock(&scan_mutex);
3582 }
3583 } 3501 }
3584 } 3502 }
3585 3503
@@ -3605,8 +3523,22 @@ static int check_for_unit_attention(ctlr_info_t *h, CommandList_struct *c)
3605 case REPORT_LUNS_CHANGED: 3523 case REPORT_LUNS_CHANGED:
3606 printk(KERN_WARNING "cciss%d: report LUN data " 3524 printk(KERN_WARNING "cciss%d: report LUN data "
3607 "changed\n", h->ctlr); 3525 "changed\n", h->ctlr);
3608 add_to_scan_list(h); 3526 /*
3609 wake_up_process(cciss_scan_thread); 3527 * Here, we could call add_to_scan_list and wake up the scan thread,
3528 * except that it's quite likely that we will get more than one
3529 * REPORT_LUNS_CHANGED condition in quick succession, which means
3530 * that those which occur after the first one will likely happen
3531 * *during* the scan_thread's rescan. And the rescan code is not
3532 * robust enough to restart in the middle, undoing what it has already
3533 * done, and it's not clear that it's even possible to do this, since
3534 * part of what it does is notify the block layer, which starts
3535 * doing it's own i/o to read partition tables and so on, and the
3536 * driver doesn't have visibility to know what might need undoing.
3537 * In any event, if possible, it is horribly complicated to get right
3538 * so we just don't do it for now.
3539 *
3540 * Note: this REPORT_LUNS_CHANGED condition only occurs on the MSA2012.
3541 */
3610 return 1; 3542 return 1;
3611 break; 3543 break;
3612 case POWER_OR_RESET: 3544 case POWER_OR_RESET:
@@ -3888,6 +3820,23 @@ static int __devinit cciss_pci_init(ctlr_info_t *c, struct pci_dev *pdev)
3888 * leave a little room for ioctl calls. 3820 * leave a little room for ioctl calls.
3889 */ 3821 */
3890 c->max_commands = readl(&(c->cfgtable->CmdsOutMax)); 3822 c->max_commands = readl(&(c->cfgtable->CmdsOutMax));
3823 c->maxsgentries = readl(&(c->cfgtable->MaxSGElements));
3824
3825 /*
3826 * Limit native command to 32 s/g elements to save dma'able memory.
3827 * Howvever spec says if 0, use 31
3828 */
3829
3830 c->max_cmd_sgentries = 31;
3831 if (c->maxsgentries > 512) {
3832 c->max_cmd_sgentries = 32;
3833 c->chainsize = c->maxsgentries - c->max_cmd_sgentries + 1;
3834 c->maxsgentries -= 1; /* account for chain pointer */
3835 } else {
3836 c->maxsgentries = 31; /* Default to traditional value */
3837 c->chainsize = 0; /* traditional */
3838 }
3839
3891 c->product_name = products[prod_index].product_name; 3840 c->product_name = products[prod_index].product_name;
3892 c->access = *(products[prod_index].access); 3841 c->access = *(products[prod_index].access);
3893 c->nr_cmds = c->max_commands - 4; 3842 c->nr_cmds = c->max_commands - 4;
@@ -4214,6 +4163,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
4214{ 4163{
4215 int i; 4164 int i;
4216 int j = 0; 4165 int j = 0;
4166 int k = 0;
4217 int rc; 4167 int rc;
4218 int dac, return_code; 4168 int dac, return_code;
4219 InquiryData_struct *inq_buff; 4169 InquiryData_struct *inq_buff;
@@ -4317,6 +4267,26 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
4317 printk(KERN_ERR "cciss: out of memory"); 4267 printk(KERN_ERR "cciss: out of memory");
4318 goto clean4; 4268 goto clean4;
4319 } 4269 }
4270
4271 /* Need space for temp scatter list */
4272 hba[i]->scatter_list = kmalloc(hba[i]->max_commands *
4273 sizeof(struct scatterlist *),
4274 GFP_KERNEL);
4275 for (k = 0; k < hba[i]->nr_cmds; k++) {
4276 hba[i]->scatter_list[k] = kmalloc(sizeof(struct scatterlist) *
4277 hba[i]->maxsgentries,
4278 GFP_KERNEL);
4279 if (hba[i]->scatter_list[k] == NULL) {
4280 printk(KERN_ERR "cciss%d: could not allocate "
4281 "s/g lists\n", i);
4282 goto clean4;
4283 }
4284 }
4285 hba[i]->cmd_sg_list = cciss_allocate_sg_chain_blocks(hba[i],
4286 hba[i]->chainsize, hba[i]->nr_cmds);
4287 if (!hba[i]->cmd_sg_list && hba[i]->chainsize > 0)
4288 goto clean4;
4289
4320 spin_lock_init(&hba[i]->lock); 4290 spin_lock_init(&hba[i]->lock);
4321 4291
4322 /* Initialize the pdev driver private data. 4292 /* Initialize the pdev driver private data.
@@ -4362,7 +4332,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
4362 4332
4363 cciss_procinit(i); 4333 cciss_procinit(i);
4364 4334
4365 hba[i]->cciss_max_sectors = 2048; 4335 hba[i]->cciss_max_sectors = 8192;
4366 4336
4367 rebuild_lun_table(hba[i], 1, 0); 4337 rebuild_lun_table(hba[i], 1, 0);
4368 hba[i]->busy_initializing = 0; 4338 hba[i]->busy_initializing = 0;
@@ -4370,6 +4340,11 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
4370 4340
4371clean4: 4341clean4:
4372 kfree(hba[i]->cmd_pool_bits); 4342 kfree(hba[i]->cmd_pool_bits);
4343 /* Free up sg elements */
4344 for (k = 0; k < hba[i]->nr_cmds; k++)
4345 kfree(hba[i]->scatter_list[k]);
4346 kfree(hba[i]->scatter_list);
4347 cciss_free_sg_chain_blocks(hba[i]->cmd_sg_list, hba[i]->nr_cmds);
4373 if (hba[i]->cmd_pool) 4348 if (hba[i]->cmd_pool)
4374 pci_free_consistent(hba[i]->pdev, 4349 pci_free_consistent(hba[i]->pdev,
4375 hba[i]->nr_cmds * sizeof(CommandList_struct), 4350 hba[i]->nr_cmds * sizeof(CommandList_struct),
@@ -4400,30 +4375,28 @@ clean_no_release_regions:
4400 4375
4401static void cciss_shutdown(struct pci_dev *pdev) 4376static void cciss_shutdown(struct pci_dev *pdev)
4402{ 4377{
4403 ctlr_info_t *tmp_ptr; 4378 ctlr_info_t *h;
4404 int i; 4379 char *flush_buf;
4405 char flush_buf[4];
4406 int return_code; 4380 int return_code;
4407 4381
4408 tmp_ptr = pci_get_drvdata(pdev); 4382 h = pci_get_drvdata(pdev);
4409 if (tmp_ptr == NULL) 4383 flush_buf = kzalloc(4, GFP_KERNEL);
4410 return; 4384 if (!flush_buf) {
4411 i = tmp_ptr->ctlr; 4385 printk(KERN_WARNING
4412 if (hba[i] == NULL) 4386 "cciss:%d cache not flushed, out of memory.\n",
4387 h->ctlr);
4413 return; 4388 return;
4414
4415 /* Turn board interrupts off and send the flush cache command */
4416 /* sendcmd will turn off interrupt, and send the flush...
4417 * To write all data in the battery backed cache to disks */
4418 memset(flush_buf, 0, 4);
4419 return_code = sendcmd(CCISS_CACHE_FLUSH, i, flush_buf, 4, 0,
4420 CTLR_LUNID, TYPE_CMD);
4421 if (return_code == IO_OK) {
4422 printk(KERN_INFO "Completed flushing cache on controller %d\n", i);
4423 } else {
4424 printk(KERN_WARNING "Error flushing cache on controller %d\n", i);
4425 } 4389 }
4426 free_irq(hba[i]->intr[2], hba[i]); 4390 /* write all data in the battery backed cache to disk */
4391 memset(flush_buf, 0, 4);
4392 return_code = sendcmd_withirq(CCISS_CACHE_FLUSH, h->ctlr, flush_buf,
4393 4, 0, CTLR_LUNID, TYPE_CMD);
4394 kfree(flush_buf);
4395 if (return_code != IO_OK)
4396 printk(KERN_WARNING "cciss%d: Error flushing cache\n",
4397 h->ctlr);
4398 h->access.set_intr_mask(h, CCISS_INTR_OFF);
4399 free_irq(h->intr[2], h);
4427} 4400}
4428 4401
4429static void __devexit cciss_remove_one(struct pci_dev *pdev) 4402static void __devexit cciss_remove_one(struct pci_dev *pdev)
@@ -4485,6 +4458,11 @@ static void __devexit cciss_remove_one(struct pci_dev *pdev)
4485 pci_free_consistent(hba[i]->pdev, hba[i]->nr_cmds * sizeof(ErrorInfo_struct), 4458 pci_free_consistent(hba[i]->pdev, hba[i]->nr_cmds * sizeof(ErrorInfo_struct),
4486 hba[i]->errinfo_pool, hba[i]->errinfo_pool_dhandle); 4459 hba[i]->errinfo_pool, hba[i]->errinfo_pool_dhandle);
4487 kfree(hba[i]->cmd_pool_bits); 4460 kfree(hba[i]->cmd_pool_bits);
4461 /* Free up sg elements */
4462 for (j = 0; j < hba[i]->nr_cmds; j++)
4463 kfree(hba[i]->scatter_list[j]);
4464 kfree(hba[i]->scatter_list);
4465 cciss_free_sg_chain_blocks(hba[i]->cmd_sg_list, hba[i]->nr_cmds);
4488 /* 4466 /*
4489 * Deliberately omit pci_disable_device(): it does something nasty to 4467 * Deliberately omit pci_disable_device(): it does something nasty to
4490 * Smart Array controllers that pci_enable_device does not undo 4468 * Smart Array controllers that pci_enable_device does not undo
@@ -4517,7 +4495,7 @@ static int __init cciss_init(void)
4517 * boundary. Given that we use pci_alloc_consistent() to allocate an 4495 * boundary. Given that we use pci_alloc_consistent() to allocate an
4518 * array of them, the size must be a multiple of 8 bytes. 4496 * array of them, the size must be a multiple of 8 bytes.
4519 */ 4497 */
4520 BUILD_BUG_ON(sizeof(CommandList_struct) % 8); 4498 BUILD_BUG_ON(sizeof(CommandList_struct) % COMMANDLIST_ALIGNMENT);
4521 4499
4522 printk(KERN_INFO DRIVER_NAME "\n"); 4500 printk(KERN_INFO DRIVER_NAME "\n");
4523 4501
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index 31524cf42c77..c5d411174db0 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -55,12 +55,12 @@ typedef struct _drive_info_struct
55 char device_initialized; /* indicates whether dev is initialized */ 55 char device_initialized; /* indicates whether dev is initialized */
56} drive_info_struct; 56} drive_info_struct;
57 57
58struct ctlr_info 58struct ctlr_info
59{ 59{
60 int ctlr; 60 int ctlr;
61 char devname[8]; 61 char devname[8];
62 char *product_name; 62 char *product_name;
63 char firm_ver[4]; // Firmware version 63 char firm_ver[4]; /* Firmware version */
64 struct pci_dev *pdev; 64 struct pci_dev *pdev;
65 __u32 board_id; 65 __u32 board_id;
66 void __iomem *vaddr; 66 void __iomem *vaddr;
@@ -75,6 +75,16 @@ struct ctlr_info
75 int num_luns; 75 int num_luns;
76 int highest_lun; 76 int highest_lun;
77 int usage_count; /* number of opens all all minor devices */ 77 int usage_count; /* number of opens all all minor devices */
78 /* Need space for temp sg list
79 * number of scatter/gathers supported
80 * number of scatter/gathers in chained block
81 */
82 struct scatterlist **scatter_list;
83 int maxsgentries;
84 int chainsize;
85 int max_cmd_sgentries;
86 SGDescriptor_struct **cmd_sg_list;
87
78# define DOORBELL_INT 0 88# define DOORBELL_INT 0
79# define PERF_MODE_INT 1 89# define PERF_MODE_INT 1
80# define SIMPLE_MODE_INT 2 90# define SIMPLE_MODE_INT 2
@@ -87,7 +97,7 @@ struct ctlr_info
87 BYTE cciss_write; 97 BYTE cciss_write;
88 BYTE cciss_read_capacity; 98 BYTE cciss_read_capacity;
89 99
90 // information about each logical volume 100 /* information about each logical volume */
91 drive_info_struct *drv[CISS_MAX_LUN]; 101 drive_info_struct *drv[CISS_MAX_LUN];
92 102
93 struct access_method access; 103 struct access_method access;
@@ -100,7 +110,7 @@ struct ctlr_info
100 unsigned int maxSG; 110 unsigned int maxSG;
101 spinlock_t lock; 111 spinlock_t lock;
102 112
103 //* pointers to command and error info pool */ 113 /* pointers to command and error info pool */
104 CommandList_struct *cmd_pool; 114 CommandList_struct *cmd_pool;
105 dma_addr_t cmd_pool_dhandle; 115 dma_addr_t cmd_pool_dhandle;
106 ErrorInfo_struct *errinfo_pool; 116 ErrorInfo_struct *errinfo_pool;
@@ -118,12 +128,10 @@ struct ctlr_info
118 */ 128 */
119 int next_to_run; 129 int next_to_run;
120 130
121 // Disk structures we need to pass back 131 /* Disk structures we need to pass back */
122 struct gendisk *gendisk[CISS_MAX_LUN]; 132 struct gendisk *gendisk[CISS_MAX_LUN];
123#ifdef CONFIG_CISS_SCSI_TAPE 133#ifdef CONFIG_CISS_SCSI_TAPE
124 void *scsi_ctlr; /* ptr to structure containing scsi related stuff */ 134 struct cciss_scsi_adapter_data_t *scsi_ctlr;
125 /* list of block side commands the scsi error handling sucked up */
126 /* and saved for later processing */
127#endif 135#endif
128 unsigned char alive; 136 unsigned char alive;
129 struct list_head scan_list; 137 struct list_head scan_list;
@@ -299,4 +307,3 @@ struct board_type {
299#define CCISS_LOCK(i) (&hba[i]->lock) 307#define CCISS_LOCK(i) (&hba[i]->lock)
300 308
301#endif /* CCISS_H */ 309#endif /* CCISS_H */
302
diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h
index dbaed1ea0da3..e624ff959cb6 100644
--- a/drivers/block/cciss_cmd.h
+++ b/drivers/block/cciss_cmd.h
@@ -1,30 +1,16 @@
1#ifndef CCISS_CMD_H 1#ifndef CCISS_CMD_H
2#define CCISS_CMD_H 2#define CCISS_CMD_H
3//########################################################################### 3
4//DEFINES 4#include <linux/cciss_defs.h>
5//########################################################################### 5
6/* DEFINES */
6#define CISS_VERSION "1.00" 7#define CISS_VERSION "1.00"
7 8
8//general boundary defintions 9/* general boundary definitions */
9#define SENSEINFOBYTES 32//note that this value may vary between host implementations 10#define MAXSGENTRIES 32
10#define MAXSGENTRIES 31 11#define CCISS_SG_CHAIN 0x80000000
11#define MAXREPLYQS 256 12#define MAXREPLYQS 256
12 13
13//Command Status value
14#define CMD_SUCCESS 0x0000
15#define CMD_TARGET_STATUS 0x0001
16#define CMD_DATA_UNDERRUN 0x0002
17#define CMD_DATA_OVERRUN 0x0003
18#define CMD_INVALID 0x0004
19#define CMD_PROTOCOL_ERR 0x0005
20#define CMD_HARDWARE_ERR 0x0006
21#define CMD_CONNECTION_LOST 0x0007
22#define CMD_ABORTED 0x0008
23#define CMD_ABORT_FAILED 0x0009
24#define CMD_UNSOLICITED_ABORT 0x000A
25#define CMD_TIMEOUT 0x000B
26#define CMD_UNABORTABLE 0x000C
27
28/* Unit Attentions ASC's as defined for the MSA2012sa */ 14/* Unit Attentions ASC's as defined for the MSA2012sa */
29#define POWER_OR_RESET 0x29 15#define POWER_OR_RESET 0x29
30#define STATE_CHANGED 0x2a 16#define STATE_CHANGED 0x2a
@@ -48,30 +34,13 @@
48#define ASYM_ACCESS_CHANGED 0x06 34#define ASYM_ACCESS_CHANGED 0x06
49#define LUN_CAPACITY_CHANGED 0x09 35#define LUN_CAPACITY_CHANGED 0x09
50 36
51//transfer direction 37/* config space register offsets */
52#define XFER_NONE 0x00
53#define XFER_WRITE 0x01
54#define XFER_READ 0x02
55#define XFER_RSVD 0x03
56
57//task attribute
58#define ATTR_UNTAGGED 0x00
59#define ATTR_SIMPLE 0x04
60#define ATTR_HEADOFQUEUE 0x05
61#define ATTR_ORDERED 0x06
62#define ATTR_ACA 0x07
63
64//cdb type
65#define TYPE_CMD 0x00
66#define TYPE_MSG 0x01
67
68//config space register offsets
69#define CFG_VENDORID 0x00 38#define CFG_VENDORID 0x00
70#define CFG_DEVICEID 0x02 39#define CFG_DEVICEID 0x02
71#define CFG_I2OBAR 0x10 40#define CFG_I2OBAR 0x10
72#define CFG_MEM1BAR 0x14 41#define CFG_MEM1BAR 0x14
73 42
74//i2o space register offsets 43/* i2o space register offsets */
75#define I2O_IBDB_SET 0x20 44#define I2O_IBDB_SET 0x20
76#define I2O_IBDB_CLEAR 0x70 45#define I2O_IBDB_CLEAR 0x70
77#define I2O_INT_STATUS 0x30 46#define I2O_INT_STATUS 0x30
@@ -80,7 +49,7 @@
80#define I2O_OBPOST_Q 0x44 49#define I2O_OBPOST_Q 0x44
81#define I2O_DMA1_CFG 0x214 50#define I2O_DMA1_CFG 0x214
82 51
83//Configuration Table 52/* Configuration Table */
84#define CFGTBL_ChangeReq 0x00000001l 53#define CFGTBL_ChangeReq 0x00000001l
85#define CFGTBL_AccCmds 0x00000001l 54#define CFGTBL_AccCmds 0x00000001l
86 55
@@ -102,24 +71,17 @@ typedef union _u64bit
102 __u64 val; 71 __u64 val;
103} u64bit; 72} u64bit;
104 73
105// Type defs used in the following structs 74/* Type defs used in the following structs */
106#define BYTE __u8
107#define WORD __u16
108#define HWORD __u16
109#define DWORD __u32
110#define QWORD vals32 75#define QWORD vals32
111 76
112//########################################################################### 77/* STRUCTURES */
113//STRUCTURES
114//###########################################################################
115#define CISS_MAX_LUN 1024
116#define CISS_MAX_PHYS_LUN 1024 78#define CISS_MAX_PHYS_LUN 1024
117// SCSI-3 Cmmands 79/* SCSI-3 Cmmands */
118 80
119#pragma pack(1) 81#pragma pack(1)
120 82
121#define CISS_INQUIRY 0x12 83#define CISS_INQUIRY 0x12
122//Date returned 84/* Date returned */
123typedef struct _InquiryData_struct 85typedef struct _InquiryData_struct
124{ 86{
125 BYTE data_byte[36]; 87 BYTE data_byte[36];
@@ -127,7 +89,7 @@ typedef struct _InquiryData_struct
127 89
128#define CISS_REPORT_LOG 0xc2 /* Report Logical LUNs */ 90#define CISS_REPORT_LOG 0xc2 /* Report Logical LUNs */
129#define CISS_REPORT_PHYS 0xc3 /* Report Physical LUNs */ 91#define CISS_REPORT_PHYS 0xc3 /* Report Physical LUNs */
130// Data returned 92/* Data returned */
131typedef struct _ReportLUNdata_struct 93typedef struct _ReportLUNdata_struct
132{ 94{
133 BYTE LUNListLength[4]; 95 BYTE LUNListLength[4];
@@ -138,8 +100,8 @@ typedef struct _ReportLUNdata_struct
138#define CCISS_READ_CAPACITY 0x25 /* Read Capacity */ 100#define CCISS_READ_CAPACITY 0x25 /* Read Capacity */
139typedef struct _ReadCapdata_struct 101typedef struct _ReadCapdata_struct
140{ 102{
141 BYTE total_size[4]; // Total size in blocks 103 BYTE total_size[4]; /* Total size in blocks */
142 BYTE block_size[4]; // Size of blocks in bytes 104 BYTE block_size[4]; /* Size of blocks in bytes */
143} ReadCapdata_struct; 105} ReadCapdata_struct;
144 106
145#define CCISS_READ_CAPACITY_16 0x9e /* Read Capacity 16 */ 107#define CCISS_READ_CAPACITY_16 0x9e /* Read Capacity 16 */
@@ -171,52 +133,13 @@ typedef struct _ReadCapdata_struct_16
171#define CDB_LEN10 10 133#define CDB_LEN10 10
172#define CDB_LEN16 16 134#define CDB_LEN16 16
173 135
174// BMIC commands 136/* BMIC commands */
175#define BMIC_READ 0x26 137#define BMIC_READ 0x26
176#define BMIC_WRITE 0x27 138#define BMIC_WRITE 0x27
177#define BMIC_CACHE_FLUSH 0xc2 139#define BMIC_CACHE_FLUSH 0xc2
178#define CCISS_CACHE_FLUSH 0x01 //C2 was already being used by CCISS 140#define CCISS_CACHE_FLUSH 0x01 /* C2 was already being used by CCISS */
179
180//Command List Structure
181typedef union _SCSI3Addr_struct {
182 struct {
183 BYTE Dev;
184 BYTE Bus:6;
185 BYTE Mode:2; // b00
186 } PeripDev;
187 struct {
188 BYTE DevLSB;
189 BYTE DevMSB:6;
190 BYTE Mode:2; // b01
191 } LogDev;
192 struct {
193 BYTE Dev:5;
194 BYTE Bus:3;
195 BYTE Targ:6;
196 BYTE Mode:2; // b10
197 } LogUnit;
198} SCSI3Addr_struct;
199
200typedef struct _PhysDevAddr_struct {
201 DWORD TargetId:24;
202 DWORD Bus:6;
203 DWORD Mode:2;
204 SCSI3Addr_struct Target[2]; //2 level target device addr
205} PhysDevAddr_struct;
206
207typedef struct _LogDevAddr_struct {
208 DWORD VolId:30;
209 DWORD Mode:2;
210 BYTE reserved[4];
211} LogDevAddr_struct;
212
213typedef union _LUNAddr_struct {
214 BYTE LunAddrBytes[8];
215 SCSI3Addr_struct SCSI3Lun[4];
216 PhysDevAddr_struct PhysDev;
217 LogDevAddr_struct LogDev;
218} LUNAddr_struct;
219 141
142/* Command List Structure */
220#define CTLR_LUNID "\0\0\0\0\0\0\0\0" 143#define CTLR_LUNID "\0\0\0\0\0\0\0\0"
221 144
222typedef struct _CommandListHeader_struct { 145typedef struct _CommandListHeader_struct {
@@ -226,16 +149,6 @@ typedef struct _CommandListHeader_struct {
226 QWORD Tag; 149 QWORD Tag;
227 LUNAddr_struct LUN; 150 LUNAddr_struct LUN;
228} CommandListHeader_struct; 151} CommandListHeader_struct;
229typedef struct _RequestBlock_struct {
230 BYTE CDBLen;
231 struct {
232 BYTE Type:3;
233 BYTE Attribute:3;
234 BYTE Direction:2;
235 } Type;
236 HWORD Timeout;
237 BYTE CDB[16];
238} RequestBlock_struct;
239typedef struct _ErrDescriptor_struct { 152typedef struct _ErrDescriptor_struct {
240 QWORD Addr; 153 QWORD Addr;
241 DWORD Len; 154 DWORD Len;
@@ -246,28 +159,6 @@ typedef struct _SGDescriptor_struct {
246 DWORD Ext; 159 DWORD Ext;
247} SGDescriptor_struct; 160} SGDescriptor_struct;
248 161
249typedef union _MoreErrInfo_struct{
250 struct {
251 BYTE Reserved[3];
252 BYTE Type;
253 DWORD ErrorInfo;
254 }Common_Info;
255 struct{
256 BYTE Reserved[2];
257 BYTE offense_size;//size of offending entry
258 BYTE offense_num; //byte # of offense 0-base
259 DWORD offense_value;
260 }Invalid_Cmd;
261}MoreErrInfo_struct;
262typedef struct _ErrorInfo_struct {
263 BYTE ScsiStatus;
264 BYTE SenseLen;
265 HWORD CommandStatus;
266 DWORD ResidualCnt;
267 MoreErrInfo_struct MoreErrInfo;
268 BYTE SenseInfo[SENSEINFOBYTES];
269} ErrorInfo_struct;
270
271/* Command types */ 162/* Command types */
272#define CMD_RWREQ 0x00 163#define CMD_RWREQ 0x00
273#define CMD_IOCTL_PEND 0x01 164#define CMD_IOCTL_PEND 0x01
@@ -276,10 +167,18 @@ typedef struct _ErrorInfo_struct {
276#define CMD_MSG_TIMEOUT 0x05 167#define CMD_MSG_TIMEOUT 0x05
277#define CMD_MSG_STALE 0xff 168#define CMD_MSG_STALE 0xff
278 169
279/* This structure needs to be divisible by 8 for new 170/* This structure needs to be divisible by COMMANDLIST_ALIGNMENT
280 * indexing method. 171 * because low bits of the address are used to to indicate that
172 * whether the tag contains an index or an address. PAD_32 and
173 * PAD_64 can be adjusted independently as needed for 32-bit
174 * and 64-bits systems.
281 */ 175 */
282#define PADSIZE (sizeof(long) - 4) 176#define COMMANDLIST_ALIGNMENT (8)
177#define IS_64_BIT ((sizeof(long) - 4)/4)
178#define IS_32_BIT (!IS_64_BIT)
179#define PAD_32 (0)
180#define PAD_64 (4)
181#define PADSIZE (IS_32_BIT * PAD_32 + IS_64_BIT * PAD_64)
283typedef struct _CommandList_struct { 182typedef struct _CommandList_struct {
284 CommandListHeader_struct Header; 183 CommandListHeader_struct Header;
285 RequestBlock_struct Request; 184 RequestBlock_struct Request;
@@ -299,7 +198,7 @@ typedef struct _CommandList_struct {
299 char pad[PADSIZE]; 198 char pad[PADSIZE];
300} CommandList_struct; 199} CommandList_struct;
301 200
302//Configuration Table Structure 201/* Configuration Table Structure */
303typedef struct _HostWrite_struct { 202typedef struct _HostWrite_struct {
304 DWORD TransportRequest; 203 DWORD TransportRequest;
305 DWORD Reserved; 204 DWORD Reserved;
@@ -319,6 +218,10 @@ typedef struct _CfgTable_struct {
319 BYTE ServerName[16]; 218 BYTE ServerName[16];
320 DWORD HeartBeat; 219 DWORD HeartBeat;
321 DWORD SCSI_Prefetch; 220 DWORD SCSI_Prefetch;
221 DWORD MaxSGElements;
222 DWORD MaxLogicalUnits;
223 DWORD MaxPhysicalDrives;
224 DWORD MaxPhysicalDrivesPerLogicalUnit;
322} CfgTable_struct; 225} CfgTable_struct;
323#pragma pack() 226#pragma pack()
324#endif // CCISS_CMD_H 227#endif /* CCISS_CMD_H */
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 3315268b4ec7..e1d0e2cfec72 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -84,7 +84,6 @@ static struct scsi_host_template cciss_driver_template = {
84 .queuecommand = cciss_scsi_queue_command, 84 .queuecommand = cciss_scsi_queue_command,
85 .can_queue = SCSI_CCISS_CAN_QUEUE, 85 .can_queue = SCSI_CCISS_CAN_QUEUE,
86 .this_id = 7, 86 .this_id = 7,
87 .sg_tablesize = MAXSGENTRIES,
88 .cmd_per_lun = 1, 87 .cmd_per_lun = 1,
89 .use_clustering = DISABLE_CLUSTERING, 88 .use_clustering = DISABLE_CLUSTERING,
90 /* Can't have eh_bus_reset_handler or eh_host_reset_handler for cciss */ 89 /* Can't have eh_bus_reset_handler or eh_host_reset_handler for cciss */
@@ -93,11 +92,16 @@ static struct scsi_host_template cciss_driver_template = {
93}; 92};
94 93
95#pragma pack(1) 94#pragma pack(1)
95
96#define SCSI_PAD_32 0
97#define SCSI_PAD_64 0
98
96struct cciss_scsi_cmd_stack_elem_t { 99struct cciss_scsi_cmd_stack_elem_t {
97 CommandList_struct cmd; 100 CommandList_struct cmd;
98 ErrorInfo_struct Err; 101 ErrorInfo_struct Err;
99 __u32 busaddr; 102 __u32 busaddr;
100 __u32 pad; 103 int cmdindex;
104 u8 pad[IS_32_BIT * SCSI_PAD_32 + IS_64_BIT * SCSI_PAD_64];
101}; 105};
102 106
103#pragma pack() 107#pragma pack()
@@ -118,16 +122,15 @@ struct cciss_scsi_cmd_stack_t {
118struct cciss_scsi_adapter_data_t { 122struct cciss_scsi_adapter_data_t {
119 struct Scsi_Host *scsi_host; 123 struct Scsi_Host *scsi_host;
120 struct cciss_scsi_cmd_stack_t cmd_stack; 124 struct cciss_scsi_cmd_stack_t cmd_stack;
125 SGDescriptor_struct **cmd_sg_list;
121 int registered; 126 int registered;
122 spinlock_t lock; // to protect ccissscsi[ctlr]; 127 spinlock_t lock; // to protect ccissscsi[ctlr];
123}; 128};
124 129
125#define CPQ_TAPE_LOCK(ctlr, flags) spin_lock_irqsave( \ 130#define CPQ_TAPE_LOCK(ctlr, flags) spin_lock_irqsave( \
126 &(((struct cciss_scsi_adapter_data_t *) \ 131 &hba[ctlr]->scsi_ctlr->lock, flags);
127 hba[ctlr]->scsi_ctlr)->lock), flags);
128#define CPQ_TAPE_UNLOCK(ctlr, flags) spin_unlock_irqrestore( \ 132#define CPQ_TAPE_UNLOCK(ctlr, flags) spin_unlock_irqrestore( \
129 &(((struct cciss_scsi_adapter_data_t *) \ 133 &hba[ctlr]->scsi_ctlr->lock, flags);
130 hba[ctlr]->scsi_ctlr)->lock), flags);
131 134
132static CommandList_struct * 135static CommandList_struct *
133scsi_cmd_alloc(ctlr_info_t *h) 136scsi_cmd_alloc(ctlr_info_t *h)
@@ -143,7 +146,7 @@ scsi_cmd_alloc(ctlr_info_t *h)
143 struct cciss_scsi_cmd_stack_t *stk; 146 struct cciss_scsi_cmd_stack_t *stk;
144 u64bit temp64; 147 u64bit temp64;
145 148
146 sa = (struct cciss_scsi_adapter_data_t *) h->scsi_ctlr; 149 sa = h->scsi_ctlr;
147 stk = &sa->cmd_stack; 150 stk = &sa->cmd_stack;
148 151
149 if (stk->top < 0) 152 if (stk->top < 0)
@@ -154,6 +157,7 @@ scsi_cmd_alloc(ctlr_info_t *h)
154 memset(&c->Err, 0, sizeof(c->Err)); 157 memset(&c->Err, 0, sizeof(c->Err));
155 /* set physical addr of cmd and addr of scsi parameters */ 158 /* set physical addr of cmd and addr of scsi parameters */
156 c->cmd.busaddr = c->busaddr; 159 c->cmd.busaddr = c->busaddr;
160 c->cmd.cmdindex = c->cmdindex;
157 /* (__u32) (stk->cmd_pool_handle + 161 /* (__u32) (stk->cmd_pool_handle +
158 (sizeof(struct cciss_scsi_cmd_stack_elem_t)*stk->top)); */ 162 (sizeof(struct cciss_scsi_cmd_stack_elem_t)*stk->top)); */
159 163
@@ -182,7 +186,7 @@ scsi_cmd_free(ctlr_info_t *h, CommandList_struct *cmd)
182 struct cciss_scsi_adapter_data_t *sa; 186 struct cciss_scsi_adapter_data_t *sa;
183 struct cciss_scsi_cmd_stack_t *stk; 187 struct cciss_scsi_cmd_stack_t *stk;
184 188
185 sa = (struct cciss_scsi_adapter_data_t *) h->scsi_ctlr; 189 sa = h->scsi_ctlr;
186 stk = &sa->cmd_stack; 190 stk = &sa->cmd_stack;
187 if (stk->top >= CMD_STACK_SIZE) { 191 if (stk->top >= CMD_STACK_SIZE) {
188 printk("cciss: scsi_cmd_free called too many times.\n"); 192 printk("cciss: scsi_cmd_free called too many times.\n");
@@ -199,24 +203,31 @@ scsi_cmd_stack_setup(int ctlr, struct cciss_scsi_adapter_data_t *sa)
199 struct cciss_scsi_cmd_stack_t *stk; 203 struct cciss_scsi_cmd_stack_t *stk;
200 size_t size; 204 size_t size;
201 205
206 sa->cmd_sg_list = cciss_allocate_sg_chain_blocks(hba[ctlr],
207 hba[ctlr]->chainsize, CMD_STACK_SIZE);
208 if (!sa->cmd_sg_list && hba[ctlr]->chainsize > 0)
209 return -ENOMEM;
210
202 stk = &sa->cmd_stack; 211 stk = &sa->cmd_stack;
203 size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * CMD_STACK_SIZE; 212 size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * CMD_STACK_SIZE;
204 213
205 // pci_alloc_consistent guarantees 32-bit DMA address will 214 /* Check alignment, see cciss_cmd.h near CommandList_struct def. */
206 // be used 215 BUILD_BUG_ON((sizeof(*stk->pool) % COMMANDLIST_ALIGNMENT) != 0);
207 216 /* pci_alloc_consistent guarantees 32-bit DMA address will be used */
208 stk->pool = (struct cciss_scsi_cmd_stack_elem_t *) 217 stk->pool = (struct cciss_scsi_cmd_stack_elem_t *)
209 pci_alloc_consistent(hba[ctlr]->pdev, size, &stk->cmd_pool_handle); 218 pci_alloc_consistent(hba[ctlr]->pdev, size, &stk->cmd_pool_handle);
210 219
211 if (stk->pool == NULL) { 220 if (stk->pool == NULL) {
212 printk("stk->pool is null\n"); 221 cciss_free_sg_chain_blocks(sa->cmd_sg_list, CMD_STACK_SIZE);
213 return -1; 222 sa->cmd_sg_list = NULL;
223 return -ENOMEM;
214 } 224 }
215 225
216 for (i=0; i<CMD_STACK_SIZE; i++) { 226 for (i=0; i<CMD_STACK_SIZE; i++) {
217 stk->elem[i] = &stk->pool[i]; 227 stk->elem[i] = &stk->pool[i];
218 stk->elem[i]->busaddr = (__u32) (stk->cmd_pool_handle + 228 stk->elem[i]->busaddr = (__u32) (stk->cmd_pool_handle +
219 (sizeof(struct cciss_scsi_cmd_stack_elem_t) * i)); 229 (sizeof(struct cciss_scsi_cmd_stack_elem_t) * i));
230 stk->elem[i]->cmdindex = i;
220 } 231 }
221 stk->top = CMD_STACK_SIZE-1; 232 stk->top = CMD_STACK_SIZE-1;
222 return 0; 233 return 0;
@@ -229,7 +240,7 @@ scsi_cmd_stack_free(int ctlr)
229 struct cciss_scsi_cmd_stack_t *stk; 240 struct cciss_scsi_cmd_stack_t *stk;
230 size_t size; 241 size_t size;
231 242
232 sa = (struct cciss_scsi_adapter_data_t *) hba[ctlr]->scsi_ctlr; 243 sa = hba[ctlr]->scsi_ctlr;
233 stk = &sa->cmd_stack; 244 stk = &sa->cmd_stack;
234 if (stk->top != CMD_STACK_SIZE-1) { 245 if (stk->top != CMD_STACK_SIZE-1) {
235 printk( "cciss: %d scsi commands are still outstanding.\n", 246 printk( "cciss: %d scsi commands are still outstanding.\n",
@@ -241,6 +252,7 @@ scsi_cmd_stack_free(int ctlr)
241 252
242 pci_free_consistent(hba[ctlr]->pdev, size, stk->pool, stk->cmd_pool_handle); 253 pci_free_consistent(hba[ctlr]->pdev, size, stk->pool, stk->cmd_pool_handle);
243 stk->pool = NULL; 254 stk->pool = NULL;
255 cciss_free_sg_chain_blocks(sa->cmd_sg_list, CMD_STACK_SIZE);
244} 256}
245 257
246#if 0 258#if 0
@@ -530,8 +542,7 @@ adjust_cciss_scsi_table(int ctlr, int hostno,
530 CPQ_TAPE_LOCK(ctlr, flags); 542 CPQ_TAPE_LOCK(ctlr, flags);
531 543
532 if (hostno != -1) /* if it's not the first time... */ 544 if (hostno != -1) /* if it's not the first time... */
533 sh = ((struct cciss_scsi_adapter_data_t *) 545 sh = hba[ctlr]->scsi_ctlr->scsi_host;
534 hba[ctlr]->scsi_ctlr)->scsi_host;
535 546
536 /* find any devices in ccissscsi[] that are not in 547 /* find any devices in ccissscsi[] that are not in
537 sd[] and remove them from ccissscsi[] */ 548 sd[] and remove them from ccissscsi[] */
@@ -702,7 +713,7 @@ cciss_scsi_setup(int cntl_num)
702 kfree(shba); 713 kfree(shba);
703 shba = NULL; 714 shba = NULL;
704 } 715 }
705 hba[cntl_num]->scsi_ctlr = (void *) shba; 716 hba[cntl_num]->scsi_ctlr = shba;
706 return; 717 return;
707} 718}
708 719
@@ -725,6 +736,8 @@ complete_scsi_command( CommandList_struct *cp, int timeout, __u32 tag)
725 ctlr = hba[cp->ctlr]; 736 ctlr = hba[cp->ctlr];
726 737
727 scsi_dma_unmap(cmd); 738 scsi_dma_unmap(cmd);
739 if (cp->Header.SGTotal > ctlr->max_cmd_sgentries)
740 cciss_unmap_sg_chain_block(ctlr, cp);
728 741
729 cmd->result = (DID_OK << 16); /* host byte */ 742 cmd->result = (DID_OK << 16); /* host byte */
730 cmd->result |= (COMMAND_COMPLETE << 8); /* msg byte */ 743 cmd->result |= (COMMAND_COMPLETE << 8); /* msg byte */
@@ -755,7 +768,7 @@ complete_scsi_command( CommandList_struct *cp, int timeout, __u32 tag)
755 cp, 768 cp,
756 ei->ScsiStatus); 769 ei->ScsiStatus);
757#endif 770#endif
758 cmd->result |= (ei->ScsiStatus < 1); 771 cmd->result |= (ei->ScsiStatus << 1);
759 } 772 }
760 else { /* scsi status is zero??? How??? */ 773 else { /* scsi status is zero??? How??? */
761 774
@@ -847,9 +860,10 @@ cciss_scsi_detect(int ctlr)
847 sh->io_port = 0; // good enough? FIXME, 860 sh->io_port = 0; // good enough? FIXME,
848 sh->n_io_port = 0; // I don't think we use these two... 861 sh->n_io_port = 0; // I don't think we use these two...
849 sh->this_id = SELF_SCSI_ID; 862 sh->this_id = SELF_SCSI_ID;
863 sh->sg_tablesize = hba[ctlr]->maxsgentries;
850 864
851 ((struct cciss_scsi_adapter_data_t *) 865 ((struct cciss_scsi_adapter_data_t *)
852 hba[ctlr]->scsi_ctlr)->scsi_host = (void *) sh; 866 hba[ctlr]->scsi_ctlr)->scsi_host = sh;
853 sh->hostdata[0] = (unsigned long) hba[ctlr]; 867 sh->hostdata[0] = (unsigned long) hba[ctlr];
854 sh->irq = hba[ctlr]->intr[SIMPLE_MODE_INT]; 868 sh->irq = hba[ctlr]->intr[SIMPLE_MODE_INT];
855 sh->unique_id = sh->irq; 869 sh->unique_id = sh->irq;
@@ -1364,34 +1378,54 @@ cciss_scsi_proc_info(struct Scsi_Host *sh,
1364 dma mapping and fills in the scatter gather entries of the 1378 dma mapping and fills in the scatter gather entries of the
1365 cciss command, cp. */ 1379 cciss command, cp. */
1366 1380
1367static void 1381static void cciss_scatter_gather(ctlr_info_t *h, CommandList_struct *cp,
1368cciss_scatter_gather(struct pci_dev *pdev, 1382 struct scsi_cmnd *cmd)
1369 CommandList_struct *cp,
1370 struct scsi_cmnd *cmd)
1371{ 1383{
1372 unsigned int len; 1384 unsigned int len;
1373 struct scatterlist *sg; 1385 struct scatterlist *sg;
1374 __u64 addr64; 1386 __u64 addr64;
1375 int use_sg, i; 1387 int request_nsgs, i, chained, sg_index;
1376 1388 struct cciss_scsi_adapter_data_t *sa = h->scsi_ctlr;
1377 BUG_ON(scsi_sg_count(cmd) > MAXSGENTRIES); 1389 SGDescriptor_struct *curr_sg;
1378 1390
1379 use_sg = scsi_dma_map(cmd); 1391 BUG_ON(scsi_sg_count(cmd) > h->maxsgentries);
1380 if (use_sg) { /* not too many addrs? */ 1392
1381 scsi_for_each_sg(cmd, sg, use_sg, i) { 1393 chained = 0;
1394 sg_index = 0;
1395 curr_sg = cp->SG;
1396 request_nsgs = scsi_dma_map(cmd);
1397 if (request_nsgs) {
1398 scsi_for_each_sg(cmd, sg, request_nsgs, i) {
1399 if (sg_index + 1 == h->max_cmd_sgentries &&
1400 !chained && request_nsgs - i > 1) {
1401 chained = 1;
1402 sg_index = 0;
1403 curr_sg = sa->cmd_sg_list[cp->cmdindex];
1404 }
1382 addr64 = (__u64) sg_dma_address(sg); 1405 addr64 = (__u64) sg_dma_address(sg);
1383 len = sg_dma_len(sg); 1406 len = sg_dma_len(sg);
1384 cp->SG[i].Addr.lower = 1407 curr_sg[sg_index].Addr.lower =
1385 (__u32) (addr64 & (__u64) 0x00000000FFFFFFFF); 1408 (__u32) (addr64 & 0x0FFFFFFFFULL);
1386 cp->SG[i].Addr.upper = 1409 curr_sg[sg_index].Addr.upper =
1387 (__u32) ((addr64 >> 32) & (__u64) 0x00000000FFFFFFFF); 1410 (__u32) ((addr64 >> 32) & 0x0FFFFFFFFULL);
1388 cp->SG[i].Len = len; 1411 curr_sg[sg_index].Len = len;
1389 cp->SG[i].Ext = 0; // we are not chaining 1412 curr_sg[sg_index].Ext = 0;
1413 ++sg_index;
1390 } 1414 }
1415 if (chained)
1416 cciss_map_sg_chain_block(h, cp,
1417 sa->cmd_sg_list[cp->cmdindex],
1418 (request_nsgs - (h->max_cmd_sgentries - 1)) *
1419 sizeof(SGDescriptor_struct));
1391 } 1420 }
1392 1421 /* track how many SG entries we are using */
1393 cp->Header.SGList = (__u8) use_sg; /* no. SGs contig in this cmd */ 1422 if (request_nsgs > h->maxSG)
1394 cp->Header.SGTotal = (__u16) use_sg; /* total sgs in this cmd list */ 1423 h->maxSG = request_nsgs;
1424 cp->Header.SGTotal = (__u8) request_nsgs + chained;
1425 if (request_nsgs > h->max_cmd_sgentries)
1426 cp->Header.SGList = h->max_cmd_sgentries;
1427 else
1428 cp->Header.SGList = cp->Header.SGTotal;
1395 return; 1429 return;
1396} 1430}
1397 1431
@@ -1399,7 +1433,7 @@ cciss_scatter_gather(struct pci_dev *pdev,
1399static int 1433static int
1400cciss_scsi_queue_command (struct scsi_cmnd *cmd, void (* done)(struct scsi_cmnd *)) 1434cciss_scsi_queue_command (struct scsi_cmnd *cmd, void (* done)(struct scsi_cmnd *))
1401{ 1435{
1402 ctlr_info_t **c; 1436 ctlr_info_t *c;
1403 int ctlr, rc; 1437 int ctlr, rc;
1404 unsigned char scsi3addr[8]; 1438 unsigned char scsi3addr[8];
1405 CommandList_struct *cp; 1439 CommandList_struct *cp;
@@ -1407,8 +1441,8 @@ cciss_scsi_queue_command (struct scsi_cmnd *cmd, void (* done)(struct scsi_cmnd
1407 1441
1408 // Get the ptr to our adapter structure (hba[i]) out of cmd->host. 1442 // Get the ptr to our adapter structure (hba[i]) out of cmd->host.
1409 // We violate cmd->host privacy here. (Is there another way?) 1443 // We violate cmd->host privacy here. (Is there another way?)
1410 c = (ctlr_info_t **) &cmd->device->host->hostdata[0]; 1444 c = (ctlr_info_t *) cmd->device->host->hostdata[0];
1411 ctlr = (*c)->ctlr; 1445 ctlr = c->ctlr;
1412 1446
1413 rc = lookup_scsi3addr(ctlr, cmd->device->channel, cmd->device->id, 1447 rc = lookup_scsi3addr(ctlr, cmd->device->channel, cmd->device->id,
1414 cmd->device->lun, scsi3addr); 1448 cmd->device->lun, scsi3addr);
@@ -1431,7 +1465,7 @@ cciss_scsi_queue_command (struct scsi_cmnd *cmd, void (* done)(struct scsi_cmnd
1431 see what the device thinks of it. */ 1465 see what the device thinks of it. */
1432 1466
1433 spin_lock_irqsave(CCISS_LOCK(ctlr), flags); 1467 spin_lock_irqsave(CCISS_LOCK(ctlr), flags);
1434 cp = scsi_cmd_alloc(*c); 1468 cp = scsi_cmd_alloc(c);
1435 spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags); 1469 spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
1436 if (cp == NULL) { /* trouble... */ 1470 if (cp == NULL) { /* trouble... */
1437 printk("scsi_cmd_alloc returned NULL!\n"); 1471 printk("scsi_cmd_alloc returned NULL!\n");
@@ -1489,15 +1523,14 @@ cciss_scsi_queue_command (struct scsi_cmnd *cmd, void (* done)(struct scsi_cmnd
1489 BUG(); 1523 BUG();
1490 break; 1524 break;
1491 } 1525 }
1492 1526 cciss_scatter_gather(c, cp, cmd);
1493 cciss_scatter_gather((*c)->pdev, cp, cmd); // Fill the SG list
1494 1527
1495 /* Put the request on the tail of the request queue */ 1528 /* Put the request on the tail of the request queue */
1496 1529
1497 spin_lock_irqsave(CCISS_LOCK(ctlr), flags); 1530 spin_lock_irqsave(CCISS_LOCK(ctlr), flags);
1498 addQ(&(*c)->reqQ, cp); 1531 addQ(&c->reqQ, cp);
1499 (*c)->Qdepth++; 1532 c->Qdepth++;
1500 start_io(*c); 1533 start_io(c);
1501 spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags); 1534 spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
1502 1535
1503 /* the cmd'll come back via intr handler in complete_scsi_command() */ 1536 /* the cmd'll come back via intr handler in complete_scsi_command() */
@@ -1514,7 +1547,7 @@ cciss_unregister_scsi(int ctlr)
1514 /* we are being forcibly unloaded, and may not refuse. */ 1547 /* we are being forcibly unloaded, and may not refuse. */
1515 1548
1516 spin_lock_irqsave(CCISS_LOCK(ctlr), flags); 1549 spin_lock_irqsave(CCISS_LOCK(ctlr), flags);
1517 sa = (struct cciss_scsi_adapter_data_t *) hba[ctlr]->scsi_ctlr; 1550 sa = hba[ctlr]->scsi_ctlr;
1518 stk = &sa->cmd_stack; 1551 stk = &sa->cmd_stack;
1519 1552
1520 /* if we weren't ever actually registered, don't unregister */ 1553 /* if we weren't ever actually registered, don't unregister */
@@ -1541,13 +1574,13 @@ cciss_engage_scsi(int ctlr)
1541 unsigned long flags; 1574 unsigned long flags;
1542 1575
1543 spin_lock_irqsave(CCISS_LOCK(ctlr), flags); 1576 spin_lock_irqsave(CCISS_LOCK(ctlr), flags);
1544 sa = (struct cciss_scsi_adapter_data_t *) hba[ctlr]->scsi_ctlr; 1577 sa = hba[ctlr]->scsi_ctlr;
1545 stk = &sa->cmd_stack; 1578 stk = &sa->cmd_stack;
1546 1579
1547 if (sa->registered) { 1580 if (sa->registered) {
1548 printk("cciss%d: SCSI subsystem already engaged.\n", ctlr); 1581 printk("cciss%d: SCSI subsystem already engaged.\n", ctlr);
1549 spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags); 1582 spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
1550 return ENXIO; 1583 return -ENXIO;
1551 } 1584 }
1552 sa->registered = 1; 1585 sa->registered = 1;
1553 spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags); 1586 spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
@@ -1654,14 +1687,14 @@ static int cciss_eh_device_reset_handler(struct scsi_cmnd *scsicmd)
1654 int rc; 1687 int rc;
1655 CommandList_struct *cmd_in_trouble; 1688 CommandList_struct *cmd_in_trouble;
1656 unsigned char lunaddr[8]; 1689 unsigned char lunaddr[8];
1657 ctlr_info_t **c; 1690 ctlr_info_t *c;
1658 int ctlr; 1691 int ctlr;
1659 1692
1660 /* find the controller to which the command to be aborted was sent */ 1693 /* find the controller to which the command to be aborted was sent */
1661 c = (ctlr_info_t **) &scsicmd->device->host->hostdata[0]; 1694 c = (ctlr_info_t *) scsicmd->device->host->hostdata[0];
1662 if (c == NULL) /* paranoia */ 1695 if (c == NULL) /* paranoia */
1663 return FAILED; 1696 return FAILED;
1664 ctlr = (*c)->ctlr; 1697 ctlr = c->ctlr;
1665 printk(KERN_WARNING "cciss%d: resetting tape drive or medium changer.\n", ctlr); 1698 printk(KERN_WARNING "cciss%d: resetting tape drive or medium changer.\n", ctlr);
1666 /* find the command that's giving us trouble */ 1699 /* find the command that's giving us trouble */
1667 cmd_in_trouble = (CommandList_struct *) scsicmd->host_scribble; 1700 cmd_in_trouble = (CommandList_struct *) scsicmd->host_scribble;
@@ -1671,7 +1704,7 @@ static int cciss_eh_device_reset_handler(struct scsi_cmnd *scsicmd)
1671 /* send a reset to the SCSI LUN which the command was sent to */ 1704 /* send a reset to the SCSI LUN which the command was sent to */
1672 rc = sendcmd_withirq(CCISS_RESET_MSG, ctlr, NULL, 0, 0, lunaddr, 1705 rc = sendcmd_withirq(CCISS_RESET_MSG, ctlr, NULL, 0, 0, lunaddr,
1673 TYPE_MSG); 1706 TYPE_MSG);
1674 if (rc == 0 && wait_for_device_to_become_ready(*c, lunaddr) == 0) 1707 if (rc == 0 && wait_for_device_to_become_ready(c, lunaddr) == 0)
1675 return SUCCESS; 1708 return SUCCESS;
1676 printk(KERN_WARNING "cciss%d: resetting device failed.\n", ctlr); 1709 printk(KERN_WARNING "cciss%d: resetting device failed.\n", ctlr);
1677 return FAILED; 1710 return FAILED;
@@ -1682,14 +1715,14 @@ static int cciss_eh_abort_handler(struct scsi_cmnd *scsicmd)
1682 int rc; 1715 int rc;
1683 CommandList_struct *cmd_to_abort; 1716 CommandList_struct *cmd_to_abort;
1684 unsigned char lunaddr[8]; 1717 unsigned char lunaddr[8];
1685 ctlr_info_t **c; 1718 ctlr_info_t *c;
1686 int ctlr; 1719 int ctlr;
1687 1720
1688 /* find the controller to which the command to be aborted was sent */ 1721 /* find the controller to which the command to be aborted was sent */
1689 c = (ctlr_info_t **) &scsicmd->device->host->hostdata[0]; 1722 c = (ctlr_info_t *) scsicmd->device->host->hostdata[0];
1690 if (c == NULL) /* paranoia */ 1723 if (c == NULL) /* paranoia */
1691 return FAILED; 1724 return FAILED;
1692 ctlr = (*c)->ctlr; 1725 ctlr = c->ctlr;
1693 printk(KERN_WARNING "cciss%d: aborting tardy SCSI cmd\n", ctlr); 1726 printk(KERN_WARNING "cciss%d: aborting tardy SCSI cmd\n", ctlr);
1694 1727
1695 /* find the command to be aborted */ 1728 /* find the command to be aborted */
diff --git a/drivers/block/cciss_scsi.h b/drivers/block/cciss_scsi.h
index 7b750245ae76..6d5822fe851a 100644
--- a/drivers/block/cciss_scsi.h
+++ b/drivers/block/cciss_scsi.h
@@ -25,16 +25,16 @@
25 25
26#include <scsi/scsicam.h> /* possibly irrelevant, since we don't show disks */ 26#include <scsi/scsicam.h> /* possibly irrelevant, since we don't show disks */
27 27
28 // the scsi id of the adapter... 28 /* the scsi id of the adapter... */
29#define SELF_SCSI_ID 15 29#define SELF_SCSI_ID 15
30 // 15 is somewhat arbitrary, since the scsi-2 bus 30 /* 15 is somewhat arbitrary, since the scsi-2 bus
31 // that's presented by the driver to the OS is 31 that's presented by the driver to the OS is
32 // fabricated. The "real" scsi-3 bus the 32 fabricated. The "real" scsi-3 bus the
33 // hardware presents is fabricated too. 33 hardware presents is fabricated too.
34 // The actual, honest-to-goodness physical 34 The actual, honest-to-goodness physical
35 // bus that the devices are attached to is not 35 bus that the devices are attached to is not
36 // addressible natively, and may in fact turn 36 addressible natively, and may in fact turn
37 // out to be not scsi at all. 37 out to be not scsi at all. */
38 38
39#define SCSI_CCISS_CAN_QUEUE 2 39#define SCSI_CCISS_CAN_QUEUE 2
40 40
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index 6422651ec364..91d11631cec9 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -448,11 +448,8 @@ static int __init cpqarray_register_ctlr( int i, struct pci_dev *pdev)
448 blk_queue_bounce_limit(q, hba[i]->pci_dev->dma_mask); 448 blk_queue_bounce_limit(q, hba[i]->pci_dev->dma_mask);
449 449
450 /* This is a hardware imposed limit. */ 450 /* This is a hardware imposed limit. */
451 blk_queue_max_hw_segments(q, SG_MAX); 451 blk_queue_max_segments(q, SG_MAX);
452 452
453 /* This is a driver limit and could be eliminated. */
454 blk_queue_max_phys_segments(q, SG_MAX);
455
456 init_timer(&hba[i]->timer); 453 init_timer(&hba[i]->timer);
457 hba[i]->timer.expires = jiffies + IDA_TIMER; 454 hba[i]->timer.expires = jiffies + IDA_TIMER;
458 hba[i]->timer.data = (unsigned long)hba[i]; 455 hba[i]->timer.data = (unsigned long)hba[i];
diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig
new file mode 100644
index 000000000000..df0983787390
--- /dev/null
+++ b/drivers/block/drbd/Kconfig
@@ -0,0 +1,71 @@
1#
2# DRBD device driver configuration
3#
4
5comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected"
6 depends on PROC_FS='n' || INET='n' || CONNECTOR='n'
7
8config BLK_DEV_DRBD
9 tristate "DRBD Distributed Replicated Block Device support"
10 depends on PROC_FS && INET && CONNECTOR
11 select LRU_CACHE
12 default n
13 help
14
15 NOTE: In order to authenticate connections you have to select
16 CRYPTO_HMAC and a hash function as well.
17
18 DRBD is a shared-nothing, synchronously replicated block device. It
19 is designed to serve as a building block for high availability
20 clusters and in this context, is a "drop-in" replacement for shared
21 storage. Simplistically, you could see it as a network RAID 1.
22
23 Each minor device has a role, which can be 'primary' or 'secondary'.
24 On the node with the primary device the application is supposed to
25 run and to access the device (/dev/drbdX). Every write is sent to
26 the local 'lower level block device' and, across the network, to the
27 node with the device in 'secondary' state. The secondary device
28 simply writes the data to its lower level block device.
29
30 DRBD can also be used in dual-Primary mode (device writable on both
31 nodes), which means it can exhibit shared disk semantics in a
32 shared-nothing cluster. Needless to say, on top of dual-Primary
33 DRBD utilizing a cluster file system is necessary to maintain for
34 cache coherency.
35
36 For automatic failover you need a cluster manager (e.g. heartbeat).
37 See also: http://www.drbd.org/, http://www.linux-ha.org
38
39 If unsure, say N.
40
41config DRBD_FAULT_INJECTION
42 bool "DRBD fault injection"
43 depends on BLK_DEV_DRBD
44 help
45
46 Say Y here if you want to simulate IO errors, in order to test DRBD's
47 behavior.
48
49 The actual simulation of IO errors is done by writing 3 values to
50 /sys/module/drbd/parameters/
51
52 enable_faults: bitmask of...
53 1 meta data write
54 2 read
55 4 resync data write
56 8 read
57 16 data write
58 32 data read
59 64 read ahead
60 128 kmalloc of bitmap
61 256 allocation of EE (epoch_entries)
62
63 fault_devs: bitmask of minor numbers
64 fault_rate: frequency in percent
65
66 Example: Simulate data write errors on /dev/drbd0 with a probability of 5%.
67 echo 16 > /sys/module/drbd/parameters/enable_faults
68 echo 1 > /sys/module/drbd/parameters/fault_devs
69 echo 5 > /sys/module/drbd/parameters/fault_rate
70
71 If unsure, say N.
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile
new file mode 100644
index 000000000000..0d3f337ff5ff
--- /dev/null
+++ b/drivers/block/drbd/Makefile
@@ -0,0 +1,5 @@
1drbd-y := drbd_bitmap.o drbd_proc.o
2drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
3drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
4
5obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
new file mode 100644
index 000000000000..df018990c422
--- /dev/null
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -0,0 +1,1433 @@
1/*
2 drbd_actlog.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/slab.h>
27#include <linux/drbd.h>
28#include "drbd_int.h"
29#include "drbd_wrappers.h"
30
31/* We maintain a trivial check sum in our on disk activity log.
32 * With that we can ensure correct operation even when the storage
33 * device might do a partial (last) sector write while loosing power.
34 */
35struct __packed al_transaction {
36 u32 magic;
37 u32 tr_number;
38 struct __packed {
39 u32 pos;
40 u32 extent; } updates[1 + AL_EXTENTS_PT];
41 u32 xor_sum;
42};
43
44struct update_odbm_work {
45 struct drbd_work w;
46 unsigned int enr;
47};
48
49struct update_al_work {
50 struct drbd_work w;
51 struct lc_element *al_ext;
52 struct completion event;
53 unsigned int enr;
54 /* if old_enr != LC_FREE, write corresponding bitmap sector, too */
55 unsigned int old_enr;
56};
57
58struct drbd_atodb_wait {
59 atomic_t count;
60 struct completion io_done;
61 struct drbd_conf *mdev;
62 int error;
63};
64
65
66int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int);
67
68static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
69 struct drbd_backing_dev *bdev,
70 struct page *page, sector_t sector,
71 int rw, int size)
72{
73 struct bio *bio;
74 struct drbd_md_io md_io;
75 int ok;
76
77 md_io.mdev = mdev;
78 init_completion(&md_io.event);
79 md_io.error = 0;
80
81 if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags))
82 rw |= (1 << BIO_RW_BARRIER);
83 rw |= ((1<<BIO_RW_UNPLUG) | (1<<BIO_RW_SYNCIO));
84
85 retry:
86 bio = bio_alloc(GFP_NOIO, 1);
87 bio->bi_bdev = bdev->md_bdev;
88 bio->bi_sector = sector;
89 ok = (bio_add_page(bio, page, size, 0) == size);
90 if (!ok)
91 goto out;
92 bio->bi_private = &md_io;
93 bio->bi_end_io = drbd_md_io_complete;
94 bio->bi_rw = rw;
95
96 if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
97 bio_endio(bio, -EIO);
98 else
99 submit_bio(rw, bio);
100 wait_for_completion(&md_io.event);
101 ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0;
102
103 /* check for unsupported barrier op.
104 * would rather check on EOPNOTSUPP, but that is not reliable.
105 * don't try again for ANY return value != 0 */
106 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && !ok)) {
107 /* Try again with no barrier */
108 dev_warn(DEV, "Barriers not supported on meta data device - disabling\n");
109 set_bit(MD_NO_BARRIER, &mdev->flags);
110 rw &= ~(1 << BIO_RW_BARRIER);
111 bio_put(bio);
112 goto retry;
113 }
114 out:
115 bio_put(bio);
116 return ok;
117}
118
119int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
120 sector_t sector, int rw)
121{
122 int logical_block_size, mask, ok;
123 int offset = 0;
124 struct page *iop = mdev->md_io_page;
125
126 D_ASSERT(mutex_is_locked(&mdev->md_io_mutex));
127
128 BUG_ON(!bdev->md_bdev);
129
130 logical_block_size = bdev_logical_block_size(bdev->md_bdev);
131 if (logical_block_size == 0)
132 logical_block_size = MD_SECTOR_SIZE;
133
134 /* in case logical_block_size != 512 [ s390 only? ] */
135 if (logical_block_size != MD_SECTOR_SIZE) {
136 mask = (logical_block_size / MD_SECTOR_SIZE) - 1;
137 D_ASSERT(mask == 1 || mask == 3 || mask == 7);
138 D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE);
139 offset = sector & mask;
140 sector = sector & ~mask;
141 iop = mdev->md_io_tmpp;
142
143 if (rw & WRITE) {
144 /* these are GFP_KERNEL pages, pre-allocated
145 * on device initialization */
146 void *p = page_address(mdev->md_io_page);
147 void *hp = page_address(mdev->md_io_tmpp);
148
149 ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector,
150 READ, logical_block_size);
151
152 if (unlikely(!ok)) {
153 dev_err(DEV, "drbd_md_sync_page_io(,%llus,"
154 "READ [logical_block_size!=512]) failed!\n",
155 (unsigned long long)sector);
156 return 0;
157 }
158
159 memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE);
160 }
161 }
162
163 if (sector < drbd_md_first_sector(bdev) ||
164 sector > drbd_md_last_sector(bdev))
165 dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
166 current->comm, current->pid, __func__,
167 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
168
169 ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size);
170 if (unlikely(!ok)) {
171 dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n",
172 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
173 return 0;
174 }
175
176 if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) {
177 void *p = page_address(mdev->md_io_page);
178 void *hp = page_address(mdev->md_io_tmpp);
179
180 memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE);
181 }
182
183 return ok;
184}
185
186static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
187{
188 struct lc_element *al_ext;
189 struct lc_element *tmp;
190 unsigned long al_flags = 0;
191
192 spin_lock_irq(&mdev->al_lock);
193 tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
194 if (unlikely(tmp != NULL)) {
195 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
196 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
197 spin_unlock_irq(&mdev->al_lock);
198 return NULL;
199 }
200 }
201 al_ext = lc_get(mdev->act_log, enr);
202 al_flags = mdev->act_log->flags;
203 spin_unlock_irq(&mdev->al_lock);
204
205 /*
206 if (!al_ext) {
207 if (al_flags & LC_STARVING)
208 dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n");
209 if (al_flags & LC_DIRTY)
210 dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n");
211 }
212 */
213
214 return al_ext;
215}
216
217void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector)
218{
219 unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
220 struct lc_element *al_ext;
221 struct update_al_work al_work;
222
223 D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
224
225 wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr)));
226
227 if (al_ext->lc_number != enr) {
228 /* drbd_al_write_transaction(mdev,al_ext,enr);
229 * recurses into generic_make_request(), which
230 * disallows recursion, bios being serialized on the
231 * current->bio_tail list now.
232 * we have to delegate updates to the activity log
233 * to the worker thread. */
234 init_completion(&al_work.event);
235 al_work.al_ext = al_ext;
236 al_work.enr = enr;
237 al_work.old_enr = al_ext->lc_number;
238 al_work.w.cb = w_al_write_transaction;
239 drbd_queue_work_front(&mdev->data.work, &al_work.w);
240 wait_for_completion(&al_work.event);
241
242 mdev->al_writ_cnt++;
243
244 spin_lock_irq(&mdev->al_lock);
245 lc_changed(mdev->act_log, al_ext);
246 spin_unlock_irq(&mdev->al_lock);
247 wake_up(&mdev->al_wait);
248 }
249}
250
251void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector)
252{
253 unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
254 struct lc_element *extent;
255 unsigned long flags;
256
257 spin_lock_irqsave(&mdev->al_lock, flags);
258
259 extent = lc_find(mdev->act_log, enr);
260
261 if (!extent) {
262 spin_unlock_irqrestore(&mdev->al_lock, flags);
263 dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
264 return;
265 }
266
267 if (lc_put(mdev->act_log, extent) == 0)
268 wake_up(&mdev->al_wait);
269
270 spin_unlock_irqrestore(&mdev->al_lock, flags);
271}
272
273int
274w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
275{
276 struct update_al_work *aw = container_of(w, struct update_al_work, w);
277 struct lc_element *updated = aw->al_ext;
278 const unsigned int new_enr = aw->enr;
279 const unsigned int evicted = aw->old_enr;
280 struct al_transaction *buffer;
281 sector_t sector;
282 int i, n, mx;
283 unsigned int extent_nr;
284 u32 xor_sum = 0;
285
286 if (!get_ldev(mdev)) {
287 dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n");
288 complete(&((struct update_al_work *)w)->event);
289 return 1;
290 }
291 /* do we have to do a bitmap write, first?
292 * TODO reduce maximum latency:
293 * submit both bios, then wait for both,
294 * instead of doing two synchronous sector writes. */
295 if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
296 drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT);
297
298 mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */
299 buffer = (struct al_transaction *)page_address(mdev->md_io_page);
300
301 buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
302 buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
303
304 n = lc_index_of(mdev->act_log, updated);
305
306 buffer->updates[0].pos = cpu_to_be32(n);
307 buffer->updates[0].extent = cpu_to_be32(new_enr);
308
309 xor_sum ^= new_enr;
310
311 mx = min_t(int, AL_EXTENTS_PT,
312 mdev->act_log->nr_elements - mdev->al_tr_cycle);
313 for (i = 0; i < mx; i++) {
314 unsigned idx = mdev->al_tr_cycle + i;
315 extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number;
316 buffer->updates[i+1].pos = cpu_to_be32(idx);
317 buffer->updates[i+1].extent = cpu_to_be32(extent_nr);
318 xor_sum ^= extent_nr;
319 }
320 for (; i < AL_EXTENTS_PT; i++) {
321 buffer->updates[i+1].pos = __constant_cpu_to_be32(-1);
322 buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE);
323 xor_sum ^= LC_FREE;
324 }
325 mdev->al_tr_cycle += AL_EXTENTS_PT;
326 if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
327 mdev->al_tr_cycle = 0;
328
329 buffer->xor_sum = cpu_to_be32(xor_sum);
330
331 sector = mdev->ldev->md.md_offset
332 + mdev->ldev->md.al_offset + mdev->al_tr_pos;
333
334 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))
335 drbd_chk_io_error(mdev, 1, TRUE);
336
337 if (++mdev->al_tr_pos >
338 div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
339 mdev->al_tr_pos = 0;
340
341 D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
342 mdev->al_tr_number++;
343
344 mutex_unlock(&mdev->md_io_mutex);
345
346 complete(&((struct update_al_work *)w)->event);
347 put_ldev(mdev);
348
349 return 1;
350}
351
352/**
353 * drbd_al_read_tr() - Read a single transaction from the on disk activity log
354 * @mdev: DRBD device.
355 * @bdev: Block device to read form.
356 * @b: pointer to an al_transaction.
357 * @index: On disk slot of the transaction to read.
358 *
359 * Returns -1 on IO error, 0 on checksum error and 1 upon success.
360 */
361static int drbd_al_read_tr(struct drbd_conf *mdev,
362 struct drbd_backing_dev *bdev,
363 struct al_transaction *b,
364 int index)
365{
366 sector_t sector;
367 int rv, i;
368 u32 xor_sum = 0;
369
370 sector = bdev->md.md_offset + bdev->md.al_offset + index;
371
372 /* Dont process error normally,
373 * as this is done before disk is attached! */
374 if (!drbd_md_sync_page_io(mdev, bdev, sector, READ))
375 return -1;
376
377 rv = (be32_to_cpu(b->magic) == DRBD_MAGIC);
378
379 for (i = 0; i < AL_EXTENTS_PT + 1; i++)
380 xor_sum ^= be32_to_cpu(b->updates[i].extent);
381 rv &= (xor_sum == be32_to_cpu(b->xor_sum));
382
383 return rv;
384}
385
386/**
387 * drbd_al_read_log() - Restores the activity log from its on disk representation.
388 * @mdev: DRBD device.
389 * @bdev: Block device to read form.
390 *
391 * Returns 1 on success, returns 0 when reading the log failed due to IO errors.
392 */
393int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
394{
395 struct al_transaction *buffer;
396 int i;
397 int rv;
398 int mx;
399 int active_extents = 0;
400 int transactions = 0;
401 int found_valid = 0;
402 int from = 0;
403 int to = 0;
404 u32 from_tnr = 0;
405 u32 to_tnr = 0;
406 u32 cnr;
407
408 mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT);
409
410 /* lock out all other meta data io for now,
411 * and make sure the page is mapped.
412 */
413 mutex_lock(&mdev->md_io_mutex);
414 buffer = page_address(mdev->md_io_page);
415
416 /* Find the valid transaction in the log */
417 for (i = 0; i <= mx; i++) {
418 rv = drbd_al_read_tr(mdev, bdev, buffer, i);
419 if (rv == 0)
420 continue;
421 if (rv == -1) {
422 mutex_unlock(&mdev->md_io_mutex);
423 return 0;
424 }
425 cnr = be32_to_cpu(buffer->tr_number);
426
427 if (++found_valid == 1) {
428 from = i;
429 to = i;
430 from_tnr = cnr;
431 to_tnr = cnr;
432 continue;
433 }
434 if ((int)cnr - (int)from_tnr < 0) {
435 D_ASSERT(from_tnr - cnr + i - from == mx+1);
436 from = i;
437 from_tnr = cnr;
438 }
439 if ((int)cnr - (int)to_tnr > 0) {
440 D_ASSERT(cnr - to_tnr == i - to);
441 to = i;
442 to_tnr = cnr;
443 }
444 }
445
446 if (!found_valid) {
447 dev_warn(DEV, "No usable activity log found.\n");
448 mutex_unlock(&mdev->md_io_mutex);
449 return 1;
450 }
451
452 /* Read the valid transactions.
453 * dev_info(DEV, "Reading from %d to %d.\n",from,to); */
454 i = from;
455 while (1) {
456 int j, pos;
457 unsigned int extent_nr;
458 unsigned int trn;
459
460 rv = drbd_al_read_tr(mdev, bdev, buffer, i);
461 ERR_IF(rv == 0) goto cancel;
462 if (rv == -1) {
463 mutex_unlock(&mdev->md_io_mutex);
464 return 0;
465 }
466
467 trn = be32_to_cpu(buffer->tr_number);
468
469 spin_lock_irq(&mdev->al_lock);
470
471 /* This loop runs backwards because in the cyclic
472 elements there might be an old version of the
473 updated element (in slot 0). So the element in slot 0
474 can overwrite old versions. */
475 for (j = AL_EXTENTS_PT; j >= 0; j--) {
476 pos = be32_to_cpu(buffer->updates[j].pos);
477 extent_nr = be32_to_cpu(buffer->updates[j].extent);
478
479 if (extent_nr == LC_FREE)
480 continue;
481
482 lc_set(mdev->act_log, extent_nr, pos);
483 active_extents++;
484 }
485 spin_unlock_irq(&mdev->al_lock);
486
487 transactions++;
488
489cancel:
490 if (i == to)
491 break;
492 i++;
493 if (i > mx)
494 i = 0;
495 }
496
497 mdev->al_tr_number = to_tnr+1;
498 mdev->al_tr_pos = to;
499 if (++mdev->al_tr_pos >
500 div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
501 mdev->al_tr_pos = 0;
502
503 /* ok, we are done with it */
504 mutex_unlock(&mdev->md_io_mutex);
505
506 dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
507 transactions, active_extents);
508
509 return 1;
510}
511
512static void atodb_endio(struct bio *bio, int error)
513{
514 struct drbd_atodb_wait *wc = bio->bi_private;
515 struct drbd_conf *mdev = wc->mdev;
516 struct page *page;
517 int uptodate = bio_flagged(bio, BIO_UPTODATE);
518
519 /* strange behavior of some lower level drivers...
520 * fail the request by clearing the uptodate flag,
521 * but do not return any error?! */
522 if (!error && !uptodate)
523 error = -EIO;
524
525 drbd_chk_io_error(mdev, error, TRUE);
526 if (error && wc->error == 0)
527 wc->error = error;
528
529 if (atomic_dec_and_test(&wc->count))
530 complete(&wc->io_done);
531
532 page = bio->bi_io_vec[0].bv_page;
533 put_page(page);
534 bio_put(bio);
535 mdev->bm_writ_cnt++;
536 put_ldev(mdev);
537}
538
539/* sector to word */
540#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
541
542/* activity log to on disk bitmap -- prepare bio unless that sector
543 * is already covered by previously prepared bios */
544static int atodb_prepare_unless_covered(struct drbd_conf *mdev,
545 struct bio **bios,
546 unsigned int enr,
547 struct drbd_atodb_wait *wc) __must_hold(local)
548{
549 struct bio *bio;
550 struct page *page;
551 sector_t on_disk_sector;
552 unsigned int page_offset = PAGE_SIZE;
553 int offset;
554 int i = 0;
555 int err = -ENOMEM;
556
557 /* We always write aligned, full 4k blocks,
558 * so we can ignore the logical_block_size (for now) */
559 enr &= ~7U;
560 on_disk_sector = enr + mdev->ldev->md.md_offset
561 + mdev->ldev->md.bm_offset;
562
563 D_ASSERT(!(on_disk_sector & 7U));
564
565 /* Check if that enr is already covered by an already created bio.
566 * Caution, bios[] is not NULL terminated,
567 * but only initialized to all NULL.
568 * For completely scattered activity log,
569 * the last invocation iterates over all bios,
570 * and finds the last NULL entry.
571 */
572 while ((bio = bios[i])) {
573 if (bio->bi_sector == on_disk_sector)
574 return 0;
575 i++;
576 }
577 /* bios[i] == NULL, the next not yet used slot */
578
579 /* GFP_KERNEL, we are not in the write-out path */
580 bio = bio_alloc(GFP_KERNEL, 1);
581 if (bio == NULL)
582 return -ENOMEM;
583
584 if (i > 0) {
585 const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec;
586 page_offset = prev_bv->bv_offset + prev_bv->bv_len;
587 page = prev_bv->bv_page;
588 }
589 if (page_offset == PAGE_SIZE) {
590 page = alloc_page(__GFP_HIGHMEM);
591 if (page == NULL)
592 goto out_bio_put;
593 page_offset = 0;
594 } else {
595 get_page(page);
596 }
597
598 offset = S2W(enr);
599 drbd_bm_get_lel(mdev, offset,
600 min_t(size_t, S2W(8), drbd_bm_words(mdev) - offset),
601 kmap(page) + page_offset);
602 kunmap(page);
603
604 bio->bi_private = wc;
605 bio->bi_end_io = atodb_endio;
606 bio->bi_bdev = mdev->ldev->md_bdev;
607 bio->bi_sector = on_disk_sector;
608
609 if (bio_add_page(bio, page, 4096, page_offset) != 4096)
610 goto out_put_page;
611
612 atomic_inc(&wc->count);
613 /* we already know that we may do this...
614 * get_ldev_if_state(mdev,D_ATTACHING);
615 * just get the extra reference, so that the local_cnt reflects
616 * the number of pending IO requests DRBD at its backing device.
617 */
618 atomic_inc(&mdev->local_cnt);
619
620 bios[i] = bio;
621
622 return 0;
623
624out_put_page:
625 err = -EINVAL;
626 put_page(page);
627out_bio_put:
628 bio_put(bio);
629 return err;
630}
631
632/**
633 * drbd_al_to_on_disk_bm() - * Writes bitmap parts covered by active AL extents
634 * @mdev: DRBD device.
635 *
636 * Called when we detach (unconfigure) local storage,
637 * or when we go from R_PRIMARY to R_SECONDARY role.
638 */
639void drbd_al_to_on_disk_bm(struct drbd_conf *mdev)
640{
641 int i, nr_elements;
642 unsigned int enr;
643 struct bio **bios;
644 struct drbd_atodb_wait wc;
645
646 ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING))
647 return; /* sorry, I don't have any act_log etc... */
648
649 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
650
651 nr_elements = mdev->act_log->nr_elements;
652
653 /* GFP_KERNEL, we are not in anyone's write-out path */
654 bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL);
655 if (!bios)
656 goto submit_one_by_one;
657
658 atomic_set(&wc.count, 0);
659 init_completion(&wc.io_done);
660 wc.mdev = mdev;
661 wc.error = 0;
662
663 for (i = 0; i < nr_elements; i++) {
664 enr = lc_element_by_index(mdev->act_log, i)->lc_number;
665 if (enr == LC_FREE)
666 continue;
667 /* next statement also does atomic_inc wc.count and local_cnt */
668 if (atodb_prepare_unless_covered(mdev, bios,
669 enr/AL_EXT_PER_BM_SECT,
670 &wc))
671 goto free_bios_submit_one_by_one;
672 }
673
674 /* unnecessary optimization? */
675 lc_unlock(mdev->act_log);
676 wake_up(&mdev->al_wait);
677
678 /* all prepared, submit them */
679 for (i = 0; i < nr_elements; i++) {
680 if (bios[i] == NULL)
681 break;
682 if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) {
683 bios[i]->bi_rw = WRITE;
684 bio_endio(bios[i], -EIO);
685 } else {
686 submit_bio(WRITE, bios[i]);
687 }
688 }
689
690 drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
691
692 /* always (try to) flush bitmap to stable storage */
693 drbd_md_flush(mdev);
694
695 /* In case we did not submit a single IO do not wait for
696 * them to complete. ( Because we would wait forever here. )
697 *
698 * In case we had IOs and they are already complete, there
699 * is not point in waiting anyways.
700 * Therefore this if () ... */
701 if (atomic_read(&wc.count))
702 wait_for_completion(&wc.io_done);
703
704 put_ldev(mdev);
705
706 kfree(bios);
707 return;
708
709 free_bios_submit_one_by_one:
710 /* free everything by calling the endio callback directly. */
711 for (i = 0; i < nr_elements && bios[i]; i++)
712 bio_endio(bios[i], 0);
713
714 kfree(bios);
715
716 submit_one_by_one:
717 dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n");
718
719 for (i = 0; i < mdev->act_log->nr_elements; i++) {
720 enr = lc_element_by_index(mdev->act_log, i)->lc_number;
721 if (enr == LC_FREE)
722 continue;
723 /* Really slow: if we have al-extents 16..19 active,
724 * sector 4 will be written four times! Synchronous! */
725 drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT);
726 }
727
728 lc_unlock(mdev->act_log);
729 wake_up(&mdev->al_wait);
730 put_ldev(mdev);
731}
732
733/**
734 * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents
735 * @mdev: DRBD device.
736 */
737void drbd_al_apply_to_bm(struct drbd_conf *mdev)
738{
739 unsigned int enr;
740 unsigned long add = 0;
741 char ppb[10];
742 int i;
743
744 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
745
746 for (i = 0; i < mdev->act_log->nr_elements; i++) {
747 enr = lc_element_by_index(mdev->act_log, i)->lc_number;
748 if (enr == LC_FREE)
749 continue;
750 add += drbd_bm_ALe_set_all(mdev, enr);
751 }
752
753 lc_unlock(mdev->act_log);
754 wake_up(&mdev->al_wait);
755
756 dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n",
757 ppsize(ppb, Bit2KB(add)));
758}
759
760static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
761{
762 int rv;
763
764 spin_lock_irq(&mdev->al_lock);
765 rv = (al_ext->refcnt == 0);
766 if (likely(rv))
767 lc_del(mdev->act_log, al_ext);
768 spin_unlock_irq(&mdev->al_lock);
769
770 return rv;
771}
772
773/**
774 * drbd_al_shrink() - Removes all active extents form the activity log
775 * @mdev: DRBD device.
776 *
777 * Removes all active extents form the activity log, waiting until
778 * the reference count of each entry dropped to 0 first, of course.
779 *
780 * You need to lock mdev->act_log with lc_try_lock() / lc_unlock()
781 */
782void drbd_al_shrink(struct drbd_conf *mdev)
783{
784 struct lc_element *al_ext;
785 int i;
786
787 D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags));
788
789 for (i = 0; i < mdev->act_log->nr_elements; i++) {
790 al_ext = lc_element_by_index(mdev->act_log, i);
791 if (al_ext->lc_number == LC_FREE)
792 continue;
793 wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext));
794 }
795
796 wake_up(&mdev->al_wait);
797}
798
799static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused)
800{
801 struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
802
803 if (!get_ldev(mdev)) {
804 if (__ratelimit(&drbd_ratelimit_state))
805 dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n");
806 kfree(udw);
807 return 1;
808 }
809
810 drbd_bm_write_sect(mdev, udw->enr);
811 put_ldev(mdev);
812
813 kfree(udw);
814
815 if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) {
816 switch (mdev->state.conn) {
817 case C_SYNC_SOURCE: case C_SYNC_TARGET:
818 case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T:
819 drbd_resync_finished(mdev);
820 default:
821 /* nothing to do */
822 break;
823 }
824 }
825 drbd_bcast_sync_progress(mdev);
826
827 return 1;
828}
829
830
831/* ATTENTION. The AL's extents are 4MB each, while the extents in the
832 * resync LRU-cache are 16MB each.
833 * The caller of this function has to hold an get_ldev() reference.
834 *
835 * TODO will be obsoleted once we have a caching lru of the on disk bitmap
836 */
837static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
838 int count, int success)
839{
840 struct lc_element *e;
841 struct update_odbm_work *udw;
842
843 unsigned int enr;
844
845 D_ASSERT(atomic_read(&mdev->local_cnt));
846
847 /* I simply assume that a sector/size pair never crosses
848 * a 16 MB extent border. (Currently this is true...) */
849 enr = BM_SECT_TO_EXT(sector);
850
851 e = lc_get(mdev->resync, enr);
852 if (e) {
853 struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
854 if (ext->lce.lc_number == enr) {
855 if (success)
856 ext->rs_left -= count;
857 else
858 ext->rs_failed += count;
859 if (ext->rs_left < ext->rs_failed) {
860 dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
861 "rs_failed=%d count=%d\n",
862 (unsigned long long)sector,
863 ext->lce.lc_number, ext->rs_left,
864 ext->rs_failed, count);
865 dump_stack();
866
867 lc_put(mdev->resync, &ext->lce);
868 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
869 return;
870 }
871 } else {
872 /* Normally this element should be in the cache,
873 * since drbd_rs_begin_io() pulled it already in.
874 *
875 * But maybe an application write finished, and we set
876 * something outside the resync lru_cache in sync.
877 */
878 int rs_left = drbd_bm_e_weight(mdev, enr);
879 if (ext->flags != 0) {
880 dev_warn(DEV, "changing resync lce: %d[%u;%02lx]"
881 " -> %d[%u;00]\n",
882 ext->lce.lc_number, ext->rs_left,
883 ext->flags, enr, rs_left);
884 ext->flags = 0;
885 }
886 if (ext->rs_failed) {
887 dev_warn(DEV, "Kicking resync_lru element enr=%u "
888 "out with rs_failed=%d\n",
889 ext->lce.lc_number, ext->rs_failed);
890 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
891 }
892 ext->rs_left = rs_left;
893 ext->rs_failed = success ? 0 : count;
894 lc_changed(mdev->resync, &ext->lce);
895 }
896 lc_put(mdev->resync, &ext->lce);
897 /* no race, we are within the al_lock! */
898
899 if (ext->rs_left == ext->rs_failed) {
900 ext->rs_failed = 0;
901
902 udw = kmalloc(sizeof(*udw), GFP_ATOMIC);
903 if (udw) {
904 udw->enr = ext->lce.lc_number;
905 udw->w.cb = w_update_odbm;
906 drbd_queue_work_front(&mdev->data.work, &udw->w);
907 } else {
908 dev_warn(DEV, "Could not kmalloc an udw\n");
909 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
910 }
911 }
912 } else {
913 dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n",
914 mdev->resync_locked,
915 mdev->resync->nr_elements,
916 mdev->resync->flags);
917 }
918}
919
920/* clear the bit corresponding to the piece of storage in question:
921 * size byte of data starting from sector. Only clear a bits of the affected
922 * one ore more _aligned_ BM_BLOCK_SIZE blocks.
923 *
924 * called by worker on C_SYNC_TARGET and receiver on SyncSource.
925 *
926 */
927void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
928 const char *file, const unsigned int line)
929{
930 /* Is called from worker and receiver context _only_ */
931 unsigned long sbnr, ebnr, lbnr;
932 unsigned long count = 0;
933 sector_t esector, nr_sectors;
934 int wake_up = 0;
935 unsigned long flags;
936
937 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
938 dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
939 (unsigned long long)sector, size);
940 return;
941 }
942 nr_sectors = drbd_get_capacity(mdev->this_bdev);
943 esector = sector + (size >> 9) - 1;
944
945 ERR_IF(sector >= nr_sectors) return;
946 ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
947
948 lbnr = BM_SECT_TO_BIT(nr_sectors-1);
949
950 /* we clear it (in sync).
951 * round up start sector, round down end sector. we make sure we only
952 * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
953 if (unlikely(esector < BM_SECT_PER_BIT-1))
954 return;
955 if (unlikely(esector == (nr_sectors-1)))
956 ebnr = lbnr;
957 else
958 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
959 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
960
961 if (sbnr > ebnr)
962 return;
963
964 /*
965 * ok, (capacity & 7) != 0 sometimes, but who cares...
966 * we count rs_{total,left} in bits, not sectors.
967 */
968 spin_lock_irqsave(&mdev->al_lock, flags);
969 count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
970 if (count) {
971 /* we need the lock for drbd_try_clear_on_disk_bm */
972 if (jiffies - mdev->rs_mark_time > HZ*10) {
973 /* should be rolling marks,
974 * but we estimate only anyways. */
975 if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) &&
976 mdev->state.conn != C_PAUSED_SYNC_T &&
977 mdev->state.conn != C_PAUSED_SYNC_S) {
978 mdev->rs_mark_time = jiffies;
979 mdev->rs_mark_left = drbd_bm_total_weight(mdev);
980 }
981 }
982 if (get_ldev(mdev)) {
983 drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE);
984 put_ldev(mdev);
985 }
986 /* just wake_up unconditional now, various lc_chaged(),
987 * lc_put() in drbd_try_clear_on_disk_bm(). */
988 wake_up = 1;
989 }
990 spin_unlock_irqrestore(&mdev->al_lock, flags);
991 if (wake_up)
992 wake_up(&mdev->al_wait);
993}
994
995/*
996 * this is intended to set one request worth of data out of sync.
997 * affects at least 1 bit,
998 * and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits.
999 *
1000 * called by tl_clear and drbd_send_dblock (==drbd_make_request).
1001 * so this can be _any_ process.
1002 */
1003void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
1004 const char *file, const unsigned int line)
1005{
1006 unsigned long sbnr, ebnr, lbnr, flags;
1007 sector_t esector, nr_sectors;
1008 unsigned int enr, count;
1009 struct lc_element *e;
1010
1011 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
1012 dev_err(DEV, "sector: %llus, size: %d\n",
1013 (unsigned long long)sector, size);
1014 return;
1015 }
1016
1017 if (!get_ldev(mdev))
1018 return; /* no disk, no metadata, no bitmap to set bits in */
1019
1020 nr_sectors = drbd_get_capacity(mdev->this_bdev);
1021 esector = sector + (size >> 9) - 1;
1022
1023 ERR_IF(sector >= nr_sectors)
1024 goto out;
1025 ERR_IF(esector >= nr_sectors)
1026 esector = (nr_sectors-1);
1027
1028 lbnr = BM_SECT_TO_BIT(nr_sectors-1);
1029
1030 /* we set it out of sync,
1031 * we do not need to round anything here */
1032 sbnr = BM_SECT_TO_BIT(sector);
1033 ebnr = BM_SECT_TO_BIT(esector);
1034
1035 /* ok, (capacity & 7) != 0 sometimes, but who cares...
1036 * we count rs_{total,left} in bits, not sectors. */
1037 spin_lock_irqsave(&mdev->al_lock, flags);
1038 count = drbd_bm_set_bits(mdev, sbnr, ebnr);
1039
1040 enr = BM_SECT_TO_EXT(sector);
1041 e = lc_find(mdev->resync, enr);
1042 if (e)
1043 lc_entry(e, struct bm_extent, lce)->rs_left += count;
1044 spin_unlock_irqrestore(&mdev->al_lock, flags);
1045
1046out:
1047 put_ldev(mdev);
1048}
1049
1050static
1051struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
1052{
1053 struct lc_element *e;
1054 struct bm_extent *bm_ext;
1055 int wakeup = 0;
1056 unsigned long rs_flags;
1057
1058 spin_lock_irq(&mdev->al_lock);
1059 if (mdev->resync_locked > mdev->resync->nr_elements/2) {
1060 spin_unlock_irq(&mdev->al_lock);
1061 return NULL;
1062 }
1063 e = lc_get(mdev->resync, enr);
1064 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1065 if (bm_ext) {
1066 if (bm_ext->lce.lc_number != enr) {
1067 bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
1068 bm_ext->rs_failed = 0;
1069 lc_changed(mdev->resync, &bm_ext->lce);
1070 wakeup = 1;
1071 }
1072 if (bm_ext->lce.refcnt == 1)
1073 mdev->resync_locked++;
1074 set_bit(BME_NO_WRITES, &bm_ext->flags);
1075 }
1076 rs_flags = mdev->resync->flags;
1077 spin_unlock_irq(&mdev->al_lock);
1078 if (wakeup)
1079 wake_up(&mdev->al_wait);
1080
1081 if (!bm_ext) {
1082 if (rs_flags & LC_STARVING)
1083 dev_warn(DEV, "Have to wait for element"
1084 " (resync LRU too small?)\n");
1085 BUG_ON(rs_flags & LC_DIRTY);
1086 }
1087
1088 return bm_ext;
1089}
1090
1091static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
1092{
1093 struct lc_element *al_ext;
1094 int rv = 0;
1095
1096 spin_lock_irq(&mdev->al_lock);
1097 if (unlikely(enr == mdev->act_log->new_number))
1098 rv = 1;
1099 else {
1100 al_ext = lc_find(mdev->act_log, enr);
1101 if (al_ext) {
1102 if (al_ext->refcnt)
1103 rv = 1;
1104 }
1105 }
1106 spin_unlock_irq(&mdev->al_lock);
1107
1108 /*
1109 if (unlikely(rv)) {
1110 dev_info(DEV, "Delaying sync read until app's write is done\n");
1111 }
1112 */
1113 return rv;
1114}
1115
1116/**
1117 * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED
1118 * @mdev: DRBD device.
1119 * @sector: The sector number.
1120 *
1121 * This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted.
1122 */
1123int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
1124{
1125 unsigned int enr = BM_SECT_TO_EXT(sector);
1126 struct bm_extent *bm_ext;
1127 int i, sig;
1128
1129 sig = wait_event_interruptible(mdev->al_wait,
1130 (bm_ext = _bme_get(mdev, enr)));
1131 if (sig)
1132 return 0;
1133
1134 if (test_bit(BME_LOCKED, &bm_ext->flags))
1135 return 1;
1136
1137 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
1138 sig = wait_event_interruptible(mdev->al_wait,
1139 !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i));
1140 if (sig) {
1141 spin_lock_irq(&mdev->al_lock);
1142 if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
1143 clear_bit(BME_NO_WRITES, &bm_ext->flags);
1144 mdev->resync_locked--;
1145 wake_up(&mdev->al_wait);
1146 }
1147 spin_unlock_irq(&mdev->al_lock);
1148 return 0;
1149 }
1150 }
1151
1152 set_bit(BME_LOCKED, &bm_ext->flags);
1153
1154 return 1;
1155}
1156
1157/**
1158 * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep
1159 * @mdev: DRBD device.
1160 * @sector: The sector number.
1161 *
1162 * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then
1163 * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN
1164 * if there is still application IO going on in this area.
1165 */
1166int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
1167{
1168 unsigned int enr = BM_SECT_TO_EXT(sector);
1169 const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT;
1170 struct lc_element *e;
1171 struct bm_extent *bm_ext;
1172 int i;
1173
1174 spin_lock_irq(&mdev->al_lock);
1175 if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) {
1176 /* in case you have very heavy scattered io, it may
1177 * stall the syncer undefined if we give up the ref count
1178 * when we try again and requeue.
1179 *
1180 * if we don't give up the refcount, but the next time
1181 * we are scheduled this extent has been "synced" by new
1182 * application writes, we'd miss the lc_put on the
1183 * extent we keep the refcount on.
1184 * so we remembered which extent we had to try again, and
1185 * if the next requested one is something else, we do
1186 * the lc_put here...
1187 * we also have to wake_up
1188 */
1189 e = lc_find(mdev->resync, mdev->resync_wenr);
1190 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1191 if (bm_ext) {
1192 D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
1193 D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
1194 clear_bit(BME_NO_WRITES, &bm_ext->flags);
1195 mdev->resync_wenr = LC_FREE;
1196 if (lc_put(mdev->resync, &bm_ext->lce) == 0)
1197 mdev->resync_locked--;
1198 wake_up(&mdev->al_wait);
1199 } else {
1200 dev_alert(DEV, "LOGIC BUG\n");
1201 }
1202 }
1203 /* TRY. */
1204 e = lc_try_get(mdev->resync, enr);
1205 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1206 if (bm_ext) {
1207 if (test_bit(BME_LOCKED, &bm_ext->flags))
1208 goto proceed;
1209 if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) {
1210 mdev->resync_locked++;
1211 } else {
1212 /* we did set the BME_NO_WRITES,
1213 * but then could not set BME_LOCKED,
1214 * so we tried again.
1215 * drop the extra reference. */
1216 bm_ext->lce.refcnt--;
1217 D_ASSERT(bm_ext->lce.refcnt > 0);
1218 }
1219 goto check_al;
1220 } else {
1221 /* do we rather want to try later? */
1222 if (mdev->resync_locked > mdev->resync->nr_elements-3)
1223 goto try_again;
1224 /* Do or do not. There is no try. -- Yoda */
1225 e = lc_get(mdev->resync, enr);
1226 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1227 if (!bm_ext) {
1228 const unsigned long rs_flags = mdev->resync->flags;
1229 if (rs_flags & LC_STARVING)
1230 dev_warn(DEV, "Have to wait for element"
1231 " (resync LRU too small?)\n");
1232 BUG_ON(rs_flags & LC_DIRTY);
1233 goto try_again;
1234 }
1235 if (bm_ext->lce.lc_number != enr) {
1236 bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
1237 bm_ext->rs_failed = 0;
1238 lc_changed(mdev->resync, &bm_ext->lce);
1239 wake_up(&mdev->al_wait);
1240 D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0);
1241 }
1242 set_bit(BME_NO_WRITES, &bm_ext->flags);
1243 D_ASSERT(bm_ext->lce.refcnt == 1);
1244 mdev->resync_locked++;
1245 goto check_al;
1246 }
1247check_al:
1248 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
1249 if (unlikely(al_enr+i == mdev->act_log->new_number))
1250 goto try_again;
1251 if (lc_is_used(mdev->act_log, al_enr+i))
1252 goto try_again;
1253 }
1254 set_bit(BME_LOCKED, &bm_ext->flags);
1255proceed:
1256 mdev->resync_wenr = LC_FREE;
1257 spin_unlock_irq(&mdev->al_lock);
1258 return 0;
1259
1260try_again:
1261 if (bm_ext)
1262 mdev->resync_wenr = enr;
1263 spin_unlock_irq(&mdev->al_lock);
1264 return -EAGAIN;
1265}
1266
1267void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector)
1268{
1269 unsigned int enr = BM_SECT_TO_EXT(sector);
1270 struct lc_element *e;
1271 struct bm_extent *bm_ext;
1272 unsigned long flags;
1273
1274 spin_lock_irqsave(&mdev->al_lock, flags);
1275 e = lc_find(mdev->resync, enr);
1276 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1277 if (!bm_ext) {
1278 spin_unlock_irqrestore(&mdev->al_lock, flags);
1279 if (__ratelimit(&drbd_ratelimit_state))
1280 dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n");
1281 return;
1282 }
1283
1284 if (bm_ext->lce.refcnt == 0) {
1285 spin_unlock_irqrestore(&mdev->al_lock, flags);
1286 dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, "
1287 "but refcnt is 0!?\n",
1288 (unsigned long long)sector, enr);
1289 return;
1290 }
1291
1292 if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
1293 clear_bit(BME_LOCKED, &bm_ext->flags);
1294 clear_bit(BME_NO_WRITES, &bm_ext->flags);
1295 mdev->resync_locked--;
1296 wake_up(&mdev->al_wait);
1297 }
1298
1299 spin_unlock_irqrestore(&mdev->al_lock, flags);
1300}
1301
1302/**
1303 * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED)
1304 * @mdev: DRBD device.
1305 */
1306void drbd_rs_cancel_all(struct drbd_conf *mdev)
1307{
1308 spin_lock_irq(&mdev->al_lock);
1309
1310 if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */
1311 lc_reset(mdev->resync);
1312 put_ldev(mdev);
1313 }
1314 mdev->resync_locked = 0;
1315 mdev->resync_wenr = LC_FREE;
1316 spin_unlock_irq(&mdev->al_lock);
1317 wake_up(&mdev->al_wait);
1318}
1319
1320/**
1321 * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU
1322 * @mdev: DRBD device.
1323 *
1324 * Returns 0 upon success, -EAGAIN if at least one reference count was
1325 * not zero.
1326 */
1327int drbd_rs_del_all(struct drbd_conf *mdev)
1328{
1329 struct lc_element *e;
1330 struct bm_extent *bm_ext;
1331 int i;
1332
1333 spin_lock_irq(&mdev->al_lock);
1334
1335 if (get_ldev_if_state(mdev, D_FAILED)) {
1336 /* ok, ->resync is there. */
1337 for (i = 0; i < mdev->resync->nr_elements; i++) {
1338 e = lc_element_by_index(mdev->resync, i);
1339 bm_ext = lc_entry(e, struct bm_extent, lce);
1340 if (bm_ext->lce.lc_number == LC_FREE)
1341 continue;
1342 if (bm_ext->lce.lc_number == mdev->resync_wenr) {
1343 dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently"
1344 " got 'synced' by application io\n",
1345 mdev->resync_wenr);
1346 D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
1347 D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
1348 clear_bit(BME_NO_WRITES, &bm_ext->flags);
1349 mdev->resync_wenr = LC_FREE;
1350 lc_put(mdev->resync, &bm_ext->lce);
1351 }
1352 if (bm_ext->lce.refcnt != 0) {
1353 dev_info(DEV, "Retrying drbd_rs_del_all() later. "
1354 "refcnt=%d\n", bm_ext->lce.refcnt);
1355 put_ldev(mdev);
1356 spin_unlock_irq(&mdev->al_lock);
1357 return -EAGAIN;
1358 }
1359 D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
1360 D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags));
1361 lc_del(mdev->resync, &bm_ext->lce);
1362 }
1363 D_ASSERT(mdev->resync->used == 0);
1364 put_ldev(mdev);
1365 }
1366 spin_unlock_irq(&mdev->al_lock);
1367
1368 return 0;
1369}
1370
1371/**
1372 * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks
1373 * @mdev: DRBD device.
1374 * @sector: The sector number.
1375 * @size: Size of failed IO operation, in byte.
1376 */
1377void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
1378{
1379 /* Is called from worker and receiver context _only_ */
1380 unsigned long sbnr, ebnr, lbnr;
1381 unsigned long count;
1382 sector_t esector, nr_sectors;
1383 int wake_up = 0;
1384
1385 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
1386 dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
1387 (unsigned long long)sector, size);
1388 return;
1389 }
1390 nr_sectors = drbd_get_capacity(mdev->this_bdev);
1391 esector = sector + (size >> 9) - 1;
1392
1393 ERR_IF(sector >= nr_sectors) return;
1394 ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
1395
1396 lbnr = BM_SECT_TO_BIT(nr_sectors-1);
1397
1398 /*
1399 * round up start sector, round down end sector. we make sure we only
1400 * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */
1401 if (unlikely(esector < BM_SECT_PER_BIT-1))
1402 return;
1403 if (unlikely(esector == (nr_sectors-1)))
1404 ebnr = lbnr;
1405 else
1406 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
1407 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
1408
1409 if (sbnr > ebnr)
1410 return;
1411
1412 /*
1413 * ok, (capacity & 7) != 0 sometimes, but who cares...
1414 * we count rs_{total,left} in bits, not sectors.
1415 */
1416 spin_lock_irq(&mdev->al_lock);
1417 count = drbd_bm_count_bits(mdev, sbnr, ebnr);
1418 if (count) {
1419 mdev->rs_failed += count;
1420
1421 if (get_ldev(mdev)) {
1422 drbd_try_clear_on_disk_bm(mdev, sector, count, FALSE);
1423 put_ldev(mdev);
1424 }
1425
1426 /* just wake_up unconditional now, various lc_chaged(),
1427 * lc_put() in drbd_try_clear_on_disk_bm(). */
1428 wake_up = 1;
1429 }
1430 spin_unlock_irq(&mdev->al_lock);
1431 if (wake_up)
1432 wake_up(&mdev->al_wait);
1433}
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
new file mode 100644
index 000000000000..3390716898d5
--- /dev/null
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -0,0 +1,1328 @@
1/*
2 drbd_bitmap.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25#include <linux/bitops.h>
26#include <linux/vmalloc.h>
27#include <linux/string.h>
28#include <linux/drbd.h>
29#include <linux/slab.h>
30#include <asm/kmap_types.h>
31#include "drbd_int.h"
32
33/* OPAQUE outside this file!
34 * interface defined in drbd_int.h
35
36 * convention:
37 * function name drbd_bm_... => used elsewhere, "public".
38 * function name bm_... => internal to implementation, "private".
39
40 * Note that since find_first_bit returns int, at the current granularity of
41 * the bitmap (4KB per byte), this implementation "only" supports up to
42 * 1<<(32+12) == 16 TB...
43 */
44
45/*
46 * NOTE
47 * Access to the *bm_pages is protected by bm_lock.
48 * It is safe to read the other members within the lock.
49 *
50 * drbd_bm_set_bits is called from bio_endio callbacks,
51 * We may be called with irq already disabled,
52 * so we need spin_lock_irqsave().
53 * And we need the kmap_atomic.
54 */
55struct drbd_bitmap {
56 struct page **bm_pages;
57 spinlock_t bm_lock;
58 /* WARNING unsigned long bm_*:
59 * 32bit number of bit offset is just enough for 512 MB bitmap.
60 * it will blow up if we make the bitmap bigger...
61 * not that it makes much sense to have a bitmap that large,
62 * rather change the granularity to 16k or 64k or something.
63 * (that implies other problems, however...)
64 */
65 unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */
66 unsigned long bm_bits;
67 size_t bm_words;
68 size_t bm_number_of_pages;
69 sector_t bm_dev_capacity;
70 struct mutex bm_change; /* serializes resize operations */
71
72 atomic_t bm_async_io;
73 wait_queue_head_t bm_io_wait;
74
75 unsigned long bm_flags;
76
77 /* debugging aid, in case we are still racy somewhere */
78 char *bm_why;
79 struct task_struct *bm_task;
80};
81
82/* definition of bits in bm_flags */
83#define BM_LOCKED 0
84#define BM_MD_IO_ERROR 1
85#define BM_P_VMALLOCED 2
86
87static int bm_is_locked(struct drbd_bitmap *b)
88{
89 return test_bit(BM_LOCKED, &b->bm_flags);
90}
91
92#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
93static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
94{
95 struct drbd_bitmap *b = mdev->bitmap;
96 if (!__ratelimit(&drbd_ratelimit_state))
97 return;
98 dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n",
99 current == mdev->receiver.task ? "receiver" :
100 current == mdev->asender.task ? "asender" :
101 current == mdev->worker.task ? "worker" : current->comm,
102 func, b->bm_why ?: "?",
103 b->bm_task == mdev->receiver.task ? "receiver" :
104 b->bm_task == mdev->asender.task ? "asender" :
105 b->bm_task == mdev->worker.task ? "worker" : "?");
106}
107
108void drbd_bm_lock(struct drbd_conf *mdev, char *why)
109{
110 struct drbd_bitmap *b = mdev->bitmap;
111 int trylock_failed;
112
113 if (!b) {
114 dev_err(DEV, "FIXME no bitmap in drbd_bm_lock!?\n");
115 return;
116 }
117
118 trylock_failed = !mutex_trylock(&b->bm_change);
119
120 if (trylock_failed) {
121 dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n",
122 current == mdev->receiver.task ? "receiver" :
123 current == mdev->asender.task ? "asender" :
124 current == mdev->worker.task ? "worker" : current->comm,
125 why, b->bm_why ?: "?",
126 b->bm_task == mdev->receiver.task ? "receiver" :
127 b->bm_task == mdev->asender.task ? "asender" :
128 b->bm_task == mdev->worker.task ? "worker" : "?");
129 mutex_lock(&b->bm_change);
130 }
131 if (__test_and_set_bit(BM_LOCKED, &b->bm_flags))
132 dev_err(DEV, "FIXME bitmap already locked in bm_lock\n");
133
134 b->bm_why = why;
135 b->bm_task = current;
136}
137
138void drbd_bm_unlock(struct drbd_conf *mdev)
139{
140 struct drbd_bitmap *b = mdev->bitmap;
141 if (!b) {
142 dev_err(DEV, "FIXME no bitmap in drbd_bm_unlock!?\n");
143 return;
144 }
145
146 if (!__test_and_clear_bit(BM_LOCKED, &mdev->bitmap->bm_flags))
147 dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n");
148
149 b->bm_why = NULL;
150 b->bm_task = NULL;
151 mutex_unlock(&b->bm_change);
152}
153
154/* word offset to long pointer */
155static unsigned long *__bm_map_paddr(struct drbd_bitmap *b, unsigned long offset, const enum km_type km)
156{
157 struct page *page;
158 unsigned long page_nr;
159
160 /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
161 page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
162 BUG_ON(page_nr >= b->bm_number_of_pages);
163 page = b->bm_pages[page_nr];
164
165 return (unsigned long *) kmap_atomic(page, km);
166}
167
168static unsigned long * bm_map_paddr(struct drbd_bitmap *b, unsigned long offset)
169{
170 return __bm_map_paddr(b, offset, KM_IRQ1);
171}
172
173static void __bm_unmap(unsigned long *p_addr, const enum km_type km)
174{
175 kunmap_atomic(p_addr, km);
176};
177
178static void bm_unmap(unsigned long *p_addr)
179{
180 return __bm_unmap(p_addr, KM_IRQ1);
181}
182
183/* long word offset of _bitmap_ sector */
184#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
185/* word offset from start of bitmap to word number _in_page_
186 * modulo longs per page
187#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
188 hm, well, Philipp thinks gcc might not optimze the % into & (... - 1)
189 so do it explicitly:
190 */
191#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
192
193/* Long words per page */
194#define LWPP (PAGE_SIZE/sizeof(long))
195
196/*
197 * actually most functions herein should take a struct drbd_bitmap*, not a
198 * struct drbd_conf*, but for the debug macros I like to have the mdev around
199 * to be able to report device specific.
200 */
201
202static void bm_free_pages(struct page **pages, unsigned long number)
203{
204 unsigned long i;
205 if (!pages)
206 return;
207
208 for (i = 0; i < number; i++) {
209 if (!pages[i]) {
210 printk(KERN_ALERT "drbd: bm_free_pages tried to free "
211 "a NULL pointer; i=%lu n=%lu\n",
212 i, number);
213 continue;
214 }
215 __free_page(pages[i]);
216 pages[i] = NULL;
217 }
218}
219
220static void bm_vk_free(void *ptr, int v)
221{
222 if (v)
223 vfree(ptr);
224 else
225 kfree(ptr);
226}
227
228/*
229 * "have" and "want" are NUMBER OF PAGES.
230 */
231static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
232{
233 struct page **old_pages = b->bm_pages;
234 struct page **new_pages, *page;
235 unsigned int i, bytes, vmalloced = 0;
236 unsigned long have = b->bm_number_of_pages;
237
238 BUG_ON(have == 0 && old_pages != NULL);
239 BUG_ON(have != 0 && old_pages == NULL);
240
241 if (have == want)
242 return old_pages;
243
244 /* Trying kmalloc first, falling back to vmalloc.
245 * GFP_KERNEL is ok, as this is done when a lower level disk is
246 * "attached" to the drbd. Context is receiver thread or cqueue
247 * thread. As we have no disk yet, we are not in the IO path,
248 * not even the IO path of the peer. */
249 bytes = sizeof(struct page *)*want;
250 new_pages = kmalloc(bytes, GFP_KERNEL);
251 if (!new_pages) {
252 new_pages = vmalloc(bytes);
253 if (!new_pages)
254 return NULL;
255 vmalloced = 1;
256 }
257
258 memset(new_pages, 0, bytes);
259 if (want >= have) {
260 for (i = 0; i < have; i++)
261 new_pages[i] = old_pages[i];
262 for (; i < want; i++) {
263 page = alloc_page(GFP_HIGHUSER);
264 if (!page) {
265 bm_free_pages(new_pages + have, i - have);
266 bm_vk_free(new_pages, vmalloced);
267 return NULL;
268 }
269 new_pages[i] = page;
270 }
271 } else {
272 for (i = 0; i < want; i++)
273 new_pages[i] = old_pages[i];
274 /* NOT HERE, we are outside the spinlock!
275 bm_free_pages(old_pages + want, have - want);
276 */
277 }
278
279 if (vmalloced)
280 set_bit(BM_P_VMALLOCED, &b->bm_flags);
281 else
282 clear_bit(BM_P_VMALLOCED, &b->bm_flags);
283
284 return new_pages;
285}
286
287/*
288 * called on driver init only. TODO call when a device is created.
289 * allocates the drbd_bitmap, and stores it in mdev->bitmap.
290 */
291int drbd_bm_init(struct drbd_conf *mdev)
292{
293 struct drbd_bitmap *b = mdev->bitmap;
294 WARN_ON(b != NULL);
295 b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL);
296 if (!b)
297 return -ENOMEM;
298 spin_lock_init(&b->bm_lock);
299 mutex_init(&b->bm_change);
300 init_waitqueue_head(&b->bm_io_wait);
301
302 mdev->bitmap = b;
303
304 return 0;
305}
306
307sector_t drbd_bm_capacity(struct drbd_conf *mdev)
308{
309 ERR_IF(!mdev->bitmap) return 0;
310 return mdev->bitmap->bm_dev_capacity;
311}
312
313/* called on driver unload. TODO: call when a device is destroyed.
314 */
315void drbd_bm_cleanup(struct drbd_conf *mdev)
316{
317 ERR_IF (!mdev->bitmap) return;
318 bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
319 bm_vk_free(mdev->bitmap->bm_pages, test_bit(BM_P_VMALLOCED, &mdev->bitmap->bm_flags));
320 kfree(mdev->bitmap);
321 mdev->bitmap = NULL;
322}
323
324/*
325 * since (b->bm_bits % BITS_PER_LONG) != 0,
326 * this masks out the remaining bits.
327 * Returns the number of bits cleared.
328 */
329static int bm_clear_surplus(struct drbd_bitmap *b)
330{
331 const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1;
332 size_t w = b->bm_bits >> LN2_BPL;
333 int cleared = 0;
334 unsigned long *p_addr, *bm;
335
336 p_addr = bm_map_paddr(b, w);
337 bm = p_addr + MLPP(w);
338 if (w < b->bm_words) {
339 cleared = hweight_long(*bm & ~mask);
340 *bm &= mask;
341 w++; bm++;
342 }
343
344 if (w < b->bm_words) {
345 cleared += hweight_long(*bm);
346 *bm = 0;
347 }
348 bm_unmap(p_addr);
349 return cleared;
350}
351
352static void bm_set_surplus(struct drbd_bitmap *b)
353{
354 const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1;
355 size_t w = b->bm_bits >> LN2_BPL;
356 unsigned long *p_addr, *bm;
357
358 p_addr = bm_map_paddr(b, w);
359 bm = p_addr + MLPP(w);
360 if (w < b->bm_words) {
361 *bm |= ~mask;
362 bm++; w++;
363 }
364
365 if (w < b->bm_words) {
366 *bm = ~(0UL);
367 }
368 bm_unmap(p_addr);
369}
370
371static unsigned long __bm_count_bits(struct drbd_bitmap *b, const int swap_endian)
372{
373 unsigned long *p_addr, *bm, offset = 0;
374 unsigned long bits = 0;
375 unsigned long i, do_now;
376
377 while (offset < b->bm_words) {
378 i = do_now = min_t(size_t, b->bm_words-offset, LWPP);
379 p_addr = __bm_map_paddr(b, offset, KM_USER0);
380 bm = p_addr + MLPP(offset);
381 while (i--) {
382#ifndef __LITTLE_ENDIAN
383 if (swap_endian)
384 *bm = lel_to_cpu(*bm);
385#endif
386 bits += hweight_long(*bm++);
387 }
388 __bm_unmap(p_addr, KM_USER0);
389 offset += do_now;
390 cond_resched();
391 }
392
393 return bits;
394}
395
396static unsigned long bm_count_bits(struct drbd_bitmap *b)
397{
398 return __bm_count_bits(b, 0);
399}
400
401static unsigned long bm_count_bits_swap_endian(struct drbd_bitmap *b)
402{
403 return __bm_count_bits(b, 1);
404}
405
406/* offset and len in long words.*/
407static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
408{
409 unsigned long *p_addr, *bm;
410 size_t do_now, end;
411
412#define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512)
413
414 end = offset + len;
415
416 if (end > b->bm_words) {
417 printk(KERN_ALERT "drbd: bm_memset end > bm_words\n");
418 return;
419 }
420
421 while (offset < end) {
422 do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
423 p_addr = bm_map_paddr(b, offset);
424 bm = p_addr + MLPP(offset);
425 if (bm+do_now > p_addr + LWPP) {
426 printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
427 p_addr, bm, (int)do_now);
428 break; /* breaks to after catch_oob_access_end() only! */
429 }
430 memset(bm, c, do_now * sizeof(long));
431 bm_unmap(p_addr);
432 offset += do_now;
433 }
434}
435
436/*
437 * make sure the bitmap has enough room for the attached storage,
438 * if necessary, resize.
439 * called whenever we may have changed the device size.
440 * returns -ENOMEM if we could not allocate enough memory, 0 on success.
441 * In case this is actually a resize, we copy the old bitmap into the new one.
442 * Otherwise, the bitmap is initialized to all bits set.
443 */
444int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity)
445{
446 struct drbd_bitmap *b = mdev->bitmap;
447 unsigned long bits, words, owords, obits, *p_addr, *bm;
448 unsigned long want, have, onpages; /* number of pages */
449 struct page **npages, **opages = NULL;
450 int err = 0, growing;
451 int opages_vmalloced;
452
453 ERR_IF(!b) return -ENOMEM;
454
455 drbd_bm_lock(mdev, "resize");
456
457 dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n",
458 (unsigned long long)capacity);
459
460 if (capacity == b->bm_dev_capacity)
461 goto out;
462
463 opages_vmalloced = test_bit(BM_P_VMALLOCED, &b->bm_flags);
464
465 if (capacity == 0) {
466 spin_lock_irq(&b->bm_lock);
467 opages = b->bm_pages;
468 onpages = b->bm_number_of_pages;
469 owords = b->bm_words;
470 b->bm_pages = NULL;
471 b->bm_number_of_pages =
472 b->bm_set =
473 b->bm_bits =
474 b->bm_words =
475 b->bm_dev_capacity = 0;
476 spin_unlock_irq(&b->bm_lock);
477 bm_free_pages(opages, onpages);
478 bm_vk_free(opages, opages_vmalloced);
479 goto out;
480 }
481 bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
482
483 /* if we would use
484 words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
485 a 32bit host could present the wrong number of words
486 to a 64bit host.
487 */
488 words = ALIGN(bits, 64) >> LN2_BPL;
489
490 if (get_ldev(mdev)) {
491 D_ASSERT((u64)bits <= (((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12));
492 put_ldev(mdev);
493 }
494
495 /* one extra long to catch off by one errors */
496 want = ALIGN((words+1)*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
497 have = b->bm_number_of_pages;
498 if (want == have) {
499 D_ASSERT(b->bm_pages != NULL);
500 npages = b->bm_pages;
501 } else {
502 if (FAULT_ACTIVE(mdev, DRBD_FAULT_BM_ALLOC))
503 npages = NULL;
504 else
505 npages = bm_realloc_pages(b, want);
506 }
507
508 if (!npages) {
509 err = -ENOMEM;
510 goto out;
511 }
512
513 spin_lock_irq(&b->bm_lock);
514 opages = b->bm_pages;
515 owords = b->bm_words;
516 obits = b->bm_bits;
517
518 growing = bits > obits;
519 if (opages)
520 bm_set_surplus(b);
521
522 b->bm_pages = npages;
523 b->bm_number_of_pages = want;
524 b->bm_bits = bits;
525 b->bm_words = words;
526 b->bm_dev_capacity = capacity;
527
528 if (growing) {
529 bm_memset(b, owords, 0xff, words-owords);
530 b->bm_set += bits - obits;
531 }
532
533 if (want < have) {
534 /* implicit: (opages != NULL) && (opages != npages) */
535 bm_free_pages(opages + want, have - want);
536 }
537
538 p_addr = bm_map_paddr(b, words);
539 bm = p_addr + MLPP(words);
540 *bm = DRBD_MAGIC;
541 bm_unmap(p_addr);
542
543 (void)bm_clear_surplus(b);
544
545 spin_unlock_irq(&b->bm_lock);
546 if (opages != npages)
547 bm_vk_free(opages, opages_vmalloced);
548 if (!growing)
549 b->bm_set = bm_count_bits(b);
550 dev_info(DEV, "resync bitmap: bits=%lu words=%lu\n", bits, words);
551
552 out:
553 drbd_bm_unlock(mdev);
554 return err;
555}
556
557/* inherently racy:
558 * if not protected by other means, return value may be out of date when
559 * leaving this function...
560 * we still need to lock it, since it is important that this returns
561 * bm_set == 0 precisely.
562 *
563 * maybe bm_set should be atomic_t ?
564 */
565static unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
566{
567 struct drbd_bitmap *b = mdev->bitmap;
568 unsigned long s;
569 unsigned long flags;
570
571 ERR_IF(!b) return 0;
572 ERR_IF(!b->bm_pages) return 0;
573
574 spin_lock_irqsave(&b->bm_lock, flags);
575 s = b->bm_set;
576 spin_unlock_irqrestore(&b->bm_lock, flags);
577
578 return s;
579}
580
581unsigned long drbd_bm_total_weight(struct drbd_conf *mdev)
582{
583 unsigned long s;
584 /* if I don't have a disk, I don't know about out-of-sync status */
585 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
586 return 0;
587 s = _drbd_bm_total_weight(mdev);
588 put_ldev(mdev);
589 return s;
590}
591
592size_t drbd_bm_words(struct drbd_conf *mdev)
593{
594 struct drbd_bitmap *b = mdev->bitmap;
595 ERR_IF(!b) return 0;
596 ERR_IF(!b->bm_pages) return 0;
597
598 return b->bm_words;
599}
600
601unsigned long drbd_bm_bits(struct drbd_conf *mdev)
602{
603 struct drbd_bitmap *b = mdev->bitmap;
604 ERR_IF(!b) return 0;
605
606 return b->bm_bits;
607}
608
609/* merge number words from buffer into the bitmap starting at offset.
610 * buffer[i] is expected to be little endian unsigned long.
611 * bitmap must be locked by drbd_bm_lock.
612 * currently only used from receive_bitmap.
613 */
614void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
615 unsigned long *buffer)
616{
617 struct drbd_bitmap *b = mdev->bitmap;
618 unsigned long *p_addr, *bm;
619 unsigned long word, bits;
620 size_t end, do_now;
621
622 end = offset + number;
623
624 ERR_IF(!b) return;
625 ERR_IF(!b->bm_pages) return;
626 if (number == 0)
627 return;
628 WARN_ON(offset >= b->bm_words);
629 WARN_ON(end > b->bm_words);
630
631 spin_lock_irq(&b->bm_lock);
632 while (offset < end) {
633 do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
634 p_addr = bm_map_paddr(b, offset);
635 bm = p_addr + MLPP(offset);
636 offset += do_now;
637 while (do_now--) {
638 bits = hweight_long(*bm);
639 word = *bm | lel_to_cpu(*buffer++);
640 *bm++ = word;
641 b->bm_set += hweight_long(word) - bits;
642 }
643 bm_unmap(p_addr);
644 }
645 /* with 32bit <-> 64bit cross-platform connect
646 * this is only correct for current usage,
647 * where we _know_ that we are 64 bit aligned,
648 * and know that this function is used in this way, too...
649 */
650 if (end == b->bm_words)
651 b->bm_set -= bm_clear_surplus(b);
652
653 spin_unlock_irq(&b->bm_lock);
654}
655
656/* copy number words from the bitmap starting at offset into the buffer.
657 * buffer[i] will be little endian unsigned long.
658 */
659void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
660 unsigned long *buffer)
661{
662 struct drbd_bitmap *b = mdev->bitmap;
663 unsigned long *p_addr, *bm;
664 size_t end, do_now;
665
666 end = offset + number;
667
668 ERR_IF(!b) return;
669 ERR_IF(!b->bm_pages) return;
670
671 spin_lock_irq(&b->bm_lock);
672 if ((offset >= b->bm_words) ||
673 (end > b->bm_words) ||
674 (number <= 0))
675 dev_err(DEV, "offset=%lu number=%lu bm_words=%lu\n",
676 (unsigned long) offset,
677 (unsigned long) number,
678 (unsigned long) b->bm_words);
679 else {
680 while (offset < end) {
681 do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
682 p_addr = bm_map_paddr(b, offset);
683 bm = p_addr + MLPP(offset);
684 offset += do_now;
685 while (do_now--)
686 *buffer++ = cpu_to_lel(*bm++);
687 bm_unmap(p_addr);
688 }
689 }
690 spin_unlock_irq(&b->bm_lock);
691}
692
693/* set all bits in the bitmap */
694void drbd_bm_set_all(struct drbd_conf *mdev)
695{
696 struct drbd_bitmap *b = mdev->bitmap;
697 ERR_IF(!b) return;
698 ERR_IF(!b->bm_pages) return;
699
700 spin_lock_irq(&b->bm_lock);
701 bm_memset(b, 0, 0xff, b->bm_words);
702 (void)bm_clear_surplus(b);
703 b->bm_set = b->bm_bits;
704 spin_unlock_irq(&b->bm_lock);
705}
706
707/* clear all bits in the bitmap */
708void drbd_bm_clear_all(struct drbd_conf *mdev)
709{
710 struct drbd_bitmap *b = mdev->bitmap;
711 ERR_IF(!b) return;
712 ERR_IF(!b->bm_pages) return;
713
714 spin_lock_irq(&b->bm_lock);
715 bm_memset(b, 0, 0, b->bm_words);
716 b->bm_set = 0;
717 spin_unlock_irq(&b->bm_lock);
718}
719
720static void bm_async_io_complete(struct bio *bio, int error)
721{
722 struct drbd_bitmap *b = bio->bi_private;
723 int uptodate = bio_flagged(bio, BIO_UPTODATE);
724
725
726 /* strange behavior of some lower level drivers...
727 * fail the request by clearing the uptodate flag,
728 * but do not return any error?!
729 * do we want to WARN() on this? */
730 if (!error && !uptodate)
731 error = -EIO;
732
733 if (error) {
734 /* doh. what now?
735 * for now, set all bits, and flag MD_IO_ERROR */
736 __set_bit(BM_MD_IO_ERROR, &b->bm_flags);
737 }
738 if (atomic_dec_and_test(&b->bm_async_io))
739 wake_up(&b->bm_io_wait);
740
741 bio_put(bio);
742}
743
744static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int page_nr, int rw) __must_hold(local)
745{
746 /* we are process context. we always get a bio */
747 struct bio *bio = bio_alloc(GFP_KERNEL, 1);
748 unsigned int len;
749 sector_t on_disk_sector =
750 mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset;
751 on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
752
753 /* this might happen with very small
754 * flexible external meta data device */
755 len = min_t(unsigned int, PAGE_SIZE,
756 (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9);
757
758 bio->bi_bdev = mdev->ldev->md_bdev;
759 bio->bi_sector = on_disk_sector;
760 bio_add_page(bio, b->bm_pages[page_nr], len, 0);
761 bio->bi_private = b;
762 bio->bi_end_io = bm_async_io_complete;
763
764 if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
765 bio->bi_rw |= rw;
766 bio_endio(bio, -EIO);
767 } else {
768 submit_bio(rw, bio);
769 }
770}
771
772# if defined(__LITTLE_ENDIAN)
773 /* nothing to do, on disk == in memory */
774# define bm_cpu_to_lel(x) ((void)0)
775# else
776void bm_cpu_to_lel(struct drbd_bitmap *b)
777{
778 /* need to cpu_to_lel all the pages ...
779 * this may be optimized by using
780 * cpu_to_lel(-1) == -1 and cpu_to_lel(0) == 0;
781 * the following is still not optimal, but better than nothing */
782 unsigned int i;
783 unsigned long *p_addr, *bm;
784 if (b->bm_set == 0) {
785 /* no page at all; avoid swap if all is 0 */
786 i = b->bm_number_of_pages;
787 } else if (b->bm_set == b->bm_bits) {
788 /* only the last page */
789 i = b->bm_number_of_pages - 1;
790 } else {
791 /* all pages */
792 i = 0;
793 }
794 for (; i < b->bm_number_of_pages; i++) {
795 p_addr = kmap_atomic(b->bm_pages[i], KM_USER0);
796 for (bm = p_addr; bm < p_addr + PAGE_SIZE/sizeof(long); bm++)
797 *bm = cpu_to_lel(*bm);
798 kunmap_atomic(p_addr, KM_USER0);
799 }
800}
801# endif
802/* lel_to_cpu == cpu_to_lel */
803# define bm_lel_to_cpu(x) bm_cpu_to_lel(x)
804
805/*
806 * bm_rw: read/write the whole bitmap from/to its on disk location.
807 */
808static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local)
809{
810 struct drbd_bitmap *b = mdev->bitmap;
811 /* sector_t sector; */
812 int bm_words, num_pages, i;
813 unsigned long now;
814 char ppb[10];
815 int err = 0;
816
817 WARN_ON(!bm_is_locked(b));
818
819 /* no spinlock here, the drbd_bm_lock should be enough! */
820
821 bm_words = drbd_bm_words(mdev);
822 num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT;
823
824 /* on disk bitmap is little endian */
825 if (rw == WRITE)
826 bm_cpu_to_lel(b);
827
828 now = jiffies;
829 atomic_set(&b->bm_async_io, num_pages);
830 __clear_bit(BM_MD_IO_ERROR, &b->bm_flags);
831
832 /* let the layers below us try to merge these bios... */
833 for (i = 0; i < num_pages; i++)
834 bm_page_io_async(mdev, b, i, rw);
835
836 drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
837 wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0);
838
839 if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) {
840 dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
841 drbd_chk_io_error(mdev, 1, TRUE);
842 err = -EIO;
843 }
844
845 now = jiffies;
846 if (rw == WRITE) {
847 /* swap back endianness */
848 bm_lel_to_cpu(b);
849 /* flush bitmap to stable storage */
850 drbd_md_flush(mdev);
851 } else /* rw == READ */ {
852 /* just read, if necessary adjust endianness */
853 b->bm_set = bm_count_bits_swap_endian(b);
854 dev_info(DEV, "recounting of set bits took additional %lu jiffies\n",
855 jiffies - now);
856 }
857 now = b->bm_set;
858
859 dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
860 ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
861
862 return err;
863}
864
865/**
866 * drbd_bm_read() - Read the whole bitmap from its on disk location.
867 * @mdev: DRBD device.
868 */
869int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
870{
871 return bm_rw(mdev, READ);
872}
873
874/**
875 * drbd_bm_write() - Write the whole bitmap to its on disk location.
876 * @mdev: DRBD device.
877 */
878int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
879{
880 return bm_rw(mdev, WRITE);
881}
882
883/**
884 * drbd_bm_write_sect: Writes a 512 (MD_SECTOR_SIZE) byte piece of the bitmap
885 * @mdev: DRBD device.
886 * @enr: Extent number in the resync lru (happens to be sector offset)
887 *
888 * The BM_EXT_SIZE is on purpose exactly the amount of the bitmap covered
889 * by a single sector write. Therefore enr == sector offset from the
890 * start of the bitmap.
891 */
892int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local)
893{
894 sector_t on_disk_sector = enr + mdev->ldev->md.md_offset
895 + mdev->ldev->md.bm_offset;
896 int bm_words, num_words, offset;
897 int err = 0;
898
899 mutex_lock(&mdev->md_io_mutex);
900 bm_words = drbd_bm_words(mdev);
901 offset = S2W(enr); /* word offset into bitmap */
902 num_words = min(S2W(1), bm_words - offset);
903 if (num_words < S2W(1))
904 memset(page_address(mdev->md_io_page), 0, MD_SECTOR_SIZE);
905 drbd_bm_get_lel(mdev, offset, num_words,
906 page_address(mdev->md_io_page));
907 if (!drbd_md_sync_page_io(mdev, mdev->ldev, on_disk_sector, WRITE)) {
908 int i;
909 err = -EIO;
910 dev_err(DEV, "IO ERROR writing bitmap sector %lu "
911 "(meta-disk sector %llus)\n",
912 enr, (unsigned long long)on_disk_sector);
913 drbd_chk_io_error(mdev, 1, TRUE);
914 for (i = 0; i < AL_EXT_PER_BM_SECT; i++)
915 drbd_bm_ALe_set_all(mdev, enr*AL_EXT_PER_BM_SECT+i);
916 }
917 mdev->bm_writ_cnt++;
918 mutex_unlock(&mdev->md_io_mutex);
919 return err;
920}
921
922/* NOTE
923 * find_first_bit returns int, we return unsigned long.
924 * should not make much difference anyways, but ...
925 *
926 * this returns a bit number, NOT a sector!
927 */
928#define BPP_MASK ((1UL << (PAGE_SHIFT+3)) - 1)
929static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
930 const int find_zero_bit, const enum km_type km)
931{
932 struct drbd_bitmap *b = mdev->bitmap;
933 unsigned long i = -1UL;
934 unsigned long *p_addr;
935 unsigned long bit_offset; /* bit offset of the mapped page. */
936
937 if (bm_fo > b->bm_bits) {
938 dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
939 } else {
940 while (bm_fo < b->bm_bits) {
941 unsigned long offset;
942 bit_offset = bm_fo & ~BPP_MASK; /* bit offset of the page */
943 offset = bit_offset >> LN2_BPL; /* word offset of the page */
944 p_addr = __bm_map_paddr(b, offset, km);
945
946 if (find_zero_bit)
947 i = find_next_zero_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK);
948 else
949 i = find_next_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK);
950
951 __bm_unmap(p_addr, km);
952 if (i < PAGE_SIZE*8) {
953 i = bit_offset + i;
954 if (i >= b->bm_bits)
955 break;
956 goto found;
957 }
958 bm_fo = bit_offset + PAGE_SIZE*8;
959 }
960 i = -1UL;
961 }
962 found:
963 return i;
964}
965
966static unsigned long bm_find_next(struct drbd_conf *mdev,
967 unsigned long bm_fo, const int find_zero_bit)
968{
969 struct drbd_bitmap *b = mdev->bitmap;
970 unsigned long i = -1UL;
971
972 ERR_IF(!b) return i;
973 ERR_IF(!b->bm_pages) return i;
974
975 spin_lock_irq(&b->bm_lock);
976 if (bm_is_locked(b))
977 bm_print_lock_info(mdev);
978
979 i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1);
980
981 spin_unlock_irq(&b->bm_lock);
982 return i;
983}
984
985unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
986{
987 return bm_find_next(mdev, bm_fo, 0);
988}
989
990#if 0
991/* not yet needed for anything. */
992unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
993{
994 return bm_find_next(mdev, bm_fo, 1);
995}
996#endif
997
998/* does not spin_lock_irqsave.
999 * you must take drbd_bm_lock() first */
1000unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
1001{
1002 /* WARN_ON(!bm_is_locked(mdev)); */
1003 return __bm_find_next(mdev, bm_fo, 0, KM_USER1);
1004}
1005
1006unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
1007{
1008 /* WARN_ON(!bm_is_locked(mdev)); */
1009 return __bm_find_next(mdev, bm_fo, 1, KM_USER1);
1010}
1011
1012/* returns number of bits actually changed.
1013 * for val != 0, we change 0 -> 1, return code positive
1014 * for val == 0, we change 1 -> 0, return code negative
1015 * wants bitnr, not sector.
1016 * expected to be called for only a few bits (e - s about BITS_PER_LONG).
1017 * Must hold bitmap lock already. */
1018int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1019 unsigned long e, int val, const enum km_type km)
1020{
1021 struct drbd_bitmap *b = mdev->bitmap;
1022 unsigned long *p_addr = NULL;
1023 unsigned long bitnr;
1024 unsigned long last_page_nr = -1UL;
1025 int c = 0;
1026
1027 if (e >= b->bm_bits) {
1028 dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
1029 s, e, b->bm_bits);
1030 e = b->bm_bits ? b->bm_bits -1 : 0;
1031 }
1032 for (bitnr = s; bitnr <= e; bitnr++) {
1033 unsigned long offset = bitnr>>LN2_BPL;
1034 unsigned long page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
1035 if (page_nr != last_page_nr) {
1036 if (p_addr)
1037 __bm_unmap(p_addr, km);
1038 p_addr = __bm_map_paddr(b, offset, km);
1039 last_page_nr = page_nr;
1040 }
1041 if (val)
1042 c += (0 == __test_and_set_bit(bitnr & BPP_MASK, p_addr));
1043 else
1044 c -= (0 != __test_and_clear_bit(bitnr & BPP_MASK, p_addr));
1045 }
1046 if (p_addr)
1047 __bm_unmap(p_addr, km);
1048 b->bm_set += c;
1049 return c;
1050}
1051
1052/* returns number of bits actually changed.
1053 * for val != 0, we change 0 -> 1, return code positive
1054 * for val == 0, we change 1 -> 0, return code negative
1055 * wants bitnr, not sector */
1056int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1057 const unsigned long e, int val)
1058{
1059 unsigned long flags;
1060 struct drbd_bitmap *b = mdev->bitmap;
1061 int c = 0;
1062
1063 ERR_IF(!b) return 1;
1064 ERR_IF(!b->bm_pages) return 0;
1065
1066 spin_lock_irqsave(&b->bm_lock, flags);
1067 if (bm_is_locked(b))
1068 bm_print_lock_info(mdev);
1069
1070 c = __bm_change_bits_to(mdev, s, e, val, KM_IRQ1);
1071
1072 spin_unlock_irqrestore(&b->bm_lock, flags);
1073 return c;
1074}
1075
1076/* returns number of bits changed 0 -> 1 */
1077int drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1078{
1079 return bm_change_bits_to(mdev, s, e, 1);
1080}
1081
1082/* returns number of bits changed 1 -> 0 */
1083int drbd_bm_clear_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1084{
1085 return -bm_change_bits_to(mdev, s, e, 0);
1086}
1087
1088/* sets all bits in full words,
1089 * from first_word up to, but not including, last_word */
1090static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
1091 int page_nr, int first_word, int last_word)
1092{
1093 int i;
1094 int bits;
1095 unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_USER0);
1096 for (i = first_word; i < last_word; i++) {
1097 bits = hweight_long(paddr[i]);
1098 paddr[i] = ~0UL;
1099 b->bm_set += BITS_PER_LONG - bits;
1100 }
1101 kunmap_atomic(paddr, KM_USER0);
1102}
1103
1104/* Same thing as drbd_bm_set_bits, but without taking the spin_lock_irqsave.
1105 * You must first drbd_bm_lock().
1106 * Can be called to set the whole bitmap in one go.
1107 * Sets bits from s to e _inclusive_. */
1108void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1109{
1110 /* First set_bit from the first bit (s)
1111 * up to the next long boundary (sl),
1112 * then assign full words up to the last long boundary (el),
1113 * then set_bit up to and including the last bit (e).
1114 *
1115 * Do not use memset, because we must account for changes,
1116 * so we need to loop over the words with hweight() anyways.
1117 */
1118 unsigned long sl = ALIGN(s,BITS_PER_LONG);
1119 unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);
1120 int first_page;
1121 int last_page;
1122 int page_nr;
1123 int first_word;
1124 int last_word;
1125
1126 if (e - s <= 3*BITS_PER_LONG) {
1127 /* don't bother; el and sl may even be wrong. */
1128 __bm_change_bits_to(mdev, s, e, 1, KM_USER0);
1129 return;
1130 }
1131
1132 /* difference is large enough that we can trust sl and el */
1133
1134 /* bits filling the current long */
1135 if (sl)
1136 __bm_change_bits_to(mdev, s, sl-1, 1, KM_USER0);
1137
1138 first_page = sl >> (3 + PAGE_SHIFT);
1139 last_page = el >> (3 + PAGE_SHIFT);
1140
1141 /* MLPP: modulo longs per page */
1142 /* LWPP: long words per page */
1143 first_word = MLPP(sl >> LN2_BPL);
1144 last_word = LWPP;
1145
1146 /* first and full pages, unless first page == last page */
1147 for (page_nr = first_page; page_nr < last_page; page_nr++) {
1148 bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word);
1149 cond_resched();
1150 first_word = 0;
1151 }
1152
1153 /* last page (respectively only page, for first page == last page) */
1154 last_word = MLPP(el >> LN2_BPL);
1155 bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word);
1156
1157 /* possibly trailing bits.
1158 * example: (e & 63) == 63, el will be e+1.
1159 * if that even was the very last bit,
1160 * it would trigger an assert in __bm_change_bits_to()
1161 */
1162 if (el <= e)
1163 __bm_change_bits_to(mdev, el, e, 1, KM_USER0);
1164}
1165
1166/* returns bit state
1167 * wants bitnr, NOT sector.
1168 * inherently racy... area needs to be locked by means of {al,rs}_lru
1169 * 1 ... bit set
1170 * 0 ... bit not set
1171 * -1 ... first out of bounds access, stop testing for bits!
1172 */
1173int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)
1174{
1175 unsigned long flags;
1176 struct drbd_bitmap *b = mdev->bitmap;
1177 unsigned long *p_addr;
1178 int i;
1179
1180 ERR_IF(!b) return 0;
1181 ERR_IF(!b->bm_pages) return 0;
1182
1183 spin_lock_irqsave(&b->bm_lock, flags);
1184 if (bm_is_locked(b))
1185 bm_print_lock_info(mdev);
1186 if (bitnr < b->bm_bits) {
1187 unsigned long offset = bitnr>>LN2_BPL;
1188 p_addr = bm_map_paddr(b, offset);
1189 i = test_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0;
1190 bm_unmap(p_addr);
1191 } else if (bitnr == b->bm_bits) {
1192 i = -1;
1193 } else { /* (bitnr > b->bm_bits) */
1194 dev_err(DEV, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);
1195 i = 0;
1196 }
1197
1198 spin_unlock_irqrestore(&b->bm_lock, flags);
1199 return i;
1200}
1201
1202/* returns number of bits set in the range [s, e] */
1203int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1204{
1205 unsigned long flags;
1206 struct drbd_bitmap *b = mdev->bitmap;
1207 unsigned long *p_addr = NULL, page_nr = -1;
1208 unsigned long bitnr;
1209 int c = 0;
1210 size_t w;
1211
1212 /* If this is called without a bitmap, that is a bug. But just to be
1213 * robust in case we screwed up elsewhere, in that case pretend there
1214 * was one dirty bit in the requested area, so we won't try to do a
1215 * local read there (no bitmap probably implies no disk) */
1216 ERR_IF(!b) return 1;
1217 ERR_IF(!b->bm_pages) return 1;
1218
1219 spin_lock_irqsave(&b->bm_lock, flags);
1220 if (bm_is_locked(b))
1221 bm_print_lock_info(mdev);
1222 for (bitnr = s; bitnr <= e; bitnr++) {
1223 w = bitnr >> LN2_BPL;
1224 if (page_nr != w >> (PAGE_SHIFT - LN2_BPL + 3)) {
1225 page_nr = w >> (PAGE_SHIFT - LN2_BPL + 3);
1226 if (p_addr)
1227 bm_unmap(p_addr);
1228 p_addr = bm_map_paddr(b, w);
1229 }
1230 ERR_IF (bitnr >= b->bm_bits) {
1231 dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
1232 } else {
1233 c += (0 != test_bit(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
1234 }
1235 }
1236 if (p_addr)
1237 bm_unmap(p_addr);
1238 spin_unlock_irqrestore(&b->bm_lock, flags);
1239 return c;
1240}
1241
1242
1243/* inherently racy...
1244 * return value may be already out-of-date when this function returns.
1245 * but the general usage is that this is only use during a cstate when bits are
1246 * only cleared, not set, and typically only care for the case when the return
1247 * value is zero, or we already "locked" this "bitmap extent" by other means.
1248 *
1249 * enr is bm-extent number, since we chose to name one sector (512 bytes)
1250 * worth of the bitmap a "bitmap extent".
1251 *
1252 * TODO
1253 * I think since we use it like a reference count, we should use the real
1254 * reference count of some bitmap extent element from some lru instead...
1255 *
1256 */
1257int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
1258{
1259 struct drbd_bitmap *b = mdev->bitmap;
1260 int count, s, e;
1261 unsigned long flags;
1262 unsigned long *p_addr, *bm;
1263
1264 ERR_IF(!b) return 0;
1265 ERR_IF(!b->bm_pages) return 0;
1266
1267 spin_lock_irqsave(&b->bm_lock, flags);
1268 if (bm_is_locked(b))
1269 bm_print_lock_info(mdev);
1270
1271 s = S2W(enr);
1272 e = min((size_t)S2W(enr+1), b->bm_words);
1273 count = 0;
1274 if (s < b->bm_words) {
1275 int n = e-s;
1276 p_addr = bm_map_paddr(b, s);
1277 bm = p_addr + MLPP(s);
1278 while (n--)
1279 count += hweight_long(*bm++);
1280 bm_unmap(p_addr);
1281 } else {
1282 dev_err(DEV, "start offset (%d) too large in drbd_bm_e_weight\n", s);
1283 }
1284 spin_unlock_irqrestore(&b->bm_lock, flags);
1285 return count;
1286}
1287
1288/* set all bits covered by the AL-extent al_enr */
1289unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
1290{
1291 struct drbd_bitmap *b = mdev->bitmap;
1292 unsigned long *p_addr, *bm;
1293 unsigned long weight;
1294 int count, s, e, i, do_now;
1295 ERR_IF(!b) return 0;
1296 ERR_IF(!b->bm_pages) return 0;
1297
1298 spin_lock_irq(&b->bm_lock);
1299 if (bm_is_locked(b))
1300 bm_print_lock_info(mdev);
1301 weight = b->bm_set;
1302
1303 s = al_enr * BM_WORDS_PER_AL_EXT;
1304 e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words);
1305 /* assert that s and e are on the same page */
1306 D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3)
1307 == s >> (PAGE_SHIFT - LN2_BPL + 3));
1308 count = 0;
1309 if (s < b->bm_words) {
1310 i = do_now = e-s;
1311 p_addr = bm_map_paddr(b, s);
1312 bm = p_addr + MLPP(s);
1313 while (i--) {
1314 count += hweight_long(*bm);
1315 *bm = -1UL;
1316 bm++;
1317 }
1318 bm_unmap(p_addr);
1319 b->bm_set += do_now*BITS_PER_LONG - count;
1320 if (e == b->bm_words)
1321 b->bm_set -= bm_clear_surplus(b);
1322 } else {
1323 dev_err(DEV, "start offset (%d) too large in drbd_bm_ALe_set_all\n", s);
1324 }
1325 weight = b->bm_set - weight;
1326 spin_unlock_irq(&b->bm_lock);
1327 return weight;
1328}
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
new file mode 100644
index 000000000000..e5e86a781820
--- /dev/null
+++ b/drivers/block/drbd/drbd_int.h
@@ -0,0 +1,2261 @@
1/*
2 drbd_int.h
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24*/
25
26#ifndef _DRBD_INT_H
27#define _DRBD_INT_H
28
29#include <linux/compiler.h>
30#include <linux/types.h>
31#include <linux/version.h>
32#include <linux/list.h>
33#include <linux/sched.h>
34#include <linux/bitops.h>
35#include <linux/slab.h>
36#include <linux/crypto.h>
37#include <linux/ratelimit.h>
38#include <linux/tcp.h>
39#include <linux/mutex.h>
40#include <linux/major.h>
41#include <linux/blkdev.h>
42#include <linux/genhd.h>
43#include <net/tcp.h>
44#include <linux/lru_cache.h>
45
46#ifdef __CHECKER__
47# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr")))
48# define __protected_read_by(x) __attribute__((require_context(x,1,999,"read")))
49# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write")))
50# define __must_hold(x) __attribute__((context(x,1,1), require_context(x,1,999,"call")))
51#else
52# define __protected_by(x)
53# define __protected_read_by(x)
54# define __protected_write_by(x)
55# define __must_hold(x)
56#endif
57
58#define __no_warn(lock, stmt) do { __acquire(lock); stmt; __release(lock); } while (0)
59
60/* module parameter, defined in drbd_main.c */
61extern unsigned int minor_count;
62extern int disable_sendpage;
63extern int allow_oos;
64extern unsigned int cn_idx;
65
66#ifdef CONFIG_DRBD_FAULT_INJECTION
67extern int enable_faults;
68extern int fault_rate;
69extern int fault_devs;
70#endif
71
72extern char usermode_helper[];
73
74
75#ifndef TRUE
76#define TRUE 1
77#endif
78#ifndef FALSE
79#define FALSE 0
80#endif
81
82/* I don't remember why XCPU ...
83 * This is used to wake the asender,
84 * and to interrupt sending the sending task
85 * on disconnect.
86 */
87#define DRBD_SIG SIGXCPU
88
89/* This is used to stop/restart our threads.
90 * Cannot use SIGTERM nor SIGKILL, since these
91 * are sent out by init on runlevel changes
92 * I choose SIGHUP for now.
93 */
94#define DRBD_SIGKILL SIGHUP
95
96/* All EEs on the free list should have ID_VACANT (== 0)
97 * freshly allocated EEs get !ID_VACANT (== 1)
98 * so if it says "cannot dereference null pointer at address 0x00000001",
99 * it is most likely one of these :( */
100
101#define ID_IN_SYNC (4711ULL)
102#define ID_OUT_OF_SYNC (4712ULL)
103
104#define ID_SYNCER (-1ULL)
105#define ID_VACANT 0
106#define is_syncer_block_id(id) ((id) == ID_SYNCER)
107
108struct drbd_conf;
109
110
111/* to shorten dev_warn(DEV, "msg"); and relatives statements */
112#define DEV (disk_to_dev(mdev->vdisk))
113
114#define D_ASSERT(exp) if (!(exp)) \
115 dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__)
116
117#define ERR_IF(exp) if (({ \
118 int _b = (exp) != 0; \
119 if (_b) dev_err(DEV, "%s: (%s) in %s:%d\n", \
120 __func__, #exp, __FILE__, __LINE__); \
121 _b; \
122 }))
123
124/* Defines to control fault insertion */
125enum {
126 DRBD_FAULT_MD_WR = 0, /* meta data write */
127 DRBD_FAULT_MD_RD = 1, /* read */
128 DRBD_FAULT_RS_WR = 2, /* resync */
129 DRBD_FAULT_RS_RD = 3,
130 DRBD_FAULT_DT_WR = 4, /* data */
131 DRBD_FAULT_DT_RD = 5,
132 DRBD_FAULT_DT_RA = 6, /* data read ahead */
133 DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */
134 DRBD_FAULT_AL_EE = 8, /* alloc ee */
135
136 DRBD_FAULT_MAX,
137};
138
139#ifdef CONFIG_DRBD_FAULT_INJECTION
140extern unsigned int
141_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type);
142static inline int
143drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) {
144 return fault_rate &&
145 (enable_faults & (1<<type)) &&
146 _drbd_insert_fault(mdev, type);
147}
148#define FAULT_ACTIVE(_m, _t) (drbd_insert_fault((_m), (_t)))
149
150#else
151#define FAULT_ACTIVE(_m, _t) (0)
152#endif
153
154/* integer division, round _UP_ to the next integer */
155#define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0))
156/* usual integer division */
157#define div_floor(A, B) ((A)/(B))
158
159/* drbd_meta-data.c (still in drbd_main.c) */
160/* 4th incarnation of the disk layout. */
161#define DRBD_MD_MAGIC (DRBD_MAGIC+4)
162
163extern struct drbd_conf **minor_table;
164extern struct ratelimit_state drbd_ratelimit_state;
165
166/* on the wire */
167enum drbd_packets {
168 /* receiver (data socket) */
169 P_DATA = 0x00,
170 P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */
171 P_RS_DATA_REPLY = 0x02, /* Response to P_RS_DATA_REQUEST */
172 P_BARRIER = 0x03,
173 P_BITMAP = 0x04,
174 P_BECOME_SYNC_TARGET = 0x05,
175 P_BECOME_SYNC_SOURCE = 0x06,
176 P_UNPLUG_REMOTE = 0x07, /* Used at various times to hint the peer */
177 P_DATA_REQUEST = 0x08, /* Used to ask for a data block */
178 P_RS_DATA_REQUEST = 0x09, /* Used to ask for a data block for resync */
179 P_SYNC_PARAM = 0x0a,
180 P_PROTOCOL = 0x0b,
181 P_UUIDS = 0x0c,
182 P_SIZES = 0x0d,
183 P_STATE = 0x0e,
184 P_SYNC_UUID = 0x0f,
185 P_AUTH_CHALLENGE = 0x10,
186 P_AUTH_RESPONSE = 0x11,
187 P_STATE_CHG_REQ = 0x12,
188
189 /* asender (meta socket */
190 P_PING = 0x13,
191 P_PING_ACK = 0x14,
192 P_RECV_ACK = 0x15, /* Used in protocol B */
193 P_WRITE_ACK = 0x16, /* Used in protocol C */
194 P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
195 P_DISCARD_ACK = 0x18, /* Used in proto C, two-primaries conflict detection */
196 P_NEG_ACK = 0x19, /* Sent if local disk is unusable */
197 P_NEG_DREPLY = 0x1a, /* Local disk is broken... */
198 P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */
199 P_BARRIER_ACK = 0x1c,
200 P_STATE_CHG_REPLY = 0x1d,
201
202 /* "new" commands, no longer fitting into the ordering scheme above */
203
204 P_OV_REQUEST = 0x1e, /* data socket */
205 P_OV_REPLY = 0x1f,
206 P_OV_RESULT = 0x20, /* meta socket */
207 P_CSUM_RS_REQUEST = 0x21, /* data socket */
208 P_RS_IS_IN_SYNC = 0x22, /* meta socket */
209 P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */
210 P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */
211
212 P_MAX_CMD = 0x25,
213 P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
214 P_MAX_OPT_CMD = 0x101,
215
216 /* special command ids for handshake */
217
218 P_HAND_SHAKE_M = 0xfff1, /* First Packet on the MetaSock */
219 P_HAND_SHAKE_S = 0xfff2, /* First Packet on the Socket */
220
221 P_HAND_SHAKE = 0xfffe /* FIXED for the next century! */
222};
223
224static inline const char *cmdname(enum drbd_packets cmd)
225{
226 /* THINK may need to become several global tables
227 * when we want to support more than
228 * one PRO_VERSION */
229 static const char *cmdnames[] = {
230 [P_DATA] = "Data",
231 [P_DATA_REPLY] = "DataReply",
232 [P_RS_DATA_REPLY] = "RSDataReply",
233 [P_BARRIER] = "Barrier",
234 [P_BITMAP] = "ReportBitMap",
235 [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget",
236 [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource",
237 [P_UNPLUG_REMOTE] = "UnplugRemote",
238 [P_DATA_REQUEST] = "DataRequest",
239 [P_RS_DATA_REQUEST] = "RSDataRequest",
240 [P_SYNC_PARAM] = "SyncParam",
241 [P_SYNC_PARAM89] = "SyncParam89",
242 [P_PROTOCOL] = "ReportProtocol",
243 [P_UUIDS] = "ReportUUIDs",
244 [P_SIZES] = "ReportSizes",
245 [P_STATE] = "ReportState",
246 [P_SYNC_UUID] = "ReportSyncUUID",
247 [P_AUTH_CHALLENGE] = "AuthChallenge",
248 [P_AUTH_RESPONSE] = "AuthResponse",
249 [P_PING] = "Ping",
250 [P_PING_ACK] = "PingAck",
251 [P_RECV_ACK] = "RecvAck",
252 [P_WRITE_ACK] = "WriteAck",
253 [P_RS_WRITE_ACK] = "RSWriteAck",
254 [P_DISCARD_ACK] = "DiscardAck",
255 [P_NEG_ACK] = "NegAck",
256 [P_NEG_DREPLY] = "NegDReply",
257 [P_NEG_RS_DREPLY] = "NegRSDReply",
258 [P_BARRIER_ACK] = "BarrierAck",
259 [P_STATE_CHG_REQ] = "StateChgRequest",
260 [P_STATE_CHG_REPLY] = "StateChgReply",
261 [P_OV_REQUEST] = "OVRequest",
262 [P_OV_REPLY] = "OVReply",
263 [P_OV_RESULT] = "OVResult",
264 [P_CSUM_RS_REQUEST] = "CsumRSRequest",
265 [P_RS_IS_IN_SYNC] = "CsumRSIsInSync",
266 [P_COMPRESSED_BITMAP] = "CBitmap",
267 [P_MAX_CMD] = NULL,
268 };
269
270 if (cmd == P_HAND_SHAKE_M)
271 return "HandShakeM";
272 if (cmd == P_HAND_SHAKE_S)
273 return "HandShakeS";
274 if (cmd == P_HAND_SHAKE)
275 return "HandShake";
276 if (cmd >= P_MAX_CMD)
277 return "Unknown";
278 return cmdnames[cmd];
279}
280
281/* for sending/receiving the bitmap,
282 * possibly in some encoding scheme */
283struct bm_xfer_ctx {
284 /* "const"
285 * stores total bits and long words
286 * of the bitmap, so we don't need to
287 * call the accessor functions over and again. */
288 unsigned long bm_bits;
289 unsigned long bm_words;
290 /* during xfer, current position within the bitmap */
291 unsigned long bit_offset;
292 unsigned long word_offset;
293
294 /* statistics; index: (h->command == P_BITMAP) */
295 unsigned packets[2];
296 unsigned bytes[2];
297};
298
299extern void INFO_bm_xfer_stats(struct drbd_conf *mdev,
300 const char *direction, struct bm_xfer_ctx *c);
301
302static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
303{
304 /* word_offset counts "native long words" (32 or 64 bit),
305 * aligned at 64 bit.
306 * Encoded packet may end at an unaligned bit offset.
307 * In case a fallback clear text packet is transmitted in
308 * between, we adjust this offset back to the last 64bit
309 * aligned "native long word", which makes coding and decoding
310 * the plain text bitmap much more convenient. */
311#if BITS_PER_LONG == 64
312 c->word_offset = c->bit_offset >> 6;
313#elif BITS_PER_LONG == 32
314 c->word_offset = c->bit_offset >> 5;
315 c->word_offset &= ~(1UL);
316#else
317# error "unsupported BITS_PER_LONG"
318#endif
319}
320
321#ifndef __packed
322#define __packed __attribute__((packed))
323#endif
324
325/* This is the layout for a packet on the wire.
326 * The byteorder is the network byte order.
327 * (except block_id and barrier fields.
328 * these are pointers to local structs
329 * and have no relevance for the partner,
330 * which just echoes them as received.)
331 *
332 * NOTE that the payload starts at a long aligned offset,
333 * regardless of 32 or 64 bit arch!
334 */
335struct p_header {
336 u32 magic;
337 u16 command;
338 u16 length; /* bytes of data after this header */
339 u8 payload[0];
340} __packed;
341/* 8 bytes. packet FIXED for the next century! */
342
343/*
344 * short commands, packets without payload, plain p_header:
345 * P_PING
346 * P_PING_ACK
347 * P_BECOME_SYNC_TARGET
348 * P_BECOME_SYNC_SOURCE
349 * P_UNPLUG_REMOTE
350 */
351
352/*
353 * commands with out-of-struct payload:
354 * P_BITMAP (no additional fields)
355 * P_DATA, P_DATA_REPLY (see p_data)
356 * P_COMPRESSED_BITMAP (see receive_compressed_bitmap)
357 */
358
359/* these defines must not be changed without changing the protocol version */
360#define DP_HARDBARRIER 1
361#define DP_RW_SYNC 2
362#define DP_MAY_SET_IN_SYNC 4
363
364struct p_data {
365 struct p_header head;
366 u64 sector; /* 64 bits sector number */
367 u64 block_id; /* to identify the request in protocol B&C */
368 u32 seq_num;
369 u32 dp_flags;
370} __packed;
371
372/*
373 * commands which share a struct:
374 * p_block_ack:
375 * P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
376 * P_DISCARD_ACK (proto C, two-primaries conflict detection)
377 * p_block_req:
378 * P_DATA_REQUEST, P_RS_DATA_REQUEST
379 */
380struct p_block_ack {
381 struct p_header head;
382 u64 sector;
383 u64 block_id;
384 u32 blksize;
385 u32 seq_num;
386} __packed;
387
388
389struct p_block_req {
390 struct p_header head;
391 u64 sector;
392 u64 block_id;
393 u32 blksize;
394 u32 pad; /* to multiple of 8 Byte */
395} __packed;
396
397/*
398 * commands with their own struct for additional fields:
399 * P_HAND_SHAKE
400 * P_BARRIER
401 * P_BARRIER_ACK
402 * P_SYNC_PARAM
403 * ReportParams
404 */
405
406struct p_handshake {
407 struct p_header head; /* 8 bytes */
408 u32 protocol_min;
409 u32 feature_flags;
410 u32 protocol_max;
411
412 /* should be more than enough for future enhancements
413 * for now, feature_flags and the reserverd array shall be zero.
414 */
415
416 u32 _pad;
417 u64 reserverd[7];
418} __packed;
419/* 80 bytes, FIXED for the next century */
420
421struct p_barrier {
422 struct p_header head;
423 u32 barrier; /* barrier number _handle_ only */
424 u32 pad; /* to multiple of 8 Byte */
425} __packed;
426
427struct p_barrier_ack {
428 struct p_header head;
429 u32 barrier;
430 u32 set_size;
431} __packed;
432
433struct p_rs_param {
434 struct p_header head;
435 u32 rate;
436
437 /* Since protocol version 88 and higher. */
438 char verify_alg[0];
439} __packed;
440
441struct p_rs_param_89 {
442 struct p_header head;
443 u32 rate;
444 /* protocol version 89: */
445 char verify_alg[SHARED_SECRET_MAX];
446 char csums_alg[SHARED_SECRET_MAX];
447} __packed;
448
449enum drbd_conn_flags {
450 CF_WANT_LOSE = 1,
451 CF_DRY_RUN = 2,
452};
453
454struct p_protocol {
455 struct p_header head;
456 u32 protocol;
457 u32 after_sb_0p;
458 u32 after_sb_1p;
459 u32 after_sb_2p;
460 u32 conn_flags;
461 u32 two_primaries;
462
463 /* Since protocol version 87 and higher. */
464 char integrity_alg[0];
465
466} __packed;
467
468struct p_uuids {
469 struct p_header head;
470 u64 uuid[UI_EXTENDED_SIZE];
471} __packed;
472
473struct p_rs_uuid {
474 struct p_header head;
475 u64 uuid;
476} __packed;
477
478struct p_sizes {
479 struct p_header head;
480 u64 d_size; /* size of disk */
481 u64 u_size; /* user requested size */
482 u64 c_size; /* current exported size */
483 u32 max_segment_size; /* Maximal size of a BIO */
484 u32 queue_order_type;
485} __packed;
486
487struct p_state {
488 struct p_header head;
489 u32 state;
490} __packed;
491
492struct p_req_state {
493 struct p_header head;
494 u32 mask;
495 u32 val;
496} __packed;
497
498struct p_req_state_reply {
499 struct p_header head;
500 u32 retcode;
501} __packed;
502
503struct p_drbd06_param {
504 u64 size;
505 u32 state;
506 u32 blksize;
507 u32 protocol;
508 u32 version;
509 u32 gen_cnt[5];
510 u32 bit_map_gen[5];
511} __packed;
512
513struct p_discard {
514 struct p_header head;
515 u64 block_id;
516 u32 seq_num;
517 u32 pad;
518} __packed;
519
520/* Valid values for the encoding field.
521 * Bump proto version when changing this. */
522enum drbd_bitmap_code {
523 /* RLE_VLI_Bytes = 0,
524 * and other bit variants had been defined during
525 * algorithm evaluation. */
526 RLE_VLI_Bits = 2,
527};
528
529struct p_compressed_bm {
530 struct p_header head;
531 /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
532 * (encoding & 0x80): polarity (set/unset) of first runlength
533 * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
534 * used to pad up to head.length bytes
535 */
536 u8 encoding;
537
538 u8 code[0];
539} __packed;
540
541/* DCBP: Drbd Compressed Bitmap Packet ... */
542static inline enum drbd_bitmap_code
543DCBP_get_code(struct p_compressed_bm *p)
544{
545 return (enum drbd_bitmap_code)(p->encoding & 0x0f);
546}
547
548static inline void
549DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
550{
551 BUG_ON(code & ~0xf);
552 p->encoding = (p->encoding & ~0xf) | code;
553}
554
555static inline int
556DCBP_get_start(struct p_compressed_bm *p)
557{
558 return (p->encoding & 0x80) != 0;
559}
560
561static inline void
562DCBP_set_start(struct p_compressed_bm *p, int set)
563{
564 p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
565}
566
567static inline int
568DCBP_get_pad_bits(struct p_compressed_bm *p)
569{
570 return (p->encoding >> 4) & 0x7;
571}
572
573static inline void
574DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
575{
576 BUG_ON(n & ~0x7);
577 p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
578}
579
580/* one bitmap packet, including the p_header,
581 * should fit within one _architecture independend_ page.
582 * so we need to use the fixed size 4KiB page size
583 * most architechtures have used for a long time.
584 */
585#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header))
586#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long))
587#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm))
588#if (PAGE_SIZE < 4096)
589/* drbd_send_bitmap / receive_bitmap would break horribly */
590#error "PAGE_SIZE too small"
591#endif
592
593union p_polymorph {
594 struct p_header header;
595 struct p_handshake handshake;
596 struct p_data data;
597 struct p_block_ack block_ack;
598 struct p_barrier barrier;
599 struct p_barrier_ack barrier_ack;
600 struct p_rs_param_89 rs_param_89;
601 struct p_protocol protocol;
602 struct p_sizes sizes;
603 struct p_uuids uuids;
604 struct p_state state;
605 struct p_req_state req_state;
606 struct p_req_state_reply req_state_reply;
607 struct p_block_req block_req;
608} __packed;
609
610/**********************************************************************/
611enum drbd_thread_state {
612 None,
613 Running,
614 Exiting,
615 Restarting
616};
617
618struct drbd_thread {
619 spinlock_t t_lock;
620 struct task_struct *task;
621 struct completion stop;
622 enum drbd_thread_state t_state;
623 int (*function) (struct drbd_thread *);
624 struct drbd_conf *mdev;
625 int reset_cpu_mask;
626};
627
628static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
629{
630 /* THINK testing the t_state seems to be uncritical in all cases
631 * (but thread_{start,stop}), so we can read it *without* the lock.
632 * --lge */
633
634 smp_rmb();
635 return thi->t_state;
636}
637
638
639/*
640 * Having this as the first member of a struct provides sort of "inheritance".
641 * "derived" structs can be "drbd_queue_work()"ed.
642 * The callback should know and cast back to the descendant struct.
643 * drbd_request and drbd_epoch_entry are descendants of drbd_work.
644 */
645struct drbd_work;
646typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel);
647struct drbd_work {
648 struct list_head list;
649 drbd_work_cb cb;
650};
651
652struct drbd_tl_epoch;
653struct drbd_request {
654 struct drbd_work w;
655 struct drbd_conf *mdev;
656
657 /* if local IO is not allowed, will be NULL.
658 * if local IO _is_ allowed, holds the locally submitted bio clone,
659 * or, after local IO completion, the ERR_PTR(error).
660 * see drbd_endio_pri(). */
661 struct bio *private_bio;
662
663 struct hlist_node colision;
664 sector_t sector;
665 unsigned int size;
666 unsigned int epoch; /* barrier_nr */
667
668 /* barrier_nr: used to check on "completion" whether this req was in
669 * the current epoch, and we therefore have to close it,
670 * starting a new epoch...
671 */
672
673 /* up to here, the struct layout is identical to drbd_epoch_entry;
674 * we might be able to use that to our advantage... */
675
676 struct list_head tl_requests; /* ring list in the transfer log */
677 struct bio *master_bio; /* master bio pointer */
678 unsigned long rq_state; /* see comments above _req_mod() */
679 int seq_num;
680 unsigned long start_time;
681};
682
683struct drbd_tl_epoch {
684 struct drbd_work w;
685 struct list_head requests; /* requests before */
686 struct drbd_tl_epoch *next; /* pointer to the next barrier */
687 unsigned int br_number; /* the barriers identifier. */
688 int n_req; /* number of requests attached before this barrier */
689};
690
691struct drbd_request;
692
693/* These Tl_epoch_entries may be in one of 6 lists:
694 active_ee .. data packet being written
695 sync_ee .. syncer block being written
696 done_ee .. block written, need to send P_WRITE_ACK
697 read_ee .. [RS]P_DATA_REQUEST being read
698*/
699
700struct drbd_epoch {
701 struct list_head list;
702 unsigned int barrier_nr;
703 atomic_t epoch_size; /* increased on every request added. */
704 atomic_t active; /* increased on every req. added, and dec on every finished. */
705 unsigned long flags;
706};
707
708/* drbd_epoch flag bits */
709enum {
710 DE_BARRIER_IN_NEXT_EPOCH_ISSUED,
711 DE_BARRIER_IN_NEXT_EPOCH_DONE,
712 DE_CONTAINS_A_BARRIER,
713 DE_HAVE_BARRIER_NUMBER,
714 DE_IS_FINISHING,
715};
716
717enum epoch_event {
718 EV_PUT,
719 EV_GOT_BARRIER_NR,
720 EV_BARRIER_DONE,
721 EV_BECAME_LAST,
722 EV_CLEANUP = 32, /* used as flag */
723};
724
725struct drbd_epoch_entry {
726 struct drbd_work w;
727 struct drbd_conf *mdev;
728 struct bio *private_bio;
729 struct hlist_node colision;
730 sector_t sector;
731 unsigned int size;
732 struct drbd_epoch *epoch;
733
734 /* up to here, the struct layout is identical to drbd_request;
735 * we might be able to use that to our advantage... */
736
737 unsigned int flags;
738 u64 block_id;
739};
740
741struct drbd_wq_barrier {
742 struct drbd_work w;
743 struct completion done;
744};
745
746struct digest_info {
747 int digest_size;
748 void *digest;
749};
750
751/* ee flag bits */
752enum {
753 __EE_CALL_AL_COMPLETE_IO,
754 __EE_CONFLICT_PENDING,
755 __EE_MAY_SET_IN_SYNC,
756 __EE_IS_BARRIER,
757};
758#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
759#define EE_CONFLICT_PENDING (1<<__EE_CONFLICT_PENDING)
760#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
761#define EE_IS_BARRIER (1<<__EE_IS_BARRIER)
762
763/* global flag bits */
764enum {
765 CREATE_BARRIER, /* next P_DATA is preceeded by a P_BARRIER */
766 SIGNAL_ASENDER, /* whether asender wants to be interrupted */
767 SEND_PING, /* whether asender should send a ping asap */
768
769 STOP_SYNC_TIMER, /* tell timer to cancel itself */
770 UNPLUG_QUEUED, /* only relevant with kernel 2.4 */
771 UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */
772 MD_DIRTY, /* current uuids and flags not yet on disk */
773 DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */
774 USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */
775 CLUSTER_ST_CHANGE, /* Cluster wide state change going on... */
776 CL_ST_CHG_SUCCESS,
777 CL_ST_CHG_FAIL,
778 CRASHED_PRIMARY, /* This node was a crashed primary.
779 * Gets cleared when the state.conn
780 * goes into C_CONNECTED state. */
781 WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */
782 NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */
783 CONSIDER_RESYNC,
784
785 MD_NO_BARRIER, /* meta data device does not support barriers,
786 so don't even try */
787 SUSPEND_IO, /* suspend application io */
788 BITMAP_IO, /* suspend application io;
789 once no more io in flight, start bitmap io */
790 BITMAP_IO_QUEUED, /* Started bitmap IO */
791 RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
792 NET_CONGESTED, /* The data socket is congested */
793
794 CONFIG_PENDING, /* serialization of (re)configuration requests.
795 * if set, also prevents the device from dying */
796 DEVICE_DYING, /* device became unconfigured,
797 * but worker thread is still handling the cleanup.
798 * reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed,
799 * while this is set. */
800 RESIZE_PENDING, /* Size change detected locally, waiting for the response from
801 * the peer, if it changed there as well. */
802 CONN_DRY_RUN, /* Expect disconnect after resync handshake. */
803 GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */
804};
805
806struct drbd_bitmap; /* opaque for drbd_conf */
807
808/* TODO sort members for performance
809 * MAYBE group them further */
810
811/* THINK maybe we actually want to use the default "event/%s" worker threads
812 * or similar in linux 2.6, which uses per cpu data and threads.
813 *
814 * To be general, this might need a spin_lock member.
815 * For now, please use the mdev->req_lock to protect list_head,
816 * see drbd_queue_work below.
817 */
818struct drbd_work_queue {
819 struct list_head q;
820 struct semaphore s; /* producers up it, worker down()s it */
821 spinlock_t q_lock; /* to protect the list. */
822};
823
824struct drbd_socket {
825 struct drbd_work_queue work;
826 struct mutex mutex;
827 struct socket *socket;
828 /* this way we get our
829 * send/receive buffers off the stack */
830 union p_polymorph sbuf;
831 union p_polymorph rbuf;
832};
833
834struct drbd_md {
835 u64 md_offset; /* sector offset to 'super' block */
836
837 u64 la_size_sect; /* last agreed size, unit sectors */
838 u64 uuid[UI_SIZE];
839 u64 device_uuid;
840 u32 flags;
841 u32 md_size_sect;
842
843 s32 al_offset; /* signed relative sector offset to al area */
844 s32 bm_offset; /* signed relative sector offset to bitmap */
845
846 /* u32 al_nr_extents; important for restoring the AL
847 * is stored into sync_conf.al_extents, which in turn
848 * gets applied to act_log->nr_elements
849 */
850};
851
852/* for sync_conf and other types... */
853#define NL_PACKET(name, number, fields) struct name { fields };
854#define NL_INTEGER(pn,pr,member) int member;
855#define NL_INT64(pn,pr,member) __u64 member;
856#define NL_BIT(pn,pr,member) unsigned member:1;
857#define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len;
858#include "linux/drbd_nl.h"
859
860struct drbd_backing_dev {
861 struct block_device *backing_bdev;
862 struct block_device *md_bdev;
863 struct file *lo_file;
864 struct file *md_file;
865 struct drbd_md md;
866 struct disk_conf dc; /* The user provided config... */
867 sector_t known_size; /* last known size of that backing device */
868};
869
870struct drbd_md_io {
871 struct drbd_conf *mdev;
872 struct completion event;
873 int error;
874};
875
876struct bm_io_work {
877 struct drbd_work w;
878 char *why;
879 int (*io_fn)(struct drbd_conf *mdev);
880 void (*done)(struct drbd_conf *mdev, int rv);
881};
882
883enum write_ordering_e {
884 WO_none,
885 WO_drain_io,
886 WO_bdev_flush,
887 WO_bio_barrier
888};
889
890struct drbd_conf {
891 /* things that are stored as / read from meta data on disk */
892 unsigned long flags;
893
894 /* configured by drbdsetup */
895 struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */
896 struct syncer_conf sync_conf;
897 struct drbd_backing_dev *ldev __protected_by(local);
898
899 sector_t p_size; /* partner's disk size */
900 struct request_queue *rq_queue;
901 struct block_device *this_bdev;
902 struct gendisk *vdisk;
903
904 struct drbd_socket data; /* data/barrier/cstate/parameter packets */
905 struct drbd_socket meta; /* ping/ack (metadata) packets */
906 int agreed_pro_version; /* actually used protocol version */
907 unsigned long last_received; /* in jiffies, either socket */
908 unsigned int ko_count;
909 struct drbd_work resync_work,
910 unplug_work,
911 md_sync_work;
912 struct timer_list resync_timer;
913 struct timer_list md_sync_timer;
914
915 /* Used after attach while negotiating new disk state. */
916 union drbd_state new_state_tmp;
917
918 union drbd_state state;
919 wait_queue_head_t misc_wait;
920 wait_queue_head_t state_wait; /* upon each state change. */
921 unsigned int send_cnt;
922 unsigned int recv_cnt;
923 unsigned int read_cnt;
924 unsigned int writ_cnt;
925 unsigned int al_writ_cnt;
926 unsigned int bm_writ_cnt;
927 atomic_t ap_bio_cnt; /* Requests we need to complete */
928 atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
929 atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
930 atomic_t unacked_cnt; /* Need to send replys for */
931 atomic_t local_cnt; /* Waiting for local completion */
932 atomic_t net_cnt; /* Users of net_conf */
933 spinlock_t req_lock;
934 struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */
935 struct drbd_tl_epoch *newest_tle;
936 struct drbd_tl_epoch *oldest_tle;
937 struct list_head out_of_sequence_requests;
938 struct hlist_head *tl_hash;
939 unsigned int tl_hash_s;
940
941 /* blocks to sync in this run [unit BM_BLOCK_SIZE] */
942 unsigned long rs_total;
943 /* number of sync IOs that failed in this run */
944 unsigned long rs_failed;
945 /* Syncer's start time [unit jiffies] */
946 unsigned long rs_start;
947 /* cumulated time in PausedSyncX state [unit jiffies] */
948 unsigned long rs_paused;
949 /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
950 unsigned long rs_mark_left;
951 /* marks's time [unit jiffies] */
952 unsigned long rs_mark_time;
953 /* skipped because csum was equeal [unit BM_BLOCK_SIZE] */
954 unsigned long rs_same_csum;
955
956 /* where does the admin want us to start? (sector) */
957 sector_t ov_start_sector;
958 /* where are we now? (sector) */
959 sector_t ov_position;
960 /* Start sector of out of sync range (to merge printk reporting). */
961 sector_t ov_last_oos_start;
962 /* size of out-of-sync range in sectors. */
963 sector_t ov_last_oos_size;
964 unsigned long ov_left; /* in bits */
965 struct crypto_hash *csums_tfm;
966 struct crypto_hash *verify_tfm;
967
968 struct drbd_thread receiver;
969 struct drbd_thread worker;
970 struct drbd_thread asender;
971 struct drbd_bitmap *bitmap;
972 unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */
973
974 /* Used to track operations of resync... */
975 struct lru_cache *resync;
976 /* Number of locked elements in resync LRU */
977 unsigned int resync_locked;
978 /* resync extent number waiting for application requests */
979 unsigned int resync_wenr;
980
981 int open_cnt;
982 u64 *p_uuid;
983 struct drbd_epoch *current_epoch;
984 spinlock_t epoch_lock;
985 unsigned int epochs;
986 enum write_ordering_e write_ordering;
987 struct list_head active_ee; /* IO in progress */
988 struct list_head sync_ee; /* IO in progress */
989 struct list_head done_ee; /* send ack */
990 struct list_head read_ee; /* IO in progress */
991 struct list_head net_ee; /* zero-copy network send in progress */
992 struct hlist_head *ee_hash; /* is proteced by req_lock! */
993 unsigned int ee_hash_s;
994
995 /* this one is protected by ee_lock, single thread */
996 struct drbd_epoch_entry *last_write_w_barrier;
997
998 int next_barrier_nr;
999 struct hlist_head *app_reads_hash; /* is proteced by req_lock */
1000 struct list_head resync_reads;
1001 atomic_t pp_in_use;
1002 wait_queue_head_t ee_wait;
1003 struct page *md_io_page; /* one page buffer for md_io */
1004 struct page *md_io_tmpp; /* for logical_block_size != 512 */
1005 struct mutex md_io_mutex; /* protects the md_io_buffer */
1006 spinlock_t al_lock;
1007 wait_queue_head_t al_wait;
1008 struct lru_cache *act_log; /* activity log */
1009 unsigned int al_tr_number;
1010 int al_tr_cycle;
1011 int al_tr_pos; /* position of the next transaction in the journal */
1012 struct crypto_hash *cram_hmac_tfm;
1013 struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */
1014 struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */
1015 void *int_dig_out;
1016 void *int_dig_in;
1017 void *int_dig_vv;
1018 wait_queue_head_t seq_wait;
1019 atomic_t packet_seq;
1020 unsigned int peer_seq;
1021 spinlock_t peer_seq_lock;
1022 unsigned int minor;
1023 unsigned long comm_bm_set; /* communicated number of set bits. */
1024 cpumask_var_t cpu_mask;
1025 struct bm_io_work bm_io_work;
1026 u64 ed_uuid; /* UUID of the exposed data */
1027 struct mutex state_mutex;
1028 char congestion_reason; /* Why we where congested... */
1029};
1030
1031static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
1032{
1033 struct drbd_conf *mdev;
1034
1035 mdev = minor < minor_count ? minor_table[minor] : NULL;
1036
1037 return mdev;
1038}
1039
1040static inline unsigned int mdev_to_minor(struct drbd_conf *mdev)
1041{
1042 return mdev->minor;
1043}
1044
1045/* returns 1 if it was successfull,
1046 * returns 0 if there was no data socket.
1047 * so wherever you are going to use the data.socket, e.g. do
1048 * if (!drbd_get_data_sock(mdev))
1049 * return 0;
1050 * CODE();
1051 * drbd_put_data_sock(mdev);
1052 */
1053static inline int drbd_get_data_sock(struct drbd_conf *mdev)
1054{
1055 mutex_lock(&mdev->data.mutex);
1056 /* drbd_disconnect() could have called drbd_free_sock()
1057 * while we were waiting in down()... */
1058 if (unlikely(mdev->data.socket == NULL)) {
1059 mutex_unlock(&mdev->data.mutex);
1060 return 0;
1061 }
1062 return 1;
1063}
1064
1065static inline void drbd_put_data_sock(struct drbd_conf *mdev)
1066{
1067 mutex_unlock(&mdev->data.mutex);
1068}
1069
1070/*
1071 * function declarations
1072 *************************/
1073
1074/* drbd_main.c */
1075
1076enum chg_state_flags {
1077 CS_HARD = 1,
1078 CS_VERBOSE = 2,
1079 CS_WAIT_COMPLETE = 4,
1080 CS_SERIALIZE = 8,
1081 CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE,
1082};
1083
1084extern void drbd_init_set_defaults(struct drbd_conf *mdev);
1085extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
1086 union drbd_state mask, union drbd_state val);
1087extern void drbd_force_state(struct drbd_conf *, union drbd_state,
1088 union drbd_state);
1089extern int _drbd_request_state(struct drbd_conf *, union drbd_state,
1090 union drbd_state, enum chg_state_flags);
1091extern int __drbd_set_state(struct drbd_conf *, union drbd_state,
1092 enum chg_state_flags, struct completion *done);
1093extern void print_st_err(struct drbd_conf *, union drbd_state,
1094 union drbd_state, int);
1095extern int drbd_thread_start(struct drbd_thread *thi);
1096extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait);
1097#ifdef CONFIG_SMP
1098extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev);
1099extern void drbd_calc_cpu_mask(struct drbd_conf *mdev);
1100#else
1101#define drbd_thread_current_set_cpu(A) ({})
1102#define drbd_calc_cpu_mask(A) ({})
1103#endif
1104extern void drbd_free_resources(struct drbd_conf *mdev);
1105extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
1106 unsigned int set_size);
1107extern void tl_clear(struct drbd_conf *mdev);
1108extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
1109extern void drbd_free_sock(struct drbd_conf *mdev);
1110extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
1111 void *buf, size_t size, unsigned msg_flags);
1112extern int drbd_send_protocol(struct drbd_conf *mdev);
1113extern int drbd_send_uuids(struct drbd_conf *mdev);
1114extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
1115extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val);
1116extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply);
1117extern int _drbd_send_state(struct drbd_conf *mdev);
1118extern int drbd_send_state(struct drbd_conf *mdev);
1119extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1120 enum drbd_packets cmd, struct p_header *h,
1121 size_t size, unsigned msg_flags);
1122#define USE_DATA_SOCKET 1
1123#define USE_META_SOCKET 0
1124extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1125 enum drbd_packets cmd, struct p_header *h,
1126 size_t size);
1127extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd,
1128 char *data, size_t size);
1129extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc);
1130extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr,
1131 u32 set_size);
1132extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
1133 struct drbd_epoch_entry *e);
1134extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
1135 struct p_block_req *rp);
1136extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
1137 struct p_data *dp);
1138extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
1139 sector_t sector, int blksize, u64 block_id);
1140extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
1141 struct drbd_epoch_entry *e);
1142extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req);
1143extern int _drbd_send_barrier(struct drbd_conf *mdev,
1144 struct drbd_tl_epoch *barrier);
1145extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
1146 sector_t sector, int size, u64 block_id);
1147extern int drbd_send_drequest_csum(struct drbd_conf *mdev,
1148 sector_t sector,int size,
1149 void *digest, int digest_size,
1150 enum drbd_packets cmd);
1151extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size);
1152
1153extern int drbd_send_bitmap(struct drbd_conf *mdev);
1154extern int _drbd_send_bitmap(struct drbd_conf *mdev);
1155extern int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode);
1156extern void drbd_free_bc(struct drbd_backing_dev *ldev);
1157extern void drbd_mdev_cleanup(struct drbd_conf *mdev);
1158
1159/* drbd_meta-data.c (still in drbd_main.c) */
1160extern void drbd_md_sync(struct drbd_conf *mdev);
1161extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev);
1162/* maybe define them below as inline? */
1163extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
1164extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
1165extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
1166extern void _drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
1167extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local);
1168extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local);
1169extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local);
1170extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
1171extern void drbd_md_mark_dirty(struct drbd_conf *mdev);
1172extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
1173 int (*io_fn)(struct drbd_conf *),
1174 void (*done)(struct drbd_conf *, int),
1175 char *why);
1176extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
1177extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
1178extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why);
1179
1180
1181/* Meta data layout
1182 We reserve a 128MB Block (4k aligned)
1183 * either at the end of the backing device
1184 * or on a separate meta data device. */
1185
1186#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */
1187/* The following numbers are sectors */
1188#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */
1189#define MD_AL_MAX_SIZE 64 /* = 32 kb LOG ~ 3776 extents ~ 14 GB Storage */
1190/* Allows up to about 3.8TB */
1191#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE)
1192
1193/* Since the smalles IO unit is usually 512 byte */
1194#define MD_SECTOR_SHIFT 9
1195#define MD_SECTOR_SIZE (1<<MD_SECTOR_SHIFT)
1196
1197/* activity log */
1198#define AL_EXTENTS_PT ((MD_SECTOR_SIZE-12)/8-1) /* 61 ; Extents per 512B sector */
1199#define AL_EXTENT_SHIFT 22 /* One extent represents 4M Storage */
1200#define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT)
1201
1202#if BITS_PER_LONG == 32
1203#define LN2_BPL 5
1204#define cpu_to_lel(A) cpu_to_le32(A)
1205#define lel_to_cpu(A) le32_to_cpu(A)
1206#elif BITS_PER_LONG == 64
1207#define LN2_BPL 6
1208#define cpu_to_lel(A) cpu_to_le64(A)
1209#define lel_to_cpu(A) le64_to_cpu(A)
1210#else
1211#error "LN2 of BITS_PER_LONG unknown!"
1212#endif
1213
1214/* resync bitmap */
1215/* 16MB sized 'bitmap extent' to track syncer usage */
1216struct bm_extent {
1217 int rs_left; /* number of bits set (out of sync) in this extent. */
1218 int rs_failed; /* number of failed resync requests in this extent. */
1219 unsigned long flags;
1220 struct lc_element lce;
1221};
1222
1223#define BME_NO_WRITES 0 /* bm_extent.flags: no more requests on this one! */
1224#define BME_LOCKED 1 /* bm_extent.flags: syncer active on this one. */
1225
1226/* drbd_bitmap.c */
1227/*
1228 * We need to store one bit for a block.
1229 * Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap.
1230 * Bit 0 ==> local node thinks this block is binary identical on both nodes
1231 * Bit 1 ==> local node thinks this block needs to be synced.
1232 */
1233
1234#define BM_BLOCK_SHIFT 12 /* 4k per bit */
1235#define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT)
1236/* (9+3) : 512 bytes @ 8 bits; representing 16M storage
1237 * per sector of on disk bitmap */
1238#define BM_EXT_SHIFT (BM_BLOCK_SHIFT + MD_SECTOR_SHIFT + 3) /* = 24 */
1239#define BM_EXT_SIZE (1<<BM_EXT_SHIFT)
1240
1241#if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12)
1242#error "HAVE YOU FIXED drbdmeta AS WELL??"
1243#endif
1244
1245/* thus many _storage_ sectors are described by one bit */
1246#define BM_SECT_TO_BIT(x) ((x)>>(BM_BLOCK_SHIFT-9))
1247#define BM_BIT_TO_SECT(x) ((sector_t)(x)<<(BM_BLOCK_SHIFT-9))
1248#define BM_SECT_PER_BIT BM_BIT_TO_SECT(1)
1249
1250/* bit to represented kilo byte conversion */
1251#define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10))
1252
1253/* in which _bitmap_ extent (resp. sector) the bit for a certain
1254 * _storage_ sector is located in */
1255#define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9))
1256
1257/* how much _storage_ sectors we have per bitmap sector */
1258#define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9))
1259#define BM_SECT_PER_EXT BM_EXT_TO_SECT(1)
1260
1261/* in one sector of the bitmap, we have this many activity_log extents. */
1262#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
1263#define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
1264
1265#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT)
1266#define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
1267
1268/* the extent in "PER_EXTENT" below is an activity log extent
1269 * we need that many (long words/bytes) to store the bitmap
1270 * of one AL_EXTENT_SIZE chunk of storage.
1271 * we can store the bitmap for that many AL_EXTENTS within
1272 * one sector of the _on_disk_ bitmap:
1273 * bit 0 bit 37 bit 38 bit (512*8)-1
1274 * ...|........|........|.. // ..|........|
1275 * sect. 0 `296 `304 ^(512*8*8)-1
1276 *
1277#define BM_WORDS_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG )
1278#define BM_BYTES_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 ) // 128
1279#define BM_EXT_PER_SECT ( 512 / BM_BYTES_PER_EXTENT ) // 4
1280 */
1281
1282#define DRBD_MAX_SECTORS_32 (0xffffffffLU)
1283#define DRBD_MAX_SECTORS_BM \
1284 ((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9)))
1285#if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32
1286#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM
1287#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM
1288#elif !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32
1289#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32
1290#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32
1291#else
1292#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM
1293/* 16 TB in units of sectors */
1294#if BITS_PER_LONG == 32
1295/* adjust by one page worth of bitmap,
1296 * so we won't wrap around in drbd_bm_find_next_bit.
1297 * you should use 64bit OS for that much storage, anyways. */
1298#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff)
1299#else
1300#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0x1LU << 32)
1301#endif
1302#endif
1303
1304/* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
1305 * With a value of 6 all IO in one 32K block make it to the same slot of the
1306 * hash table. */
1307#define HT_SHIFT 6
1308#define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT))
1309
1310/* Number of elements in the app_reads_hash */
1311#define APP_R_HSIZE 15
1312
1313extern int drbd_bm_init(struct drbd_conf *mdev);
1314extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors);
1315extern void drbd_bm_cleanup(struct drbd_conf *mdev);
1316extern void drbd_bm_set_all(struct drbd_conf *mdev);
1317extern void drbd_bm_clear_all(struct drbd_conf *mdev);
1318extern int drbd_bm_set_bits(
1319 struct drbd_conf *mdev, unsigned long s, unsigned long e);
1320extern int drbd_bm_clear_bits(
1321 struct drbd_conf *mdev, unsigned long s, unsigned long e);
1322/* bm_set_bits variant for use while holding drbd_bm_lock */
1323extern void _drbd_bm_set_bits(struct drbd_conf *mdev,
1324 const unsigned long s, const unsigned long e);
1325extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr);
1326extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
1327extern int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local);
1328extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
1329extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
1330extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
1331 unsigned long al_enr);
1332extern size_t drbd_bm_words(struct drbd_conf *mdev);
1333extern unsigned long drbd_bm_bits(struct drbd_conf *mdev);
1334extern sector_t drbd_bm_capacity(struct drbd_conf *mdev);
1335extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
1336/* bm_find_next variants for use while you hold drbd_bm_lock() */
1337extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
1338extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo);
1339extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev);
1340extern int drbd_bm_rs_done(struct drbd_conf *mdev);
1341/* for receive_bitmap */
1342extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset,
1343 size_t number, unsigned long *buffer);
1344/* for _drbd_send_bitmap and drbd_bm_write_sect */
1345extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset,
1346 size_t number, unsigned long *buffer);
1347
1348extern void drbd_bm_lock(struct drbd_conf *mdev, char *why);
1349extern void drbd_bm_unlock(struct drbd_conf *mdev);
1350
1351extern int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e);
1352/* drbd_main.c */
1353
1354extern struct kmem_cache *drbd_request_cache;
1355extern struct kmem_cache *drbd_ee_cache; /* epoch entries */
1356extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
1357extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
1358extern mempool_t *drbd_request_mempool;
1359extern mempool_t *drbd_ee_mempool;
1360
1361extern struct page *drbd_pp_pool; /* drbd's page pool */
1362extern spinlock_t drbd_pp_lock;
1363extern int drbd_pp_vacant;
1364extern wait_queue_head_t drbd_pp_wait;
1365
1366extern rwlock_t global_state_lock;
1367
1368extern struct drbd_conf *drbd_new_device(unsigned int minor);
1369extern void drbd_free_mdev(struct drbd_conf *mdev);
1370
1371extern int proc_details;
1372
1373/* drbd_req */
1374extern int drbd_make_request_26(struct request_queue *q, struct bio *bio);
1375extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
1376extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
1377extern int is_valid_ar_handle(struct drbd_request *, sector_t);
1378
1379
1380/* drbd_nl.c */
1381extern void drbd_suspend_io(struct drbd_conf *mdev);
1382extern void drbd_resume_io(struct drbd_conf *mdev);
1383extern char *ppsize(char *buf, unsigned long long size);
1384extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int);
1385enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
1386extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, int force) __must_hold(local);
1387extern void resync_after_online_grow(struct drbd_conf *);
1388extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
1389extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
1390 int force);
1391enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
1392extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
1393
1394/* drbd_worker.c */
1395extern int drbd_worker(struct drbd_thread *thi);
1396extern int drbd_alter_sa(struct drbd_conf *mdev, int na);
1397extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side);
1398extern void resume_next_sg(struct drbd_conf *mdev);
1399extern void suspend_other_sg(struct drbd_conf *mdev);
1400extern int drbd_resync_finished(struct drbd_conf *mdev);
1401/* maybe rather drbd_main.c ? */
1402extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
1403 struct drbd_backing_dev *bdev, sector_t sector, int rw);
1404extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int);
1405
1406static inline void ov_oos_print(struct drbd_conf *mdev)
1407{
1408 if (mdev->ov_last_oos_size) {
1409 dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n",
1410 (unsigned long long)mdev->ov_last_oos_start,
1411 (unsigned long)mdev->ov_last_oos_size);
1412 }
1413 mdev->ov_last_oos_size=0;
1414}
1415
1416
1417extern void drbd_csum(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
1418/* worker callbacks */
1419extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int);
1420extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int);
1421extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int);
1422extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int);
1423extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int);
1424extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int);
1425extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int);
1426extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int);
1427extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int);
1428extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int);
1429extern int w_io_error(struct drbd_conf *, struct drbd_work *, int);
1430extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int);
1431extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int);
1432extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int);
1433extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int);
1434extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
1435extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
1436extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
1437
1438extern void resync_timer_fn(unsigned long data);
1439
1440/* drbd_receiver.c */
1441extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
1442extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
1443 u64 id,
1444 sector_t sector,
1445 unsigned int data_size,
1446 gfp_t gfp_mask) __must_hold(local);
1447extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e);
1448extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
1449 struct list_head *head);
1450extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
1451 struct list_head *head);
1452extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
1453extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed);
1454extern void drbd_flush_workqueue(struct drbd_conf *mdev);
1455
1456/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
1457 * mess with get_fs/set_fs, we know we are KERNEL_DS always. */
1458static inline int drbd_setsockopt(struct socket *sock, int level, int optname,
1459 char __user *optval, int optlen)
1460{
1461 int err;
1462 if (level == SOL_SOCKET)
1463 err = sock_setsockopt(sock, level, optname, optval, optlen);
1464 else
1465 err = sock->ops->setsockopt(sock, level, optname, optval,
1466 optlen);
1467 return err;
1468}
1469
1470static inline void drbd_tcp_cork(struct socket *sock)
1471{
1472 int __user val = 1;
1473 (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
1474 (char __user *)&val, sizeof(val));
1475}
1476
1477static inline void drbd_tcp_uncork(struct socket *sock)
1478{
1479 int __user val = 0;
1480 (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
1481 (char __user *)&val, sizeof(val));
1482}
1483
1484static inline void drbd_tcp_nodelay(struct socket *sock)
1485{
1486 int __user val = 1;
1487 (void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY,
1488 (char __user *)&val, sizeof(val));
1489}
1490
1491static inline void drbd_tcp_quickack(struct socket *sock)
1492{
1493 int __user val = 1;
1494 (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
1495 (char __user *)&val, sizeof(val));
1496}
1497
1498void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo);
1499
1500/* drbd_proc.c */
1501extern struct proc_dir_entry *drbd_proc;
1502extern const struct file_operations drbd_proc_fops;
1503extern const char *drbd_conn_str(enum drbd_conns s);
1504extern const char *drbd_role_str(enum drbd_role s);
1505
1506/* drbd_actlog.c */
1507extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector);
1508extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector);
1509extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector);
1510extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
1511extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
1512extern void drbd_rs_cancel_all(struct drbd_conf *mdev);
1513extern int drbd_rs_del_all(struct drbd_conf *mdev);
1514extern void drbd_rs_failed_io(struct drbd_conf *mdev,
1515 sector_t sector, int size);
1516extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *);
1517extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector,
1518 int size, const char *file, const unsigned int line);
1519#define drbd_set_in_sync(mdev, sector, size) \
1520 __drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__)
1521extern void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector,
1522 int size, const char *file, const unsigned int line);
1523#define drbd_set_out_of_sync(mdev, sector, size) \
1524 __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
1525extern void drbd_al_apply_to_bm(struct drbd_conf *mdev);
1526extern void drbd_al_to_on_disk_bm(struct drbd_conf *mdev);
1527extern void drbd_al_shrink(struct drbd_conf *mdev);
1528
1529
1530/* drbd_nl.c */
1531
1532void drbd_nl_cleanup(void);
1533int __init drbd_nl_init(void);
1534void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state);
1535void drbd_bcast_sync_progress(struct drbd_conf *mdev);
1536void drbd_bcast_ee(struct drbd_conf *mdev,
1537 const char *reason, const int dgs,
1538 const char* seen_hash, const char* calc_hash,
1539 const struct drbd_epoch_entry* e);
1540
1541
1542/**
1543 * DOC: DRBD State macros
1544 *
1545 * These macros are used to express state changes in easily readable form.
1546 *
1547 * The NS macros expand to a mask and a value, that can be bit ored onto the
1548 * current state as soon as the spinlock (req_lock) was taken.
1549 *
1550 * The _NS macros are used for state functions that get called with the
1551 * spinlock. These macros expand directly to the new state value.
1552 *
1553 * Besides the basic forms NS() and _NS() additional _?NS[23] are defined
1554 * to express state changes that affect more than one aspect of the state.
1555 *
1556 * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
1557 * Means that the network connection was established and that the peer
1558 * is in secondary role.
1559 */
1560#define role_MASK R_MASK
1561#define peer_MASK R_MASK
1562#define disk_MASK D_MASK
1563#define pdsk_MASK D_MASK
1564#define conn_MASK C_MASK
1565#define susp_MASK 1
1566#define user_isp_MASK 1
1567#define aftr_isp_MASK 1
1568
1569#define NS(T, S) \
1570 ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
1571 ({ union drbd_state val; val.i = 0; val.T = (S); val; })
1572#define NS2(T1, S1, T2, S2) \
1573 ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
1574 mask.T2 = T2##_MASK; mask; }), \
1575 ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
1576 val.T2 = (S2); val; })
1577#define NS3(T1, S1, T2, S2, T3, S3) \
1578 ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
1579 mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
1580 ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
1581 val.T2 = (S2); val.T3 = (S3); val; })
1582
1583#define _NS(D, T, S) \
1584 D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T = (S); __ns; })
1585#define _NS2(D, T1, S1, T2, S2) \
1586 D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
1587 __ns.T2 = (S2); __ns; })
1588#define _NS3(D, T1, S1, T2, S2, T3, S3) \
1589 D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
1590 __ns.T2 = (S2); __ns.T3 = (S3); __ns; })
1591
1592/*
1593 * inline helper functions
1594 *************************/
1595
1596static inline void drbd_state_lock(struct drbd_conf *mdev)
1597{
1598 wait_event(mdev->misc_wait,
1599 !test_and_set_bit(CLUSTER_ST_CHANGE, &mdev->flags));
1600}
1601
1602static inline void drbd_state_unlock(struct drbd_conf *mdev)
1603{
1604 clear_bit(CLUSTER_ST_CHANGE, &mdev->flags);
1605 wake_up(&mdev->misc_wait);
1606}
1607
1608static inline int _drbd_set_state(struct drbd_conf *mdev,
1609 union drbd_state ns, enum chg_state_flags flags,
1610 struct completion *done)
1611{
1612 int rv;
1613
1614 read_lock(&global_state_lock);
1615 rv = __drbd_set_state(mdev, ns, flags, done);
1616 read_unlock(&global_state_lock);
1617
1618 return rv;
1619}
1620
1621/**
1622 * drbd_request_state() - Reqest a state change
1623 * @mdev: DRBD device.
1624 * @mask: mask of state bits to change.
1625 * @val: value of new state bits.
1626 *
1627 * This is the most graceful way of requesting a state change. It is verbose
1628 * quite verbose in case the state change is not possible, and all those
1629 * state changes are globally serialized.
1630 */
1631static inline int drbd_request_state(struct drbd_conf *mdev,
1632 union drbd_state mask,
1633 union drbd_state val)
1634{
1635 return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED);
1636}
1637
1638#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__)
1639static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, const char *where)
1640{
1641 switch (mdev->ldev->dc.on_io_error) {
1642 case EP_PASS_ON:
1643 if (!forcedetach) {
1644 if (printk_ratelimit())
1645 dev_err(DEV, "Local IO failed in %s."
1646 "Passing error on...\n", where);
1647 break;
1648 }
1649 /* NOTE fall through to detach case if forcedetach set */
1650 case EP_DETACH:
1651 case EP_CALL_HELPER:
1652 if (mdev->state.disk > D_FAILED) {
1653 _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
1654 dev_err(DEV, "Local IO failed in %s."
1655 "Detaching...\n", where);
1656 }
1657 break;
1658 }
1659}
1660
1661/**
1662 * drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers
1663 * @mdev: DRBD device.
1664 * @error: Error code passed to the IO completion callback
1665 * @forcedetach: Force detach. I.e. the error happened while accessing the meta data
1666 *
1667 * See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED)
1668 */
1669#define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__)
1670static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
1671 int error, int forcedetach, const char *where)
1672{
1673 if (error) {
1674 unsigned long flags;
1675 spin_lock_irqsave(&mdev->req_lock, flags);
1676 __drbd_chk_io_error_(mdev, forcedetach, where);
1677 spin_unlock_irqrestore(&mdev->req_lock, flags);
1678 }
1679}
1680
1681
1682/**
1683 * drbd_md_first_sector() - Returns the first sector number of the meta data area
1684 * @bdev: Meta data block device.
1685 *
1686 * BTW, for internal meta data, this happens to be the maximum capacity
1687 * we could agree upon with our peer node.
1688 */
1689static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
1690{
1691 switch (bdev->dc.meta_dev_idx) {
1692 case DRBD_MD_INDEX_INTERNAL:
1693 case DRBD_MD_INDEX_FLEX_INT:
1694 return bdev->md.md_offset + bdev->md.bm_offset;
1695 case DRBD_MD_INDEX_FLEX_EXT:
1696 default:
1697 return bdev->md.md_offset;
1698 }
1699}
1700
1701/**
1702 * drbd_md_last_sector() - Return the last sector number of the meta data area
1703 * @bdev: Meta data block device.
1704 */
1705static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
1706{
1707 switch (bdev->dc.meta_dev_idx) {
1708 case DRBD_MD_INDEX_INTERNAL:
1709 case DRBD_MD_INDEX_FLEX_INT:
1710 return bdev->md.md_offset + MD_AL_OFFSET - 1;
1711 case DRBD_MD_INDEX_FLEX_EXT:
1712 default:
1713 return bdev->md.md_offset + bdev->md.md_size_sect;
1714 }
1715}
1716
1717/* Returns the number of 512 byte sectors of the device */
1718static inline sector_t drbd_get_capacity(struct block_device *bdev)
1719{
1720 /* return bdev ? get_capacity(bdev->bd_disk) : 0; */
1721 return bdev ? bdev->bd_inode->i_size >> 9 : 0;
1722}
1723
1724/**
1725 * drbd_get_max_capacity() - Returns the capacity we announce to out peer
1726 * @bdev: Meta data block device.
1727 *
1728 * returns the capacity we announce to out peer. we clip ourselves at the
1729 * various MAX_SECTORS, because if we don't, current implementation will
1730 * oops sooner or later
1731 */
1732static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
1733{
1734 sector_t s;
1735 switch (bdev->dc.meta_dev_idx) {
1736 case DRBD_MD_INDEX_INTERNAL:
1737 case DRBD_MD_INDEX_FLEX_INT:
1738 s = drbd_get_capacity(bdev->backing_bdev)
1739 ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
1740 drbd_md_first_sector(bdev))
1741 : 0;
1742 break;
1743 case DRBD_MD_INDEX_FLEX_EXT:
1744 s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
1745 drbd_get_capacity(bdev->backing_bdev));
1746 /* clip at maximum size the meta device can support */
1747 s = min_t(sector_t, s,
1748 BM_EXT_TO_SECT(bdev->md.md_size_sect
1749 - bdev->md.bm_offset));
1750 break;
1751 default:
1752 s = min_t(sector_t, DRBD_MAX_SECTORS,
1753 drbd_get_capacity(bdev->backing_bdev));
1754 }
1755 return s;
1756}
1757
1758/**
1759 * drbd_md_ss__() - Return the sector number of our meta data super block
1760 * @mdev: DRBD device.
1761 * @bdev: Meta data block device.
1762 */
1763static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
1764 struct drbd_backing_dev *bdev)
1765{
1766 switch (bdev->dc.meta_dev_idx) {
1767 default: /* external, some index */
1768 return MD_RESERVED_SECT * bdev->dc.meta_dev_idx;
1769 case DRBD_MD_INDEX_INTERNAL:
1770 /* with drbd08, internal meta data is always "flexible" */
1771 case DRBD_MD_INDEX_FLEX_INT:
1772 /* sizeof(struct md_on_disk_07) == 4k
1773 * position: last 4k aligned block of 4k size */
1774 if (!bdev->backing_bdev) {
1775 if (__ratelimit(&drbd_ratelimit_state)) {
1776 dev_err(DEV, "bdev->backing_bdev==NULL\n");
1777 dump_stack();
1778 }
1779 return 0;
1780 }
1781 return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL)
1782 - MD_AL_OFFSET;
1783 case DRBD_MD_INDEX_FLEX_EXT:
1784 return 0;
1785 }
1786}
1787
1788static inline void
1789_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
1790{
1791 list_add_tail(&w->list, &q->q);
1792 up(&q->s);
1793}
1794
1795static inline void
1796drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
1797{
1798 unsigned long flags;
1799 spin_lock_irqsave(&q->q_lock, flags);
1800 list_add(&w->list, &q->q);
1801 up(&q->s); /* within the spinlock,
1802 see comment near end of drbd_worker() */
1803 spin_unlock_irqrestore(&q->q_lock, flags);
1804}
1805
1806static inline void
1807drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
1808{
1809 unsigned long flags;
1810 spin_lock_irqsave(&q->q_lock, flags);
1811 list_add_tail(&w->list, &q->q);
1812 up(&q->s); /* within the spinlock,
1813 see comment near end of drbd_worker() */
1814 spin_unlock_irqrestore(&q->q_lock, flags);
1815}
1816
1817static inline void wake_asender(struct drbd_conf *mdev)
1818{
1819 if (test_bit(SIGNAL_ASENDER, &mdev->flags))
1820 force_sig(DRBD_SIG, mdev->asender.task);
1821}
1822
1823static inline void request_ping(struct drbd_conf *mdev)
1824{
1825 set_bit(SEND_PING, &mdev->flags);
1826 wake_asender(mdev);
1827}
1828
1829static inline int drbd_send_short_cmd(struct drbd_conf *mdev,
1830 enum drbd_packets cmd)
1831{
1832 struct p_header h;
1833 return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h));
1834}
1835
1836static inline int drbd_send_ping(struct drbd_conf *mdev)
1837{
1838 struct p_header h;
1839 return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h));
1840}
1841
1842static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
1843{
1844 struct p_header h;
1845 return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h));
1846}
1847
1848static inline void drbd_thread_stop(struct drbd_thread *thi)
1849{
1850 _drbd_thread_stop(thi, FALSE, TRUE);
1851}
1852
1853static inline void drbd_thread_stop_nowait(struct drbd_thread *thi)
1854{
1855 _drbd_thread_stop(thi, FALSE, FALSE);
1856}
1857
1858static inline void drbd_thread_restart_nowait(struct drbd_thread *thi)
1859{
1860 _drbd_thread_stop(thi, TRUE, FALSE);
1861}
1862
1863/* counts how many answer packets packets we expect from our peer,
1864 * for either explicit application requests,
1865 * or implicit barrier packets as necessary.
1866 * increased:
1867 * w_send_barrier
1868 * _req_mod(req, queue_for_net_write or queue_for_net_read);
1869 * it is much easier and equally valid to count what we queue for the
1870 * worker, even before it actually was queued or send.
1871 * (drbd_make_request_common; recovery path on read io-error)
1872 * decreased:
1873 * got_BarrierAck (respective tl_clear, tl_clear_barrier)
1874 * _req_mod(req, data_received)
1875 * [from receive_DataReply]
1876 * _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked)
1877 * [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)]
1878 * for some reason it is NOT decreased in got_NegAck,
1879 * but in the resulting cleanup code from report_params.
1880 * we should try to remember the reason for that...
1881 * _req_mod(req, send_failed or send_canceled)
1882 * _req_mod(req, connection_lost_while_pending)
1883 * [from tl_clear_barrier]
1884 */
1885static inline void inc_ap_pending(struct drbd_conf *mdev)
1886{
1887 atomic_inc(&mdev->ap_pending_cnt);
1888}
1889
1890#define ERR_IF_CNT_IS_NEGATIVE(which) \
1891 if (atomic_read(&mdev->which) < 0) \
1892 dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n", \
1893 __func__ , __LINE__ , \
1894 atomic_read(&mdev->which))
1895
1896#define dec_ap_pending(mdev) do { \
1897 typecheck(struct drbd_conf *, mdev); \
1898 if (atomic_dec_and_test(&mdev->ap_pending_cnt)) \
1899 wake_up(&mdev->misc_wait); \
1900 ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0)
1901
1902/* counts how many resync-related answers we still expect from the peer
1903 * increase decrease
1904 * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY)
1905 * C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK whith ID_SYNCER)
1906 * (or P_NEG_ACK with ID_SYNCER)
1907 */
1908static inline void inc_rs_pending(struct drbd_conf *mdev)
1909{
1910 atomic_inc(&mdev->rs_pending_cnt);
1911}
1912
1913#define dec_rs_pending(mdev) do { \
1914 typecheck(struct drbd_conf *, mdev); \
1915 atomic_dec(&mdev->rs_pending_cnt); \
1916 ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0)
1917
1918/* counts how many answers we still need to send to the peer.
1919 * increased on
1920 * receive_Data unless protocol A;
1921 * we need to send a P_RECV_ACK (proto B)
1922 * or P_WRITE_ACK (proto C)
1923 * receive_RSDataReply (recv_resync_read) we need to send a P_WRITE_ACK
1924 * receive_DataRequest (receive_RSDataRequest) we need to send back P_DATA
1925 * receive_Barrier_* we need to send a P_BARRIER_ACK
1926 */
1927static inline void inc_unacked(struct drbd_conf *mdev)
1928{
1929 atomic_inc(&mdev->unacked_cnt);
1930}
1931
1932#define dec_unacked(mdev) do { \
1933 typecheck(struct drbd_conf *, mdev); \
1934 atomic_dec(&mdev->unacked_cnt); \
1935 ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
1936
1937#define sub_unacked(mdev, n) do { \
1938 typecheck(struct drbd_conf *, mdev); \
1939 atomic_sub(n, &mdev->unacked_cnt); \
1940 ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
1941
1942
1943static inline void put_net_conf(struct drbd_conf *mdev)
1944{
1945 if (atomic_dec_and_test(&mdev->net_cnt))
1946 wake_up(&mdev->misc_wait);
1947}
1948
1949/**
1950 * get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there
1951 * @mdev: DRBD device.
1952 *
1953 * You have to call put_net_conf() when finished working with mdev->net_conf.
1954 */
1955static inline int get_net_conf(struct drbd_conf *mdev)
1956{
1957 int have_net_conf;
1958
1959 atomic_inc(&mdev->net_cnt);
1960 have_net_conf = mdev->state.conn >= C_UNCONNECTED;
1961 if (!have_net_conf)
1962 put_net_conf(mdev);
1963 return have_net_conf;
1964}
1965
1966/**
1967 * get_ldev() - Increase the ref count on mdev->ldev. Returns 0 if there is no ldev
1968 * @M: DRBD device.
1969 *
1970 * You have to call put_ldev() when finished working with mdev->ldev.
1971 */
1972#define get_ldev(M) __cond_lock(local, _get_ldev_if_state(M,D_INCONSISTENT))
1973#define get_ldev_if_state(M,MINS) __cond_lock(local, _get_ldev_if_state(M,MINS))
1974
1975static inline void put_ldev(struct drbd_conf *mdev)
1976{
1977 __release(local);
1978 if (atomic_dec_and_test(&mdev->local_cnt))
1979 wake_up(&mdev->misc_wait);
1980 D_ASSERT(atomic_read(&mdev->local_cnt) >= 0);
1981}
1982
1983#ifndef __CHECKER__
1984static inline int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
1985{
1986 int io_allowed;
1987
1988 atomic_inc(&mdev->local_cnt);
1989 io_allowed = (mdev->state.disk >= mins);
1990 if (!io_allowed)
1991 put_ldev(mdev);
1992 return io_allowed;
1993}
1994#else
1995extern int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins);
1996#endif
1997
1998/* you must have an "get_ldev" reference */
1999static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
2000 unsigned long *bits_left, unsigned int *per_mil_done)
2001{
2002 /*
2003 * this is to break it at compile time when we change that
2004 * (we may feel 4TB maximum storage per drbd is not enough)
2005 */
2006 typecheck(unsigned long, mdev->rs_total);
2007
2008 /* note: both rs_total and rs_left are in bits, i.e. in
2009 * units of BM_BLOCK_SIZE.
2010 * for the percentage, we don't care. */
2011
2012 *bits_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
2013 /* >> 10 to prevent overflow,
2014 * +1 to prevent division by zero */
2015 if (*bits_left > mdev->rs_total) {
2016 /* doh. maybe a logic bug somewhere.
2017 * may also be just a race condition
2018 * between this and a disconnect during sync.
2019 * for now, just prevent in-kernel buffer overflow.
2020 */
2021 smp_rmb();
2022 dev_warn(DEV, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n",
2023 drbd_conn_str(mdev->state.conn),
2024 *bits_left, mdev->rs_total, mdev->rs_failed);
2025 *per_mil_done = 0;
2026 } else {
2027 /* make sure the calculation happens in long context */
2028 unsigned long tmp = 1000UL -
2029 (*bits_left >> 10)*1000UL
2030 / ((mdev->rs_total >> 10) + 1UL);
2031 *per_mil_done = tmp;
2032 }
2033}
2034
2035
2036/* this throttles on-the-fly application requests
2037 * according to max_buffers settings;
2038 * maybe re-implement using semaphores? */
2039static inline int drbd_get_max_buffers(struct drbd_conf *mdev)
2040{
2041 int mxb = 1000000; /* arbitrary limit on open requests */
2042 if (get_net_conf(mdev)) {
2043 mxb = mdev->net_conf->max_buffers;
2044 put_net_conf(mdev);
2045 }
2046 return mxb;
2047}
2048
2049static inline int drbd_state_is_stable(union drbd_state s)
2050{
2051
2052 /* DO NOT add a default clause, we want the compiler to warn us
2053 * for any newly introduced state we may have forgotten to add here */
2054
2055 switch ((enum drbd_conns)s.conn) {
2056 /* new io only accepted when there is no connection, ... */
2057 case C_STANDALONE:
2058 case C_WF_CONNECTION:
2059 /* ... or there is a well established connection. */
2060 case C_CONNECTED:
2061 case C_SYNC_SOURCE:
2062 case C_SYNC_TARGET:
2063 case C_VERIFY_S:
2064 case C_VERIFY_T:
2065 case C_PAUSED_SYNC_S:
2066 case C_PAUSED_SYNC_T:
2067 /* maybe stable, look at the disk state */
2068 break;
2069
2070 /* no new io accepted during tansitional states
2071 * like handshake or teardown */
2072 case C_DISCONNECTING:
2073 case C_UNCONNECTED:
2074 case C_TIMEOUT:
2075 case C_BROKEN_PIPE:
2076 case C_NETWORK_FAILURE:
2077 case C_PROTOCOL_ERROR:
2078 case C_TEAR_DOWN:
2079 case C_WF_REPORT_PARAMS:
2080 case C_STARTING_SYNC_S:
2081 case C_STARTING_SYNC_T:
2082 case C_WF_BITMAP_S:
2083 case C_WF_BITMAP_T:
2084 case C_WF_SYNC_UUID:
2085 case C_MASK:
2086 /* not "stable" */
2087 return 0;
2088 }
2089
2090 switch ((enum drbd_disk_state)s.disk) {
2091 case D_DISKLESS:
2092 case D_INCONSISTENT:
2093 case D_OUTDATED:
2094 case D_CONSISTENT:
2095 case D_UP_TO_DATE:
2096 /* disk state is stable as well. */
2097 break;
2098
2099 /* no new io accepted during tansitional states */
2100 case D_ATTACHING:
2101 case D_FAILED:
2102 case D_NEGOTIATING:
2103 case D_UNKNOWN:
2104 case D_MASK:
2105 /* not "stable" */
2106 return 0;
2107 }
2108
2109 return 1;
2110}
2111
2112static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
2113{
2114 int mxb = drbd_get_max_buffers(mdev);
2115
2116 if (mdev->state.susp)
2117 return 0;
2118 if (test_bit(SUSPEND_IO, &mdev->flags))
2119 return 0;
2120
2121 /* to avoid potential deadlock or bitmap corruption,
2122 * in various places, we only allow new application io
2123 * to start during "stable" states. */
2124
2125 /* no new io accepted when attaching or detaching the disk */
2126 if (!drbd_state_is_stable(mdev->state))
2127 return 0;
2128
2129 /* since some older kernels don't have atomic_add_unless,
2130 * and we are within the spinlock anyways, we have this workaround. */
2131 if (atomic_read(&mdev->ap_bio_cnt) > mxb)
2132 return 0;
2133 if (test_bit(BITMAP_IO, &mdev->flags))
2134 return 0;
2135 return 1;
2136}
2137
2138/* I'd like to use wait_event_lock_irq,
2139 * but I'm not sure when it got introduced,
2140 * and not sure when it has 3 or 4 arguments */
2141static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two)
2142{
2143 /* compare with after_state_ch,
2144 * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */
2145 DEFINE_WAIT(wait);
2146
2147 /* we wait here
2148 * as long as the device is suspended
2149 * until the bitmap is no longer on the fly during connection
2150 * handshake as long as we would exeed the max_buffer limit.
2151 *
2152 * to avoid races with the reconnect code,
2153 * we need to atomic_inc within the spinlock. */
2154
2155 spin_lock_irq(&mdev->req_lock);
2156 while (!__inc_ap_bio_cond(mdev)) {
2157 prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
2158 spin_unlock_irq(&mdev->req_lock);
2159 schedule();
2160 finish_wait(&mdev->misc_wait, &wait);
2161 spin_lock_irq(&mdev->req_lock);
2162 }
2163 atomic_add(one_or_two, &mdev->ap_bio_cnt);
2164 spin_unlock_irq(&mdev->req_lock);
2165}
2166
2167static inline void dec_ap_bio(struct drbd_conf *mdev)
2168{
2169 int mxb = drbd_get_max_buffers(mdev);
2170 int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt);
2171
2172 D_ASSERT(ap_bio >= 0);
2173 /* this currently does wake_up for every dec_ap_bio!
2174 * maybe rather introduce some type of hysteresis?
2175 * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */
2176 if (ap_bio < mxb)
2177 wake_up(&mdev->misc_wait);
2178 if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) {
2179 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
2180 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
2181 }
2182}
2183
2184static inline void drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
2185{
2186 mdev->ed_uuid = val;
2187}
2188
2189static inline int seq_cmp(u32 a, u32 b)
2190{
2191 /* we assume wrap around at 32bit.
2192 * for wrap around at 24bit (old atomic_t),
2193 * we'd have to
2194 * a <<= 8; b <<= 8;
2195 */
2196 return (s32)(a) - (s32)(b);
2197}
2198#define seq_lt(a, b) (seq_cmp((a), (b)) < 0)
2199#define seq_gt(a, b) (seq_cmp((a), (b)) > 0)
2200#define seq_ge(a, b) (seq_cmp((a), (b)) >= 0)
2201#define seq_le(a, b) (seq_cmp((a), (b)) <= 0)
2202/* CAUTION: please no side effects in arguments! */
2203#define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b)))
2204
2205static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq)
2206{
2207 unsigned int m;
2208 spin_lock(&mdev->peer_seq_lock);
2209 m = seq_max(mdev->peer_seq, new_seq);
2210 mdev->peer_seq = m;
2211 spin_unlock(&mdev->peer_seq_lock);
2212 if (m == new_seq)
2213 wake_up(&mdev->seq_wait);
2214}
2215
2216static inline void drbd_update_congested(struct drbd_conf *mdev)
2217{
2218 struct sock *sk = mdev->data.socket->sk;
2219 if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
2220 set_bit(NET_CONGESTED, &mdev->flags);
2221}
2222
2223static inline int drbd_queue_order_type(struct drbd_conf *mdev)
2224{
2225 /* sorry, we currently have no working implementation
2226 * of distributed TCQ stuff */
2227#ifndef QUEUE_ORDERED_NONE
2228#define QUEUE_ORDERED_NONE 0
2229#endif
2230 return QUEUE_ORDERED_NONE;
2231}
2232
2233static inline void drbd_blk_run_queue(struct request_queue *q)
2234{
2235 if (q && q->unplug_fn)
2236 q->unplug_fn(q);
2237}
2238
2239static inline void drbd_kick_lo(struct drbd_conf *mdev)
2240{
2241 if (get_ldev(mdev)) {
2242 drbd_blk_run_queue(bdev_get_queue(mdev->ldev->backing_bdev));
2243 put_ldev(mdev);
2244 }
2245}
2246
2247static inline void drbd_md_flush(struct drbd_conf *mdev)
2248{
2249 int r;
2250
2251 if (test_bit(MD_NO_BARRIER, &mdev->flags))
2252 return;
2253
2254 r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL);
2255 if (r) {
2256 set_bit(MD_NO_BARRIER, &mdev->flags);
2257 dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
2258 }
2259}
2260
2261#endif
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
new file mode 100644
index 000000000000..93d1f9b469d4
--- /dev/null
+++ b/drivers/block/drbd/drbd_main.c
@@ -0,0 +1,3716 @@
1/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
29#include <linux/module.h>
30#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
35#include <linux/smp_lock.h>
36#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
55#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
67int drbdd_init(struct drbd_thread *);
68int drbd_worker(struct drbd_thread *);
69int drbd_asender(struct drbd_thread *);
70
71int drbd_init(void);
72static int drbd_open(struct block_device *bdev, fmode_t mode);
73static int drbd_release(struct gendisk *gd, fmode_t mode);
74static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
75static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
76 union drbd_state ns, enum chg_state_flags flags);
77static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
78static void md_sync_timer_fn(unsigned long data);
79static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
80
81MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
82 "Lars Ellenberg <lars@linbit.com>");
83MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
84MODULE_VERSION(REL_VERSION);
85MODULE_LICENSE("GPL");
86MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
87MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
88
89#include <linux/moduleparam.h>
90/* allow_open_on_secondary */
91MODULE_PARM_DESC(allow_oos, "DONT USE!");
92/* thanks to these macros, if compiled into the kernel (not-module),
93 * this becomes the boot parameter drbd.minor_count */
94module_param(minor_count, uint, 0444);
95module_param(disable_sendpage, bool, 0644);
96module_param(allow_oos, bool, 0);
97module_param(cn_idx, uint, 0444);
98module_param(proc_details, int, 0644);
99
100#ifdef CONFIG_DRBD_FAULT_INJECTION
101int enable_faults;
102int fault_rate;
103static int fault_count;
104int fault_devs;
105/* bitmap of enabled faults */
106module_param(enable_faults, int, 0664);
107/* fault rate % value - applies to all enabled faults */
108module_param(fault_rate, int, 0664);
109/* count of faults inserted */
110module_param(fault_count, int, 0664);
111/* bitmap of devices to insert faults on */
112module_param(fault_devs, int, 0644);
113#endif
114
115/* module parameter, defined */
116unsigned int minor_count = 32;
117int disable_sendpage;
118int allow_oos;
119unsigned int cn_idx = CN_IDX_DRBD;
120int proc_details; /* Detail level in proc drbd*/
121
122/* Module parameter for setting the user mode helper program
123 * to run. Default is /sbin/drbdadm */
124char usermode_helper[80] = "/sbin/drbdadm";
125
126module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
127
128/* in 2.6.x, our device mapping and config info contains our virtual gendisks
129 * as member "struct gendisk *vdisk;"
130 */
131struct drbd_conf **minor_table;
132
133struct kmem_cache *drbd_request_cache;
134struct kmem_cache *drbd_ee_cache; /* epoch entries */
135struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
136struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
137mempool_t *drbd_request_mempool;
138mempool_t *drbd_ee_mempool;
139
140/* I do not use a standard mempool, because:
141 1) I want to hand out the pre-allocated objects first.
142 2) I want to be able to interrupt sleeping allocation with a signal.
143 Note: This is a single linked list, the next pointer is the private
144 member of struct page.
145 */
146struct page *drbd_pp_pool;
147spinlock_t drbd_pp_lock;
148int drbd_pp_vacant;
149wait_queue_head_t drbd_pp_wait;
150
151DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
152
153static const struct block_device_operations drbd_ops = {
154 .owner = THIS_MODULE,
155 .open = drbd_open,
156 .release = drbd_release,
157};
158
159#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
160
161#ifdef __CHECKER__
162/* When checking with sparse, and this is an inline function, sparse will
163 give tons of false positives. When this is a real functions sparse works.
164 */
165int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
166{
167 int io_allowed;
168
169 atomic_inc(&mdev->local_cnt);
170 io_allowed = (mdev->state.disk >= mins);
171 if (!io_allowed) {
172 if (atomic_dec_and_test(&mdev->local_cnt))
173 wake_up(&mdev->misc_wait);
174 }
175 return io_allowed;
176}
177
178#endif
179
180/**
181 * DOC: The transfer log
182 *
183 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
184 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
185 * of the list. There is always at least one &struct drbd_tl_epoch object.
186 *
187 * Each &struct drbd_tl_epoch has a circular double linked list of requests
188 * attached.
189 */
190static int tl_init(struct drbd_conf *mdev)
191{
192 struct drbd_tl_epoch *b;
193
194 /* during device minor initialization, we may well use GFP_KERNEL */
195 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
196 if (!b)
197 return 0;
198 INIT_LIST_HEAD(&b->requests);
199 INIT_LIST_HEAD(&b->w.list);
200 b->next = NULL;
201 b->br_number = 4711;
202 b->n_req = 0;
203 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
204
205 mdev->oldest_tle = b;
206 mdev->newest_tle = b;
207 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
208
209 mdev->tl_hash = NULL;
210 mdev->tl_hash_s = 0;
211
212 return 1;
213}
214
215static void tl_cleanup(struct drbd_conf *mdev)
216{
217 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
218 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
219 kfree(mdev->oldest_tle);
220 mdev->oldest_tle = NULL;
221 kfree(mdev->unused_spare_tle);
222 mdev->unused_spare_tle = NULL;
223 kfree(mdev->tl_hash);
224 mdev->tl_hash = NULL;
225 mdev->tl_hash_s = 0;
226}
227
228/**
229 * _tl_add_barrier() - Adds a barrier to the transfer log
230 * @mdev: DRBD device.
231 * @new: Barrier to be added before the current head of the TL.
232 *
233 * The caller must hold the req_lock.
234 */
235void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
236{
237 struct drbd_tl_epoch *newest_before;
238
239 INIT_LIST_HEAD(&new->requests);
240 INIT_LIST_HEAD(&new->w.list);
241 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
242 new->next = NULL;
243 new->n_req = 0;
244
245 newest_before = mdev->newest_tle;
246 /* never send a barrier number == 0, because that is special-cased
247 * when using TCQ for our write ordering code */
248 new->br_number = (newest_before->br_number+1) ?: 1;
249 if (mdev->newest_tle != new) {
250 mdev->newest_tle->next = new;
251 mdev->newest_tle = new;
252 }
253}
254
255/**
256 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
257 * @mdev: DRBD device.
258 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
259 * @set_size: Expected number of requests before that barrier.
260 *
261 * In case the passed barrier_nr or set_size does not match the oldest
262 * &struct drbd_tl_epoch objects this function will cause a termination
263 * of the connection.
264 */
265void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
266 unsigned int set_size)
267{
268 struct drbd_tl_epoch *b, *nob; /* next old barrier */
269 struct list_head *le, *tle;
270 struct drbd_request *r;
271
272 spin_lock_irq(&mdev->req_lock);
273
274 b = mdev->oldest_tle;
275
276 /* first some paranoia code */
277 if (b == NULL) {
278 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
279 barrier_nr);
280 goto bail;
281 }
282 if (b->br_number != barrier_nr) {
283 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
284 barrier_nr, b->br_number);
285 goto bail;
286 }
287 if (b->n_req != set_size) {
288 dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n",
289 barrier_nr, set_size, b->n_req);
290 goto bail;
291 }
292
293 /* Clean up list of requests processed during current epoch */
294 list_for_each_safe(le, tle, &b->requests) {
295 r = list_entry(le, struct drbd_request, tl_requests);
296 _req_mod(r, barrier_acked);
297 }
298 /* There could be requests on the list waiting for completion
299 of the write to the local disk. To avoid corruptions of
300 slab's data structures we have to remove the lists head.
301
302 Also there could have been a barrier ack out of sequence, overtaking
303 the write acks - which would be a bug and violating write ordering.
304 To not deadlock in case we lose connection while such requests are
305 still pending, we need some way to find them for the
306 _req_mode(connection_lost_while_pending).
307
308 These have been list_move'd to the out_of_sequence_requests list in
309 _req_mod(, barrier_acked) above.
310 */
311 list_del_init(&b->requests);
312
313 nob = b->next;
314 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
315 _tl_add_barrier(mdev, b);
316 if (nob)
317 mdev->oldest_tle = nob;
318 /* if nob == NULL b was the only barrier, and becomes the new
319 barrier. Therefore mdev->oldest_tle points already to b */
320 } else {
321 D_ASSERT(nob != NULL);
322 mdev->oldest_tle = nob;
323 kfree(b);
324 }
325
326 spin_unlock_irq(&mdev->req_lock);
327 dec_ap_pending(mdev);
328
329 return;
330
331bail:
332 spin_unlock_irq(&mdev->req_lock);
333 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
334}
335
336
337/**
338 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
339 * @mdev: DRBD device.
340 *
341 * This is called after the connection to the peer was lost. The storage covered
342 * by the requests on the transfer gets marked as our of sync. Called from the
343 * receiver thread and the worker thread.
344 */
345void tl_clear(struct drbd_conf *mdev)
346{
347 struct drbd_tl_epoch *b, *tmp;
348 struct list_head *le, *tle;
349 struct drbd_request *r;
350 int new_initial_bnr = net_random();
351
352 spin_lock_irq(&mdev->req_lock);
353
354 b = mdev->oldest_tle;
355 while (b) {
356 list_for_each_safe(le, tle, &b->requests) {
357 r = list_entry(le, struct drbd_request, tl_requests);
358 /* It would be nice to complete outside of spinlock.
359 * But this is easier for now. */
360 _req_mod(r, connection_lost_while_pending);
361 }
362 tmp = b->next;
363
364 /* there could still be requests on that ring list,
365 * in case local io is still pending */
366 list_del(&b->requests);
367
368 /* dec_ap_pending corresponding to queue_barrier.
369 * the newest barrier may not have been queued yet,
370 * in which case w.cb is still NULL. */
371 if (b->w.cb != NULL)
372 dec_ap_pending(mdev);
373
374 if (b == mdev->newest_tle) {
375 /* recycle, but reinit! */
376 D_ASSERT(tmp == NULL);
377 INIT_LIST_HEAD(&b->requests);
378 INIT_LIST_HEAD(&b->w.list);
379 b->w.cb = NULL;
380 b->br_number = new_initial_bnr;
381 b->n_req = 0;
382
383 mdev->oldest_tle = b;
384 break;
385 }
386 kfree(b);
387 b = tmp;
388 }
389
390 /* we expect this list to be empty. */
391 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
392
393 /* but just in case, clean it up anyways! */
394 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
395 r = list_entry(le, struct drbd_request, tl_requests);
396 /* It would be nice to complete outside of spinlock.
397 * But this is easier for now. */
398 _req_mod(r, connection_lost_while_pending);
399 }
400
401 /* ensure bit indicating barrier is required is clear */
402 clear_bit(CREATE_BARRIER, &mdev->flags);
403
404 spin_unlock_irq(&mdev->req_lock);
405}
406
407/**
408 * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
409 * @mdev: DRBD device.
410 * @os: old (current) state.
411 * @ns: new (wanted) state.
412 */
413static int cl_wide_st_chg(struct drbd_conf *mdev,
414 union drbd_state os, union drbd_state ns)
415{
416 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
417 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
418 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
419 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
420 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
421 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
422 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
423}
424
425int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
426 union drbd_state mask, union drbd_state val)
427{
428 unsigned long flags;
429 union drbd_state os, ns;
430 int rv;
431
432 spin_lock_irqsave(&mdev->req_lock, flags);
433 os = mdev->state;
434 ns.i = (os.i & ~mask.i) | val.i;
435 rv = _drbd_set_state(mdev, ns, f, NULL);
436 ns = mdev->state;
437 spin_unlock_irqrestore(&mdev->req_lock, flags);
438
439 return rv;
440}
441
442/**
443 * drbd_force_state() - Impose a change which happens outside our control on our state
444 * @mdev: DRBD device.
445 * @mask: mask of state bits to change.
446 * @val: value of new state bits.
447 */
448void drbd_force_state(struct drbd_conf *mdev,
449 union drbd_state mask, union drbd_state val)
450{
451 drbd_change_state(mdev, CS_HARD, mask, val);
452}
453
454static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
455static int is_valid_state_transition(struct drbd_conf *,
456 union drbd_state, union drbd_state);
457static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
458 union drbd_state ns, int *warn_sync_abort);
459int drbd_send_state_req(struct drbd_conf *,
460 union drbd_state, union drbd_state);
461
462static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
463 union drbd_state mask, union drbd_state val)
464{
465 union drbd_state os, ns;
466 unsigned long flags;
467 int rv;
468
469 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
470 return SS_CW_SUCCESS;
471
472 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
473 return SS_CW_FAILED_BY_PEER;
474
475 rv = 0;
476 spin_lock_irqsave(&mdev->req_lock, flags);
477 os = mdev->state;
478 ns.i = (os.i & ~mask.i) | val.i;
479 ns = sanitize_state(mdev, os, ns, NULL);
480
481 if (!cl_wide_st_chg(mdev, os, ns))
482 rv = SS_CW_NO_NEED;
483 if (!rv) {
484 rv = is_valid_state(mdev, ns);
485 if (rv == SS_SUCCESS) {
486 rv = is_valid_state_transition(mdev, ns, os);
487 if (rv == SS_SUCCESS)
488 rv = 0; /* cont waiting, otherwise fail. */
489 }
490 }
491 spin_unlock_irqrestore(&mdev->req_lock, flags);
492
493 return rv;
494}
495
496/**
497 * drbd_req_state() - Perform an eventually cluster wide state change
498 * @mdev: DRBD device.
499 * @mask: mask of state bits to change.
500 * @val: value of new state bits.
501 * @f: flags
502 *
503 * Should not be called directly, use drbd_request_state() or
504 * _drbd_request_state().
505 */
506static int drbd_req_state(struct drbd_conf *mdev,
507 union drbd_state mask, union drbd_state val,
508 enum chg_state_flags f)
509{
510 struct completion done;
511 unsigned long flags;
512 union drbd_state os, ns;
513 int rv;
514
515 init_completion(&done);
516
517 if (f & CS_SERIALIZE)
518 mutex_lock(&mdev->state_mutex);
519
520 spin_lock_irqsave(&mdev->req_lock, flags);
521 os = mdev->state;
522 ns.i = (os.i & ~mask.i) | val.i;
523 ns = sanitize_state(mdev, os, ns, NULL);
524
525 if (cl_wide_st_chg(mdev, os, ns)) {
526 rv = is_valid_state(mdev, ns);
527 if (rv == SS_SUCCESS)
528 rv = is_valid_state_transition(mdev, ns, os);
529 spin_unlock_irqrestore(&mdev->req_lock, flags);
530
531 if (rv < SS_SUCCESS) {
532 if (f & CS_VERBOSE)
533 print_st_err(mdev, os, ns, rv);
534 goto abort;
535 }
536
537 drbd_state_lock(mdev);
538 if (!drbd_send_state_req(mdev, mask, val)) {
539 drbd_state_unlock(mdev);
540 rv = SS_CW_FAILED_BY_PEER;
541 if (f & CS_VERBOSE)
542 print_st_err(mdev, os, ns, rv);
543 goto abort;
544 }
545
546 wait_event(mdev->state_wait,
547 (rv = _req_st_cond(mdev, mask, val)));
548
549 if (rv < SS_SUCCESS) {
550 drbd_state_unlock(mdev);
551 if (f & CS_VERBOSE)
552 print_st_err(mdev, os, ns, rv);
553 goto abort;
554 }
555 spin_lock_irqsave(&mdev->req_lock, flags);
556 os = mdev->state;
557 ns.i = (os.i & ~mask.i) | val.i;
558 rv = _drbd_set_state(mdev, ns, f, &done);
559 drbd_state_unlock(mdev);
560 } else {
561 rv = _drbd_set_state(mdev, ns, f, &done);
562 }
563
564 spin_unlock_irqrestore(&mdev->req_lock, flags);
565
566 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
567 D_ASSERT(current != mdev->worker.task);
568 wait_for_completion(&done);
569 }
570
571abort:
572 if (f & CS_SERIALIZE)
573 mutex_unlock(&mdev->state_mutex);
574
575 return rv;
576}
577
578/**
579 * _drbd_request_state() - Request a state change (with flags)
580 * @mdev: DRBD device.
581 * @mask: mask of state bits to change.
582 * @val: value of new state bits.
583 * @f: flags
584 *
585 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
586 * flag, or when logging of failed state change requests is not desired.
587 */
588int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
589 union drbd_state val, enum chg_state_flags f)
590{
591 int rv;
592
593 wait_event(mdev->state_wait,
594 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
595
596 return rv;
597}
598
599static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
600{
601 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
602 name,
603 drbd_conn_str(ns.conn),
604 drbd_role_str(ns.role),
605 drbd_role_str(ns.peer),
606 drbd_disk_str(ns.disk),
607 drbd_disk_str(ns.pdsk),
608 ns.susp ? 's' : 'r',
609 ns.aftr_isp ? 'a' : '-',
610 ns.peer_isp ? 'p' : '-',
611 ns.user_isp ? 'u' : '-'
612 );
613}
614
615void print_st_err(struct drbd_conf *mdev,
616 union drbd_state os, union drbd_state ns, int err)
617{
618 if (err == SS_IN_TRANSIENT_STATE)
619 return;
620 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
621 print_st(mdev, " state", os);
622 print_st(mdev, "wanted", ns);
623}
624
625
626#define drbd_peer_str drbd_role_str
627#define drbd_pdsk_str drbd_disk_str
628
629#define drbd_susp_str(A) ((A) ? "1" : "0")
630#define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
631#define drbd_peer_isp_str(A) ((A) ? "1" : "0")
632#define drbd_user_isp_str(A) ((A) ? "1" : "0")
633
634#define PSC(A) \
635 ({ if (ns.A != os.A) { \
636 pbp += sprintf(pbp, #A "( %s -> %s ) ", \
637 drbd_##A##_str(os.A), \
638 drbd_##A##_str(ns.A)); \
639 } })
640
641/**
642 * is_valid_state() - Returns an SS_ error code if ns is not valid
643 * @mdev: DRBD device.
644 * @ns: State to consider.
645 */
646static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
647{
648 /* See drbd_state_sw_errors in drbd_strings.c */
649
650 enum drbd_fencing_p fp;
651 int rv = SS_SUCCESS;
652
653 fp = FP_DONT_CARE;
654 if (get_ldev(mdev)) {
655 fp = mdev->ldev->dc.fencing;
656 put_ldev(mdev);
657 }
658
659 if (get_net_conf(mdev)) {
660 if (!mdev->net_conf->two_primaries &&
661 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
662 rv = SS_TWO_PRIMARIES;
663 put_net_conf(mdev);
664 }
665
666 if (rv <= 0)
667 /* already found a reason to abort */;
668 else if (ns.role == R_SECONDARY && mdev->open_cnt)
669 rv = SS_DEVICE_IN_USE;
670
671 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
672 rv = SS_NO_UP_TO_DATE_DISK;
673
674 else if (fp >= FP_RESOURCE &&
675 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
676 rv = SS_PRIMARY_NOP;
677
678 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
679 rv = SS_NO_UP_TO_DATE_DISK;
680
681 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
682 rv = SS_NO_LOCAL_DISK;
683
684 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
685 rv = SS_NO_REMOTE_DISK;
686
687 else if ((ns.conn == C_CONNECTED ||
688 ns.conn == C_WF_BITMAP_S ||
689 ns.conn == C_SYNC_SOURCE ||
690 ns.conn == C_PAUSED_SYNC_S) &&
691 ns.disk == D_OUTDATED)
692 rv = SS_CONNECTED_OUTDATES;
693
694 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
695 (mdev->sync_conf.verify_alg[0] == 0))
696 rv = SS_NO_VERIFY_ALG;
697
698 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
699 mdev->agreed_pro_version < 88)
700 rv = SS_NOT_SUPPORTED;
701
702 return rv;
703}
704
705/**
706 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
707 * @mdev: DRBD device.
708 * @ns: new state.
709 * @os: old state.
710 */
711static int is_valid_state_transition(struct drbd_conf *mdev,
712 union drbd_state ns, union drbd_state os)
713{
714 int rv = SS_SUCCESS;
715
716 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
717 os.conn > C_CONNECTED)
718 rv = SS_RESYNC_RUNNING;
719
720 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
721 rv = SS_ALREADY_STANDALONE;
722
723 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
724 rv = SS_IS_DISKLESS;
725
726 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
727 rv = SS_NO_NET_CONFIG;
728
729 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
730 rv = SS_LOWER_THAN_OUTDATED;
731
732 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
733 rv = SS_IN_TRANSIENT_STATE;
734
735 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
736 rv = SS_IN_TRANSIENT_STATE;
737
738 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
739 rv = SS_NEED_CONNECTION;
740
741 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
742 ns.conn != os.conn && os.conn > C_CONNECTED)
743 rv = SS_RESYNC_RUNNING;
744
745 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
746 os.conn < C_CONNECTED)
747 rv = SS_NEED_CONNECTION;
748
749 return rv;
750}
751
752/**
753 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
754 * @mdev: DRBD device.
755 * @os: old state.
756 * @ns: new state.
757 * @warn_sync_abort:
758 *
759 * When we loose connection, we have to set the state of the peers disk (pdsk)
760 * to D_UNKNOWN. This rule and many more along those lines are in this function.
761 */
762static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
763 union drbd_state ns, int *warn_sync_abort)
764{
765 enum drbd_fencing_p fp;
766
767 fp = FP_DONT_CARE;
768 if (get_ldev(mdev)) {
769 fp = mdev->ldev->dc.fencing;
770 put_ldev(mdev);
771 }
772
773 /* Disallow Network errors to configure a device's network part */
774 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
775 os.conn <= C_DISCONNECTING)
776 ns.conn = os.conn;
777
778 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */
779 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
780 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING)
781 ns.conn = os.conn;
782
783 /* After C_DISCONNECTING only C_STANDALONE may follow */
784 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
785 ns.conn = os.conn;
786
787 if (ns.conn < C_CONNECTED) {
788 ns.peer_isp = 0;
789 ns.peer = R_UNKNOWN;
790 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
791 ns.pdsk = D_UNKNOWN;
792 }
793
794 /* Clear the aftr_isp when becoming unconfigured */
795 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
796 ns.aftr_isp = 0;
797
798 if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS)
799 ns.pdsk = D_UNKNOWN;
800
801 /* Abort resync if a disk fails/detaches */
802 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
803 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
804 if (warn_sync_abort)
805 *warn_sync_abort = 1;
806 ns.conn = C_CONNECTED;
807 }
808
809 if (ns.conn >= C_CONNECTED &&
810 ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) ||
811 (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) {
812 switch (ns.conn) {
813 case C_WF_BITMAP_T:
814 case C_PAUSED_SYNC_T:
815 ns.disk = D_OUTDATED;
816 break;
817 case C_CONNECTED:
818 case C_WF_BITMAP_S:
819 case C_SYNC_SOURCE:
820 case C_PAUSED_SYNC_S:
821 ns.disk = D_UP_TO_DATE;
822 break;
823 case C_SYNC_TARGET:
824 ns.disk = D_INCONSISTENT;
825 dev_warn(DEV, "Implicitly set disk state Inconsistent!\n");
826 break;
827 }
828 if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE)
829 dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n");
830 }
831
832 if (ns.conn >= C_CONNECTED &&
833 (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) {
834 switch (ns.conn) {
835 case C_CONNECTED:
836 case C_WF_BITMAP_T:
837 case C_PAUSED_SYNC_T:
838 case C_SYNC_TARGET:
839 ns.pdsk = D_UP_TO_DATE;
840 break;
841 case C_WF_BITMAP_S:
842 case C_PAUSED_SYNC_S:
843 ns.pdsk = D_OUTDATED;
844 break;
845 case C_SYNC_SOURCE:
846 ns.pdsk = D_INCONSISTENT;
847 dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n");
848 break;
849 }
850 if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE)
851 dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n");
852 }
853
854 /* Connection breaks down before we finished "Negotiating" */
855 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
856 get_ldev_if_state(mdev, D_NEGOTIATING)) {
857 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
858 ns.disk = mdev->new_state_tmp.disk;
859 ns.pdsk = mdev->new_state_tmp.pdsk;
860 } else {
861 dev_alert(DEV, "Connection lost while negotiating, no data!\n");
862 ns.disk = D_DISKLESS;
863 ns.pdsk = D_UNKNOWN;
864 }
865 put_ldev(mdev);
866 }
867
868 if (fp == FP_STONITH &&
869 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
870 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
871 ns.susp = 1;
872
873 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
874 if (ns.conn == C_SYNC_SOURCE)
875 ns.conn = C_PAUSED_SYNC_S;
876 if (ns.conn == C_SYNC_TARGET)
877 ns.conn = C_PAUSED_SYNC_T;
878 } else {
879 if (ns.conn == C_PAUSED_SYNC_S)
880 ns.conn = C_SYNC_SOURCE;
881 if (ns.conn == C_PAUSED_SYNC_T)
882 ns.conn = C_SYNC_TARGET;
883 }
884
885 return ns;
886}
887
888/* helper for __drbd_set_state */
889static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
890{
891 if (cs == C_VERIFY_T) {
892 /* starting online verify from an arbitrary position
893 * does not fit well into the existing protocol.
894 * on C_VERIFY_T, we initialize ov_left and friends
895 * implicitly in receive_DataRequest once the
896 * first P_OV_REQUEST is received */
897 mdev->ov_start_sector = ~(sector_t)0;
898 } else {
899 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
900 if (bit >= mdev->rs_total)
901 mdev->ov_start_sector =
902 BM_BIT_TO_SECT(mdev->rs_total - 1);
903 mdev->ov_position = mdev->ov_start_sector;
904 }
905}
906
907/**
908 * __drbd_set_state() - Set a new DRBD state
909 * @mdev: DRBD device.
910 * @ns: new state.
911 * @flags: Flags
912 * @done: Optional completion, that will get completed after the after_state_ch() finished
913 *
914 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
915 */
916int __drbd_set_state(struct drbd_conf *mdev,
917 union drbd_state ns, enum chg_state_flags flags,
918 struct completion *done)
919{
920 union drbd_state os;
921 int rv = SS_SUCCESS;
922 int warn_sync_abort = 0;
923 struct after_state_chg_work *ascw;
924
925 os = mdev->state;
926
927 ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
928
929 if (ns.i == os.i)
930 return SS_NOTHING_TO_DO;
931
932 if (!(flags & CS_HARD)) {
933 /* pre-state-change checks ; only look at ns */
934 /* See drbd_state_sw_errors in drbd_strings.c */
935
936 rv = is_valid_state(mdev, ns);
937 if (rv < SS_SUCCESS) {
938 /* If the old state was illegal as well, then let
939 this happen...*/
940
941 if (is_valid_state(mdev, os) == rv) {
942 dev_err(DEV, "Considering state change from bad state. "
943 "Error would be: '%s'\n",
944 drbd_set_st_err_str(rv));
945 print_st(mdev, "old", os);
946 print_st(mdev, "new", ns);
947 rv = is_valid_state_transition(mdev, ns, os);
948 }
949 } else
950 rv = is_valid_state_transition(mdev, ns, os);
951 }
952
953 if (rv < SS_SUCCESS) {
954 if (flags & CS_VERBOSE)
955 print_st_err(mdev, os, ns, rv);
956 return rv;
957 }
958
959 if (warn_sync_abort)
960 dev_warn(DEV, "Resync aborted.\n");
961
962 {
963 char *pbp, pb[300];
964 pbp = pb;
965 *pbp = 0;
966 PSC(role);
967 PSC(peer);
968 PSC(conn);
969 PSC(disk);
970 PSC(pdsk);
971 PSC(susp);
972 PSC(aftr_isp);
973 PSC(peer_isp);
974 PSC(user_isp);
975 dev_info(DEV, "%s\n", pb);
976 }
977
978 /* solve the race between becoming unconfigured,
979 * worker doing the cleanup, and
980 * admin reconfiguring us:
981 * on (re)configure, first set CONFIG_PENDING,
982 * then wait for a potentially exiting worker,
983 * start the worker, and schedule one no_op.
984 * then proceed with configuration.
985 */
986 if (ns.disk == D_DISKLESS &&
987 ns.conn == C_STANDALONE &&
988 ns.role == R_SECONDARY &&
989 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
990 set_bit(DEVICE_DYING, &mdev->flags);
991
992 mdev->state.i = ns.i;
993 wake_up(&mdev->misc_wait);
994 wake_up(&mdev->state_wait);
995
996 /* post-state-change actions */
997 if (os.conn >= C_SYNC_SOURCE && ns.conn <= C_CONNECTED) {
998 set_bit(STOP_SYNC_TIMER, &mdev->flags);
999 mod_timer(&mdev->resync_timer, jiffies);
1000 }
1001
1002 /* aborted verify run. log the last position */
1003 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1004 ns.conn < C_CONNECTED) {
1005 mdev->ov_start_sector =
1006 BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left);
1007 dev_info(DEV, "Online Verify reached sector %llu\n",
1008 (unsigned long long)mdev->ov_start_sector);
1009 }
1010
1011 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1012 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1013 dev_info(DEV, "Syncer continues.\n");
1014 mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time;
1015 if (ns.conn == C_SYNC_TARGET) {
1016 if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))
1017 mod_timer(&mdev->resync_timer, jiffies);
1018 /* This if (!test_bit) is only needed for the case
1019 that a device that has ceased to used its timer,
1020 i.e. it is already in drbd_resync_finished() gets
1021 paused and resumed. */
1022 }
1023 }
1024
1025 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1026 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1027 dev_info(DEV, "Resync suspended\n");
1028 mdev->rs_mark_time = jiffies;
1029 if (ns.conn == C_PAUSED_SYNC_T)
1030 set_bit(STOP_SYNC_TIMER, &mdev->flags);
1031 }
1032
1033 if (os.conn == C_CONNECTED &&
1034 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1035 mdev->ov_position = 0;
1036 mdev->rs_total =
1037 mdev->rs_mark_left = drbd_bm_bits(mdev);
1038 if (mdev->agreed_pro_version >= 90)
1039 set_ov_position(mdev, ns.conn);
1040 else
1041 mdev->ov_start_sector = 0;
1042 mdev->ov_left = mdev->rs_total
1043 - BM_SECT_TO_BIT(mdev->ov_position);
1044 mdev->rs_start =
1045 mdev->rs_mark_time = jiffies;
1046 mdev->ov_last_oos_size = 0;
1047 mdev->ov_last_oos_start = 0;
1048
1049 if (ns.conn == C_VERIFY_S) {
1050 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1051 (unsigned long long)mdev->ov_position);
1052 mod_timer(&mdev->resync_timer, jiffies);
1053 }
1054 }
1055
1056 if (get_ldev(mdev)) {
1057 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1058 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1059 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1060
1061 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1062 mdf |= MDF_CRASHED_PRIMARY;
1063 if (mdev->state.role == R_PRIMARY ||
1064 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1065 mdf |= MDF_PRIMARY_IND;
1066 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1067 mdf |= MDF_CONNECTED_IND;
1068 if (mdev->state.disk > D_INCONSISTENT)
1069 mdf |= MDF_CONSISTENT;
1070 if (mdev->state.disk > D_OUTDATED)
1071 mdf |= MDF_WAS_UP_TO_DATE;
1072 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1073 mdf |= MDF_PEER_OUT_DATED;
1074 if (mdf != mdev->ldev->md.flags) {
1075 mdev->ldev->md.flags = mdf;
1076 drbd_md_mark_dirty(mdev);
1077 }
1078 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1079 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1080 put_ldev(mdev);
1081 }
1082
1083 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1084 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1085 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1086 set_bit(CONSIDER_RESYNC, &mdev->flags);
1087
1088 /* Receiver should clean up itself */
1089 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1090 drbd_thread_stop_nowait(&mdev->receiver);
1091
1092 /* Now the receiver finished cleaning up itself, it should die */
1093 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1094 drbd_thread_stop_nowait(&mdev->receiver);
1095
1096 /* Upon network failure, we need to restart the receiver. */
1097 if (os.conn > C_TEAR_DOWN &&
1098 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1099 drbd_thread_restart_nowait(&mdev->receiver);
1100
1101 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1102 if (ascw) {
1103 ascw->os = os;
1104 ascw->ns = ns;
1105 ascw->flags = flags;
1106 ascw->w.cb = w_after_state_ch;
1107 ascw->done = done;
1108 drbd_queue_work(&mdev->data.work, &ascw->w);
1109 } else {
1110 dev_warn(DEV, "Could not kmalloc an ascw\n");
1111 }
1112
1113 return rv;
1114}
1115
1116static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1117{
1118 struct after_state_chg_work *ascw =
1119 container_of(w, struct after_state_chg_work, w);
1120 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1121 if (ascw->flags & CS_WAIT_COMPLETE) {
1122 D_ASSERT(ascw->done != NULL);
1123 complete(ascw->done);
1124 }
1125 kfree(ascw);
1126
1127 return 1;
1128}
1129
1130static void abw_start_sync(struct drbd_conf *mdev, int rv)
1131{
1132 if (rv) {
1133 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1134 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1135 return;
1136 }
1137
1138 switch (mdev->state.conn) {
1139 case C_STARTING_SYNC_T:
1140 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1141 break;
1142 case C_STARTING_SYNC_S:
1143 drbd_start_resync(mdev, C_SYNC_SOURCE);
1144 break;
1145 }
1146}
1147
1148/**
1149 * after_state_ch() - Perform after state change actions that may sleep
1150 * @mdev: DRBD device.
1151 * @os: old state.
1152 * @ns: new state.
1153 * @flags: Flags
1154 */
1155static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1156 union drbd_state ns, enum chg_state_flags flags)
1157{
1158 enum drbd_fencing_p fp;
1159
1160 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1161 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1162 if (mdev->p_uuid)
1163 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1164 }
1165
1166 fp = FP_DONT_CARE;
1167 if (get_ldev(mdev)) {
1168 fp = mdev->ldev->dc.fencing;
1169 put_ldev(mdev);
1170 }
1171
1172 /* Inform userspace about the change... */
1173 drbd_bcast_state(mdev, ns);
1174
1175 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1176 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1177 drbd_khelper(mdev, "pri-on-incon-degr");
1178
1179 /* Here we have the actions that are performed after a
1180 state change. This function might sleep */
1181
1182 if (fp == FP_STONITH && ns.susp) {
1183 /* case1: The outdate peer handler is successful:
1184 * case2: The connection was established again: */
1185 if ((os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) ||
1186 (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) {
1187 tl_clear(mdev);
1188 spin_lock_irq(&mdev->req_lock);
1189 _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
1190 spin_unlock_irq(&mdev->req_lock);
1191 }
1192 }
1193 /* Do not change the order of the if above and the two below... */
1194 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1195 drbd_send_uuids(mdev);
1196 drbd_send_state(mdev);
1197 }
1198 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
1199 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
1200
1201 /* Lost contact to peer's copy of the data */
1202 if ((os.pdsk >= D_INCONSISTENT &&
1203 os.pdsk != D_UNKNOWN &&
1204 os.pdsk != D_OUTDATED)
1205 && (ns.pdsk < D_INCONSISTENT ||
1206 ns.pdsk == D_UNKNOWN ||
1207 ns.pdsk == D_OUTDATED)) {
1208 kfree(mdev->p_uuid);
1209 mdev->p_uuid = NULL;
1210 if (get_ldev(mdev)) {
1211 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1212 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1213 drbd_uuid_new_current(mdev);
1214 drbd_send_uuids(mdev);
1215 }
1216 put_ldev(mdev);
1217 }
1218 }
1219
1220 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1221 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
1222 drbd_uuid_new_current(mdev);
1223
1224 /* D_DISKLESS Peer becomes secondary */
1225 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1226 drbd_al_to_on_disk_bm(mdev);
1227 put_ldev(mdev);
1228 }
1229
1230 /* Last part of the attaching process ... */
1231 if (ns.conn >= C_CONNECTED &&
1232 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1233 kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */
1234 mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */
1235 drbd_send_sizes(mdev, 0); /* to start sync... */
1236 drbd_send_uuids(mdev);
1237 drbd_send_state(mdev);
1238 }
1239
1240 /* We want to pause/continue resync, tell peer. */
1241 if (ns.conn >= C_CONNECTED &&
1242 ((os.aftr_isp != ns.aftr_isp) ||
1243 (os.user_isp != ns.user_isp)))
1244 drbd_send_state(mdev);
1245
1246 /* In case one of the isp bits got set, suspend other devices. */
1247 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1248 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1249 suspend_other_sg(mdev);
1250
1251 /* Make sure the peer gets informed about eventual state
1252 changes (ISP bits) while we were in WFReportParams. */
1253 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1254 drbd_send_state(mdev);
1255
1256 /* We are in the progress to start a full sync... */
1257 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1258 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1259 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
1260
1261 /* We are invalidating our self... */
1262 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1263 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1264 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1265
1266 if (os.disk > D_FAILED && ns.disk == D_FAILED) {
1267 enum drbd_io_error_p eh;
1268
1269 eh = EP_PASS_ON;
1270 if (get_ldev_if_state(mdev, D_FAILED)) {
1271 eh = mdev->ldev->dc.on_io_error;
1272 put_ldev(mdev);
1273 }
1274
1275 drbd_rs_cancel_all(mdev);
1276 /* since get_ldev() only works as long as disk>=D_INCONSISTENT,
1277 and it is D_DISKLESS here, local_cnt can only go down, it can
1278 not increase... It will reach zero */
1279 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
1280 mdev->rs_total = 0;
1281 mdev->rs_failed = 0;
1282 atomic_set(&mdev->rs_pending_cnt, 0);
1283
1284 spin_lock_irq(&mdev->req_lock);
1285 _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL);
1286 spin_unlock_irq(&mdev->req_lock);
1287
1288 if (eh == EP_CALL_HELPER)
1289 drbd_khelper(mdev, "local-io-error");
1290 }
1291
1292 if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
1293
1294 if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ {
1295 if (drbd_send_state(mdev))
1296 dev_warn(DEV, "Notified peer that my disk is broken.\n");
1297 else
1298 dev_err(DEV, "Sending state in drbd_io_error() failed\n");
1299 }
1300
1301 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
1302 lc_destroy(mdev->resync);
1303 mdev->resync = NULL;
1304 lc_destroy(mdev->act_log);
1305 mdev->act_log = NULL;
1306 __no_warn(local,
1307 drbd_free_bc(mdev->ldev);
1308 mdev->ldev = NULL;);
1309
1310 if (mdev->md_io_tmpp)
1311 __free_page(mdev->md_io_tmpp);
1312 }
1313
1314 /* Disks got bigger while they were detached */
1315 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1316 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1317 if (ns.conn == C_CONNECTED)
1318 resync_after_online_grow(mdev);
1319 }
1320
1321 /* A resync finished or aborted, wake paused devices... */
1322 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1323 (os.peer_isp && !ns.peer_isp) ||
1324 (os.user_isp && !ns.user_isp))
1325 resume_next_sg(mdev);
1326
1327 /* Upon network connection, we need to start the receiver */
1328 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1329 drbd_thread_start(&mdev->receiver);
1330
1331 /* Terminate worker thread if we are unconfigured - it will be
1332 restarted as needed... */
1333 if (ns.disk == D_DISKLESS &&
1334 ns.conn == C_STANDALONE &&
1335 ns.role == R_SECONDARY) {
1336 if (os.aftr_isp != ns.aftr_isp)
1337 resume_next_sg(mdev);
1338 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1339 if (test_bit(DEVICE_DYING, &mdev->flags))
1340 drbd_thread_stop_nowait(&mdev->worker);
1341 }
1342
1343 drbd_md_sync(mdev);
1344}
1345
1346
1347static int drbd_thread_setup(void *arg)
1348{
1349 struct drbd_thread *thi = (struct drbd_thread *) arg;
1350 struct drbd_conf *mdev = thi->mdev;
1351 unsigned long flags;
1352 int retval;
1353
1354restart:
1355 retval = thi->function(thi);
1356
1357 spin_lock_irqsave(&thi->t_lock, flags);
1358
1359 /* if the receiver has been "Exiting", the last thing it did
1360 * was set the conn state to "StandAlone",
1361 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1362 * and receiver thread will be "started".
1363 * drbd_thread_start needs to set "Restarting" in that case.
1364 * t_state check and assignment needs to be within the same spinlock,
1365 * so either thread_start sees Exiting, and can remap to Restarting,
1366 * or thread_start see None, and can proceed as normal.
1367 */
1368
1369 if (thi->t_state == Restarting) {
1370 dev_info(DEV, "Restarting %s\n", current->comm);
1371 thi->t_state = Running;
1372 spin_unlock_irqrestore(&thi->t_lock, flags);
1373 goto restart;
1374 }
1375
1376 thi->task = NULL;
1377 thi->t_state = None;
1378 smp_mb();
1379 complete(&thi->stop);
1380 spin_unlock_irqrestore(&thi->t_lock, flags);
1381
1382 dev_info(DEV, "Terminating %s\n", current->comm);
1383
1384 /* Release mod reference taken when thread was started */
1385 module_put(THIS_MODULE);
1386 return retval;
1387}
1388
1389static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1390 int (*func) (struct drbd_thread *))
1391{
1392 spin_lock_init(&thi->t_lock);
1393 thi->task = NULL;
1394 thi->t_state = None;
1395 thi->function = func;
1396 thi->mdev = mdev;
1397}
1398
1399int drbd_thread_start(struct drbd_thread *thi)
1400{
1401 struct drbd_conf *mdev = thi->mdev;
1402 struct task_struct *nt;
1403 unsigned long flags;
1404
1405 const char *me =
1406 thi == &mdev->receiver ? "receiver" :
1407 thi == &mdev->asender ? "asender" :
1408 thi == &mdev->worker ? "worker" : "NONSENSE";
1409
1410 /* is used from state engine doing drbd_thread_stop_nowait,
1411 * while holding the req lock irqsave */
1412 spin_lock_irqsave(&thi->t_lock, flags);
1413
1414 switch (thi->t_state) {
1415 case None:
1416 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1417 me, current->comm, current->pid);
1418
1419 /* Get ref on module for thread - this is released when thread exits */
1420 if (!try_module_get(THIS_MODULE)) {
1421 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1422 spin_unlock_irqrestore(&thi->t_lock, flags);
1423 return FALSE;
1424 }
1425
1426 init_completion(&thi->stop);
1427 D_ASSERT(thi->task == NULL);
1428 thi->reset_cpu_mask = 1;
1429 thi->t_state = Running;
1430 spin_unlock_irqrestore(&thi->t_lock, flags);
1431 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1432
1433 nt = kthread_create(drbd_thread_setup, (void *) thi,
1434 "drbd%d_%s", mdev_to_minor(mdev), me);
1435
1436 if (IS_ERR(nt)) {
1437 dev_err(DEV, "Couldn't start thread\n");
1438
1439 module_put(THIS_MODULE);
1440 return FALSE;
1441 }
1442 spin_lock_irqsave(&thi->t_lock, flags);
1443 thi->task = nt;
1444 thi->t_state = Running;
1445 spin_unlock_irqrestore(&thi->t_lock, flags);
1446 wake_up_process(nt);
1447 break;
1448 case Exiting:
1449 thi->t_state = Restarting;
1450 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1451 me, current->comm, current->pid);
1452 /* fall through */
1453 case Running:
1454 case Restarting:
1455 default:
1456 spin_unlock_irqrestore(&thi->t_lock, flags);
1457 break;
1458 }
1459
1460 return TRUE;
1461}
1462
1463
1464void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1465{
1466 unsigned long flags;
1467
1468 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1469
1470 /* may be called from state engine, holding the req lock irqsave */
1471 spin_lock_irqsave(&thi->t_lock, flags);
1472
1473 if (thi->t_state == None) {
1474 spin_unlock_irqrestore(&thi->t_lock, flags);
1475 if (restart)
1476 drbd_thread_start(thi);
1477 return;
1478 }
1479
1480 if (thi->t_state != ns) {
1481 if (thi->task == NULL) {
1482 spin_unlock_irqrestore(&thi->t_lock, flags);
1483 return;
1484 }
1485
1486 thi->t_state = ns;
1487 smp_mb();
1488 init_completion(&thi->stop);
1489 if (thi->task != current)
1490 force_sig(DRBD_SIGKILL, thi->task);
1491
1492 }
1493
1494 spin_unlock_irqrestore(&thi->t_lock, flags);
1495
1496 if (wait)
1497 wait_for_completion(&thi->stop);
1498}
1499
1500#ifdef CONFIG_SMP
1501/**
1502 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1503 * @mdev: DRBD device.
1504 *
1505 * Forces all threads of a device onto the same CPU. This is beneficial for
1506 * DRBD's performance. May be overwritten by user's configuration.
1507 */
1508void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1509{
1510 int ord, cpu;
1511
1512 /* user override. */
1513 if (cpumask_weight(mdev->cpu_mask))
1514 return;
1515
1516 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1517 for_each_online_cpu(cpu) {
1518 if (ord-- == 0) {
1519 cpumask_set_cpu(cpu, mdev->cpu_mask);
1520 return;
1521 }
1522 }
1523 /* should not be reached */
1524 cpumask_setall(mdev->cpu_mask);
1525}
1526
1527/**
1528 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1529 * @mdev: DRBD device.
1530 *
1531 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1532 * prematurely.
1533 */
1534void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1535{
1536 struct task_struct *p = current;
1537 struct drbd_thread *thi =
1538 p == mdev->asender.task ? &mdev->asender :
1539 p == mdev->receiver.task ? &mdev->receiver :
1540 p == mdev->worker.task ? &mdev->worker :
1541 NULL;
1542 ERR_IF(thi == NULL)
1543 return;
1544 if (!thi->reset_cpu_mask)
1545 return;
1546 thi->reset_cpu_mask = 0;
1547 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1548}
1549#endif
1550
1551/* the appropriate socket mutex must be held already */
1552int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1553 enum drbd_packets cmd, struct p_header *h,
1554 size_t size, unsigned msg_flags)
1555{
1556 int sent, ok;
1557
1558 ERR_IF(!h) return FALSE;
1559 ERR_IF(!size) return FALSE;
1560
1561 h->magic = BE_DRBD_MAGIC;
1562 h->command = cpu_to_be16(cmd);
1563 h->length = cpu_to_be16(size-sizeof(struct p_header));
1564
1565 sent = drbd_send(mdev, sock, h, size, msg_flags);
1566
1567 ok = (sent == size);
1568 if (!ok)
1569 dev_err(DEV, "short sent %s size=%d sent=%d\n",
1570 cmdname(cmd), (int)size, sent);
1571 return ok;
1572}
1573
1574/* don't pass the socket. we may only look at it
1575 * when we hold the appropriate socket mutex.
1576 */
1577int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1578 enum drbd_packets cmd, struct p_header *h, size_t size)
1579{
1580 int ok = 0;
1581 struct socket *sock;
1582
1583 if (use_data_socket) {
1584 mutex_lock(&mdev->data.mutex);
1585 sock = mdev->data.socket;
1586 } else {
1587 mutex_lock(&mdev->meta.mutex);
1588 sock = mdev->meta.socket;
1589 }
1590
1591 /* drbd_disconnect() could have called drbd_free_sock()
1592 * while we were waiting in down()... */
1593 if (likely(sock != NULL))
1594 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1595
1596 if (use_data_socket)
1597 mutex_unlock(&mdev->data.mutex);
1598 else
1599 mutex_unlock(&mdev->meta.mutex);
1600 return ok;
1601}
1602
1603int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1604 size_t size)
1605{
1606 struct p_header h;
1607 int ok;
1608
1609 h.magic = BE_DRBD_MAGIC;
1610 h.command = cpu_to_be16(cmd);
1611 h.length = cpu_to_be16(size);
1612
1613 if (!drbd_get_data_sock(mdev))
1614 return 0;
1615
1616 ok = (sizeof(h) ==
1617 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1618 ok = ok && (size ==
1619 drbd_send(mdev, mdev->data.socket, data, size, 0));
1620
1621 drbd_put_data_sock(mdev);
1622
1623 return ok;
1624}
1625
1626int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1627{
1628 struct p_rs_param_89 *p;
1629 struct socket *sock;
1630 int size, rv;
1631 const int apv = mdev->agreed_pro_version;
1632
1633 size = apv <= 87 ? sizeof(struct p_rs_param)
1634 : apv == 88 ? sizeof(struct p_rs_param)
1635 + strlen(mdev->sync_conf.verify_alg) + 1
1636 : /* 89 */ sizeof(struct p_rs_param_89);
1637
1638 /* used from admin command context and receiver/worker context.
1639 * to avoid kmalloc, grab the socket right here,
1640 * then use the pre-allocated sbuf there */
1641 mutex_lock(&mdev->data.mutex);
1642 sock = mdev->data.socket;
1643
1644 if (likely(sock != NULL)) {
1645 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1646
1647 p = &mdev->data.sbuf.rs_param_89;
1648
1649 /* initialize verify_alg and csums_alg */
1650 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1651
1652 p->rate = cpu_to_be32(sc->rate);
1653
1654 if (apv >= 88)
1655 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1656 if (apv >= 89)
1657 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1658
1659 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1660 } else
1661 rv = 0; /* not ok */
1662
1663 mutex_unlock(&mdev->data.mutex);
1664
1665 return rv;
1666}
1667
1668int drbd_send_protocol(struct drbd_conf *mdev)
1669{
1670 struct p_protocol *p;
1671 int size, cf, rv;
1672
1673 size = sizeof(struct p_protocol);
1674
1675 if (mdev->agreed_pro_version >= 87)
1676 size += strlen(mdev->net_conf->integrity_alg) + 1;
1677
1678 /* we must not recurse into our own queue,
1679 * as that is blocked during handshake */
1680 p = kmalloc(size, GFP_NOIO);
1681 if (p == NULL)
1682 return 0;
1683
1684 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
1685 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
1686 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
1687 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
1688 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1689
1690 cf = 0;
1691 if (mdev->net_conf->want_lose)
1692 cf |= CF_WANT_LOSE;
1693 if (mdev->net_conf->dry_run) {
1694 if (mdev->agreed_pro_version >= 92)
1695 cf |= CF_DRY_RUN;
1696 else {
1697 dev_err(DEV, "--dry-run is not supported by peer");
1698 kfree(p);
1699 return 0;
1700 }
1701 }
1702 p->conn_flags = cpu_to_be32(cf);
1703
1704 if (mdev->agreed_pro_version >= 87)
1705 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1706
1707 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
1708 (struct p_header *)p, size);
1709 kfree(p);
1710 return rv;
1711}
1712
1713int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1714{
1715 struct p_uuids p;
1716 int i;
1717
1718 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
1719 return 1;
1720
1721 for (i = UI_CURRENT; i < UI_SIZE; i++)
1722 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
1723
1724 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
1725 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
1726 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
1727 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
1728 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
1729 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
1730
1731 put_ldev(mdev);
1732
1733 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
1734 (struct p_header *)&p, sizeof(p));
1735}
1736
1737int drbd_send_uuids(struct drbd_conf *mdev)
1738{
1739 return _drbd_send_uuids(mdev, 0);
1740}
1741
1742int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1743{
1744 return _drbd_send_uuids(mdev, 8);
1745}
1746
1747
1748int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1749{
1750 struct p_rs_uuid p;
1751
1752 p.uuid = cpu_to_be64(val);
1753
1754 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
1755 (struct p_header *)&p, sizeof(p));
1756}
1757
1758int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply)
1759{
1760 struct p_sizes p;
1761 sector_t d_size, u_size;
1762 int q_order_type;
1763 int ok;
1764
1765 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1766 D_ASSERT(mdev->ldev->backing_bdev);
1767 d_size = drbd_get_max_capacity(mdev->ldev);
1768 u_size = mdev->ldev->dc.disk_size;
1769 q_order_type = drbd_queue_order_type(mdev);
1770 p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev));
1771 put_ldev(mdev);
1772 } else {
1773 d_size = 0;
1774 u_size = 0;
1775 q_order_type = QUEUE_ORDERED_NONE;
1776 }
1777
1778 p.d_size = cpu_to_be64(d_size);
1779 p.u_size = cpu_to_be64(u_size);
1780 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1781 p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
1782 p.queue_order_type = cpu_to_be32(q_order_type);
1783
1784 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
1785 (struct p_header *)&p, sizeof(p));
1786 return ok;
1787}
1788
1789/**
1790 * drbd_send_state() - Sends the drbd state to the peer
1791 * @mdev: DRBD device.
1792 */
1793int drbd_send_state(struct drbd_conf *mdev)
1794{
1795 struct socket *sock;
1796 struct p_state p;
1797 int ok = 0;
1798
1799 /* Grab state lock so we wont send state if we're in the middle
1800 * of a cluster wide state change on another thread */
1801 drbd_state_lock(mdev);
1802
1803 mutex_lock(&mdev->data.mutex);
1804
1805 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
1806 sock = mdev->data.socket;
1807
1808 if (likely(sock != NULL)) {
1809 ok = _drbd_send_cmd(mdev, sock, P_STATE,
1810 (struct p_header *)&p, sizeof(p), 0);
1811 }
1812
1813 mutex_unlock(&mdev->data.mutex);
1814
1815 drbd_state_unlock(mdev);
1816 return ok;
1817}
1818
1819int drbd_send_state_req(struct drbd_conf *mdev,
1820 union drbd_state mask, union drbd_state val)
1821{
1822 struct p_req_state p;
1823
1824 p.mask = cpu_to_be32(mask.i);
1825 p.val = cpu_to_be32(val.i);
1826
1827 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
1828 (struct p_header *)&p, sizeof(p));
1829}
1830
1831int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
1832{
1833 struct p_req_state_reply p;
1834
1835 p.retcode = cpu_to_be32(retcode);
1836
1837 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
1838 (struct p_header *)&p, sizeof(p));
1839}
1840
1841int fill_bitmap_rle_bits(struct drbd_conf *mdev,
1842 struct p_compressed_bm *p,
1843 struct bm_xfer_ctx *c)
1844{
1845 struct bitstream bs;
1846 unsigned long plain_bits;
1847 unsigned long tmp;
1848 unsigned long rl;
1849 unsigned len;
1850 unsigned toggle;
1851 int bits;
1852
1853 /* may we use this feature? */
1854 if ((mdev->sync_conf.use_rle == 0) ||
1855 (mdev->agreed_pro_version < 90))
1856 return 0;
1857
1858 if (c->bit_offset >= c->bm_bits)
1859 return 0; /* nothing to do. */
1860
1861 /* use at most thus many bytes */
1862 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
1863 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
1864 /* plain bits covered in this code string */
1865 plain_bits = 0;
1866
1867 /* p->encoding & 0x80 stores whether the first run length is set.
1868 * bit offset is implicit.
1869 * start with toggle == 2 to be able to tell the first iteration */
1870 toggle = 2;
1871
1872 /* see how much plain bits we can stuff into one packet
1873 * using RLE and VLI. */
1874 do {
1875 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
1876 : _drbd_bm_find_next(mdev, c->bit_offset);
1877 if (tmp == -1UL)
1878 tmp = c->bm_bits;
1879 rl = tmp - c->bit_offset;
1880
1881 if (toggle == 2) { /* first iteration */
1882 if (rl == 0) {
1883 /* the first checked bit was set,
1884 * store start value, */
1885 DCBP_set_start(p, 1);
1886 /* but skip encoding of zero run length */
1887 toggle = !toggle;
1888 continue;
1889 }
1890 DCBP_set_start(p, 0);
1891 }
1892
1893 /* paranoia: catch zero runlength.
1894 * can only happen if bitmap is modified while we scan it. */
1895 if (rl == 0) {
1896 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
1897 "t:%u bo:%lu\n", toggle, c->bit_offset);
1898 return -1;
1899 }
1900
1901 bits = vli_encode_bits(&bs, rl);
1902 if (bits == -ENOBUFS) /* buffer full */
1903 break;
1904 if (bits <= 0) {
1905 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
1906 return 0;
1907 }
1908
1909 toggle = !toggle;
1910 plain_bits += rl;
1911 c->bit_offset = tmp;
1912 } while (c->bit_offset < c->bm_bits);
1913
1914 len = bs.cur.b - p->code + !!bs.cur.bit;
1915
1916 if (plain_bits < (len << 3)) {
1917 /* incompressible with this method.
1918 * we need to rewind both word and bit position. */
1919 c->bit_offset -= plain_bits;
1920 bm_xfer_ctx_bit_to_word_offset(c);
1921 c->bit_offset = c->word_offset * BITS_PER_LONG;
1922 return 0;
1923 }
1924
1925 /* RLE + VLI was able to compress it just fine.
1926 * update c->word_offset. */
1927 bm_xfer_ctx_bit_to_word_offset(c);
1928
1929 /* store pad_bits */
1930 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
1931
1932 return len;
1933}
1934
1935enum { OK, FAILED, DONE }
1936send_bitmap_rle_or_plain(struct drbd_conf *mdev,
1937 struct p_header *h, struct bm_xfer_ctx *c)
1938{
1939 struct p_compressed_bm *p = (void*)h;
1940 unsigned long num_words;
1941 int len;
1942 int ok;
1943
1944 len = fill_bitmap_rle_bits(mdev, p, c);
1945
1946 if (len < 0)
1947 return FAILED;
1948
1949 if (len) {
1950 DCBP_set_code(p, RLE_VLI_Bits);
1951 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
1952 sizeof(*p) + len, 0);
1953
1954 c->packets[0]++;
1955 c->bytes[0] += sizeof(*p) + len;
1956
1957 if (c->bit_offset >= c->bm_bits)
1958 len = 0; /* DONE */
1959 } else {
1960 /* was not compressible.
1961 * send a buffer full of plain text bits instead. */
1962 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
1963 len = num_words * sizeof(long);
1964 if (len)
1965 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
1966 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
1967 h, sizeof(struct p_header) + len, 0);
1968 c->word_offset += num_words;
1969 c->bit_offset = c->word_offset * BITS_PER_LONG;
1970
1971 c->packets[1]++;
1972 c->bytes[1] += sizeof(struct p_header) + len;
1973
1974 if (c->bit_offset > c->bm_bits)
1975 c->bit_offset = c->bm_bits;
1976 }
1977 ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
1978
1979 if (ok == DONE)
1980 INFO_bm_xfer_stats(mdev, "send", c);
1981 return ok;
1982}
1983
1984/* See the comment at receive_bitmap() */
1985int _drbd_send_bitmap(struct drbd_conf *mdev)
1986{
1987 struct bm_xfer_ctx c;
1988 struct p_header *p;
1989 int ret;
1990
1991 ERR_IF(!mdev->bitmap) return FALSE;
1992
1993 /* maybe we should use some per thread scratch page,
1994 * and allocate that during initial device creation? */
1995 p = (struct p_header *) __get_free_page(GFP_NOIO);
1996 if (!p) {
1997 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
1998 return FALSE;
1999 }
2000
2001 if (get_ldev(mdev)) {
2002 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2003 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2004 drbd_bm_set_all(mdev);
2005 if (drbd_bm_write(mdev)) {
2006 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2007 * but otherwise process as per normal - need to tell other
2008 * side that a full resync is required! */
2009 dev_err(DEV, "Failed to write bitmap to disk!\n");
2010 } else {
2011 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2012 drbd_md_sync(mdev);
2013 }
2014 }
2015 put_ldev(mdev);
2016 }
2017
2018 c = (struct bm_xfer_ctx) {
2019 .bm_bits = drbd_bm_bits(mdev),
2020 .bm_words = drbd_bm_words(mdev),
2021 };
2022
2023 do {
2024 ret = send_bitmap_rle_or_plain(mdev, p, &c);
2025 } while (ret == OK);
2026
2027 free_page((unsigned long) p);
2028 return (ret == DONE);
2029}
2030
2031int drbd_send_bitmap(struct drbd_conf *mdev)
2032{
2033 int err;
2034
2035 if (!drbd_get_data_sock(mdev))
2036 return -1;
2037 err = !_drbd_send_bitmap(mdev);
2038 drbd_put_data_sock(mdev);
2039 return err;
2040}
2041
2042int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2043{
2044 int ok;
2045 struct p_barrier_ack p;
2046
2047 p.barrier = barrier_nr;
2048 p.set_size = cpu_to_be32(set_size);
2049
2050 if (mdev->state.conn < C_CONNECTED)
2051 return FALSE;
2052 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2053 (struct p_header *)&p, sizeof(p));
2054 return ok;
2055}
2056
2057/**
2058 * _drbd_send_ack() - Sends an ack packet
2059 * @mdev: DRBD device.
2060 * @cmd: Packet command code.
2061 * @sector: sector, needs to be in big endian byte order
2062 * @blksize: size in byte, needs to be in big endian byte order
2063 * @block_id: Id, big endian byte order
2064 */
2065static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2066 u64 sector,
2067 u32 blksize,
2068 u64 block_id)
2069{
2070 int ok;
2071 struct p_block_ack p;
2072
2073 p.sector = sector;
2074 p.block_id = block_id;
2075 p.blksize = blksize;
2076 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2077
2078 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2079 return FALSE;
2080 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2081 (struct p_header *)&p, sizeof(p));
2082 return ok;
2083}
2084
2085int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2086 struct p_data *dp)
2087{
2088 const int header_size = sizeof(struct p_data)
2089 - sizeof(struct p_header);
2090 int data_size = ((struct p_header *)dp)->length - header_size;
2091
2092 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2093 dp->block_id);
2094}
2095
2096int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2097 struct p_block_req *rp)
2098{
2099 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2100}
2101
2102/**
2103 * drbd_send_ack() - Sends an ack packet
2104 * @mdev: DRBD device.
2105 * @cmd: Packet command code.
2106 * @e: Epoch entry.
2107 */
2108int drbd_send_ack(struct drbd_conf *mdev,
2109 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2110{
2111 return _drbd_send_ack(mdev, cmd,
2112 cpu_to_be64(e->sector),
2113 cpu_to_be32(e->size),
2114 e->block_id);
2115}
2116
2117/* This function misuses the block_id field to signal if the blocks
2118 * are is sync or not. */
2119int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2120 sector_t sector, int blksize, u64 block_id)
2121{
2122 return _drbd_send_ack(mdev, cmd,
2123 cpu_to_be64(sector),
2124 cpu_to_be32(blksize),
2125 cpu_to_be64(block_id));
2126}
2127
2128int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2129 sector_t sector, int size, u64 block_id)
2130{
2131 int ok;
2132 struct p_block_req p;
2133
2134 p.sector = cpu_to_be64(sector);
2135 p.block_id = block_id;
2136 p.blksize = cpu_to_be32(size);
2137
2138 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2139 (struct p_header *)&p, sizeof(p));
2140 return ok;
2141}
2142
2143int drbd_send_drequest_csum(struct drbd_conf *mdev,
2144 sector_t sector, int size,
2145 void *digest, int digest_size,
2146 enum drbd_packets cmd)
2147{
2148 int ok;
2149 struct p_block_req p;
2150
2151 p.sector = cpu_to_be64(sector);
2152 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2153 p.blksize = cpu_to_be32(size);
2154
2155 p.head.magic = BE_DRBD_MAGIC;
2156 p.head.command = cpu_to_be16(cmd);
2157 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size);
2158
2159 mutex_lock(&mdev->data.mutex);
2160
2161 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2162 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2163
2164 mutex_unlock(&mdev->data.mutex);
2165
2166 return ok;
2167}
2168
2169int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2170{
2171 int ok;
2172 struct p_block_req p;
2173
2174 p.sector = cpu_to_be64(sector);
2175 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2176 p.blksize = cpu_to_be32(size);
2177
2178 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2179 (struct p_header *)&p, sizeof(p));
2180 return ok;
2181}
2182
2183/* called on sndtimeo
2184 * returns FALSE if we should retry,
2185 * TRUE if we think connection is dead
2186 */
2187static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2188{
2189 int drop_it;
2190 /* long elapsed = (long)(jiffies - mdev->last_received); */
2191
2192 drop_it = mdev->meta.socket == sock
2193 || !mdev->asender.task
2194 || get_t_state(&mdev->asender) != Running
2195 || mdev->state.conn < C_CONNECTED;
2196
2197 if (drop_it)
2198 return TRUE;
2199
2200 drop_it = !--mdev->ko_count;
2201 if (!drop_it) {
2202 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2203 current->comm, current->pid, mdev->ko_count);
2204 request_ping(mdev);
2205 }
2206
2207 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2208}
2209
2210/* The idea of sendpage seems to be to put some kind of reference
2211 * to the page into the skb, and to hand it over to the NIC. In
2212 * this process get_page() gets called.
2213 *
2214 * As soon as the page was really sent over the network put_page()
2215 * gets called by some part of the network layer. [ NIC driver? ]
2216 *
2217 * [ get_page() / put_page() increment/decrement the count. If count
2218 * reaches 0 the page will be freed. ]
2219 *
2220 * This works nicely with pages from FSs.
2221 * But this means that in protocol A we might signal IO completion too early!
2222 *
2223 * In order not to corrupt data during a resync we must make sure
2224 * that we do not reuse our own buffer pages (EEs) to early, therefore
2225 * we have the net_ee list.
2226 *
2227 * XFS seems to have problems, still, it submits pages with page_count == 0!
2228 * As a workaround, we disable sendpage on pages
2229 * with page_count == 0 or PageSlab.
2230 */
2231static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2232 int offset, size_t size)
2233{
2234 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0);
2235 kunmap(page);
2236 if (sent == size)
2237 mdev->send_cnt += size>>9;
2238 return sent == size;
2239}
2240
2241static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2242 int offset, size_t size)
2243{
2244 mm_segment_t oldfs = get_fs();
2245 int sent, ok;
2246 int len = size;
2247
2248 /* e.g. XFS meta- & log-data is in slab pages, which have a
2249 * page_count of 0 and/or have PageSlab() set.
2250 * we cannot use send_page for those, as that does get_page();
2251 * put_page(); and would cause either a VM_BUG directly, or
2252 * __page_cache_release a page that would actually still be referenced
2253 * by someone, leading to some obscure delayed Oops somewhere else. */
2254 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2255 return _drbd_no_send_page(mdev, page, offset, size);
2256
2257 drbd_update_congested(mdev);
2258 set_fs(KERNEL_DS);
2259 do {
2260 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2261 offset, len,
2262 MSG_NOSIGNAL);
2263 if (sent == -EAGAIN) {
2264 if (we_should_drop_the_connection(mdev,
2265 mdev->data.socket))
2266 break;
2267 else
2268 continue;
2269 }
2270 if (sent <= 0) {
2271 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2272 __func__, (int)size, len, sent);
2273 break;
2274 }
2275 len -= sent;
2276 offset += sent;
2277 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2278 set_fs(oldfs);
2279 clear_bit(NET_CONGESTED, &mdev->flags);
2280
2281 ok = (len == 0);
2282 if (likely(ok))
2283 mdev->send_cnt += size>>9;
2284 return ok;
2285}
2286
2287static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2288{
2289 struct bio_vec *bvec;
2290 int i;
2291 __bio_for_each_segment(bvec, bio, i, 0) {
2292 if (!_drbd_no_send_page(mdev, bvec->bv_page,
2293 bvec->bv_offset, bvec->bv_len))
2294 return 0;
2295 }
2296 return 1;
2297}
2298
2299static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2300{
2301 struct bio_vec *bvec;
2302 int i;
2303 __bio_for_each_segment(bvec, bio, i, 0) {
2304 if (!_drbd_send_page(mdev, bvec->bv_page,
2305 bvec->bv_offset, bvec->bv_len))
2306 return 0;
2307 }
2308
2309 return 1;
2310}
2311
2312/* Used to send write requests
2313 * R_PRIMARY -> Peer (P_DATA)
2314 */
2315int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2316{
2317 int ok = 1;
2318 struct p_data p;
2319 unsigned int dp_flags = 0;
2320 void *dgb;
2321 int dgs;
2322
2323 if (!drbd_get_data_sock(mdev))
2324 return 0;
2325
2326 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2327 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2328
2329 p.head.magic = BE_DRBD_MAGIC;
2330 p.head.command = cpu_to_be16(P_DATA);
2331 p.head.length =
2332 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size);
2333
2334 p.sector = cpu_to_be64(req->sector);
2335 p.block_id = (unsigned long)req;
2336 p.seq_num = cpu_to_be32(req->seq_num =
2337 atomic_add_return(1, &mdev->packet_seq));
2338 dp_flags = 0;
2339
2340 /* NOTE: no need to check if barriers supported here as we would
2341 * not pass the test in make_request_common in that case
2342 */
2343 if (bio_rw_flagged(req->master_bio, BIO_RW_BARRIER)) {
2344 dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n");
2345 /* dp_flags |= DP_HARDBARRIER; */
2346 }
2347 if (bio_rw_flagged(req->master_bio, BIO_RW_SYNCIO))
2348 dp_flags |= DP_RW_SYNC;
2349 /* for now handle SYNCIO and UNPLUG
2350 * as if they still were one and the same flag */
2351 if (bio_rw_flagged(req->master_bio, BIO_RW_UNPLUG))
2352 dp_flags |= DP_RW_SYNC;
2353 if (mdev->state.conn >= C_SYNC_SOURCE &&
2354 mdev->state.conn <= C_PAUSED_SYNC_T)
2355 dp_flags |= DP_MAY_SET_IN_SYNC;
2356
2357 p.dp_flags = cpu_to_be32(dp_flags);
2358 set_bit(UNPLUG_REMOTE, &mdev->flags);
2359 ok = (sizeof(p) ==
2360 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE));
2361 if (ok && dgs) {
2362 dgb = mdev->int_dig_out;
2363 drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2364 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
2365 }
2366 if (ok) {
2367 if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
2368 ok = _drbd_send_bio(mdev, req->master_bio);
2369 else
2370 ok = _drbd_send_zc_bio(mdev, req->master_bio);
2371 }
2372
2373 drbd_put_data_sock(mdev);
2374 return ok;
2375}
2376
2377/* answer packet, used to send data back for read requests:
2378 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2379 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2380 */
2381int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2382 struct drbd_epoch_entry *e)
2383{
2384 int ok;
2385 struct p_data p;
2386 void *dgb;
2387 int dgs;
2388
2389 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2390 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2391
2392 p.head.magic = BE_DRBD_MAGIC;
2393 p.head.command = cpu_to_be16(cmd);
2394 p.head.length =
2395 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size);
2396
2397 p.sector = cpu_to_be64(e->sector);
2398 p.block_id = e->block_id;
2399 /* p.seq_num = 0; No sequence numbers here.. */
2400
2401 /* Only called by our kernel thread.
2402 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2403 * in response to admin command or module unload.
2404 */
2405 if (!drbd_get_data_sock(mdev))
2406 return 0;
2407
2408 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p,
2409 sizeof(p), MSG_MORE);
2410 if (ok && dgs) {
2411 dgb = mdev->int_dig_out;
2412 drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb);
2413 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
2414 }
2415 if (ok)
2416 ok = _drbd_send_zc_bio(mdev, e->private_bio);
2417
2418 drbd_put_data_sock(mdev);
2419 return ok;
2420}
2421
2422/*
2423 drbd_send distinguishes two cases:
2424
2425 Packets sent via the data socket "sock"
2426 and packets sent via the meta data socket "msock"
2427
2428 sock msock
2429 -----------------+-------------------------+------------------------------
2430 timeout conf.timeout / 2 conf.timeout / 2
2431 timeout action send a ping via msock Abort communication
2432 and close all sockets
2433*/
2434
2435/*
2436 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2437 */
2438int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2439 void *buf, size_t size, unsigned msg_flags)
2440{
2441 struct kvec iov;
2442 struct msghdr msg;
2443 int rv, sent = 0;
2444
2445 if (!sock)
2446 return -1000;
2447
2448 /* THINK if (signal_pending) return ... ? */
2449
2450 iov.iov_base = buf;
2451 iov.iov_len = size;
2452
2453 msg.msg_name = NULL;
2454 msg.msg_namelen = 0;
2455 msg.msg_control = NULL;
2456 msg.msg_controllen = 0;
2457 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2458
2459 if (sock == mdev->data.socket) {
2460 mdev->ko_count = mdev->net_conf->ko_count;
2461 drbd_update_congested(mdev);
2462 }
2463 do {
2464 /* STRANGE
2465 * tcp_sendmsg does _not_ use its size parameter at all ?
2466 *
2467 * -EAGAIN on timeout, -EINTR on signal.
2468 */
2469/* THINK
2470 * do we need to block DRBD_SIG if sock == &meta.socket ??
2471 * otherwise wake_asender() might interrupt some send_*Ack !
2472 */
2473 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2474 if (rv == -EAGAIN) {
2475 if (we_should_drop_the_connection(mdev, sock))
2476 break;
2477 else
2478 continue;
2479 }
2480 D_ASSERT(rv != 0);
2481 if (rv == -EINTR) {
2482 flush_signals(current);
2483 rv = 0;
2484 }
2485 if (rv < 0)
2486 break;
2487 sent += rv;
2488 iov.iov_base += rv;
2489 iov.iov_len -= rv;
2490 } while (sent < size);
2491
2492 if (sock == mdev->data.socket)
2493 clear_bit(NET_CONGESTED, &mdev->flags);
2494
2495 if (rv <= 0) {
2496 if (rv != -EAGAIN) {
2497 dev_err(DEV, "%s_sendmsg returned %d\n",
2498 sock == mdev->meta.socket ? "msock" : "sock",
2499 rv);
2500 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2501 } else
2502 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2503 }
2504
2505 return sent;
2506}
2507
2508static int drbd_open(struct block_device *bdev, fmode_t mode)
2509{
2510 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2511 unsigned long flags;
2512 int rv = 0;
2513
2514 spin_lock_irqsave(&mdev->req_lock, flags);
2515 /* to have a stable mdev->state.role
2516 * and no race with updating open_cnt */
2517
2518 if (mdev->state.role != R_PRIMARY) {
2519 if (mode & FMODE_WRITE)
2520 rv = -EROFS;
2521 else if (!allow_oos)
2522 rv = -EMEDIUMTYPE;
2523 }
2524
2525 if (!rv)
2526 mdev->open_cnt++;
2527 spin_unlock_irqrestore(&mdev->req_lock, flags);
2528
2529 return rv;
2530}
2531
2532static int drbd_release(struct gendisk *gd, fmode_t mode)
2533{
2534 struct drbd_conf *mdev = gd->private_data;
2535 mdev->open_cnt--;
2536 return 0;
2537}
2538
2539static void drbd_unplug_fn(struct request_queue *q)
2540{
2541 struct drbd_conf *mdev = q->queuedata;
2542
2543 /* unplug FIRST */
2544 spin_lock_irq(q->queue_lock);
2545 blk_remove_plug(q);
2546 spin_unlock_irq(q->queue_lock);
2547
2548 /* only if connected */
2549 spin_lock_irq(&mdev->req_lock);
2550 if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) {
2551 D_ASSERT(mdev->state.role == R_PRIMARY);
2552 if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) {
2553 /* add to the data.work queue,
2554 * unless already queued.
2555 * XXX this might be a good addition to drbd_queue_work
2556 * anyways, to detect "double queuing" ... */
2557 if (list_empty(&mdev->unplug_work.list))
2558 drbd_queue_work(&mdev->data.work,
2559 &mdev->unplug_work);
2560 }
2561 }
2562 spin_unlock_irq(&mdev->req_lock);
2563
2564 if (mdev->state.disk >= D_INCONSISTENT)
2565 drbd_kick_lo(mdev);
2566}
2567
2568static void drbd_set_defaults(struct drbd_conf *mdev)
2569{
2570 mdev->sync_conf.after = DRBD_AFTER_DEF;
2571 mdev->sync_conf.rate = DRBD_RATE_DEF;
2572 mdev->sync_conf.al_extents = DRBD_AL_EXTENTS_DEF;
2573 mdev->state = (union drbd_state) {
2574 { .role = R_SECONDARY,
2575 .peer = R_UNKNOWN,
2576 .conn = C_STANDALONE,
2577 .disk = D_DISKLESS,
2578 .pdsk = D_UNKNOWN,
2579 .susp = 0
2580 } };
2581}
2582
2583void drbd_init_set_defaults(struct drbd_conf *mdev)
2584{
2585 /* the memset(,0,) did most of this.
2586 * note: only assignments, no allocation in here */
2587
2588 drbd_set_defaults(mdev);
2589
2590 /* for now, we do NOT yet support it,
2591 * even though we start some framework
2592 * to eventually support barriers */
2593 set_bit(NO_BARRIER_SUPP, &mdev->flags);
2594
2595 atomic_set(&mdev->ap_bio_cnt, 0);
2596 atomic_set(&mdev->ap_pending_cnt, 0);
2597 atomic_set(&mdev->rs_pending_cnt, 0);
2598 atomic_set(&mdev->unacked_cnt, 0);
2599 atomic_set(&mdev->local_cnt, 0);
2600 atomic_set(&mdev->net_cnt, 0);
2601 atomic_set(&mdev->packet_seq, 0);
2602 atomic_set(&mdev->pp_in_use, 0);
2603
2604 mutex_init(&mdev->md_io_mutex);
2605 mutex_init(&mdev->data.mutex);
2606 mutex_init(&mdev->meta.mutex);
2607 sema_init(&mdev->data.work.s, 0);
2608 sema_init(&mdev->meta.work.s, 0);
2609 mutex_init(&mdev->state_mutex);
2610
2611 spin_lock_init(&mdev->data.work.q_lock);
2612 spin_lock_init(&mdev->meta.work.q_lock);
2613
2614 spin_lock_init(&mdev->al_lock);
2615 spin_lock_init(&mdev->req_lock);
2616 spin_lock_init(&mdev->peer_seq_lock);
2617 spin_lock_init(&mdev->epoch_lock);
2618
2619 INIT_LIST_HEAD(&mdev->active_ee);
2620 INIT_LIST_HEAD(&mdev->sync_ee);
2621 INIT_LIST_HEAD(&mdev->done_ee);
2622 INIT_LIST_HEAD(&mdev->read_ee);
2623 INIT_LIST_HEAD(&mdev->net_ee);
2624 INIT_LIST_HEAD(&mdev->resync_reads);
2625 INIT_LIST_HEAD(&mdev->data.work.q);
2626 INIT_LIST_HEAD(&mdev->meta.work.q);
2627 INIT_LIST_HEAD(&mdev->resync_work.list);
2628 INIT_LIST_HEAD(&mdev->unplug_work.list);
2629 INIT_LIST_HEAD(&mdev->md_sync_work.list);
2630 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
2631 mdev->resync_work.cb = w_resync_inactive;
2632 mdev->unplug_work.cb = w_send_write_hint;
2633 mdev->md_sync_work.cb = w_md_sync;
2634 mdev->bm_io_work.w.cb = w_bitmap_io;
2635 init_timer(&mdev->resync_timer);
2636 init_timer(&mdev->md_sync_timer);
2637 mdev->resync_timer.function = resync_timer_fn;
2638 mdev->resync_timer.data = (unsigned long) mdev;
2639 mdev->md_sync_timer.function = md_sync_timer_fn;
2640 mdev->md_sync_timer.data = (unsigned long) mdev;
2641
2642 init_waitqueue_head(&mdev->misc_wait);
2643 init_waitqueue_head(&mdev->state_wait);
2644 init_waitqueue_head(&mdev->ee_wait);
2645 init_waitqueue_head(&mdev->al_wait);
2646 init_waitqueue_head(&mdev->seq_wait);
2647
2648 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
2649 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
2650 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2651
2652 mdev->agreed_pro_version = PRO_VERSION_MAX;
2653 mdev->write_ordering = WO_bio_barrier;
2654 mdev->resync_wenr = LC_FREE;
2655}
2656
2657void drbd_mdev_cleanup(struct drbd_conf *mdev)
2658{
2659 if (mdev->receiver.t_state != None)
2660 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2661 mdev->receiver.t_state);
2662
2663 /* no need to lock it, I'm the only thread alive */
2664 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
2665 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2666 mdev->al_writ_cnt =
2667 mdev->bm_writ_cnt =
2668 mdev->read_cnt =
2669 mdev->recv_cnt =
2670 mdev->send_cnt =
2671 mdev->writ_cnt =
2672 mdev->p_size =
2673 mdev->rs_start =
2674 mdev->rs_total =
2675 mdev->rs_failed =
2676 mdev->rs_mark_left =
2677 mdev->rs_mark_time = 0;
2678 D_ASSERT(mdev->net_conf == NULL);
2679
2680 drbd_set_my_capacity(mdev, 0);
2681 if (mdev->bitmap) {
2682 /* maybe never allocated. */
2683 drbd_bm_resize(mdev, 0);
2684 drbd_bm_cleanup(mdev);
2685 }
2686
2687 drbd_free_resources(mdev);
2688
2689 /*
2690 * currently we drbd_init_ee only on module load, so
2691 * we may do drbd_release_ee only on module unload!
2692 */
2693 D_ASSERT(list_empty(&mdev->active_ee));
2694 D_ASSERT(list_empty(&mdev->sync_ee));
2695 D_ASSERT(list_empty(&mdev->done_ee));
2696 D_ASSERT(list_empty(&mdev->read_ee));
2697 D_ASSERT(list_empty(&mdev->net_ee));
2698 D_ASSERT(list_empty(&mdev->resync_reads));
2699 D_ASSERT(list_empty(&mdev->data.work.q));
2700 D_ASSERT(list_empty(&mdev->meta.work.q));
2701 D_ASSERT(list_empty(&mdev->resync_work.list));
2702 D_ASSERT(list_empty(&mdev->unplug_work.list));
2703
2704}
2705
2706
2707static void drbd_destroy_mempools(void)
2708{
2709 struct page *page;
2710
2711 while (drbd_pp_pool) {
2712 page = drbd_pp_pool;
2713 drbd_pp_pool = (struct page *)page_private(page);
2714 __free_page(page);
2715 drbd_pp_vacant--;
2716 }
2717
2718 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
2719
2720 if (drbd_ee_mempool)
2721 mempool_destroy(drbd_ee_mempool);
2722 if (drbd_request_mempool)
2723 mempool_destroy(drbd_request_mempool);
2724 if (drbd_ee_cache)
2725 kmem_cache_destroy(drbd_ee_cache);
2726 if (drbd_request_cache)
2727 kmem_cache_destroy(drbd_request_cache);
2728 if (drbd_bm_ext_cache)
2729 kmem_cache_destroy(drbd_bm_ext_cache);
2730 if (drbd_al_ext_cache)
2731 kmem_cache_destroy(drbd_al_ext_cache);
2732
2733 drbd_ee_mempool = NULL;
2734 drbd_request_mempool = NULL;
2735 drbd_ee_cache = NULL;
2736 drbd_request_cache = NULL;
2737 drbd_bm_ext_cache = NULL;
2738 drbd_al_ext_cache = NULL;
2739
2740 return;
2741}
2742
2743static int drbd_create_mempools(void)
2744{
2745 struct page *page;
2746 const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count;
2747 int i;
2748
2749 /* prepare our caches and mempools */
2750 drbd_request_mempool = NULL;
2751 drbd_ee_cache = NULL;
2752 drbd_request_cache = NULL;
2753 drbd_bm_ext_cache = NULL;
2754 drbd_al_ext_cache = NULL;
2755 drbd_pp_pool = NULL;
2756
2757 /* caches */
2758 drbd_request_cache = kmem_cache_create(
2759 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
2760 if (drbd_request_cache == NULL)
2761 goto Enomem;
2762
2763 drbd_ee_cache = kmem_cache_create(
2764 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
2765 if (drbd_ee_cache == NULL)
2766 goto Enomem;
2767
2768 drbd_bm_ext_cache = kmem_cache_create(
2769 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
2770 if (drbd_bm_ext_cache == NULL)
2771 goto Enomem;
2772
2773 drbd_al_ext_cache = kmem_cache_create(
2774 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
2775 if (drbd_al_ext_cache == NULL)
2776 goto Enomem;
2777
2778 /* mempools */
2779 drbd_request_mempool = mempool_create(number,
2780 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
2781 if (drbd_request_mempool == NULL)
2782 goto Enomem;
2783
2784 drbd_ee_mempool = mempool_create(number,
2785 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2786 if (drbd_request_mempool == NULL)
2787 goto Enomem;
2788
2789 /* drbd's page pool */
2790 spin_lock_init(&drbd_pp_lock);
2791
2792 for (i = 0; i < number; i++) {
2793 page = alloc_page(GFP_HIGHUSER);
2794 if (!page)
2795 goto Enomem;
2796 set_page_private(page, (unsigned long)drbd_pp_pool);
2797 drbd_pp_pool = page;
2798 }
2799 drbd_pp_vacant = number;
2800
2801 return 0;
2802
2803Enomem:
2804 drbd_destroy_mempools(); /* in case we allocated some */
2805 return -ENOMEM;
2806}
2807
2808static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
2809 void *unused)
2810{
2811 /* just so we have it. you never know what interesting things we
2812 * might want to do here some day...
2813 */
2814
2815 return NOTIFY_DONE;
2816}
2817
2818static struct notifier_block drbd_notifier = {
2819 .notifier_call = drbd_notify_sys,
2820};
2821
2822static void drbd_release_ee_lists(struct drbd_conf *mdev)
2823{
2824 int rr;
2825
2826 rr = drbd_release_ee(mdev, &mdev->active_ee);
2827 if (rr)
2828 dev_err(DEV, "%d EEs in active list found!\n", rr);
2829
2830 rr = drbd_release_ee(mdev, &mdev->sync_ee);
2831 if (rr)
2832 dev_err(DEV, "%d EEs in sync list found!\n", rr);
2833
2834 rr = drbd_release_ee(mdev, &mdev->read_ee);
2835 if (rr)
2836 dev_err(DEV, "%d EEs in read list found!\n", rr);
2837
2838 rr = drbd_release_ee(mdev, &mdev->done_ee);
2839 if (rr)
2840 dev_err(DEV, "%d EEs in done list found!\n", rr);
2841
2842 rr = drbd_release_ee(mdev, &mdev->net_ee);
2843 if (rr)
2844 dev_err(DEV, "%d EEs in net list found!\n", rr);
2845}
2846
2847/* caution. no locking.
2848 * currently only used from module cleanup code. */
2849static void drbd_delete_device(unsigned int minor)
2850{
2851 struct drbd_conf *mdev = minor_to_mdev(minor);
2852
2853 if (!mdev)
2854 return;
2855
2856 /* paranoia asserts */
2857 if (mdev->open_cnt != 0)
2858 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
2859 __FILE__ , __LINE__);
2860
2861 ERR_IF (!list_empty(&mdev->data.work.q)) {
2862 struct list_head *lp;
2863 list_for_each(lp, &mdev->data.work.q) {
2864 dev_err(DEV, "lp = %p\n", lp);
2865 }
2866 };
2867 /* end paranoia asserts */
2868
2869 del_gendisk(mdev->vdisk);
2870
2871 /* cleanup stuff that may have been allocated during
2872 * device (re-)configuration or state changes */
2873
2874 if (mdev->this_bdev)
2875 bdput(mdev->this_bdev);
2876
2877 drbd_free_resources(mdev);
2878
2879 drbd_release_ee_lists(mdev);
2880
2881 /* should be free'd on disconnect? */
2882 kfree(mdev->ee_hash);
2883 /*
2884 mdev->ee_hash_s = 0;
2885 mdev->ee_hash = NULL;
2886 */
2887
2888 lc_destroy(mdev->act_log);
2889 lc_destroy(mdev->resync);
2890
2891 kfree(mdev->p_uuid);
2892 /* mdev->p_uuid = NULL; */
2893
2894 kfree(mdev->int_dig_out);
2895 kfree(mdev->int_dig_in);
2896 kfree(mdev->int_dig_vv);
2897
2898 /* cleanup the rest that has been
2899 * allocated from drbd_new_device
2900 * and actually free the mdev itself */
2901 drbd_free_mdev(mdev);
2902}
2903
2904static void drbd_cleanup(void)
2905{
2906 unsigned int i;
2907
2908 unregister_reboot_notifier(&drbd_notifier);
2909
2910 drbd_nl_cleanup();
2911
2912 if (minor_table) {
2913 if (drbd_proc)
2914 remove_proc_entry("drbd", NULL);
2915 i = minor_count;
2916 while (i--)
2917 drbd_delete_device(i);
2918 drbd_destroy_mempools();
2919 }
2920
2921 kfree(minor_table);
2922
2923 unregister_blkdev(DRBD_MAJOR, "drbd");
2924
2925 printk(KERN_INFO "drbd: module cleanup done.\n");
2926}
2927
2928/**
2929 * drbd_congested() - Callback for pdflush
2930 * @congested_data: User data
2931 * @bdi_bits: Bits pdflush is currently interested in
2932 *
2933 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
2934 */
2935static int drbd_congested(void *congested_data, int bdi_bits)
2936{
2937 struct drbd_conf *mdev = congested_data;
2938 struct request_queue *q;
2939 char reason = '-';
2940 int r = 0;
2941
2942 if (!__inc_ap_bio_cond(mdev)) {
2943 /* DRBD has frozen IO */
2944 r = bdi_bits;
2945 reason = 'd';
2946 goto out;
2947 }
2948
2949 if (get_ldev(mdev)) {
2950 q = bdev_get_queue(mdev->ldev->backing_bdev);
2951 r = bdi_congested(&q->backing_dev_info, bdi_bits);
2952 put_ldev(mdev);
2953 if (r)
2954 reason = 'b';
2955 }
2956
2957 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
2958 r |= (1 << BDI_async_congested);
2959 reason = reason == 'b' ? 'a' : 'n';
2960 }
2961
2962out:
2963 mdev->congestion_reason = reason;
2964 return r;
2965}
2966
2967struct drbd_conf *drbd_new_device(unsigned int minor)
2968{
2969 struct drbd_conf *mdev;
2970 struct gendisk *disk;
2971 struct request_queue *q;
2972
2973 /* GFP_KERNEL, we are outside of all write-out paths */
2974 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
2975 if (!mdev)
2976 return NULL;
2977 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
2978 goto out_no_cpumask;
2979
2980 mdev->minor = minor;
2981
2982 drbd_init_set_defaults(mdev);
2983
2984 q = blk_alloc_queue(GFP_KERNEL);
2985 if (!q)
2986 goto out_no_q;
2987 mdev->rq_queue = q;
2988 q->queuedata = mdev;
2989
2990 disk = alloc_disk(1);
2991 if (!disk)
2992 goto out_no_disk;
2993 mdev->vdisk = disk;
2994
2995 set_disk_ro(disk, TRUE);
2996
2997 disk->queue = q;
2998 disk->major = DRBD_MAJOR;
2999 disk->first_minor = minor;
3000 disk->fops = &drbd_ops;
3001 sprintf(disk->disk_name, "drbd%d", minor);
3002 disk->private_data = mdev;
3003
3004 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3005 /* we have no partitions. we contain only ourselves. */
3006 mdev->this_bdev->bd_contains = mdev->this_bdev;
3007
3008 q->backing_dev_info.congested_fn = drbd_congested;
3009 q->backing_dev_info.congested_data = mdev;
3010
3011 blk_queue_make_request(q, drbd_make_request_26);
3012 blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE);
3013 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3014 blk_queue_merge_bvec(q, drbd_merge_bvec);
3015 q->queue_lock = &mdev->req_lock; /* needed since we use */
3016 /* plugging on a queue, that actually has no requests! */
3017 q->unplug_fn = drbd_unplug_fn;
3018
3019 mdev->md_io_page = alloc_page(GFP_KERNEL);
3020 if (!mdev->md_io_page)
3021 goto out_no_io_page;
3022
3023 if (drbd_bm_init(mdev))
3024 goto out_no_bitmap;
3025 /* no need to lock access, we are still initializing this minor device. */
3026 if (!tl_init(mdev))
3027 goto out_no_tl;
3028
3029 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3030 if (!mdev->app_reads_hash)
3031 goto out_no_app_reads;
3032
3033 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3034 if (!mdev->current_epoch)
3035 goto out_no_epoch;
3036
3037 INIT_LIST_HEAD(&mdev->current_epoch->list);
3038 mdev->epochs = 1;
3039
3040 return mdev;
3041
3042/* out_whatever_else:
3043 kfree(mdev->current_epoch); */
3044out_no_epoch:
3045 kfree(mdev->app_reads_hash);
3046out_no_app_reads:
3047 tl_cleanup(mdev);
3048out_no_tl:
3049 drbd_bm_cleanup(mdev);
3050out_no_bitmap:
3051 __free_page(mdev->md_io_page);
3052out_no_io_page:
3053 put_disk(disk);
3054out_no_disk:
3055 blk_cleanup_queue(q);
3056out_no_q:
3057 free_cpumask_var(mdev->cpu_mask);
3058out_no_cpumask:
3059 kfree(mdev);
3060 return NULL;
3061}
3062
3063/* counterpart of drbd_new_device.
3064 * last part of drbd_delete_device. */
3065void drbd_free_mdev(struct drbd_conf *mdev)
3066{
3067 kfree(mdev->current_epoch);
3068 kfree(mdev->app_reads_hash);
3069 tl_cleanup(mdev);
3070 if (mdev->bitmap) /* should no longer be there. */
3071 drbd_bm_cleanup(mdev);
3072 __free_page(mdev->md_io_page);
3073 put_disk(mdev->vdisk);
3074 blk_cleanup_queue(mdev->rq_queue);
3075 free_cpumask_var(mdev->cpu_mask);
3076 kfree(mdev);
3077}
3078
3079
3080int __init drbd_init(void)
3081{
3082 int err;
3083
3084 if (sizeof(struct p_handshake) != 80) {
3085 printk(KERN_ERR
3086 "drbd: never change the size or layout "
3087 "of the HandShake packet.\n");
3088 return -EINVAL;
3089 }
3090
3091 if (1 > minor_count || minor_count > 255) {
3092 printk(KERN_ERR
3093 "drbd: invalid minor_count (%d)\n", minor_count);
3094#ifdef MODULE
3095 return -EINVAL;
3096#else
3097 minor_count = 8;
3098#endif
3099 }
3100
3101 err = drbd_nl_init();
3102 if (err)
3103 return err;
3104
3105 err = register_blkdev(DRBD_MAJOR, "drbd");
3106 if (err) {
3107 printk(KERN_ERR
3108 "drbd: unable to register block device major %d\n",
3109 DRBD_MAJOR);
3110 return err;
3111 }
3112
3113 register_reboot_notifier(&drbd_notifier);
3114
3115 /*
3116 * allocate all necessary structs
3117 */
3118 err = -ENOMEM;
3119
3120 init_waitqueue_head(&drbd_pp_wait);
3121
3122 drbd_proc = NULL; /* play safe for drbd_cleanup */
3123 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3124 GFP_KERNEL);
3125 if (!minor_table)
3126 goto Enomem;
3127
3128 err = drbd_create_mempools();
3129 if (err)
3130 goto Enomem;
3131
3132 drbd_proc = proc_create("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops);
3133 if (!drbd_proc) {
3134 printk(KERN_ERR "drbd: unable to register proc file\n");
3135 goto Enomem;
3136 }
3137
3138 rwlock_init(&global_state_lock);
3139
3140 printk(KERN_INFO "drbd: initialized. "
3141 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3142 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3143 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3144 printk(KERN_INFO "drbd: registered as block device major %d\n",
3145 DRBD_MAJOR);
3146 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3147
3148 return 0; /* Success! */
3149
3150Enomem:
3151 drbd_cleanup();
3152 if (err == -ENOMEM)
3153 /* currently always the case */
3154 printk(KERN_ERR "drbd: ran out of memory\n");
3155 else
3156 printk(KERN_ERR "drbd: initialization failure\n");
3157 return err;
3158}
3159
3160void drbd_free_bc(struct drbd_backing_dev *ldev)
3161{
3162 if (ldev == NULL)
3163 return;
3164
3165 bd_release(ldev->backing_bdev);
3166 bd_release(ldev->md_bdev);
3167
3168 fput(ldev->lo_file);
3169 fput(ldev->md_file);
3170
3171 kfree(ldev);
3172}
3173
3174void drbd_free_sock(struct drbd_conf *mdev)
3175{
3176 if (mdev->data.socket) {
3177 mutex_lock(&mdev->data.mutex);
3178 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3179 sock_release(mdev->data.socket);
3180 mdev->data.socket = NULL;
3181 mutex_unlock(&mdev->data.mutex);
3182 }
3183 if (mdev->meta.socket) {
3184 mutex_lock(&mdev->meta.mutex);
3185 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3186 sock_release(mdev->meta.socket);
3187 mdev->meta.socket = NULL;
3188 mutex_unlock(&mdev->meta.mutex);
3189 }
3190}
3191
3192
3193void drbd_free_resources(struct drbd_conf *mdev)
3194{
3195 crypto_free_hash(mdev->csums_tfm);
3196 mdev->csums_tfm = NULL;
3197 crypto_free_hash(mdev->verify_tfm);
3198 mdev->verify_tfm = NULL;
3199 crypto_free_hash(mdev->cram_hmac_tfm);
3200 mdev->cram_hmac_tfm = NULL;
3201 crypto_free_hash(mdev->integrity_w_tfm);
3202 mdev->integrity_w_tfm = NULL;
3203 crypto_free_hash(mdev->integrity_r_tfm);
3204 mdev->integrity_r_tfm = NULL;
3205
3206 drbd_free_sock(mdev);
3207
3208 __no_warn(local,
3209 drbd_free_bc(mdev->ldev);
3210 mdev->ldev = NULL;);
3211}
3212
3213/* meta data management */
3214
3215struct meta_data_on_disk {
3216 u64 la_size; /* last agreed size. */
3217 u64 uuid[UI_SIZE]; /* UUIDs. */
3218 u64 device_uuid;
3219 u64 reserved_u64_1;
3220 u32 flags; /* MDF */
3221 u32 magic;
3222 u32 md_size_sect;
3223 u32 al_offset; /* offset to this block */
3224 u32 al_nr_extents; /* important for restoring the AL */
3225 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3226 u32 bm_offset; /* offset to the bitmap, from here */
3227 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
3228 u32 reserved_u32[4];
3229
3230} __packed;
3231
3232/**
3233 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3234 * @mdev: DRBD device.
3235 */
3236void drbd_md_sync(struct drbd_conf *mdev)
3237{
3238 struct meta_data_on_disk *buffer;
3239 sector_t sector;
3240 int i;
3241
3242 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3243 return;
3244 del_timer(&mdev->md_sync_timer);
3245
3246 /* We use here D_FAILED and not D_ATTACHING because we try to write
3247 * metadata even if we detach due to a disk failure! */
3248 if (!get_ldev_if_state(mdev, D_FAILED))
3249 return;
3250
3251 mutex_lock(&mdev->md_io_mutex);
3252 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3253 memset(buffer, 0, 512);
3254
3255 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3256 for (i = UI_CURRENT; i < UI_SIZE; i++)
3257 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3258 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3259 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3260
3261 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3262 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3263 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3264 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3265 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3266
3267 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3268
3269 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3270 sector = mdev->ldev->md.md_offset;
3271
3272 if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3273 clear_bit(MD_DIRTY, &mdev->flags);
3274 } else {
3275 /* this was a try anyways ... */
3276 dev_err(DEV, "meta data update failed!\n");
3277
3278 drbd_chk_io_error(mdev, 1, TRUE);
3279 }
3280
3281 /* Update mdev->ldev->md.la_size_sect,
3282 * since we updated it on metadata. */
3283 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3284
3285 mutex_unlock(&mdev->md_io_mutex);
3286 put_ldev(mdev);
3287}
3288
3289/**
3290 * drbd_md_read() - Reads in the meta data super block
3291 * @mdev: DRBD device.
3292 * @bdev: Device from which the meta data should be read in.
3293 *
3294 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
3295 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3296 */
3297int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3298{
3299 struct meta_data_on_disk *buffer;
3300 int i, rv = NO_ERROR;
3301
3302 if (!get_ldev_if_state(mdev, D_ATTACHING))
3303 return ERR_IO_MD_DISK;
3304
3305 mutex_lock(&mdev->md_io_mutex);
3306 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3307
3308 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3309 /* NOTE: cant do normal error processing here as this is
3310 called BEFORE disk is attached */
3311 dev_err(DEV, "Error while reading metadata.\n");
3312 rv = ERR_IO_MD_DISK;
3313 goto err;
3314 }
3315
3316 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3317 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3318 rv = ERR_MD_INVALID;
3319 goto err;
3320 }
3321 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3322 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3323 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3324 rv = ERR_MD_INVALID;
3325 goto err;
3326 }
3327 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3328 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3329 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3330 rv = ERR_MD_INVALID;
3331 goto err;
3332 }
3333 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3334 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3335 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3336 rv = ERR_MD_INVALID;
3337 goto err;
3338 }
3339
3340 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3341 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3342 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3343 rv = ERR_MD_INVALID;
3344 goto err;
3345 }
3346
3347 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3348 for (i = UI_CURRENT; i < UI_SIZE; i++)
3349 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3350 bdev->md.flags = be32_to_cpu(buffer->flags);
3351 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3352 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3353
3354 if (mdev->sync_conf.al_extents < 7)
3355 mdev->sync_conf.al_extents = 127;
3356
3357 err:
3358 mutex_unlock(&mdev->md_io_mutex);
3359 put_ldev(mdev);
3360
3361 return rv;
3362}
3363
3364/**
3365 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3366 * @mdev: DRBD device.
3367 *
3368 * Call this function if you change anything that should be written to
3369 * the meta-data super block. This function sets MD_DIRTY, and starts a
3370 * timer that ensures that within five seconds you have to call drbd_md_sync().
3371 */
3372void drbd_md_mark_dirty(struct drbd_conf *mdev)
3373{
3374 set_bit(MD_DIRTY, &mdev->flags);
3375 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
3376}
3377
3378
3379static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3380{
3381 int i;
3382
3383 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
3384 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
3385}
3386
3387void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3388{
3389 if (idx == UI_CURRENT) {
3390 if (mdev->state.role == R_PRIMARY)
3391 val |= 1;
3392 else
3393 val &= ~((u64)1);
3394
3395 drbd_set_ed_uuid(mdev, val);
3396 }
3397
3398 mdev->ldev->md.uuid[idx] = val;
3399 drbd_md_mark_dirty(mdev);
3400}
3401
3402
3403void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3404{
3405 if (mdev->ldev->md.uuid[idx]) {
3406 drbd_uuid_move_history(mdev);
3407 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
3408 }
3409 _drbd_uuid_set(mdev, idx, val);
3410}
3411
3412/**
3413 * drbd_uuid_new_current() - Creates a new current UUID
3414 * @mdev: DRBD device.
3415 *
3416 * Creates a new current UUID, and rotates the old current UUID into
3417 * the bitmap slot. Causes an incremental resync upon next connect.
3418 */
3419void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3420{
3421 u64 val;
3422
3423 dev_info(DEV, "Creating new current UUID\n");
3424 D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3425 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
3426
3427 get_random_bytes(&val, sizeof(u64));
3428 _drbd_uuid_set(mdev, UI_CURRENT, val);
3429}
3430
3431void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3432{
3433 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3434 return;
3435
3436 if (val == 0) {
3437 drbd_uuid_move_history(mdev);
3438 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3439 mdev->ldev->md.uuid[UI_BITMAP] = 0;
3440 } else {
3441 if (mdev->ldev->md.uuid[UI_BITMAP])
3442 dev_warn(DEV, "bm UUID already set");
3443
3444 mdev->ldev->md.uuid[UI_BITMAP] = val;
3445 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3446
3447 }
3448 drbd_md_mark_dirty(mdev);
3449}
3450
3451/**
3452 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3453 * @mdev: DRBD device.
3454 *
3455 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3456 */
3457int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3458{
3459 int rv = -EIO;
3460
3461 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3462 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3463 drbd_md_sync(mdev);
3464 drbd_bm_set_all(mdev);
3465
3466 rv = drbd_bm_write(mdev);
3467
3468 if (!rv) {
3469 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3470 drbd_md_sync(mdev);
3471 }
3472
3473 put_ldev(mdev);
3474 }
3475
3476 return rv;
3477}
3478
3479/**
3480 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3481 * @mdev: DRBD device.
3482 *
3483 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3484 */
3485int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3486{
3487 int rv = -EIO;
3488
3489 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3490 drbd_bm_clear_all(mdev);
3491 rv = drbd_bm_write(mdev);
3492 put_ldev(mdev);
3493 }
3494
3495 return rv;
3496}
3497
3498static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3499{
3500 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3501 int rv;
3502
3503 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3504
3505 drbd_bm_lock(mdev, work->why);
3506 rv = work->io_fn(mdev);
3507 drbd_bm_unlock(mdev);
3508
3509 clear_bit(BITMAP_IO, &mdev->flags);
3510 wake_up(&mdev->misc_wait);
3511
3512 if (work->done)
3513 work->done(mdev, rv);
3514
3515 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3516 work->why = NULL;
3517
3518 return 1;
3519}
3520
3521/**
3522 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3523 * @mdev: DRBD device.
3524 * @io_fn: IO callback to be called when bitmap IO is possible
3525 * @done: callback to be called after the bitmap IO was performed
3526 * @why: Descriptive text of the reason for doing the IO
3527 *
3528 * While IO on the bitmap happens we freeze application IO thus we ensure
3529 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3530 * called from worker context. It MUST NOT be used while a previous such
3531 * work is still pending!
3532 */
3533void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3534 int (*io_fn)(struct drbd_conf *),
3535 void (*done)(struct drbd_conf *, int),
3536 char *why)
3537{
3538 D_ASSERT(current == mdev->worker.task);
3539
3540 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3541 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3542 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3543 if (mdev->bm_io_work.why)
3544 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3545 why, mdev->bm_io_work.why);
3546
3547 mdev->bm_io_work.io_fn = io_fn;
3548 mdev->bm_io_work.done = done;
3549 mdev->bm_io_work.why = why;
3550
3551 set_bit(BITMAP_IO, &mdev->flags);
3552 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
3553 if (list_empty(&mdev->bm_io_work.w.list)) {
3554 set_bit(BITMAP_IO_QUEUED, &mdev->flags);
3555 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
3556 } else
3557 dev_err(DEV, "FIXME avoided double queuing bm_io_work\n");
3558 }
3559}
3560
3561/**
3562 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
3563 * @mdev: DRBD device.
3564 * @io_fn: IO callback to be called when bitmap IO is possible
3565 * @why: Descriptive text of the reason for doing the IO
3566 *
3567 * freezes application IO while that the actual IO operations runs. This
3568 * functions MAY NOT be called from worker context.
3569 */
3570int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
3571{
3572 int rv;
3573
3574 D_ASSERT(current != mdev->worker.task);
3575
3576 drbd_suspend_io(mdev);
3577
3578 drbd_bm_lock(mdev, why);
3579 rv = io_fn(mdev);
3580 drbd_bm_unlock(mdev);
3581
3582 drbd_resume_io(mdev);
3583
3584 return rv;
3585}
3586
3587void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3588{
3589 if ((mdev->ldev->md.flags & flag) != flag) {
3590 drbd_md_mark_dirty(mdev);
3591 mdev->ldev->md.flags |= flag;
3592 }
3593}
3594
3595void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3596{
3597 if ((mdev->ldev->md.flags & flag) != 0) {
3598 drbd_md_mark_dirty(mdev);
3599 mdev->ldev->md.flags &= ~flag;
3600 }
3601}
3602int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3603{
3604 return (bdev->md.flags & flag) != 0;
3605}
3606
3607static void md_sync_timer_fn(unsigned long data)
3608{
3609 struct drbd_conf *mdev = (struct drbd_conf *) data;
3610
3611 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
3612}
3613
3614static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3615{
3616 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
3617 drbd_md_sync(mdev);
3618
3619 return 1;
3620}
3621
3622#ifdef CONFIG_DRBD_FAULT_INJECTION
3623/* Fault insertion support including random number generator shamelessly
3624 * stolen from kernel/rcutorture.c */
3625struct fault_random_state {
3626 unsigned long state;
3627 unsigned long count;
3628};
3629
3630#define FAULT_RANDOM_MULT 39916801 /* prime */
3631#define FAULT_RANDOM_ADD 479001701 /* prime */
3632#define FAULT_RANDOM_REFRESH 10000
3633
3634/*
3635 * Crude but fast random-number generator. Uses a linear congruential
3636 * generator, with occasional help from get_random_bytes().
3637 */
3638static unsigned long
3639_drbd_fault_random(struct fault_random_state *rsp)
3640{
3641 long refresh;
3642
3643 if (!rsp->count--) {
3644 get_random_bytes(&refresh, sizeof(refresh));
3645 rsp->state += refresh;
3646 rsp->count = FAULT_RANDOM_REFRESH;
3647 }
3648 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3649 return swahw32(rsp->state);
3650}
3651
3652static char *
3653_drbd_fault_str(unsigned int type) {
3654 static char *_faults[] = {
3655 [DRBD_FAULT_MD_WR] = "Meta-data write",
3656 [DRBD_FAULT_MD_RD] = "Meta-data read",
3657 [DRBD_FAULT_RS_WR] = "Resync write",
3658 [DRBD_FAULT_RS_RD] = "Resync read",
3659 [DRBD_FAULT_DT_WR] = "Data write",
3660 [DRBD_FAULT_DT_RD] = "Data read",
3661 [DRBD_FAULT_DT_RA] = "Data read ahead",
3662 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
3663 [DRBD_FAULT_AL_EE] = "EE allocation"
3664 };
3665
3666 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
3667}
3668
3669unsigned int
3670_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
3671{
3672 static struct fault_random_state rrs = {0, 0};
3673
3674 unsigned int ret = (
3675 (fault_devs == 0 ||
3676 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
3677 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
3678
3679 if (ret) {
3680 fault_count++;
3681
3682 if (printk_ratelimit())
3683 dev_warn(DEV, "***Simulating %s failure\n",
3684 _drbd_fault_str(type));
3685 }
3686
3687 return ret;
3688}
3689#endif
3690
3691const char *drbd_buildtag(void)
3692{
3693 /* DRBD built from external sources has here a reference to the
3694 git hash of the source code. */
3695
3696 static char buildtag[38] = "\0uilt-in";
3697
3698 if (buildtag[0] == 0) {
3699#ifdef CONFIG_MODULES
3700 if (THIS_MODULE != NULL)
3701 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
3702 else
3703#endif
3704 buildtag[0] = 'b';
3705 }
3706
3707 return buildtag;
3708}
3709
3710module_init(drbd_init)
3711module_exit(drbd_cleanup)
3712
3713EXPORT_SYMBOL(drbd_conn_str);
3714EXPORT_SYMBOL(drbd_role_str);
3715EXPORT_SYMBOL(drbd_disk_str);
3716EXPORT_SYMBOL(drbd_set_st_err_str);
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
new file mode 100644
index 000000000000..6429d2b19e06
--- /dev/null
+++ b/drivers/block/drbd/drbd_nl.c
@@ -0,0 +1,2367 @@
1/*
2 drbd_nl.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/module.h>
27#include <linux/drbd.h>
28#include <linux/in.h>
29#include <linux/fs.h>
30#include <linux/file.h>
31#include <linux/slab.h>
32#include <linux/connector.h>
33#include <linux/blkpg.h>
34#include <linux/cpumask.h>
35#include "drbd_int.h"
36#include "drbd_wrappers.h"
37#include <asm/unaligned.h>
38#include <linux/drbd_tag_magic.h>
39#include <linux/drbd_limits.h>
40
41static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int);
42static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *);
43static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *);
44
45/* see get_sb_bdev and bd_claim */
46static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
47
48/* Generate the tag_list to struct functions */
49#define NL_PACKET(name, number, fields) \
50static int name ## _from_tags(struct drbd_conf *mdev, \
51 unsigned short *tags, struct name *arg) __attribute__ ((unused)); \
52static int name ## _from_tags(struct drbd_conf *mdev, \
53 unsigned short *tags, struct name *arg) \
54{ \
55 int tag; \
56 int dlen; \
57 \
58 while ((tag = get_unaligned(tags++)) != TT_END) { \
59 dlen = get_unaligned(tags++); \
60 switch (tag_number(tag)) { \
61 fields \
62 default: \
63 if (tag & T_MANDATORY) { \
64 dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \
65 return 0; \
66 } \
67 } \
68 tags = (unsigned short *)((char *)tags + dlen); \
69 } \
70 return 1; \
71}
72#define NL_INTEGER(pn, pr, member) \
73 case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \
74 arg->member = get_unaligned((int *)(tags)); \
75 break;
76#define NL_INT64(pn, pr, member) \
77 case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \
78 arg->member = get_unaligned((u64 *)(tags)); \
79 break;
80#define NL_BIT(pn, pr, member) \
81 case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \
82 arg->member = *(char *)(tags) ? 1 : 0; \
83 break;
84#define NL_STRING(pn, pr, member, len) \
85 case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \
86 if (dlen > len) { \
87 dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \
88 #member, dlen, (unsigned int)len); \
89 return 0; \
90 } \
91 arg->member ## _len = dlen; \
92 memcpy(arg->member, tags, min_t(size_t, dlen, len)); \
93 break;
94#include "linux/drbd_nl.h"
95
96/* Generate the struct to tag_list functions */
97#define NL_PACKET(name, number, fields) \
98static unsigned short* \
99name ## _to_tags(struct drbd_conf *mdev, \
100 struct name *arg, unsigned short *tags) __attribute__ ((unused)); \
101static unsigned short* \
102name ## _to_tags(struct drbd_conf *mdev, \
103 struct name *arg, unsigned short *tags) \
104{ \
105 fields \
106 return tags; \
107}
108
109#define NL_INTEGER(pn, pr, member) \
110 put_unaligned(pn | pr | TT_INTEGER, tags++); \
111 put_unaligned(sizeof(int), tags++); \
112 put_unaligned(arg->member, (int *)tags); \
113 tags = (unsigned short *)((char *)tags+sizeof(int));
114#define NL_INT64(pn, pr, member) \
115 put_unaligned(pn | pr | TT_INT64, tags++); \
116 put_unaligned(sizeof(u64), tags++); \
117 put_unaligned(arg->member, (u64 *)tags); \
118 tags = (unsigned short *)((char *)tags+sizeof(u64));
119#define NL_BIT(pn, pr, member) \
120 put_unaligned(pn | pr | TT_BIT, tags++); \
121 put_unaligned(sizeof(char), tags++); \
122 *(char *)tags = arg->member; \
123 tags = (unsigned short *)((char *)tags+sizeof(char));
124#define NL_STRING(pn, pr, member, len) \
125 put_unaligned(pn | pr | TT_STRING, tags++); \
126 put_unaligned(arg->member ## _len, tags++); \
127 memcpy(tags, arg->member, arg->member ## _len); \
128 tags = (unsigned short *)((char *)tags + arg->member ## _len);
129#include "linux/drbd_nl.h"
130
131void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name);
132void drbd_nl_send_reply(struct cn_msg *, int);
133
134int drbd_khelper(struct drbd_conf *mdev, char *cmd)
135{
136 char *envp[] = { "HOME=/",
137 "TERM=linux",
138 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
139 NULL, /* Will be set to address family */
140 NULL, /* Will be set to address */
141 NULL };
142
143 char mb[12], af[20], ad[60], *afs;
144 char *argv[] = {usermode_helper, cmd, mb, NULL };
145 int ret;
146
147 snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev));
148
149 if (get_net_conf(mdev)) {
150 switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) {
151 case AF_INET6:
152 afs = "ipv6";
153 snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6",
154 &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr);
155 break;
156 case AF_INET:
157 afs = "ipv4";
158 snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
159 &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
160 break;
161 default:
162 afs = "ssocks";
163 snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
164 &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
165 }
166 snprintf(af, 20, "DRBD_PEER_AF=%s", afs);
167 envp[3]=af;
168 envp[4]=ad;
169 put_net_conf(mdev);
170 }
171
172 dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
173
174 drbd_bcast_ev_helper(mdev, cmd);
175 ret = call_usermodehelper(usermode_helper, argv, envp, 1);
176 if (ret)
177 dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
178 usermode_helper, cmd, mb,
179 (ret >> 8) & 0xff, ret);
180 else
181 dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
182 usermode_helper, cmd, mb,
183 (ret >> 8) & 0xff, ret);
184
185 if (ret < 0) /* Ignore any ERRNOs we got. */
186 ret = 0;
187
188 return ret;
189}
190
191enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
192{
193 char *ex_to_string;
194 int r;
195 enum drbd_disk_state nps;
196 enum drbd_fencing_p fp;
197
198 D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
199
200 if (get_ldev_if_state(mdev, D_CONSISTENT)) {
201 fp = mdev->ldev->dc.fencing;
202 put_ldev(mdev);
203 } else {
204 dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n");
205 return mdev->state.pdsk;
206 }
207
208 if (fp == FP_STONITH)
209 _drbd_request_state(mdev, NS(susp, 1), CS_WAIT_COMPLETE);
210
211 r = drbd_khelper(mdev, "fence-peer");
212
213 switch ((r>>8) & 0xff) {
214 case 3: /* peer is inconsistent */
215 ex_to_string = "peer is inconsistent or worse";
216 nps = D_INCONSISTENT;
217 break;
218 case 4: /* peer got outdated, or was already outdated */
219 ex_to_string = "peer was fenced";
220 nps = D_OUTDATED;
221 break;
222 case 5: /* peer was down */
223 if (mdev->state.disk == D_UP_TO_DATE) {
224 /* we will(have) create(d) a new UUID anyways... */
225 ex_to_string = "peer is unreachable, assumed to be dead";
226 nps = D_OUTDATED;
227 } else {
228 ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
229 nps = mdev->state.pdsk;
230 }
231 break;
232 case 6: /* Peer is primary, voluntarily outdate myself.
233 * This is useful when an unconnected R_SECONDARY is asked to
234 * become R_PRIMARY, but finds the other peer being active. */
235 ex_to_string = "peer is active";
236 dev_warn(DEV, "Peer is primary, outdating myself.\n");
237 nps = D_UNKNOWN;
238 _drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE);
239 break;
240 case 7:
241 if (fp != FP_STONITH)
242 dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n");
243 ex_to_string = "peer was stonithed";
244 nps = D_OUTDATED;
245 break;
246 default:
247 /* The script is broken ... */
248 nps = D_UNKNOWN;
249 dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
250 return nps;
251 }
252
253 dev_info(DEV, "fence-peer helper returned %d (%s)\n",
254 (r>>8) & 0xff, ex_to_string);
255 return nps;
256}
257
258
259int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
260{
261 const int max_tries = 4;
262 int r = 0;
263 int try = 0;
264 int forced = 0;
265 union drbd_state mask, val;
266 enum drbd_disk_state nps;
267
268 if (new_role == R_PRIMARY)
269 request_ping(mdev); /* Detect a dead peer ASAP */
270
271 mutex_lock(&mdev->state_mutex);
272
273 mask.i = 0; mask.role = R_MASK;
274 val.i = 0; val.role = new_role;
275
276 while (try++ < max_tries) {
277 r = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE);
278
279 /* in case we first succeeded to outdate,
280 * but now suddenly could establish a connection */
281 if (r == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
282 val.pdsk = 0;
283 mask.pdsk = 0;
284 continue;
285 }
286
287 if (r == SS_NO_UP_TO_DATE_DISK && force &&
288 (mdev->state.disk < D_UP_TO_DATE &&
289 mdev->state.disk >= D_INCONSISTENT)) {
290 mask.disk = D_MASK;
291 val.disk = D_UP_TO_DATE;
292 forced = 1;
293 continue;
294 }
295
296 if (r == SS_NO_UP_TO_DATE_DISK &&
297 mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) {
298 D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
299 nps = drbd_try_outdate_peer(mdev);
300
301 if (nps == D_OUTDATED || nps == D_INCONSISTENT) {
302 val.disk = D_UP_TO_DATE;
303 mask.disk = D_MASK;
304 }
305
306 val.pdsk = nps;
307 mask.pdsk = D_MASK;
308
309 continue;
310 }
311
312 if (r == SS_NOTHING_TO_DO)
313 goto fail;
314 if (r == SS_PRIMARY_NOP && mask.pdsk == 0) {
315 nps = drbd_try_outdate_peer(mdev);
316
317 if (force && nps > D_OUTDATED) {
318 dev_warn(DEV, "Forced into split brain situation!\n");
319 nps = D_OUTDATED;
320 }
321
322 mask.pdsk = D_MASK;
323 val.pdsk = nps;
324
325 continue;
326 }
327 if (r == SS_TWO_PRIMARIES) {
328 /* Maybe the peer is detected as dead very soon...
329 retry at most once more in this case. */
330 __set_current_state(TASK_INTERRUPTIBLE);
331 schedule_timeout((mdev->net_conf->ping_timeo+1)*HZ/10);
332 if (try < max_tries)
333 try = max_tries - 1;
334 continue;
335 }
336 if (r < SS_SUCCESS) {
337 r = _drbd_request_state(mdev, mask, val,
338 CS_VERBOSE + CS_WAIT_COMPLETE);
339 if (r < SS_SUCCESS)
340 goto fail;
341 }
342 break;
343 }
344
345 if (r < SS_SUCCESS)
346 goto fail;
347
348 if (forced)
349 dev_warn(DEV, "Forced to consider local data as UpToDate!\n");
350
351 /* Wait until nothing is on the fly :) */
352 wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0);
353
354 if (new_role == R_SECONDARY) {
355 set_disk_ro(mdev->vdisk, TRUE);
356 if (get_ldev(mdev)) {
357 mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
358 put_ldev(mdev);
359 }
360 } else {
361 if (get_net_conf(mdev)) {
362 mdev->net_conf->want_lose = 0;
363 put_net_conf(mdev);
364 }
365 set_disk_ro(mdev->vdisk, FALSE);
366 if (get_ldev(mdev)) {
367 if (((mdev->state.conn < C_CONNECTED ||
368 mdev->state.pdsk <= D_FAILED)
369 && mdev->ldev->md.uuid[UI_BITMAP] == 0) || forced)
370 drbd_uuid_new_current(mdev);
371
372 mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1;
373 put_ldev(mdev);
374 }
375 }
376
377 if ((new_role == R_SECONDARY) && get_ldev(mdev)) {
378 drbd_al_to_on_disk_bm(mdev);
379 put_ldev(mdev);
380 }
381
382 if (mdev->state.conn >= C_WF_REPORT_PARAMS) {
383 /* if this was forced, we should consider sync */
384 if (forced)
385 drbd_send_uuids(mdev);
386 drbd_send_state(mdev);
387 }
388
389 drbd_md_sync(mdev);
390
391 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
392 fail:
393 mutex_unlock(&mdev->state_mutex);
394 return r;
395}
396
397
398static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
399 struct drbd_nl_cfg_reply *reply)
400{
401 struct primary primary_args;
402
403 memset(&primary_args, 0, sizeof(struct primary));
404 if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) {
405 reply->ret_code = ERR_MANDATORY_TAG;
406 return 0;
407 }
408
409 reply->ret_code =
410 drbd_set_role(mdev, R_PRIMARY, primary_args.primary_force);
411
412 return 0;
413}
414
415static int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
416 struct drbd_nl_cfg_reply *reply)
417{
418 reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0);
419
420 return 0;
421}
422
423/* initializes the md.*_offset members, so we are able to find
424 * the on disk meta data */
425static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
426 struct drbd_backing_dev *bdev)
427{
428 sector_t md_size_sect = 0;
429 switch (bdev->dc.meta_dev_idx) {
430 default:
431 /* v07 style fixed size indexed meta data */
432 bdev->md.md_size_sect = MD_RESERVED_SECT;
433 bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
434 bdev->md.al_offset = MD_AL_OFFSET;
435 bdev->md.bm_offset = MD_BM_OFFSET;
436 break;
437 case DRBD_MD_INDEX_FLEX_EXT:
438 /* just occupy the full device; unit: sectors */
439 bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
440 bdev->md.md_offset = 0;
441 bdev->md.al_offset = MD_AL_OFFSET;
442 bdev->md.bm_offset = MD_BM_OFFSET;
443 break;
444 case DRBD_MD_INDEX_INTERNAL:
445 case DRBD_MD_INDEX_FLEX_INT:
446 bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
447 /* al size is still fixed */
448 bdev->md.al_offset = -MD_AL_MAX_SIZE;
449 /* we need (slightly less than) ~ this much bitmap sectors: */
450 md_size_sect = drbd_get_capacity(bdev->backing_bdev);
451 md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
452 md_size_sect = BM_SECT_TO_EXT(md_size_sect);
453 md_size_sect = ALIGN(md_size_sect, 8);
454
455 /* plus the "drbd meta data super block",
456 * and the activity log; */
457 md_size_sect += MD_BM_OFFSET;
458
459 bdev->md.md_size_sect = md_size_sect;
460 /* bitmap offset is adjusted by 'super' block size */
461 bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET;
462 break;
463 }
464}
465
466char *ppsize(char *buf, unsigned long long size)
467{
468 /* Needs 9 bytes at max. */
469 static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' };
470 int base = 0;
471 while (size >= 10000) {
472 /* shift + round */
473 size = (size >> 10) + !!(size & (1<<9));
474 base++;
475 }
476 sprintf(buf, "%lu %cB", (long)size, units[base]);
477
478 return buf;
479}
480
481/* there is still a theoretical deadlock when called from receiver
482 * on an D_INCONSISTENT R_PRIMARY:
483 * remote READ does inc_ap_bio, receiver would need to receive answer
484 * packet from remote to dec_ap_bio again.
485 * receiver receive_sizes(), comes here,
486 * waits for ap_bio_cnt == 0. -> deadlock.
487 * but this cannot happen, actually, because:
488 * R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable
489 * (not connected, or bad/no disk on peer):
490 * see drbd_fail_request_early, ap_bio_cnt is zero.
491 * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET:
492 * peer may not initiate a resize.
493 */
494void drbd_suspend_io(struct drbd_conf *mdev)
495{
496 set_bit(SUSPEND_IO, &mdev->flags);
497 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
498}
499
500void drbd_resume_io(struct drbd_conf *mdev)
501{
502 clear_bit(SUSPEND_IO, &mdev->flags);
503 wake_up(&mdev->misc_wait);
504}
505
506/**
507 * drbd_determine_dev_size() - Sets the right device size obeying all constraints
508 * @mdev: DRBD device.
509 *
510 * Returns 0 on success, negative return values indicate errors.
511 * You should call drbd_md_sync() after calling this function.
512 */
513enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, int force) __must_hold(local)
514{
515 sector_t prev_first_sect, prev_size; /* previous meta location */
516 sector_t la_size;
517 sector_t size;
518 char ppb[10];
519
520 int md_moved, la_size_changed;
521 enum determine_dev_size rv = unchanged;
522
523 /* race:
524 * application request passes inc_ap_bio,
525 * but then cannot get an AL-reference.
526 * this function later may wait on ap_bio_cnt == 0. -> deadlock.
527 *
528 * to avoid that:
529 * Suspend IO right here.
530 * still lock the act_log to not trigger ASSERTs there.
531 */
532 drbd_suspend_io(mdev);
533
534 /* no wait necessary anymore, actually we could assert that */
535 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
536
537 prev_first_sect = drbd_md_first_sector(mdev->ldev);
538 prev_size = mdev->ldev->md.md_size_sect;
539 la_size = mdev->ldev->md.la_size_sect;
540
541 /* TODO: should only be some assert here, not (re)init... */
542 drbd_md_set_sector_offsets(mdev, mdev->ldev);
543
544 size = drbd_new_dev_size(mdev, mdev->ldev, force);
545
546 if (drbd_get_capacity(mdev->this_bdev) != size ||
547 drbd_bm_capacity(mdev) != size) {
548 int err;
549 err = drbd_bm_resize(mdev, size);
550 if (unlikely(err)) {
551 /* currently there is only one error: ENOMEM! */
552 size = drbd_bm_capacity(mdev)>>1;
553 if (size == 0) {
554 dev_err(DEV, "OUT OF MEMORY! "
555 "Could not allocate bitmap!\n");
556 } else {
557 dev_err(DEV, "BM resizing failed. "
558 "Leaving size unchanged at size = %lu KB\n",
559 (unsigned long)size);
560 }
561 rv = dev_size_error;
562 }
563 /* racy, see comments above. */
564 drbd_set_my_capacity(mdev, size);
565 mdev->ldev->md.la_size_sect = size;
566 dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
567 (unsigned long long)size>>1);
568 }
569 if (rv == dev_size_error)
570 goto out;
571
572 la_size_changed = (la_size != mdev->ldev->md.la_size_sect);
573
574 md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev)
575 || prev_size != mdev->ldev->md.md_size_sect;
576
577 if (la_size_changed || md_moved) {
578 drbd_al_shrink(mdev); /* All extents inactive. */
579 dev_info(DEV, "Writing the whole bitmap, %s\n",
580 la_size_changed && md_moved ? "size changed and md moved" :
581 la_size_changed ? "size changed" : "md moved");
582 rv = drbd_bitmap_io(mdev, &drbd_bm_write, "size changed"); /* does drbd_resume_io() ! */
583 drbd_md_mark_dirty(mdev);
584 }
585
586 if (size > la_size)
587 rv = grew;
588 if (size < la_size)
589 rv = shrunk;
590out:
591 lc_unlock(mdev->act_log);
592 wake_up(&mdev->al_wait);
593 drbd_resume_io(mdev);
594
595 return rv;
596}
597
598sector_t
599drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int assume_peer_has_space)
600{
601 sector_t p_size = mdev->p_size; /* partner's disk size. */
602 sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */
603 sector_t m_size; /* my size */
604 sector_t u_size = bdev->dc.disk_size; /* size requested by user. */
605 sector_t size = 0;
606
607 m_size = drbd_get_max_capacity(bdev);
608
609 if (mdev->state.conn < C_CONNECTED && assume_peer_has_space) {
610 dev_warn(DEV, "Resize while not connected was forced by the user!\n");
611 p_size = m_size;
612 }
613
614 if (p_size && m_size) {
615 size = min_t(sector_t, p_size, m_size);
616 } else {
617 if (la_size) {
618 size = la_size;
619 if (m_size && m_size < size)
620 size = m_size;
621 if (p_size && p_size < size)
622 size = p_size;
623 } else {
624 if (m_size)
625 size = m_size;
626 if (p_size)
627 size = p_size;
628 }
629 }
630
631 if (size == 0)
632 dev_err(DEV, "Both nodes diskless!\n");
633
634 if (u_size) {
635 if (u_size > size)
636 dev_err(DEV, "Requested disk size is too big (%lu > %lu)\n",
637 (unsigned long)u_size>>1, (unsigned long)size>>1);
638 else
639 size = u_size;
640 }
641
642 return size;
643}
644
645/**
646 * drbd_check_al_size() - Ensures that the AL is of the right size
647 * @mdev: DRBD device.
648 *
649 * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation
650 * failed, and 0 on success. You should call drbd_md_sync() after you called
651 * this function.
652 */
653static int drbd_check_al_size(struct drbd_conf *mdev)
654{
655 struct lru_cache *n, *t;
656 struct lc_element *e;
657 unsigned int in_use;
658 int i;
659
660 ERR_IF(mdev->sync_conf.al_extents < 7)
661 mdev->sync_conf.al_extents = 127;
662
663 if (mdev->act_log &&
664 mdev->act_log->nr_elements == mdev->sync_conf.al_extents)
665 return 0;
666
667 in_use = 0;
668 t = mdev->act_log;
669 n = lc_create("act_log", drbd_al_ext_cache,
670 mdev->sync_conf.al_extents, sizeof(struct lc_element), 0);
671
672 if (n == NULL) {
673 dev_err(DEV, "Cannot allocate act_log lru!\n");
674 return -ENOMEM;
675 }
676 spin_lock_irq(&mdev->al_lock);
677 if (t) {
678 for (i = 0; i < t->nr_elements; i++) {
679 e = lc_element_by_index(t, i);
680 if (e->refcnt)
681 dev_err(DEV, "refcnt(%d)==%d\n",
682 e->lc_number, e->refcnt);
683 in_use += e->refcnt;
684 }
685 }
686 if (!in_use)
687 mdev->act_log = n;
688 spin_unlock_irq(&mdev->al_lock);
689 if (in_use) {
690 dev_err(DEV, "Activity log still in use!\n");
691 lc_destroy(n);
692 return -EBUSY;
693 } else {
694 if (t)
695 lc_destroy(t);
696 }
697 drbd_md_mark_dirty(mdev); /* we changed mdev->act_log->nr_elemens */
698 return 0;
699}
700
701void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __must_hold(local)
702{
703 struct request_queue * const q = mdev->rq_queue;
704 struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
705 int max_segments = mdev->ldev->dc.max_bio_bvecs;
706
707 if (b->merge_bvec_fn && !mdev->ldev->dc.use_bmbv)
708 max_seg_s = PAGE_SIZE;
709
710 max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s);
711
712 blk_queue_max_hw_sectors(q, max_seg_s >> 9);
713 blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
714 blk_queue_max_segment_size(q, max_seg_s);
715 blk_queue_logical_block_size(q, 512);
716 blk_queue_segment_boundary(q, PAGE_SIZE-1);
717 blk_stack_limits(&q->limits, &b->limits, 0);
718
719 if (b->merge_bvec_fn)
720 dev_warn(DEV, "Backing device's merge_bvec_fn() = %p\n",
721 b->merge_bvec_fn);
722 dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q));
723
724 if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
725 dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
726 q->backing_dev_info.ra_pages,
727 b->backing_dev_info.ra_pages);
728 q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
729 }
730}
731
732/* serialize deconfig (worker exiting, doing cleanup)
733 * and reconfig (drbdsetup disk, drbdsetup net)
734 *
735 * wait for a potentially exiting worker, then restart it,
736 * or start a new one.
737 */
738static void drbd_reconfig_start(struct drbd_conf *mdev)
739{
740 wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags));
741 wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags));
742 drbd_thread_start(&mdev->worker);
743}
744
745/* if still unconfigured, stops worker again.
746 * if configured now, clears CONFIG_PENDING.
747 * wakes potential waiters */
748static void drbd_reconfig_done(struct drbd_conf *mdev)
749{
750 spin_lock_irq(&mdev->req_lock);
751 if (mdev->state.disk == D_DISKLESS &&
752 mdev->state.conn == C_STANDALONE &&
753 mdev->state.role == R_SECONDARY) {
754 set_bit(DEVICE_DYING, &mdev->flags);
755 drbd_thread_stop_nowait(&mdev->worker);
756 } else
757 clear_bit(CONFIG_PENDING, &mdev->flags);
758 spin_unlock_irq(&mdev->req_lock);
759 wake_up(&mdev->state_wait);
760}
761
762/* does always return 0;
763 * interesting return code is in reply->ret_code */
764static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
765 struct drbd_nl_cfg_reply *reply)
766{
767 enum drbd_ret_codes retcode;
768 enum determine_dev_size dd;
769 sector_t max_possible_sectors;
770 sector_t min_md_device_sectors;
771 struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
772 struct inode *inode, *inode2;
773 struct lru_cache *resync_lru = NULL;
774 union drbd_state ns, os;
775 int rv;
776 int cp_discovered = 0;
777 int logical_block_size;
778
779 drbd_reconfig_start(mdev);
780
781 /* if you want to reconfigure, please tear down first */
782 if (mdev->state.disk > D_DISKLESS) {
783 retcode = ERR_DISK_CONFIGURED;
784 goto fail;
785 }
786
787 /* allocation not in the IO path, cqueue thread context */
788 nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
789 if (!nbc) {
790 retcode = ERR_NOMEM;
791 goto fail;
792 }
793
794 nbc->dc.disk_size = DRBD_DISK_SIZE_SECT_DEF;
795 nbc->dc.on_io_error = DRBD_ON_IO_ERROR_DEF;
796 nbc->dc.fencing = DRBD_FENCING_DEF;
797 nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF;
798
799 if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) {
800 retcode = ERR_MANDATORY_TAG;
801 goto fail;
802 }
803
804 if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
805 retcode = ERR_MD_IDX_INVALID;
806 goto fail;
807 }
808
809 nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0);
810 if (IS_ERR(nbc->lo_file)) {
811 dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
812 PTR_ERR(nbc->lo_file));
813 nbc->lo_file = NULL;
814 retcode = ERR_OPEN_DISK;
815 goto fail;
816 }
817
818 inode = nbc->lo_file->f_dentry->d_inode;
819
820 if (!S_ISBLK(inode->i_mode)) {
821 retcode = ERR_DISK_NOT_BDEV;
822 goto fail;
823 }
824
825 nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0);
826 if (IS_ERR(nbc->md_file)) {
827 dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
828 PTR_ERR(nbc->md_file));
829 nbc->md_file = NULL;
830 retcode = ERR_OPEN_MD_DISK;
831 goto fail;
832 }
833
834 inode2 = nbc->md_file->f_dentry->d_inode;
835
836 if (!S_ISBLK(inode2->i_mode)) {
837 retcode = ERR_MD_NOT_BDEV;
838 goto fail;
839 }
840
841 nbc->backing_bdev = inode->i_bdev;
842 if (bd_claim(nbc->backing_bdev, mdev)) {
843 printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n",
844 nbc->backing_bdev, mdev,
845 nbc->backing_bdev->bd_holder,
846 nbc->backing_bdev->bd_contains->bd_holder,
847 nbc->backing_bdev->bd_holders);
848 retcode = ERR_BDCLAIM_DISK;
849 goto fail;
850 }
851
852 resync_lru = lc_create("resync", drbd_bm_ext_cache,
853 61, sizeof(struct bm_extent),
854 offsetof(struct bm_extent, lce));
855 if (!resync_lru) {
856 retcode = ERR_NOMEM;
857 goto release_bdev_fail;
858 }
859
860 /* meta_dev_idx >= 0: external fixed size,
861 * possibly multiple drbd sharing one meta device.
862 * TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is
863 * not yet used by some other drbd minor!
864 * (if you use drbd.conf + drbdadm,
865 * that should check it for you already; but if you don't, or someone
866 * fooled it, we need to double check here) */
867 nbc->md_bdev = inode2->i_bdev;
868 if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev
869 : (void *) drbd_m_holder)) {
870 retcode = ERR_BDCLAIM_MD_DISK;
871 goto release_bdev_fail;
872 }
873
874 if ((nbc->backing_bdev == nbc->md_bdev) !=
875 (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
876 nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
877 retcode = ERR_MD_IDX_INVALID;
878 goto release_bdev2_fail;
879 }
880
881 /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
882 drbd_md_set_sector_offsets(mdev, nbc);
883
884 if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) {
885 dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
886 (unsigned long long) drbd_get_max_capacity(nbc),
887 (unsigned long long) nbc->dc.disk_size);
888 retcode = ERR_DISK_TO_SMALL;
889 goto release_bdev2_fail;
890 }
891
892 if (nbc->dc.meta_dev_idx < 0) {
893 max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
894 /* at least one MB, otherwise it does not make sense */
895 min_md_device_sectors = (2<<10);
896 } else {
897 max_possible_sectors = DRBD_MAX_SECTORS;
898 min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1);
899 }
900
901 if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
902 retcode = ERR_MD_DISK_TO_SMALL;
903 dev_warn(DEV, "refusing attach: md-device too small, "
904 "at least %llu sectors needed for this meta-disk type\n",
905 (unsigned long long) min_md_device_sectors);
906 goto release_bdev2_fail;
907 }
908
909 /* Make sure the new disk is big enough
910 * (we may currently be R_PRIMARY with no local disk...) */
911 if (drbd_get_max_capacity(nbc) <
912 drbd_get_capacity(mdev->this_bdev)) {
913 retcode = ERR_DISK_TO_SMALL;
914 goto release_bdev2_fail;
915 }
916
917 nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
918
919 if (nbc->known_size > max_possible_sectors) {
920 dev_warn(DEV, "==> truncating very big lower level device "
921 "to currently maximum possible %llu sectors <==\n",
922 (unsigned long long) max_possible_sectors);
923 if (nbc->dc.meta_dev_idx >= 0)
924 dev_warn(DEV, "==>> using internal or flexible "
925 "meta data may help <<==\n");
926 }
927
928 drbd_suspend_io(mdev);
929 /* also wait for the last barrier ack. */
930 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt));
931 /* and for any other previously queued work */
932 drbd_flush_workqueue(mdev);
933
934 retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE);
935 drbd_resume_io(mdev);
936 if (retcode < SS_SUCCESS)
937 goto release_bdev2_fail;
938
939 if (!get_ldev_if_state(mdev, D_ATTACHING))
940 goto force_diskless;
941
942 drbd_md_set_sector_offsets(mdev, nbc);
943
944 /* allocate a second IO page if logical_block_size != 512 */
945 logical_block_size = bdev_logical_block_size(nbc->md_bdev);
946 if (logical_block_size == 0)
947 logical_block_size = MD_SECTOR_SIZE;
948
949 if (logical_block_size != MD_SECTOR_SIZE) {
950 if (!mdev->md_io_tmpp) {
951 struct page *page = alloc_page(GFP_NOIO);
952 if (!page)
953 goto force_diskless_dec;
954
955 dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n",
956 logical_block_size, MD_SECTOR_SIZE);
957 dev_warn(DEV, "Workaround engaged (has performance impact).\n");
958
959 mdev->md_io_tmpp = page;
960 }
961 }
962
963 if (!mdev->bitmap) {
964 if (drbd_bm_init(mdev)) {
965 retcode = ERR_NOMEM;
966 goto force_diskless_dec;
967 }
968 }
969
970 retcode = drbd_md_read(mdev, nbc);
971 if (retcode != NO_ERROR)
972 goto force_diskless_dec;
973
974 if (mdev->state.conn < C_CONNECTED &&
975 mdev->state.role == R_PRIMARY &&
976 (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
977 dev_err(DEV, "Can only attach to data with current UUID=%016llX\n",
978 (unsigned long long)mdev->ed_uuid);
979 retcode = ERR_DATA_NOT_CURRENT;
980 goto force_diskless_dec;
981 }
982
983 /* Since we are diskless, fix the activity log first... */
984 if (drbd_check_al_size(mdev)) {
985 retcode = ERR_NOMEM;
986 goto force_diskless_dec;
987 }
988
989 /* Prevent shrinking of consistent devices ! */
990 if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
991 drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) {
992 dev_warn(DEV, "refusing to truncate a consistent device\n");
993 retcode = ERR_DISK_TO_SMALL;
994 goto force_diskless_dec;
995 }
996
997 if (!drbd_al_read_log(mdev, nbc)) {
998 retcode = ERR_IO_MD_DISK;
999 goto force_diskless_dec;
1000 }
1001
1002 /* Reset the "barriers don't work" bits here, then force meta data to
1003 * be written, to ensure we determine if barriers are supported. */
1004 if (nbc->dc.no_md_flush)
1005 set_bit(MD_NO_BARRIER, &mdev->flags);
1006 else
1007 clear_bit(MD_NO_BARRIER, &mdev->flags);
1008
1009 /* Point of no return reached.
1010 * Devices and memory are no longer released by error cleanup below.
1011 * now mdev takes over responsibility, and the state engine should
1012 * clean it up somewhere. */
1013 D_ASSERT(mdev->ldev == NULL);
1014 mdev->ldev = nbc;
1015 mdev->resync = resync_lru;
1016 nbc = NULL;
1017 resync_lru = NULL;
1018
1019 mdev->write_ordering = WO_bio_barrier;
1020 drbd_bump_write_ordering(mdev, WO_bio_barrier);
1021
1022 if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY))
1023 set_bit(CRASHED_PRIMARY, &mdev->flags);
1024 else
1025 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1026
1027 if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND)) {
1028 set_bit(CRASHED_PRIMARY, &mdev->flags);
1029 cp_discovered = 1;
1030 }
1031
1032 mdev->send_cnt = 0;
1033 mdev->recv_cnt = 0;
1034 mdev->read_cnt = 0;
1035 mdev->writ_cnt = 0;
1036
1037 drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE);
1038
1039 /* If I am currently not R_PRIMARY,
1040 * but meta data primary indicator is set,
1041 * I just now recover from a hard crash,
1042 * and have been R_PRIMARY before that crash.
1043 *
1044 * Now, if I had no connection before that crash
1045 * (have been degraded R_PRIMARY), chances are that
1046 * I won't find my peer now either.
1047 *
1048 * In that case, and _only_ in that case,
1049 * we use the degr-wfc-timeout instead of the default,
1050 * so we can automatically recover from a crash of a
1051 * degraded but active "cluster" after a certain timeout.
1052 */
1053 clear_bit(USE_DEGR_WFC_T, &mdev->flags);
1054 if (mdev->state.role != R_PRIMARY &&
1055 drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
1056 !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND))
1057 set_bit(USE_DEGR_WFC_T, &mdev->flags);
1058
1059 dd = drbd_determin_dev_size(mdev, 0);
1060 if (dd == dev_size_error) {
1061 retcode = ERR_NOMEM_BITMAP;
1062 goto force_diskless_dec;
1063 } else if (dd == grew)
1064 set_bit(RESYNC_AFTER_NEG, &mdev->flags);
1065
1066 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
1067 dev_info(DEV, "Assuming that all blocks are out of sync "
1068 "(aka FullSync)\n");
1069 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from attaching")) {
1070 retcode = ERR_IO_MD_DISK;
1071 goto force_diskless_dec;
1072 }
1073 } else {
1074 if (drbd_bitmap_io(mdev, &drbd_bm_read, "read from attaching") < 0) {
1075 retcode = ERR_IO_MD_DISK;
1076 goto force_diskless_dec;
1077 }
1078 }
1079
1080 if (cp_discovered) {
1081 drbd_al_apply_to_bm(mdev);
1082 drbd_al_to_on_disk_bm(mdev);
1083 }
1084
1085 spin_lock_irq(&mdev->req_lock);
1086 os = mdev->state;
1087 ns.i = os.i;
1088 /* If MDF_CONSISTENT is not set go into inconsistent state,
1089 otherwise investigate MDF_WasUpToDate...
1090 If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
1091 otherwise into D_CONSISTENT state.
1092 */
1093 if (drbd_md_test_flag(mdev->ldev, MDF_CONSISTENT)) {
1094 if (drbd_md_test_flag(mdev->ldev, MDF_WAS_UP_TO_DATE))
1095 ns.disk = D_CONSISTENT;
1096 else
1097 ns.disk = D_OUTDATED;
1098 } else {
1099 ns.disk = D_INCONSISTENT;
1100 }
1101
1102 if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED))
1103 ns.pdsk = D_OUTDATED;
1104
1105 if ( ns.disk == D_CONSISTENT &&
1106 (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE))
1107 ns.disk = D_UP_TO_DATE;
1108
1109 /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
1110 MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before
1111 this point, because drbd_request_state() modifies these
1112 flags. */
1113
1114 /* In case we are C_CONNECTED postpone any decision on the new disk
1115 state after the negotiation phase. */
1116 if (mdev->state.conn == C_CONNECTED) {
1117 mdev->new_state_tmp.i = ns.i;
1118 ns.i = os.i;
1119 ns.disk = D_NEGOTIATING;
1120 }
1121
1122 rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
1123 ns = mdev->state;
1124 spin_unlock_irq(&mdev->req_lock);
1125
1126 if (rv < SS_SUCCESS)
1127 goto force_diskless_dec;
1128
1129 if (mdev->state.role == R_PRIMARY)
1130 mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1;
1131 else
1132 mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
1133
1134 drbd_md_mark_dirty(mdev);
1135 drbd_md_sync(mdev);
1136
1137 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
1138 put_ldev(mdev);
1139 reply->ret_code = retcode;
1140 drbd_reconfig_done(mdev);
1141 return 0;
1142
1143 force_diskless_dec:
1144 put_ldev(mdev);
1145 force_diskless:
1146 drbd_force_state(mdev, NS(disk, D_DISKLESS));
1147 drbd_md_sync(mdev);
1148 release_bdev2_fail:
1149 if (nbc)
1150 bd_release(nbc->md_bdev);
1151 release_bdev_fail:
1152 if (nbc)
1153 bd_release(nbc->backing_bdev);
1154 fail:
1155 if (nbc) {
1156 if (nbc->lo_file)
1157 fput(nbc->lo_file);
1158 if (nbc->md_file)
1159 fput(nbc->md_file);
1160 kfree(nbc);
1161 }
1162 lc_destroy(resync_lru);
1163
1164 reply->ret_code = retcode;
1165 drbd_reconfig_done(mdev);
1166 return 0;
1167}
1168
1169static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1170 struct drbd_nl_cfg_reply *reply)
1171{
1172 reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS));
1173 return 0;
1174}
1175
1176static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1177 struct drbd_nl_cfg_reply *reply)
1178{
1179 int i, ns;
1180 enum drbd_ret_codes retcode;
1181 struct net_conf *new_conf = NULL;
1182 struct crypto_hash *tfm = NULL;
1183 struct crypto_hash *integrity_w_tfm = NULL;
1184 struct crypto_hash *integrity_r_tfm = NULL;
1185 struct hlist_head *new_tl_hash = NULL;
1186 struct hlist_head *new_ee_hash = NULL;
1187 struct drbd_conf *odev;
1188 char hmac_name[CRYPTO_MAX_ALG_NAME];
1189 void *int_dig_out = NULL;
1190 void *int_dig_in = NULL;
1191 void *int_dig_vv = NULL;
1192 struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr;
1193
1194 drbd_reconfig_start(mdev);
1195
1196 if (mdev->state.conn > C_STANDALONE) {
1197 retcode = ERR_NET_CONFIGURED;
1198 goto fail;
1199 }
1200
1201 /* allocation not in the IO path, cqueue thread context */
1202 new_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
1203 if (!new_conf) {
1204 retcode = ERR_NOMEM;
1205 goto fail;
1206 }
1207
1208 memset(new_conf, 0, sizeof(struct net_conf));
1209 new_conf->timeout = DRBD_TIMEOUT_DEF;
1210 new_conf->try_connect_int = DRBD_CONNECT_INT_DEF;
1211 new_conf->ping_int = DRBD_PING_INT_DEF;
1212 new_conf->max_epoch_size = DRBD_MAX_EPOCH_SIZE_DEF;
1213 new_conf->max_buffers = DRBD_MAX_BUFFERS_DEF;
1214 new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF;
1215 new_conf->sndbuf_size = DRBD_SNDBUF_SIZE_DEF;
1216 new_conf->rcvbuf_size = DRBD_RCVBUF_SIZE_DEF;
1217 new_conf->ko_count = DRBD_KO_COUNT_DEF;
1218 new_conf->after_sb_0p = DRBD_AFTER_SB_0P_DEF;
1219 new_conf->after_sb_1p = DRBD_AFTER_SB_1P_DEF;
1220 new_conf->after_sb_2p = DRBD_AFTER_SB_2P_DEF;
1221 new_conf->want_lose = 0;
1222 new_conf->two_primaries = 0;
1223 new_conf->wire_protocol = DRBD_PROT_C;
1224 new_conf->ping_timeo = DRBD_PING_TIMEO_DEF;
1225 new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF;
1226
1227 if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) {
1228 retcode = ERR_MANDATORY_TAG;
1229 goto fail;
1230 }
1231
1232 if (new_conf->two_primaries
1233 && (new_conf->wire_protocol != DRBD_PROT_C)) {
1234 retcode = ERR_NOT_PROTO_C;
1235 goto fail;
1236 };
1237
1238 if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
1239 retcode = ERR_DISCARD;
1240 goto fail;
1241 }
1242
1243 retcode = NO_ERROR;
1244
1245 new_my_addr = (struct sockaddr *)&new_conf->my_addr;
1246 new_peer_addr = (struct sockaddr *)&new_conf->peer_addr;
1247 for (i = 0; i < minor_count; i++) {
1248 odev = minor_to_mdev(i);
1249 if (!odev || odev == mdev)
1250 continue;
1251 if (get_net_conf(odev)) {
1252 taken_addr = (struct sockaddr *)&odev->net_conf->my_addr;
1253 if (new_conf->my_addr_len == odev->net_conf->my_addr_len &&
1254 !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len))
1255 retcode = ERR_LOCAL_ADDR;
1256
1257 taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr;
1258 if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len &&
1259 !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len))
1260 retcode = ERR_PEER_ADDR;
1261
1262 put_net_conf(odev);
1263 if (retcode != NO_ERROR)
1264 goto fail;
1265 }
1266 }
1267
1268 if (new_conf->cram_hmac_alg[0] != 0) {
1269 snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
1270 new_conf->cram_hmac_alg);
1271 tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC);
1272 if (IS_ERR(tfm)) {
1273 tfm = NULL;
1274 retcode = ERR_AUTH_ALG;
1275 goto fail;
1276 }
1277
1278 if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
1279 retcode = ERR_AUTH_ALG_ND;
1280 goto fail;
1281 }
1282 }
1283
1284 if (new_conf->integrity_alg[0]) {
1285 integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
1286 if (IS_ERR(integrity_w_tfm)) {
1287 integrity_w_tfm = NULL;
1288 retcode=ERR_INTEGRITY_ALG;
1289 goto fail;
1290 }
1291
1292 if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) {
1293 retcode=ERR_INTEGRITY_ALG_ND;
1294 goto fail;
1295 }
1296
1297 integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
1298 if (IS_ERR(integrity_r_tfm)) {
1299 integrity_r_tfm = NULL;
1300 retcode=ERR_INTEGRITY_ALG;
1301 goto fail;
1302 }
1303 }
1304
1305 ns = new_conf->max_epoch_size/8;
1306 if (mdev->tl_hash_s != ns) {
1307 new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
1308 if (!new_tl_hash) {
1309 retcode = ERR_NOMEM;
1310 goto fail;
1311 }
1312 }
1313
1314 ns = new_conf->max_buffers/8;
1315 if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) {
1316 new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
1317 if (!new_ee_hash) {
1318 retcode = ERR_NOMEM;
1319 goto fail;
1320 }
1321 }
1322
1323 ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
1324
1325 if (integrity_w_tfm) {
1326 i = crypto_hash_digestsize(integrity_w_tfm);
1327 int_dig_out = kmalloc(i, GFP_KERNEL);
1328 if (!int_dig_out) {
1329 retcode = ERR_NOMEM;
1330 goto fail;
1331 }
1332 int_dig_in = kmalloc(i, GFP_KERNEL);
1333 if (!int_dig_in) {
1334 retcode = ERR_NOMEM;
1335 goto fail;
1336 }
1337 int_dig_vv = kmalloc(i, GFP_KERNEL);
1338 if (!int_dig_vv) {
1339 retcode = ERR_NOMEM;
1340 goto fail;
1341 }
1342 }
1343
1344 if (!mdev->bitmap) {
1345 if(drbd_bm_init(mdev)) {
1346 retcode = ERR_NOMEM;
1347 goto fail;
1348 }
1349 }
1350
1351 spin_lock_irq(&mdev->req_lock);
1352 if (mdev->net_conf != NULL) {
1353 retcode = ERR_NET_CONFIGURED;
1354 spin_unlock_irq(&mdev->req_lock);
1355 goto fail;
1356 }
1357 mdev->net_conf = new_conf;
1358
1359 mdev->send_cnt = 0;
1360 mdev->recv_cnt = 0;
1361
1362 if (new_tl_hash) {
1363 kfree(mdev->tl_hash);
1364 mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8;
1365 mdev->tl_hash = new_tl_hash;
1366 }
1367
1368 if (new_ee_hash) {
1369 kfree(mdev->ee_hash);
1370 mdev->ee_hash_s = mdev->net_conf->max_buffers/8;
1371 mdev->ee_hash = new_ee_hash;
1372 }
1373
1374 crypto_free_hash(mdev->cram_hmac_tfm);
1375 mdev->cram_hmac_tfm = tfm;
1376
1377 crypto_free_hash(mdev->integrity_w_tfm);
1378 mdev->integrity_w_tfm = integrity_w_tfm;
1379
1380 crypto_free_hash(mdev->integrity_r_tfm);
1381 mdev->integrity_r_tfm = integrity_r_tfm;
1382
1383 kfree(mdev->int_dig_out);
1384 kfree(mdev->int_dig_in);
1385 kfree(mdev->int_dig_vv);
1386 mdev->int_dig_out=int_dig_out;
1387 mdev->int_dig_in=int_dig_in;
1388 mdev->int_dig_vv=int_dig_vv;
1389 spin_unlock_irq(&mdev->req_lock);
1390
1391 retcode = _drbd_request_state(mdev, NS(conn, C_UNCONNECTED), CS_VERBOSE);
1392
1393 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
1394 reply->ret_code = retcode;
1395 drbd_reconfig_done(mdev);
1396 return 0;
1397
1398fail:
1399 kfree(int_dig_out);
1400 kfree(int_dig_in);
1401 kfree(int_dig_vv);
1402 crypto_free_hash(tfm);
1403 crypto_free_hash(integrity_w_tfm);
1404 crypto_free_hash(integrity_r_tfm);
1405 kfree(new_tl_hash);
1406 kfree(new_ee_hash);
1407 kfree(new_conf);
1408
1409 reply->ret_code = retcode;
1410 drbd_reconfig_done(mdev);
1411 return 0;
1412}
1413
1414static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1415 struct drbd_nl_cfg_reply *reply)
1416{
1417 int retcode;
1418
1419 retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED);
1420
1421 if (retcode == SS_NOTHING_TO_DO)
1422 goto done;
1423 else if (retcode == SS_ALREADY_STANDALONE)
1424 goto done;
1425 else if (retcode == SS_PRIMARY_NOP) {
1426 /* Our statche checking code wants to see the peer outdated. */
1427 retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
1428 pdsk, D_OUTDATED));
1429 } else if (retcode == SS_CW_FAILED_BY_PEER) {
1430 /* The peer probably wants to see us outdated. */
1431 retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
1432 disk, D_OUTDATED),
1433 CS_ORDERED);
1434 if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) {
1435 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
1436 retcode = SS_SUCCESS;
1437 }
1438 }
1439
1440 if (retcode < SS_SUCCESS)
1441 goto fail;
1442
1443 if (wait_event_interruptible(mdev->state_wait,
1444 mdev->state.conn != C_DISCONNECTING)) {
1445 /* Do not test for mdev->state.conn == C_STANDALONE, since
1446 someone else might connect us in the mean time! */
1447 retcode = ERR_INTR;
1448 goto fail;
1449 }
1450
1451 done:
1452 retcode = NO_ERROR;
1453 fail:
1454 drbd_md_sync(mdev);
1455 reply->ret_code = retcode;
1456 return 0;
1457}
1458
1459void resync_after_online_grow(struct drbd_conf *mdev)
1460{
1461 int iass; /* I am sync source */
1462
1463 dev_info(DEV, "Resync of new storage after online grow\n");
1464 if (mdev->state.role != mdev->state.peer)
1465 iass = (mdev->state.role == R_PRIMARY);
1466 else
1467 iass = test_bit(DISCARD_CONCURRENT, &mdev->flags);
1468
1469 if (iass)
1470 drbd_start_resync(mdev, C_SYNC_SOURCE);
1471 else
1472 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
1473}
1474
1475static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1476 struct drbd_nl_cfg_reply *reply)
1477{
1478 struct resize rs;
1479 int retcode = NO_ERROR;
1480 int ldsc = 0; /* local disk size changed */
1481 enum determine_dev_size dd;
1482
1483 memset(&rs, 0, sizeof(struct resize));
1484 if (!resize_from_tags(mdev, nlp->tag_list, &rs)) {
1485 retcode = ERR_MANDATORY_TAG;
1486 goto fail;
1487 }
1488
1489 if (mdev->state.conn > C_CONNECTED) {
1490 retcode = ERR_RESIZE_RESYNC;
1491 goto fail;
1492 }
1493
1494 if (mdev->state.role == R_SECONDARY &&
1495 mdev->state.peer == R_SECONDARY) {
1496 retcode = ERR_NO_PRIMARY;
1497 goto fail;
1498 }
1499
1500 if (!get_ldev(mdev)) {
1501 retcode = ERR_NO_DISK;
1502 goto fail;
1503 }
1504
1505 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
1506 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
1507 ldsc = 1;
1508 }
1509
1510 mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
1511 dd = drbd_determin_dev_size(mdev, rs.resize_force);
1512 drbd_md_sync(mdev);
1513 put_ldev(mdev);
1514 if (dd == dev_size_error) {
1515 retcode = ERR_NOMEM_BITMAP;
1516 goto fail;
1517 }
1518
1519 if (mdev->state.conn == C_CONNECTED && (dd != unchanged || ldsc)) {
1520 if (dd == grew)
1521 set_bit(RESIZE_PENDING, &mdev->flags);
1522
1523 drbd_send_uuids(mdev);
1524 drbd_send_sizes(mdev, 1);
1525 }
1526
1527 fail:
1528 reply->ret_code = retcode;
1529 return 0;
1530}
1531
1532static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1533 struct drbd_nl_cfg_reply *reply)
1534{
1535 int retcode = NO_ERROR;
1536 int err;
1537 int ovr; /* online verify running */
1538 int rsr; /* re-sync running */
1539 struct crypto_hash *verify_tfm = NULL;
1540 struct crypto_hash *csums_tfm = NULL;
1541 struct syncer_conf sc;
1542 cpumask_var_t new_cpu_mask;
1543
1544 if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) {
1545 retcode = ERR_NOMEM;
1546 goto fail;
1547 }
1548
1549 if (nlp->flags & DRBD_NL_SET_DEFAULTS) {
1550 memset(&sc, 0, sizeof(struct syncer_conf));
1551 sc.rate = DRBD_RATE_DEF;
1552 sc.after = DRBD_AFTER_DEF;
1553 sc.al_extents = DRBD_AL_EXTENTS_DEF;
1554 } else
1555 memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
1556
1557 if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) {
1558 retcode = ERR_MANDATORY_TAG;
1559 goto fail;
1560 }
1561
1562 /* re-sync running */
1563 rsr = ( mdev->state.conn == C_SYNC_SOURCE ||
1564 mdev->state.conn == C_SYNC_TARGET ||
1565 mdev->state.conn == C_PAUSED_SYNC_S ||
1566 mdev->state.conn == C_PAUSED_SYNC_T );
1567
1568 if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) {
1569 retcode = ERR_CSUMS_RESYNC_RUNNING;
1570 goto fail;
1571 }
1572
1573 if (!rsr && sc.csums_alg[0]) {
1574 csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC);
1575 if (IS_ERR(csums_tfm)) {
1576 csums_tfm = NULL;
1577 retcode = ERR_CSUMS_ALG;
1578 goto fail;
1579 }
1580
1581 if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) {
1582 retcode = ERR_CSUMS_ALG_ND;
1583 goto fail;
1584 }
1585 }
1586
1587 /* online verify running */
1588 ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T);
1589
1590 if (ovr) {
1591 if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) {
1592 retcode = ERR_VERIFY_RUNNING;
1593 goto fail;
1594 }
1595 }
1596
1597 if (!ovr && sc.verify_alg[0]) {
1598 verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC);
1599 if (IS_ERR(verify_tfm)) {
1600 verify_tfm = NULL;
1601 retcode = ERR_VERIFY_ALG;
1602 goto fail;
1603 }
1604
1605 if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) {
1606 retcode = ERR_VERIFY_ALG_ND;
1607 goto fail;
1608 }
1609 }
1610
1611 /* silently ignore cpu mask on UP kernel */
1612 if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) {
1613 err = __bitmap_parse(sc.cpu_mask, 32, 0,
1614 cpumask_bits(new_cpu_mask), nr_cpu_ids);
1615 if (err) {
1616 dev_warn(DEV, "__bitmap_parse() failed with %d\n", err);
1617 retcode = ERR_CPU_MASK_PARSE;
1618 goto fail;
1619 }
1620 }
1621
1622 ERR_IF (sc.rate < 1) sc.rate = 1;
1623 ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */
1624#define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT)
1625 if (sc.al_extents > AL_MAX) {
1626 dev_err(DEV, "sc.al_extents > %d\n", AL_MAX);
1627 sc.al_extents = AL_MAX;
1628 }
1629#undef AL_MAX
1630
1631 /* most sanity checks done, try to assign the new sync-after
1632 * dependency. need to hold the global lock in there,
1633 * to avoid a race in the dependency loop check. */
1634 retcode = drbd_alter_sa(mdev, sc.after);
1635 if (retcode != NO_ERROR)
1636 goto fail;
1637
1638 /* ok, assign the rest of it as well.
1639 * lock against receive_SyncParam() */
1640 spin_lock(&mdev->peer_seq_lock);
1641 mdev->sync_conf = sc;
1642
1643 if (!rsr) {
1644 crypto_free_hash(mdev->csums_tfm);
1645 mdev->csums_tfm = csums_tfm;
1646 csums_tfm = NULL;
1647 }
1648
1649 if (!ovr) {
1650 crypto_free_hash(mdev->verify_tfm);
1651 mdev->verify_tfm = verify_tfm;
1652 verify_tfm = NULL;
1653 }
1654 spin_unlock(&mdev->peer_seq_lock);
1655
1656 if (get_ldev(mdev)) {
1657 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
1658 drbd_al_shrink(mdev);
1659 err = drbd_check_al_size(mdev);
1660 lc_unlock(mdev->act_log);
1661 wake_up(&mdev->al_wait);
1662
1663 put_ldev(mdev);
1664 drbd_md_sync(mdev);
1665
1666 if (err) {
1667 retcode = ERR_NOMEM;
1668 goto fail;
1669 }
1670 }
1671
1672 if (mdev->state.conn >= C_CONNECTED)
1673 drbd_send_sync_param(mdev, &sc);
1674
1675 if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) {
1676 cpumask_copy(mdev->cpu_mask, new_cpu_mask);
1677 drbd_calc_cpu_mask(mdev);
1678 mdev->receiver.reset_cpu_mask = 1;
1679 mdev->asender.reset_cpu_mask = 1;
1680 mdev->worker.reset_cpu_mask = 1;
1681 }
1682
1683 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
1684fail:
1685 free_cpumask_var(new_cpu_mask);
1686 crypto_free_hash(csums_tfm);
1687 crypto_free_hash(verify_tfm);
1688 reply->ret_code = retcode;
1689 return 0;
1690}
1691
1692static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1693 struct drbd_nl_cfg_reply *reply)
1694{
1695 int retcode;
1696
1697 retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
1698
1699 if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION)
1700 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
1701
1702 while (retcode == SS_NEED_CONNECTION) {
1703 spin_lock_irq(&mdev->req_lock);
1704 if (mdev->state.conn < C_CONNECTED)
1705 retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL);
1706 spin_unlock_irq(&mdev->req_lock);
1707
1708 if (retcode != SS_NEED_CONNECTION)
1709 break;
1710
1711 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
1712 }
1713
1714 reply->ret_code = retcode;
1715 return 0;
1716}
1717
1718static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1719 struct drbd_nl_cfg_reply *reply)
1720{
1721
1722 reply->ret_code = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
1723
1724 return 0;
1725}
1726
1727static int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1728 struct drbd_nl_cfg_reply *reply)
1729{
1730 int retcode = NO_ERROR;
1731
1732 if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
1733 retcode = ERR_PAUSE_IS_SET;
1734
1735 reply->ret_code = retcode;
1736 return 0;
1737}
1738
1739static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1740 struct drbd_nl_cfg_reply *reply)
1741{
1742 int retcode = NO_ERROR;
1743
1744 if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO)
1745 retcode = ERR_PAUSE_IS_CLEAR;
1746
1747 reply->ret_code = retcode;
1748 return 0;
1749}
1750
1751static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1752 struct drbd_nl_cfg_reply *reply)
1753{
1754 reply->ret_code = drbd_request_state(mdev, NS(susp, 1));
1755
1756 return 0;
1757}
1758
1759static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1760 struct drbd_nl_cfg_reply *reply)
1761{
1762 reply->ret_code = drbd_request_state(mdev, NS(susp, 0));
1763 return 0;
1764}
1765
1766static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1767 struct drbd_nl_cfg_reply *reply)
1768{
1769 reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED));
1770 return 0;
1771}
1772
1773static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1774 struct drbd_nl_cfg_reply *reply)
1775{
1776 unsigned short *tl;
1777
1778 tl = reply->tag_list;
1779
1780 if (get_ldev(mdev)) {
1781 tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl);
1782 put_ldev(mdev);
1783 }
1784
1785 if (get_net_conf(mdev)) {
1786 tl = net_conf_to_tags(mdev, mdev->net_conf, tl);
1787 put_net_conf(mdev);
1788 }
1789 tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl);
1790
1791 put_unaligned(TT_END, tl++); /* Close the tag list */
1792
1793 return (int)((char *)tl - (char *)reply->tag_list);
1794}
1795
1796static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1797 struct drbd_nl_cfg_reply *reply)
1798{
1799 unsigned short *tl = reply->tag_list;
1800 union drbd_state s = mdev->state;
1801 unsigned long rs_left;
1802 unsigned int res;
1803
1804 tl = get_state_to_tags(mdev, (struct get_state *)&s, tl);
1805
1806 /* no local ref, no bitmap, no syncer progress. */
1807 if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) {
1808 if (get_ldev(mdev)) {
1809 drbd_get_syncer_progress(mdev, &rs_left, &res);
1810 tl = tl_add_int(tl, T_sync_progress, &res);
1811 put_ldev(mdev);
1812 }
1813 }
1814 put_unaligned(TT_END, tl++); /* Close the tag list */
1815
1816 return (int)((char *)tl - (char *)reply->tag_list);
1817}
1818
1819static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1820 struct drbd_nl_cfg_reply *reply)
1821{
1822 unsigned short *tl;
1823
1824 tl = reply->tag_list;
1825
1826 if (get_ldev(mdev)) {
1827 tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64));
1828 tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags);
1829 put_ldev(mdev);
1830 }
1831 put_unaligned(TT_END, tl++); /* Close the tag list */
1832
1833 return (int)((char *)tl - (char *)reply->tag_list);
1834}
1835
1836/**
1837 * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use
1838 * @mdev: DRBD device.
1839 * @nlp: Netlink/connector packet from drbdsetup
1840 * @reply: Reply packet for drbdsetup
1841 */
1842static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1843 struct drbd_nl_cfg_reply *reply)
1844{
1845 unsigned short *tl;
1846 char rv;
1847
1848 tl = reply->tag_list;
1849
1850 rv = mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
1851 test_bit(USE_DEGR_WFC_T, &mdev->flags) ? UT_DEGRADED : UT_DEFAULT;
1852
1853 tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv));
1854 put_unaligned(TT_END, tl++); /* Close the tag list */
1855
1856 return (int)((char *)tl - (char *)reply->tag_list);
1857}
1858
1859static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1860 struct drbd_nl_cfg_reply *reply)
1861{
1862 /* default to resume from last known position, if possible */
1863 struct start_ov args =
1864 { .start_sector = mdev->ov_start_sector };
1865
1866 if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) {
1867 reply->ret_code = ERR_MANDATORY_TAG;
1868 return 0;
1869 }
1870 /* w_make_ov_request expects position to be aligned */
1871 mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT;
1872 reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
1873 return 0;
1874}
1875
1876
1877static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1878 struct drbd_nl_cfg_reply *reply)
1879{
1880 int retcode = NO_ERROR;
1881 int skip_initial_sync = 0;
1882 int err;
1883
1884 struct new_c_uuid args;
1885
1886 memset(&args, 0, sizeof(struct new_c_uuid));
1887 if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) {
1888 reply->ret_code = ERR_MANDATORY_TAG;
1889 return 0;
1890 }
1891
1892 mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */
1893
1894 if (!get_ldev(mdev)) {
1895 retcode = ERR_NO_DISK;
1896 goto out;
1897 }
1898
1899 /* this is "skip initial sync", assume to be clean */
1900 if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 &&
1901 mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
1902 dev_info(DEV, "Preparing to skip initial sync\n");
1903 skip_initial_sync = 1;
1904 } else if (mdev->state.conn != C_STANDALONE) {
1905 retcode = ERR_CONNECTED;
1906 goto out_dec;
1907 }
1908
1909 drbd_uuid_set(mdev, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */
1910 drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */
1911
1912 if (args.clear_bm) {
1913 err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, "clear_n_write from new_c_uuid");
1914 if (err) {
1915 dev_err(DEV, "Writing bitmap failed with %d\n",err);
1916 retcode = ERR_IO_MD_DISK;
1917 }
1918 if (skip_initial_sync) {
1919 drbd_send_uuids_skip_initial_sync(mdev);
1920 _drbd_uuid_set(mdev, UI_BITMAP, 0);
1921 spin_lock_irq(&mdev->req_lock);
1922 _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
1923 CS_VERBOSE, NULL);
1924 spin_unlock_irq(&mdev->req_lock);
1925 }
1926 }
1927
1928 drbd_md_sync(mdev);
1929out_dec:
1930 put_ldev(mdev);
1931out:
1932 mutex_unlock(&mdev->state_mutex);
1933
1934 reply->ret_code = retcode;
1935 return 0;
1936}
1937
1938static struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp)
1939{
1940 struct drbd_conf *mdev;
1941
1942 if (nlp->drbd_minor >= minor_count)
1943 return NULL;
1944
1945 mdev = minor_to_mdev(nlp->drbd_minor);
1946
1947 if (!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) {
1948 struct gendisk *disk = NULL;
1949 mdev = drbd_new_device(nlp->drbd_minor);
1950
1951 spin_lock_irq(&drbd_pp_lock);
1952 if (minor_table[nlp->drbd_minor] == NULL) {
1953 minor_table[nlp->drbd_minor] = mdev;
1954 disk = mdev->vdisk;
1955 mdev = NULL;
1956 } /* else: we lost the race */
1957 spin_unlock_irq(&drbd_pp_lock);
1958
1959 if (disk) /* we won the race above */
1960 /* in case we ever add a drbd_delete_device(),
1961 * don't forget the del_gendisk! */
1962 add_disk(disk);
1963 else /* we lost the race above */
1964 drbd_free_mdev(mdev);
1965
1966 mdev = minor_to_mdev(nlp->drbd_minor);
1967 }
1968
1969 return mdev;
1970}
1971
1972struct cn_handler_struct {
1973 int (*function)(struct drbd_conf *,
1974 struct drbd_nl_cfg_req *,
1975 struct drbd_nl_cfg_reply *);
1976 int reply_body_size;
1977};
1978
1979static struct cn_handler_struct cnd_table[] = {
1980 [ P_primary ] = { &drbd_nl_primary, 0 },
1981 [ P_secondary ] = { &drbd_nl_secondary, 0 },
1982 [ P_disk_conf ] = { &drbd_nl_disk_conf, 0 },
1983 [ P_detach ] = { &drbd_nl_detach, 0 },
1984 [ P_net_conf ] = { &drbd_nl_net_conf, 0 },
1985 [ P_disconnect ] = { &drbd_nl_disconnect, 0 },
1986 [ P_resize ] = { &drbd_nl_resize, 0 },
1987 [ P_syncer_conf ] = { &drbd_nl_syncer_conf, 0 },
1988 [ P_invalidate ] = { &drbd_nl_invalidate, 0 },
1989 [ P_invalidate_peer ] = { &drbd_nl_invalidate_peer, 0 },
1990 [ P_pause_sync ] = { &drbd_nl_pause_sync, 0 },
1991 [ P_resume_sync ] = { &drbd_nl_resume_sync, 0 },
1992 [ P_suspend_io ] = { &drbd_nl_suspend_io, 0 },
1993 [ P_resume_io ] = { &drbd_nl_resume_io, 0 },
1994 [ P_outdate ] = { &drbd_nl_outdate, 0 },
1995 [ P_get_config ] = { &drbd_nl_get_config,
1996 sizeof(struct syncer_conf_tag_len_struct) +
1997 sizeof(struct disk_conf_tag_len_struct) +
1998 sizeof(struct net_conf_tag_len_struct) },
1999 [ P_get_state ] = { &drbd_nl_get_state,
2000 sizeof(struct get_state_tag_len_struct) +
2001 sizeof(struct sync_progress_tag_len_struct) },
2002 [ P_get_uuids ] = { &drbd_nl_get_uuids,
2003 sizeof(struct get_uuids_tag_len_struct) },
2004 [ P_get_timeout_flag ] = { &drbd_nl_get_timeout_flag,
2005 sizeof(struct get_timeout_flag_tag_len_struct)},
2006 [ P_start_ov ] = { &drbd_nl_start_ov, 0 },
2007 [ P_new_c_uuid ] = { &drbd_nl_new_c_uuid, 0 },
2008};
2009
2010static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms *nsp)
2011{
2012 struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data;
2013 struct cn_handler_struct *cm;
2014 struct cn_msg *cn_reply;
2015 struct drbd_nl_cfg_reply *reply;
2016 struct drbd_conf *mdev;
2017 int retcode, rr;
2018 int reply_size = sizeof(struct cn_msg)
2019 + sizeof(struct drbd_nl_cfg_reply)
2020 + sizeof(short int);
2021
2022 if (!try_module_get(THIS_MODULE)) {
2023 printk(KERN_ERR "drbd: try_module_get() failed!\n");
2024 return;
2025 }
2026
2027 if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN)) {
2028 retcode = ERR_PERM;
2029 goto fail;
2030 }
2031
2032 mdev = ensure_mdev(nlp);
2033 if (!mdev) {
2034 retcode = ERR_MINOR_INVALID;
2035 goto fail;
2036 }
2037
2038 if (nlp->packet_type >= P_nl_after_last_packet) {
2039 retcode = ERR_PACKET_NR;
2040 goto fail;
2041 }
2042
2043 cm = cnd_table + nlp->packet_type;
2044
2045 /* This may happen if packet number is 0: */
2046 if (cm->function == NULL) {
2047 retcode = ERR_PACKET_NR;
2048 goto fail;
2049 }
2050
2051 reply_size += cm->reply_body_size;
2052
2053 /* allocation not in the IO path, cqueue thread context */
2054 cn_reply = kmalloc(reply_size, GFP_KERNEL);
2055 if (!cn_reply) {
2056 retcode = ERR_NOMEM;
2057 goto fail;
2058 }
2059 reply = (struct drbd_nl_cfg_reply *) cn_reply->data;
2060
2061 reply->packet_type =
2062 cm->reply_body_size ? nlp->packet_type : P_nl_after_last_packet;
2063 reply->minor = nlp->drbd_minor;
2064 reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */
2065 /* reply->tag_list; might be modified by cm->function. */
2066
2067 rr = cm->function(mdev, nlp, reply);
2068
2069 cn_reply->id = req->id;
2070 cn_reply->seq = req->seq;
2071 cn_reply->ack = req->ack + 1;
2072 cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr;
2073 cn_reply->flags = 0;
2074
2075 rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL);
2076 if (rr && rr != -ESRCH)
2077 printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
2078
2079 kfree(cn_reply);
2080 module_put(THIS_MODULE);
2081 return;
2082 fail:
2083 drbd_nl_send_reply(req, retcode);
2084 module_put(THIS_MODULE);
2085}
2086
2087static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */
2088
2089static unsigned short *
2090__tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data,
2091 unsigned short len, int nul_terminated)
2092{
2093 unsigned short l = tag_descriptions[tag_number(tag)].max_len;
2094 len = (len < l) ? len : l;
2095 put_unaligned(tag, tl++);
2096 put_unaligned(len, tl++);
2097 memcpy(tl, data, len);
2098 tl = (unsigned short*)((char*)tl + len);
2099 if (nul_terminated)
2100 *((char*)tl - 1) = 0;
2101 return tl;
2102}
2103
2104static unsigned short *
2105tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len)
2106{
2107 return __tl_add_blob(tl, tag, data, len, 0);
2108}
2109
2110static unsigned short *
2111tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str)
2112{
2113 return __tl_add_blob(tl, tag, str, strlen(str)+1, 0);
2114}
2115
2116static unsigned short *
2117tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val)
2118{
2119 put_unaligned(tag, tl++);
2120 switch(tag_type(tag)) {
2121 case TT_INTEGER:
2122 put_unaligned(sizeof(int), tl++);
2123 put_unaligned(*(int *)val, (int *)tl);
2124 tl = (unsigned short*)((char*)tl+sizeof(int));
2125 break;
2126 case TT_INT64:
2127 put_unaligned(sizeof(u64), tl++);
2128 put_unaligned(*(u64 *)val, (u64 *)tl);
2129 tl = (unsigned short*)((char*)tl+sizeof(u64));
2130 break;
2131 default:
2132 /* someone did something stupid. */
2133 ;
2134 }
2135 return tl;
2136}
2137
2138void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state)
2139{
2140 char buffer[sizeof(struct cn_msg)+
2141 sizeof(struct drbd_nl_cfg_reply)+
2142 sizeof(struct get_state_tag_len_struct)+
2143 sizeof(short int)];
2144 struct cn_msg *cn_reply = (struct cn_msg *) buffer;
2145 struct drbd_nl_cfg_reply *reply =
2146 (struct drbd_nl_cfg_reply *)cn_reply->data;
2147 unsigned short *tl = reply->tag_list;
2148
2149 /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
2150
2151 tl = get_state_to_tags(mdev, (struct get_state *)&state, tl);
2152
2153 put_unaligned(TT_END, tl++); /* Close the tag list */
2154
2155 cn_reply->id.idx = CN_IDX_DRBD;
2156 cn_reply->id.val = CN_VAL_DRBD;
2157
2158 cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
2159 cn_reply->ack = 0; /* not used here. */
2160 cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
2161 (int)((char *)tl - (char *)reply->tag_list);
2162 cn_reply->flags = 0;
2163
2164 reply->packet_type = P_get_state;
2165 reply->minor = mdev_to_minor(mdev);
2166 reply->ret_code = NO_ERROR;
2167
2168 cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2169}
2170
2171void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name)
2172{
2173 char buffer[sizeof(struct cn_msg)+
2174 sizeof(struct drbd_nl_cfg_reply)+
2175 sizeof(struct call_helper_tag_len_struct)+
2176 sizeof(short int)];
2177 struct cn_msg *cn_reply = (struct cn_msg *) buffer;
2178 struct drbd_nl_cfg_reply *reply =
2179 (struct drbd_nl_cfg_reply *)cn_reply->data;
2180 unsigned short *tl = reply->tag_list;
2181
2182 /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
2183
2184 tl = tl_add_str(tl, T_helper, helper_name);
2185 put_unaligned(TT_END, tl++); /* Close the tag list */
2186
2187 cn_reply->id.idx = CN_IDX_DRBD;
2188 cn_reply->id.val = CN_VAL_DRBD;
2189
2190 cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
2191 cn_reply->ack = 0; /* not used here. */
2192 cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
2193 (int)((char *)tl - (char *)reply->tag_list);
2194 cn_reply->flags = 0;
2195
2196 reply->packet_type = P_call_helper;
2197 reply->minor = mdev_to_minor(mdev);
2198 reply->ret_code = NO_ERROR;
2199
2200 cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2201}
2202
2203void drbd_bcast_ee(struct drbd_conf *mdev,
2204 const char *reason, const int dgs,
2205 const char* seen_hash, const char* calc_hash,
2206 const struct drbd_epoch_entry* e)
2207{
2208 struct cn_msg *cn_reply;
2209 struct drbd_nl_cfg_reply *reply;
2210 struct bio_vec *bvec;
2211 unsigned short *tl;
2212 int i;
2213
2214 if (!e)
2215 return;
2216 if (!reason || !reason[0])
2217 return;
2218
2219 /* apparently we have to memcpy twice, first to prepare the data for the
2220 * struct cn_msg, then within cn_netlink_send from the cn_msg to the
2221 * netlink skb. */
2222 /* receiver thread context, which is not in the writeout path (of this node),
2223 * but may be in the writeout path of the _other_ node.
2224 * GFP_NOIO to avoid potential "distributed deadlock". */
2225 cn_reply = kmalloc(
2226 sizeof(struct cn_msg)+
2227 sizeof(struct drbd_nl_cfg_reply)+
2228 sizeof(struct dump_ee_tag_len_struct)+
2229 sizeof(short int),
2230 GFP_NOIO);
2231
2232 if (!cn_reply) {
2233 dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n",
2234 (unsigned long long)e->sector, e->size);
2235 return;
2236 }
2237
2238 reply = (struct drbd_nl_cfg_reply*)cn_reply->data;
2239 tl = reply->tag_list;
2240
2241 tl = tl_add_str(tl, T_dump_ee_reason, reason);
2242 tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs);
2243 tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs);
2244 tl = tl_add_int(tl, T_ee_sector, &e->sector);
2245 tl = tl_add_int(tl, T_ee_block_id, &e->block_id);
2246
2247 put_unaligned(T_ee_data, tl++);
2248 put_unaligned(e->size, tl++);
2249
2250 __bio_for_each_segment(bvec, e->private_bio, i, 0) {
2251 void *d = kmap(bvec->bv_page);
2252 memcpy(tl, d + bvec->bv_offset, bvec->bv_len);
2253 kunmap(bvec->bv_page);
2254 tl=(unsigned short*)((char*)tl + bvec->bv_len);
2255 }
2256 put_unaligned(TT_END, tl++); /* Close the tag list */
2257
2258 cn_reply->id.idx = CN_IDX_DRBD;
2259 cn_reply->id.val = CN_VAL_DRBD;
2260
2261 cn_reply->seq = atomic_add_return(1,&drbd_nl_seq);
2262 cn_reply->ack = 0; // not used here.
2263 cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
2264 (int)((char*)tl - (char*)reply->tag_list);
2265 cn_reply->flags = 0;
2266
2267 reply->packet_type = P_dump_ee;
2268 reply->minor = mdev_to_minor(mdev);
2269 reply->ret_code = NO_ERROR;
2270
2271 cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2272 kfree(cn_reply);
2273}
2274
2275void drbd_bcast_sync_progress(struct drbd_conf *mdev)
2276{
2277 char buffer[sizeof(struct cn_msg)+
2278 sizeof(struct drbd_nl_cfg_reply)+
2279 sizeof(struct sync_progress_tag_len_struct)+
2280 sizeof(short int)];
2281 struct cn_msg *cn_reply = (struct cn_msg *) buffer;
2282 struct drbd_nl_cfg_reply *reply =
2283 (struct drbd_nl_cfg_reply *)cn_reply->data;
2284 unsigned short *tl = reply->tag_list;
2285 unsigned long rs_left;
2286 unsigned int res;
2287
2288 /* no local ref, no bitmap, no syncer progress, no broadcast. */
2289 if (!get_ldev(mdev))
2290 return;
2291 drbd_get_syncer_progress(mdev, &rs_left, &res);
2292 put_ldev(mdev);
2293
2294 tl = tl_add_int(tl, T_sync_progress, &res);
2295 put_unaligned(TT_END, tl++); /* Close the tag list */
2296
2297 cn_reply->id.idx = CN_IDX_DRBD;
2298 cn_reply->id.val = CN_VAL_DRBD;
2299
2300 cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
2301 cn_reply->ack = 0; /* not used here. */
2302 cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
2303 (int)((char *)tl - (char *)reply->tag_list);
2304 cn_reply->flags = 0;
2305
2306 reply->packet_type = P_sync_progress;
2307 reply->minor = mdev_to_minor(mdev);
2308 reply->ret_code = NO_ERROR;
2309
2310 cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2311}
2312
2313int __init drbd_nl_init(void)
2314{
2315 static struct cb_id cn_id_drbd;
2316 int err, try=10;
2317
2318 cn_id_drbd.val = CN_VAL_DRBD;
2319 do {
2320 cn_id_drbd.idx = cn_idx;
2321 err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback);
2322 if (!err)
2323 break;
2324 cn_idx = (cn_idx + CN_IDX_STEP);
2325 } while (try--);
2326
2327 if (err) {
2328 printk(KERN_ERR "drbd: cn_drbd failed to register\n");
2329 return err;
2330 }
2331
2332 return 0;
2333}
2334
2335void drbd_nl_cleanup(void)
2336{
2337 static struct cb_id cn_id_drbd;
2338
2339 cn_id_drbd.idx = cn_idx;
2340 cn_id_drbd.val = CN_VAL_DRBD;
2341
2342 cn_del_callback(&cn_id_drbd);
2343}
2344
2345void drbd_nl_send_reply(struct cn_msg *req, int ret_code)
2346{
2347 char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)];
2348 struct cn_msg *cn_reply = (struct cn_msg *) buffer;
2349 struct drbd_nl_cfg_reply *reply =
2350 (struct drbd_nl_cfg_reply *)cn_reply->data;
2351 int rr;
2352
2353 cn_reply->id = req->id;
2354
2355 cn_reply->seq = req->seq;
2356 cn_reply->ack = req->ack + 1;
2357 cn_reply->len = sizeof(struct drbd_nl_cfg_reply);
2358 cn_reply->flags = 0;
2359
2360 reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor;
2361 reply->ret_code = ret_code;
2362
2363 rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2364 if (rr && rr != -ESRCH)
2365 printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
2366}
2367
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
new file mode 100644
index 000000000000..be3374b68460
--- /dev/null
+++ b/drivers/block/drbd/drbd_proc.c
@@ -0,0 +1,264 @@
1/*
2 drbd_proc.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/module.h>
27
28#include <asm/uaccess.h>
29#include <linux/fs.h>
30#include <linux/file.h>
31#include <linux/proc_fs.h>
32#include <linux/seq_file.h>
33#include <linux/drbd.h>
34#include "drbd_int.h"
35
36static int drbd_proc_open(struct inode *inode, struct file *file);
37
38
39struct proc_dir_entry *drbd_proc;
40const struct file_operations drbd_proc_fops = {
41 .owner = THIS_MODULE,
42 .open = drbd_proc_open,
43 .read = seq_read,
44 .llseek = seq_lseek,
45 .release = single_release,
46};
47
48
49/*lge
50 * progress bars shamelessly adapted from driver/md/md.c
51 * output looks like
52 * [=====>..............] 33.5% (23456/123456)
53 * finish: 2:20:20 speed: 6,345 (6,456) K/sec
54 */
55static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
56{
57 unsigned long db, dt, dbdt, rt, rs_left;
58 unsigned int res;
59 int i, x, y;
60
61 drbd_get_syncer_progress(mdev, &rs_left, &res);
62
63 x = res/50;
64 y = 20-x;
65 seq_printf(seq, "\t[");
66 for (i = 1; i < x; i++)
67 seq_printf(seq, "=");
68 seq_printf(seq, ">");
69 for (i = 0; i < y; i++)
70 seq_printf(seq, ".");
71 seq_printf(seq, "] ");
72
73 seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10);
74 /* if more than 1 GB display in MB */
75 if (mdev->rs_total > 0x100000L)
76 seq_printf(seq, "(%lu/%lu)M\n\t",
77 (unsigned long) Bit2KB(rs_left >> 10),
78 (unsigned long) Bit2KB(mdev->rs_total >> 10));
79 else
80 seq_printf(seq, "(%lu/%lu)K\n\t",
81 (unsigned long) Bit2KB(rs_left),
82 (unsigned long) Bit2KB(mdev->rs_total));
83
84 /* see drivers/md/md.c
85 * We do not want to overflow, so the order of operands and
86 * the * 100 / 100 trick are important. We do a +1 to be
87 * safe against division by zero. We only estimate anyway.
88 *
89 * dt: time from mark until now
90 * db: blocks written from mark until now
91 * rt: remaining time
92 */
93 dt = (jiffies - mdev->rs_mark_time) / HZ;
94
95 if (dt > 20) {
96 /* if we made no update to rs_mark_time for too long,
97 * we are stalled. show that. */
98 seq_printf(seq, "stalled\n");
99 return;
100 }
101
102 if (!dt)
103 dt++;
104 db = mdev->rs_mark_left - rs_left;
105 rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */
106
107 seq_printf(seq, "finish: %lu:%02lu:%02lu",
108 rt / 3600, (rt % 3600) / 60, rt % 60);
109
110 /* current speed average over (SYNC_MARKS * SYNC_MARK_STEP) jiffies */
111 dbdt = Bit2KB(db/dt);
112 if (dbdt > 1000)
113 seq_printf(seq, " speed: %ld,%03ld",
114 dbdt/1000, dbdt % 1000);
115 else
116 seq_printf(seq, " speed: %ld", dbdt);
117
118 /* mean speed since syncer started
119 * we do account for PausedSync periods */
120 dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
121 if (dt <= 0)
122 dt = 1;
123 db = mdev->rs_total - rs_left;
124 dbdt = Bit2KB(db/dt);
125 if (dbdt > 1000)
126 seq_printf(seq, " (%ld,%03ld)",
127 dbdt/1000, dbdt % 1000);
128 else
129 seq_printf(seq, " (%ld)", dbdt);
130
131 seq_printf(seq, " K/sec\n");
132}
133
134static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
135{
136 struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
137
138 seq_printf(seq, "%5d %s %s\n", bme->rs_left,
139 bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------",
140 bme->flags & BME_LOCKED ? "LOCKED" : "------"
141 );
142}
143
144static int drbd_seq_show(struct seq_file *seq, void *v)
145{
146 int i, hole = 0;
147 const char *sn;
148 struct drbd_conf *mdev;
149
150 static char write_ordering_chars[] = {
151 [WO_none] = 'n',
152 [WO_drain_io] = 'd',
153 [WO_bdev_flush] = 'f',
154 [WO_bio_barrier] = 'b',
155 };
156
157 seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
158 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag());
159
160 /*
161 cs .. connection state
162 ro .. node role (local/remote)
163 ds .. disk state (local/remote)
164 protocol
165 various flags
166 ns .. network send
167 nr .. network receive
168 dw .. disk write
169 dr .. disk read
170 al .. activity log write count
171 bm .. bitmap update write count
172 pe .. pending (waiting for ack or data reply)
173 ua .. unack'd (still need to send ack or data reply)
174 ap .. application requests accepted, but not yet completed
175 ep .. number of epochs currently "on the fly", P_BARRIER_ACK pending
176 wo .. write ordering mode currently in use
177 oos .. known out-of-sync kB
178 */
179
180 for (i = 0; i < minor_count; i++) {
181 mdev = minor_to_mdev(i);
182 if (!mdev) {
183 hole = 1;
184 continue;
185 }
186 if (hole) {
187 hole = 0;
188 seq_printf(seq, "\n");
189 }
190
191 sn = drbd_conn_str(mdev->state.conn);
192
193 if (mdev->state.conn == C_STANDALONE &&
194 mdev->state.disk == D_DISKLESS &&
195 mdev->state.role == R_SECONDARY) {
196 seq_printf(seq, "%2d: cs:Unconfigured\n", i);
197 } else {
198 seq_printf(seq,
199 "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c\n"
200 " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
201 "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
202 i, sn,
203 drbd_role_str(mdev->state.role),
204 drbd_role_str(mdev->state.peer),
205 drbd_disk_str(mdev->state.disk),
206 drbd_disk_str(mdev->state.pdsk),
207 (mdev->net_conf == NULL ? ' ' :
208 (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')),
209 mdev->state.susp ? 's' : 'r',
210 mdev->state.aftr_isp ? 'a' : '-',
211 mdev->state.peer_isp ? 'p' : '-',
212 mdev->state.user_isp ? 'u' : '-',
213 mdev->congestion_reason ?: '-',
214 mdev->send_cnt/2,
215 mdev->recv_cnt/2,
216 mdev->writ_cnt/2,
217 mdev->read_cnt/2,
218 mdev->al_writ_cnt,
219 mdev->bm_writ_cnt,
220 atomic_read(&mdev->local_cnt),
221 atomic_read(&mdev->ap_pending_cnt) +
222 atomic_read(&mdev->rs_pending_cnt),
223 atomic_read(&mdev->unacked_cnt),
224 atomic_read(&mdev->ap_bio_cnt),
225 mdev->epochs,
226 write_ordering_chars[mdev->write_ordering]
227 );
228 seq_printf(seq, " oos:%lu\n",
229 Bit2KB(drbd_bm_total_weight(mdev)));
230 }
231 if (mdev->state.conn == C_SYNC_SOURCE ||
232 mdev->state.conn == C_SYNC_TARGET)
233 drbd_syncer_progress(mdev, seq);
234
235 if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
236 seq_printf(seq, "\t%3d%% %lu/%lu\n",
237 (int)((mdev->rs_total-mdev->ov_left) /
238 (mdev->rs_total/100+1)),
239 mdev->rs_total - mdev->ov_left,
240 mdev->rs_total);
241
242 if (proc_details >= 1 && get_ldev_if_state(mdev, D_FAILED)) {
243 lc_seq_printf_stats(seq, mdev->resync);
244 lc_seq_printf_stats(seq, mdev->act_log);
245 put_ldev(mdev);
246 }
247
248 if (proc_details >= 2) {
249 if (mdev->resync) {
250 lc_seq_dump_details(seq, mdev->resync, "rs_left",
251 resync_dump_detail);
252 }
253 }
254 }
255
256 return 0;
257}
258
259static int drbd_proc_open(struct inode *inode, struct file *file)
260{
261 return single_open(file, drbd_seq_show, PDE(inode)->data);
262}
263
264/* PROC FS stuff end */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
new file mode 100644
index 000000000000..3f096e7959b4
--- /dev/null
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -0,0 +1,4462 @@
1/*
2 drbd_receiver.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25
26#include <linux/module.h>
27
28#include <asm/uaccess.h>
29#include <net/sock.h>
30
31#include <linux/drbd.h>
32#include <linux/fs.h>
33#include <linux/file.h>
34#include <linux/in.h>
35#include <linux/mm.h>
36#include <linux/memcontrol.h>
37#include <linux/mm_inline.h>
38#include <linux/slab.h>
39#include <linux/smp_lock.h>
40#include <linux/pkt_sched.h>
41#define __KERNEL_SYSCALLS__
42#include <linux/unistd.h>
43#include <linux/vmalloc.h>
44#include <linux/random.h>
45#include <linux/mm.h>
46#include <linux/string.h>
47#include <linux/scatterlist.h>
48#include "drbd_int.h"
49#include "drbd_req.h"
50
51#include "drbd_vli.h"
52
53struct flush_work {
54 struct drbd_work w;
55 struct drbd_epoch *epoch;
56};
57
58enum finish_epoch {
59 FE_STILL_LIVE,
60 FE_DESTROYED,
61 FE_RECYCLED,
62};
63
64static int drbd_do_handshake(struct drbd_conf *mdev);
65static int drbd_do_auth(struct drbd_conf *mdev);
66
67static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
68static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
69
70static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
71{
72 struct drbd_epoch *prev;
73 spin_lock(&mdev->epoch_lock);
74 prev = list_entry(epoch->list.prev, struct drbd_epoch, list);
75 if (prev == epoch || prev == mdev->current_epoch)
76 prev = NULL;
77 spin_unlock(&mdev->epoch_lock);
78 return prev;
79}
80
81#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
82
83static struct page *drbd_pp_first_page_or_try_alloc(struct drbd_conf *mdev)
84{
85 struct page *page = NULL;
86
87 /* Yes, testing drbd_pp_vacant outside the lock is racy.
88 * So what. It saves a spin_lock. */
89 if (drbd_pp_vacant > 0) {
90 spin_lock(&drbd_pp_lock);
91 page = drbd_pp_pool;
92 if (page) {
93 drbd_pp_pool = (struct page *)page_private(page);
94 set_page_private(page, 0); /* just to be polite */
95 drbd_pp_vacant--;
96 }
97 spin_unlock(&drbd_pp_lock);
98 }
99 /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
100 * "criss-cross" setup, that might cause write-out on some other DRBD,
101 * which in turn might block on the other node at this very place. */
102 if (!page)
103 page = alloc_page(GFP_TRY);
104 if (page)
105 atomic_inc(&mdev->pp_in_use);
106 return page;
107}
108
109/* kick lower level device, if we have more than (arbitrary number)
110 * reference counts on it, which typically are locally submitted io
111 * requests. don't use unacked_cnt, so we speed up proto A and B, too. */
112static void maybe_kick_lo(struct drbd_conf *mdev)
113{
114 if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark)
115 drbd_kick_lo(mdev);
116}
117
118static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed)
119{
120 struct drbd_epoch_entry *e;
121 struct list_head *le, *tle;
122
123 /* The EEs are always appended to the end of the list. Since
124 they are sent in order over the wire, they have to finish
125 in order. As soon as we see the first not finished we can
126 stop to examine the list... */
127
128 list_for_each_safe(le, tle, &mdev->net_ee) {
129 e = list_entry(le, struct drbd_epoch_entry, w.list);
130 if (drbd_bio_has_active_page(e->private_bio))
131 break;
132 list_move(le, to_be_freed);
133 }
134}
135
136static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
137{
138 LIST_HEAD(reclaimed);
139 struct drbd_epoch_entry *e, *t;
140
141 maybe_kick_lo(mdev);
142 spin_lock_irq(&mdev->req_lock);
143 reclaim_net_ee(mdev, &reclaimed);
144 spin_unlock_irq(&mdev->req_lock);
145
146 list_for_each_entry_safe(e, t, &reclaimed, w.list)
147 drbd_free_ee(mdev, e);
148}
149
150/**
151 * drbd_pp_alloc() - Returns a page, fails only if a signal comes in
152 * @mdev: DRBD device.
153 * @retry: whether or not to retry allocation forever (or until signalled)
154 *
155 * Tries to allocate a page, first from our own page pool, then from the
156 * kernel, unless this allocation would exceed the max_buffers setting.
157 * If @retry is non-zero, retry until DRBD frees a page somewhere else.
158 */
159static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry)
160{
161 struct page *page = NULL;
162 DEFINE_WAIT(wait);
163
164 if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
165 page = drbd_pp_first_page_or_try_alloc(mdev);
166 if (page)
167 return page;
168 }
169
170 for (;;) {
171 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
172
173 drbd_kick_lo_and_reclaim_net(mdev);
174
175 if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
176 page = drbd_pp_first_page_or_try_alloc(mdev);
177 if (page)
178 break;
179 }
180
181 if (!retry)
182 break;
183
184 if (signal_pending(current)) {
185 dev_warn(DEV, "drbd_pp_alloc interrupted!\n");
186 break;
187 }
188
189 schedule();
190 }
191 finish_wait(&drbd_pp_wait, &wait);
192
193 return page;
194}
195
196/* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
197 * Is also used from inside an other spin_lock_irq(&mdev->req_lock) */
198static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
199{
200 int free_it;
201
202 spin_lock(&drbd_pp_lock);
203 if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) {
204 free_it = 1;
205 } else {
206 set_page_private(page, (unsigned long)drbd_pp_pool);
207 drbd_pp_pool = page;
208 drbd_pp_vacant++;
209 free_it = 0;
210 }
211 spin_unlock(&drbd_pp_lock);
212
213 atomic_dec(&mdev->pp_in_use);
214
215 if (free_it)
216 __free_page(page);
217
218 wake_up(&drbd_pp_wait);
219}
220
221static void drbd_pp_free_bio_pages(struct drbd_conf *mdev, struct bio *bio)
222{
223 struct page *p_to_be_freed = NULL;
224 struct page *page;
225 struct bio_vec *bvec;
226 int i;
227
228 spin_lock(&drbd_pp_lock);
229 __bio_for_each_segment(bvec, bio, i, 0) {
230 if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) {
231 set_page_private(bvec->bv_page, (unsigned long)p_to_be_freed);
232 p_to_be_freed = bvec->bv_page;
233 } else {
234 set_page_private(bvec->bv_page, (unsigned long)drbd_pp_pool);
235 drbd_pp_pool = bvec->bv_page;
236 drbd_pp_vacant++;
237 }
238 }
239 spin_unlock(&drbd_pp_lock);
240 atomic_sub(bio->bi_vcnt, &mdev->pp_in_use);
241
242 while (p_to_be_freed) {
243 page = p_to_be_freed;
244 p_to_be_freed = (struct page *)page_private(page);
245 set_page_private(page, 0); /* just to be polite */
246 put_page(page);
247 }
248
249 wake_up(&drbd_pp_wait);
250}
251
252/*
253You need to hold the req_lock:
254 _drbd_wait_ee_list_empty()
255
256You must not have the req_lock:
257 drbd_free_ee()
258 drbd_alloc_ee()
259 drbd_init_ee()
260 drbd_release_ee()
261 drbd_ee_fix_bhs()
262 drbd_process_done_ee()
263 drbd_clear_done_ee()
264 drbd_wait_ee_list_empty()
265*/
266
267struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
268 u64 id,
269 sector_t sector,
270 unsigned int data_size,
271 gfp_t gfp_mask) __must_hold(local)
272{
273 struct request_queue *q;
274 struct drbd_epoch_entry *e;
275 struct page *page;
276 struct bio *bio;
277 unsigned int ds;
278
279 if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE))
280 return NULL;
281
282 e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
283 if (!e) {
284 if (!(gfp_mask & __GFP_NOWARN))
285 dev_err(DEV, "alloc_ee: Allocation of an EE failed\n");
286 return NULL;
287 }
288
289 bio = bio_alloc(gfp_mask & ~__GFP_HIGHMEM, div_ceil(data_size, PAGE_SIZE));
290 if (!bio) {
291 if (!(gfp_mask & __GFP_NOWARN))
292 dev_err(DEV, "alloc_ee: Allocation of a bio failed\n");
293 goto fail1;
294 }
295
296 bio->bi_bdev = mdev->ldev->backing_bdev;
297 bio->bi_sector = sector;
298
299 ds = data_size;
300 while (ds) {
301 page = drbd_pp_alloc(mdev, (gfp_mask & __GFP_WAIT));
302 if (!page) {
303 if (!(gfp_mask & __GFP_NOWARN))
304 dev_err(DEV, "alloc_ee: Allocation of a page failed\n");
305 goto fail2;
306 }
307 if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) {
308 drbd_pp_free(mdev, page);
309 dev_err(DEV, "alloc_ee: bio_add_page(s=%llu,"
310 "data_size=%u,ds=%u) failed\n",
311 (unsigned long long)sector, data_size, ds);
312
313 q = bdev_get_queue(bio->bi_bdev);
314 if (q->merge_bvec_fn) {
315 struct bvec_merge_data bvm = {
316 .bi_bdev = bio->bi_bdev,
317 .bi_sector = bio->bi_sector,
318 .bi_size = bio->bi_size,
319 .bi_rw = bio->bi_rw,
320 };
321 int l = q->merge_bvec_fn(q, &bvm,
322 &bio->bi_io_vec[bio->bi_vcnt]);
323 dev_err(DEV, "merge_bvec_fn() = %d\n", l);
324 }
325
326 /* dump more of the bio. */
327 dev_err(DEV, "bio->bi_max_vecs = %d\n", bio->bi_max_vecs);
328 dev_err(DEV, "bio->bi_vcnt = %d\n", bio->bi_vcnt);
329 dev_err(DEV, "bio->bi_size = %d\n", bio->bi_size);
330 dev_err(DEV, "bio->bi_phys_segments = %d\n", bio->bi_phys_segments);
331
332 goto fail2;
333 break;
334 }
335 ds -= min_t(int, ds, PAGE_SIZE);
336 }
337
338 D_ASSERT(data_size == bio->bi_size);
339
340 bio->bi_private = e;
341 e->mdev = mdev;
342 e->sector = sector;
343 e->size = bio->bi_size;
344
345 e->private_bio = bio;
346 e->block_id = id;
347 INIT_HLIST_NODE(&e->colision);
348 e->epoch = NULL;
349 e->flags = 0;
350
351 return e;
352
353 fail2:
354 drbd_pp_free_bio_pages(mdev, bio);
355 bio_put(bio);
356 fail1:
357 mempool_free(e, drbd_ee_mempool);
358
359 return NULL;
360}
361
362void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
363{
364 struct bio *bio = e->private_bio;
365 drbd_pp_free_bio_pages(mdev, bio);
366 bio_put(bio);
367 D_ASSERT(hlist_unhashed(&e->colision));
368 mempool_free(e, drbd_ee_mempool);
369}
370
371int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
372{
373 LIST_HEAD(work_list);
374 struct drbd_epoch_entry *e, *t;
375 int count = 0;
376
377 spin_lock_irq(&mdev->req_lock);
378 list_splice_init(list, &work_list);
379 spin_unlock_irq(&mdev->req_lock);
380
381 list_for_each_entry_safe(e, t, &work_list, w.list) {
382 drbd_free_ee(mdev, e);
383 count++;
384 }
385 return count;
386}
387
388
389/*
390 * This function is called from _asender only_
391 * but see also comments in _req_mod(,barrier_acked)
392 * and receive_Barrier.
393 *
394 * Move entries from net_ee to done_ee, if ready.
395 * Grab done_ee, call all callbacks, free the entries.
396 * The callbacks typically send out ACKs.
397 */
398static int drbd_process_done_ee(struct drbd_conf *mdev)
399{
400 LIST_HEAD(work_list);
401 LIST_HEAD(reclaimed);
402 struct drbd_epoch_entry *e, *t;
403 int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS);
404
405 spin_lock_irq(&mdev->req_lock);
406 reclaim_net_ee(mdev, &reclaimed);
407 list_splice_init(&mdev->done_ee, &work_list);
408 spin_unlock_irq(&mdev->req_lock);
409
410 list_for_each_entry_safe(e, t, &reclaimed, w.list)
411 drbd_free_ee(mdev, e);
412
413 /* possible callbacks here:
414 * e_end_block, and e_end_resync_block, e_send_discard_ack.
415 * all ignore the last argument.
416 */
417 list_for_each_entry_safe(e, t, &work_list, w.list) {
418 /* list_del not necessary, next/prev members not touched */
419 ok = e->w.cb(mdev, &e->w, !ok) && ok;
420 drbd_free_ee(mdev, e);
421 }
422 wake_up(&mdev->ee_wait);
423
424 return ok;
425}
426
427void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
428{
429 DEFINE_WAIT(wait);
430
431 /* avoids spin_lock/unlock
432 * and calling prepare_to_wait in the fast path */
433 while (!list_empty(head)) {
434 prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
435 spin_unlock_irq(&mdev->req_lock);
436 drbd_kick_lo(mdev);
437 schedule();
438 finish_wait(&mdev->ee_wait, &wait);
439 spin_lock_irq(&mdev->req_lock);
440 }
441}
442
443void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
444{
445 spin_lock_irq(&mdev->req_lock);
446 _drbd_wait_ee_list_empty(mdev, head);
447 spin_unlock_irq(&mdev->req_lock);
448}
449
450/* see also kernel_accept; which is only present since 2.6.18.
451 * also we want to log which part of it failed, exactly */
452static int drbd_accept(struct drbd_conf *mdev, const char **what,
453 struct socket *sock, struct socket **newsock)
454{
455 struct sock *sk = sock->sk;
456 int err = 0;
457
458 *what = "listen";
459 err = sock->ops->listen(sock, 5);
460 if (err < 0)
461 goto out;
462
463 *what = "sock_create_lite";
464 err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
465 newsock);
466 if (err < 0)
467 goto out;
468
469 *what = "accept";
470 err = sock->ops->accept(sock, *newsock, 0);
471 if (err < 0) {
472 sock_release(*newsock);
473 *newsock = NULL;
474 goto out;
475 }
476 (*newsock)->ops = sock->ops;
477
478out:
479 return err;
480}
481
482static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock,
483 void *buf, size_t size, int flags)
484{
485 mm_segment_t oldfs;
486 struct kvec iov = {
487 .iov_base = buf,
488 .iov_len = size,
489 };
490 struct msghdr msg = {
491 .msg_iovlen = 1,
492 .msg_iov = (struct iovec *)&iov,
493 .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
494 };
495 int rv;
496
497 oldfs = get_fs();
498 set_fs(KERNEL_DS);
499 rv = sock_recvmsg(sock, &msg, size, msg.msg_flags);
500 set_fs(oldfs);
501
502 return rv;
503}
504
505static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size)
506{
507 mm_segment_t oldfs;
508 struct kvec iov = {
509 .iov_base = buf,
510 .iov_len = size,
511 };
512 struct msghdr msg = {
513 .msg_iovlen = 1,
514 .msg_iov = (struct iovec *)&iov,
515 .msg_flags = MSG_WAITALL | MSG_NOSIGNAL
516 };
517 int rv;
518
519 oldfs = get_fs();
520 set_fs(KERNEL_DS);
521
522 for (;;) {
523 rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags);
524 if (rv == size)
525 break;
526
527 /* Note:
528 * ECONNRESET other side closed the connection
529 * ERESTARTSYS (on sock) we got a signal
530 */
531
532 if (rv < 0) {
533 if (rv == -ECONNRESET)
534 dev_info(DEV, "sock was reset by peer\n");
535 else if (rv != -ERESTARTSYS)
536 dev_err(DEV, "sock_recvmsg returned %d\n", rv);
537 break;
538 } else if (rv == 0) {
539 dev_info(DEV, "sock was shut down by peer\n");
540 break;
541 } else {
542 /* signal came in, or peer/link went down,
543 * after we read a partial message
544 */
545 /* D_ASSERT(signal_pending(current)); */
546 break;
547 }
548 };
549
550 set_fs(oldfs);
551
552 if (rv != size)
553 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
554
555 return rv;
556}
557
558static struct socket *drbd_try_connect(struct drbd_conf *mdev)
559{
560 const char *what;
561 struct socket *sock;
562 struct sockaddr_in6 src_in6;
563 int err;
564 int disconnect_on_error = 1;
565
566 if (!get_net_conf(mdev))
567 return NULL;
568
569 what = "sock_create_kern";
570 err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
571 SOCK_STREAM, IPPROTO_TCP, &sock);
572 if (err < 0) {
573 sock = NULL;
574 goto out;
575 }
576
577 sock->sk->sk_rcvtimeo =
578 sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ;
579
580 /* explicitly bind to the configured IP as source IP
581 * for the outgoing connections.
582 * This is needed for multihomed hosts and to be
583 * able to use lo: interfaces for drbd.
584 * Make sure to use 0 as port number, so linux selects
585 * a free one dynamically.
586 */
587 memcpy(&src_in6, mdev->net_conf->my_addr,
588 min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6)));
589 if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6)
590 src_in6.sin6_port = 0;
591 else
592 ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
593
594 what = "bind before connect";
595 err = sock->ops->bind(sock,
596 (struct sockaddr *) &src_in6,
597 mdev->net_conf->my_addr_len);
598 if (err < 0)
599 goto out;
600
601 /* connect may fail, peer not yet available.
602 * stay C_WF_CONNECTION, don't go Disconnecting! */
603 disconnect_on_error = 0;
604 what = "connect";
605 err = sock->ops->connect(sock,
606 (struct sockaddr *)mdev->net_conf->peer_addr,
607 mdev->net_conf->peer_addr_len, 0);
608
609out:
610 if (err < 0) {
611 if (sock) {
612 sock_release(sock);
613 sock = NULL;
614 }
615 switch (-err) {
616 /* timeout, busy, signal pending */
617 case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
618 case EINTR: case ERESTARTSYS:
619 /* peer not (yet) available, network problem */
620 case ECONNREFUSED: case ENETUNREACH:
621 case EHOSTDOWN: case EHOSTUNREACH:
622 disconnect_on_error = 0;
623 break;
624 default:
625 dev_err(DEV, "%s failed, err = %d\n", what, err);
626 }
627 if (disconnect_on_error)
628 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
629 }
630 put_net_conf(mdev);
631 return sock;
632}
633
634static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
635{
636 int timeo, err;
637 struct socket *s_estab = NULL, *s_listen;
638 const char *what;
639
640 if (!get_net_conf(mdev))
641 return NULL;
642
643 what = "sock_create_kern";
644 err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
645 SOCK_STREAM, IPPROTO_TCP, &s_listen);
646 if (err) {
647 s_listen = NULL;
648 goto out;
649 }
650
651 timeo = mdev->net_conf->try_connect_int * HZ;
652 timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
653
654 s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */
655 s_listen->sk->sk_rcvtimeo = timeo;
656 s_listen->sk->sk_sndtimeo = timeo;
657
658 what = "bind before listen";
659 err = s_listen->ops->bind(s_listen,
660 (struct sockaddr *) mdev->net_conf->my_addr,
661 mdev->net_conf->my_addr_len);
662 if (err < 0)
663 goto out;
664
665 err = drbd_accept(mdev, &what, s_listen, &s_estab);
666
667out:
668 if (s_listen)
669 sock_release(s_listen);
670 if (err < 0) {
671 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
672 dev_err(DEV, "%s failed, err = %d\n", what, err);
673 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
674 }
675 }
676 put_net_conf(mdev);
677
678 return s_estab;
679}
680
681static int drbd_send_fp(struct drbd_conf *mdev,
682 struct socket *sock, enum drbd_packets cmd)
683{
684 struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
685
686 return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
687}
688
689static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
690{
691 struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
692 int rr;
693
694 rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
695
696 if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC)
697 return be16_to_cpu(h->command);
698
699 return 0xffff;
700}
701
702/**
703 * drbd_socket_okay() - Free the socket if its connection is not okay
704 * @mdev: DRBD device.
705 * @sock: pointer to the pointer to the socket.
706 */
707static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
708{
709 int rr;
710 char tb[4];
711
712 if (!*sock)
713 return FALSE;
714
715 rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
716
717 if (rr > 0 || rr == -EAGAIN) {
718 return TRUE;
719 } else {
720 sock_release(*sock);
721 *sock = NULL;
722 return FALSE;
723 }
724}
725
726/*
727 * return values:
728 * 1 yes, we have a valid connection
729 * 0 oops, did not work out, please try again
730 * -1 peer talks different language,
731 * no point in trying again, please go standalone.
732 * -2 We do not have a network config...
733 */
734static int drbd_connect(struct drbd_conf *mdev)
735{
736 struct socket *s, *sock, *msock;
737 int try, h, ok;
738
739 D_ASSERT(!mdev->data.socket);
740
741 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags))
742 dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n");
743
744 if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
745 return -2;
746
747 clear_bit(DISCARD_CONCURRENT, &mdev->flags);
748
749 sock = NULL;
750 msock = NULL;
751
752 do {
753 for (try = 0;;) {
754 /* 3 tries, this should take less than a second! */
755 s = drbd_try_connect(mdev);
756 if (s || ++try >= 3)
757 break;
758 /* give the other side time to call bind() & listen() */
759 __set_current_state(TASK_INTERRUPTIBLE);
760 schedule_timeout(HZ / 10);
761 }
762
763 if (s) {
764 if (!sock) {
765 drbd_send_fp(mdev, s, P_HAND_SHAKE_S);
766 sock = s;
767 s = NULL;
768 } else if (!msock) {
769 drbd_send_fp(mdev, s, P_HAND_SHAKE_M);
770 msock = s;
771 s = NULL;
772 } else {
773 dev_err(DEV, "Logic error in drbd_connect()\n");
774 goto out_release_sockets;
775 }
776 }
777
778 if (sock && msock) {
779 __set_current_state(TASK_INTERRUPTIBLE);
780 schedule_timeout(HZ / 10);
781 ok = drbd_socket_okay(mdev, &sock);
782 ok = drbd_socket_okay(mdev, &msock) && ok;
783 if (ok)
784 break;
785 }
786
787retry:
788 s = drbd_wait_for_connect(mdev);
789 if (s) {
790 try = drbd_recv_fp(mdev, s);
791 drbd_socket_okay(mdev, &sock);
792 drbd_socket_okay(mdev, &msock);
793 switch (try) {
794 case P_HAND_SHAKE_S:
795 if (sock) {
796 dev_warn(DEV, "initial packet S crossed\n");
797 sock_release(sock);
798 }
799 sock = s;
800 break;
801 case P_HAND_SHAKE_M:
802 if (msock) {
803 dev_warn(DEV, "initial packet M crossed\n");
804 sock_release(msock);
805 }
806 msock = s;
807 set_bit(DISCARD_CONCURRENT, &mdev->flags);
808 break;
809 default:
810 dev_warn(DEV, "Error receiving initial packet\n");
811 sock_release(s);
812 if (random32() & 1)
813 goto retry;
814 }
815 }
816
817 if (mdev->state.conn <= C_DISCONNECTING)
818 goto out_release_sockets;
819 if (signal_pending(current)) {
820 flush_signals(current);
821 smp_rmb();
822 if (get_t_state(&mdev->receiver) == Exiting)
823 goto out_release_sockets;
824 }
825
826 if (sock && msock) {
827 ok = drbd_socket_okay(mdev, &sock);
828 ok = drbd_socket_okay(mdev, &msock) && ok;
829 if (ok)
830 break;
831 }
832 } while (1);
833
834 msock->sk->sk_reuse = 1; /* SO_REUSEADDR */
835 sock->sk->sk_reuse = 1; /* SO_REUSEADDR */
836
837 sock->sk->sk_allocation = GFP_NOIO;
838 msock->sk->sk_allocation = GFP_NOIO;
839
840 sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
841 msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
842
843 if (mdev->net_conf->sndbuf_size) {
844 sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size;
845 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
846 }
847
848 if (mdev->net_conf->rcvbuf_size) {
849 sock->sk->sk_rcvbuf = mdev->net_conf->rcvbuf_size;
850 sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
851 }
852
853 /* NOT YET ...
854 * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
855 * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
856 * first set it to the P_HAND_SHAKE timeout,
857 * which we set to 4x the configured ping_timeout. */
858 sock->sk->sk_sndtimeo =
859 sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10;
860
861 msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
862 msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
863
864 /* we don't want delays.
865 * we use TCP_CORK where apropriate, though */
866 drbd_tcp_nodelay(sock);
867 drbd_tcp_nodelay(msock);
868
869 mdev->data.socket = sock;
870 mdev->meta.socket = msock;
871 mdev->last_received = jiffies;
872
873 D_ASSERT(mdev->asender.task == NULL);
874
875 h = drbd_do_handshake(mdev);
876 if (h <= 0)
877 return h;
878
879 if (mdev->cram_hmac_tfm) {
880 /* drbd_request_state(mdev, NS(conn, WFAuth)); */
881 switch (drbd_do_auth(mdev)) {
882 case -1:
883 dev_err(DEV, "Authentication of peer failed\n");
884 return -1;
885 case 0:
886 dev_err(DEV, "Authentication of peer failed, trying again.\n");
887 return 0;
888 }
889 }
890
891 if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS)
892 return 0;
893
894 sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
895 sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
896
897 atomic_set(&mdev->packet_seq, 0);
898 mdev->peer_seq = 0;
899
900 drbd_thread_start(&mdev->asender);
901
902 if (!drbd_send_protocol(mdev))
903 return -1;
904 drbd_send_sync_param(mdev, &mdev->sync_conf);
905 drbd_send_sizes(mdev, 0);
906 drbd_send_uuids(mdev);
907 drbd_send_state(mdev);
908 clear_bit(USE_DEGR_WFC_T, &mdev->flags);
909 clear_bit(RESIZE_PENDING, &mdev->flags);
910
911 return 1;
912
913out_release_sockets:
914 if (sock)
915 sock_release(sock);
916 if (msock)
917 sock_release(msock);
918 return -1;
919}
920
921static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h)
922{
923 int r;
924
925 r = drbd_recv(mdev, h, sizeof(*h));
926
927 if (unlikely(r != sizeof(*h))) {
928 dev_err(DEV, "short read expecting header on sock: r=%d\n", r);
929 return FALSE;
930 };
931 h->command = be16_to_cpu(h->command);
932 h->length = be16_to_cpu(h->length);
933 if (unlikely(h->magic != BE_DRBD_MAGIC)) {
934 dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n",
935 (long)be32_to_cpu(h->magic),
936 h->command, h->length);
937 return FALSE;
938 }
939 mdev->last_received = jiffies;
940
941 return TRUE;
942}
943
944static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
945{
946 int rv;
947
948 if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
949 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL);
950 if (rv) {
951 dev_err(DEV, "local disk flush failed with status %d\n", rv);
952 /* would rather check on EOPNOTSUPP, but that is not reliable.
953 * don't try again for ANY return value != 0
954 * if (rv == -EOPNOTSUPP) */
955 drbd_bump_write_ordering(mdev, WO_drain_io);
956 }
957 put_ldev(mdev);
958 }
959
960 return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
961}
962
963static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
964{
965 struct flush_work *fw = (struct flush_work *)w;
966 struct drbd_epoch *epoch = fw->epoch;
967
968 kfree(w);
969
970 if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags))
971 drbd_flush_after_epoch(mdev, epoch);
972
973 drbd_may_finish_epoch(mdev, epoch, EV_PUT |
974 (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0));
975
976 return 1;
977}
978
979/**
980 * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
981 * @mdev: DRBD device.
982 * @epoch: Epoch object.
983 * @ev: Epoch event.
984 */
985static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
986 struct drbd_epoch *epoch,
987 enum epoch_event ev)
988{
989 int finish, epoch_size;
990 struct drbd_epoch *next_epoch;
991 int schedule_flush = 0;
992 enum finish_epoch rv = FE_STILL_LIVE;
993
994 spin_lock(&mdev->epoch_lock);
995 do {
996 next_epoch = NULL;
997 finish = 0;
998
999 epoch_size = atomic_read(&epoch->epoch_size);
1000
1001 switch (ev & ~EV_CLEANUP) {
1002 case EV_PUT:
1003 atomic_dec(&epoch->active);
1004 break;
1005 case EV_GOT_BARRIER_NR:
1006 set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
1007
1008 /* Special case: If we just switched from WO_bio_barrier to
1009 WO_bdev_flush we should not finish the current epoch */
1010 if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 &&
1011 mdev->write_ordering != WO_bio_barrier &&
1012 epoch == mdev->current_epoch)
1013 clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags);
1014 break;
1015 case EV_BARRIER_DONE:
1016 set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags);
1017 break;
1018 case EV_BECAME_LAST:
1019 /* nothing to do*/
1020 break;
1021 }
1022
1023 if (epoch_size != 0 &&
1024 atomic_read(&epoch->active) == 0 &&
1025 test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) &&
1026 epoch->list.prev == &mdev->current_epoch->list &&
1027 !test_bit(DE_IS_FINISHING, &epoch->flags)) {
1028 /* Nearly all conditions are met to finish that epoch... */
1029 if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ||
1030 mdev->write_ordering == WO_none ||
1031 (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) ||
1032 ev & EV_CLEANUP) {
1033 finish = 1;
1034 set_bit(DE_IS_FINISHING, &epoch->flags);
1035 } else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) &&
1036 mdev->write_ordering == WO_bio_barrier) {
1037 atomic_inc(&epoch->active);
1038 schedule_flush = 1;
1039 }
1040 }
1041 if (finish) {
1042 if (!(ev & EV_CLEANUP)) {
1043 spin_unlock(&mdev->epoch_lock);
1044 drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
1045 spin_lock(&mdev->epoch_lock);
1046 }
1047 dec_unacked(mdev);
1048
1049 if (mdev->current_epoch != epoch) {
1050 next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
1051 list_del(&epoch->list);
1052 ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
1053 mdev->epochs--;
1054 kfree(epoch);
1055
1056 if (rv == FE_STILL_LIVE)
1057 rv = FE_DESTROYED;
1058 } else {
1059 epoch->flags = 0;
1060 atomic_set(&epoch->epoch_size, 0);
1061 /* atomic_set(&epoch->active, 0); is alrady zero */
1062 if (rv == FE_STILL_LIVE)
1063 rv = FE_RECYCLED;
1064 }
1065 }
1066
1067 if (!next_epoch)
1068 break;
1069
1070 epoch = next_epoch;
1071 } while (1);
1072
1073 spin_unlock(&mdev->epoch_lock);
1074
1075 if (schedule_flush) {
1076 struct flush_work *fw;
1077 fw = kmalloc(sizeof(*fw), GFP_ATOMIC);
1078 if (fw) {
1079 fw->w.cb = w_flush;
1080 fw->epoch = epoch;
1081 drbd_queue_work(&mdev->data.work, &fw->w);
1082 } else {
1083 dev_warn(DEV, "Could not kmalloc a flush_work obj\n");
1084 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1085 /* That is not a recursion, only one level */
1086 drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
1087 drbd_may_finish_epoch(mdev, epoch, EV_PUT);
1088 }
1089 }
1090
1091 return rv;
1092}
1093
1094/**
1095 * drbd_bump_write_ordering() - Fall back to an other write ordering method
1096 * @mdev: DRBD device.
1097 * @wo: Write ordering method to try.
1098 */
1099void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local)
1100{
1101 enum write_ordering_e pwo;
1102 static char *write_ordering_str[] = {
1103 [WO_none] = "none",
1104 [WO_drain_io] = "drain",
1105 [WO_bdev_flush] = "flush",
1106 [WO_bio_barrier] = "barrier",
1107 };
1108
1109 pwo = mdev->write_ordering;
1110 wo = min(pwo, wo);
1111 if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier)
1112 wo = WO_bdev_flush;
1113 if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
1114 wo = WO_drain_io;
1115 if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
1116 wo = WO_none;
1117 mdev->write_ordering = wo;
1118 if (pwo != mdev->write_ordering || wo == WO_bio_barrier)
1119 dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
1120}
1121
1122/**
1123 * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set
1124 * @mdev: DRBD device.
1125 * @w: work object.
1126 * @cancel: The connection will be closed anyways (unused in this callback)
1127 */
1128int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
1129{
1130 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1131 struct bio *bio = e->private_bio;
1132
1133 /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
1134 (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
1135 so that we can finish that epoch in drbd_may_finish_epoch().
1136 That is necessary if we already have a long chain of Epochs, before
1137 we realize that BIO_RW_BARRIER is actually not supported */
1138
1139 /* As long as the -ENOTSUPP on the barrier is reported immediately
1140 that will never trigger. If it is reported late, we will just
1141 print that warning and continue correctly for all future requests
1142 with WO_bdev_flush */
1143 if (previous_epoch(mdev, e->epoch))
1144 dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
1145
1146 /* prepare bio for re-submit,
1147 * re-init volatile members */
1148 /* we still have a local reference,
1149 * get_ldev was done in receive_Data. */
1150 bio->bi_bdev = mdev->ldev->backing_bdev;
1151 bio->bi_sector = e->sector;
1152 bio->bi_size = e->size;
1153 bio->bi_idx = 0;
1154
1155 bio->bi_flags &= ~(BIO_POOL_MASK - 1);
1156 bio->bi_flags |= 1 << BIO_UPTODATE;
1157
1158 /* don't know whether this is necessary: */
1159 bio->bi_phys_segments = 0;
1160 bio->bi_next = NULL;
1161
1162 /* these should be unchanged: */
1163 /* bio->bi_end_io = drbd_endio_write_sec; */
1164 /* bio->bi_vcnt = whatever; */
1165
1166 e->w.cb = e_end_block;
1167
1168 /* This is no longer a barrier request. */
1169 bio->bi_rw &= ~(1UL << BIO_RW_BARRIER);
1170
1171 drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, bio);
1172
1173 return 1;
1174}
1175
1176static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h)
1177{
1178 int rv, issue_flush;
1179 struct p_barrier *p = (struct p_barrier *)h;
1180 struct drbd_epoch *epoch;
1181
1182 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
1183
1184 rv = drbd_recv(mdev, h->payload, h->length);
1185 ERR_IF(rv != h->length) return FALSE;
1186
1187 inc_unacked(mdev);
1188
1189 if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
1190 drbd_kick_lo(mdev);
1191
1192 mdev->current_epoch->barrier_nr = p->barrier;
1193 rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR);
1194
1195 /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
1196 * the activity log, which means it would not be resynced in case the
1197 * R_PRIMARY crashes now.
1198 * Therefore we must send the barrier_ack after the barrier request was
1199 * completed. */
1200 switch (mdev->write_ordering) {
1201 case WO_bio_barrier:
1202 case WO_none:
1203 if (rv == FE_RECYCLED)
1204 return TRUE;
1205 break;
1206
1207 case WO_bdev_flush:
1208 case WO_drain_io:
1209 if (rv == FE_STILL_LIVE) {
1210 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
1211 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1212 rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
1213 }
1214 if (rv == FE_RECYCLED)
1215 return TRUE;
1216
1217 /* The asender will send all the ACKs and barrier ACKs out, since
1218 all EEs moved from the active_ee to the done_ee. We need to
1219 provide a new epoch object for the EEs that come in soon */
1220 break;
1221 }
1222
1223 /* receiver context, in the writeout path of the other node.
1224 * avoid potential distributed deadlock */
1225 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1226 if (!epoch) {
1227 dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
1228 issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
1229 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1230 if (issue_flush) {
1231 rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
1232 if (rv == FE_RECYCLED)
1233 return TRUE;
1234 }
1235
1236 drbd_wait_ee_list_empty(mdev, &mdev->done_ee);
1237
1238 return TRUE;
1239 }
1240
1241 epoch->flags = 0;
1242 atomic_set(&epoch->epoch_size, 0);
1243 atomic_set(&epoch->active, 0);
1244
1245 spin_lock(&mdev->epoch_lock);
1246 if (atomic_read(&mdev->current_epoch->epoch_size)) {
1247 list_add(&epoch->list, &mdev->current_epoch->list);
1248 mdev->current_epoch = epoch;
1249 mdev->epochs++;
1250 } else {
1251 /* The current_epoch got recycled while we allocated this one... */
1252 kfree(epoch);
1253 }
1254 spin_unlock(&mdev->epoch_lock);
1255
1256 return TRUE;
1257}
1258
1259/* used from receive_RSDataReply (recv_resync_read)
1260 * and from receive_Data */
1261static struct drbd_epoch_entry *
1262read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local)
1263{
1264 struct drbd_epoch_entry *e;
1265 struct bio_vec *bvec;
1266 struct page *page;
1267 struct bio *bio;
1268 int dgs, ds, i, rr;
1269 void *dig_in = mdev->int_dig_in;
1270 void *dig_vv = mdev->int_dig_vv;
1271
1272 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
1273 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
1274
1275 if (dgs) {
1276 rr = drbd_recv(mdev, dig_in, dgs);
1277 if (rr != dgs) {
1278 dev_warn(DEV, "short read receiving data digest: read %d expected %d\n",
1279 rr, dgs);
1280 return NULL;
1281 }
1282 }
1283
1284 data_size -= dgs;
1285
1286 ERR_IF(data_size & 0x1ff) return NULL;
1287 ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return NULL;
1288
1289 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1290 * "criss-cross" setup, that might cause write-out on some other DRBD,
1291 * which in turn might block on the other node at this very place. */
1292 e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
1293 if (!e)
1294 return NULL;
1295 bio = e->private_bio;
1296 ds = data_size;
1297 bio_for_each_segment(bvec, bio, i) {
1298 page = bvec->bv_page;
1299 rr = drbd_recv(mdev, kmap(page), min_t(int, ds, PAGE_SIZE));
1300 kunmap(page);
1301 if (rr != min_t(int, ds, PAGE_SIZE)) {
1302 drbd_free_ee(mdev, e);
1303 dev_warn(DEV, "short read receiving data: read %d expected %d\n",
1304 rr, min_t(int, ds, PAGE_SIZE));
1305 return NULL;
1306 }
1307 ds -= rr;
1308 }
1309
1310 if (dgs) {
1311 drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv);
1312 if (memcmp(dig_in, dig_vv, dgs)) {
1313 dev_err(DEV, "Digest integrity check FAILED.\n");
1314 drbd_bcast_ee(mdev, "digest failed",
1315 dgs, dig_in, dig_vv, e);
1316 drbd_free_ee(mdev, e);
1317 return NULL;
1318 }
1319 }
1320 mdev->recv_cnt += data_size>>9;
1321 return e;
1322}
1323
1324/* drbd_drain_block() just takes a data block
1325 * out of the socket input buffer, and discards it.
1326 */
1327static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
1328{
1329 struct page *page;
1330 int rr, rv = 1;
1331 void *data;
1332
1333 page = drbd_pp_alloc(mdev, 1);
1334
1335 data = kmap(page);
1336 while (data_size) {
1337 rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE));
1338 if (rr != min_t(int, data_size, PAGE_SIZE)) {
1339 rv = 0;
1340 dev_warn(DEV, "short read receiving data: read %d expected %d\n",
1341 rr, min_t(int, data_size, PAGE_SIZE));
1342 break;
1343 }
1344 data_size -= rr;
1345 }
1346 kunmap(page);
1347 drbd_pp_free(mdev, page);
1348 return rv;
1349}
1350
1351static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
1352 sector_t sector, int data_size)
1353{
1354 struct bio_vec *bvec;
1355 struct bio *bio;
1356 int dgs, rr, i, expect;
1357 void *dig_in = mdev->int_dig_in;
1358 void *dig_vv = mdev->int_dig_vv;
1359
1360 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
1361 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
1362
1363 if (dgs) {
1364 rr = drbd_recv(mdev, dig_in, dgs);
1365 if (rr != dgs) {
1366 dev_warn(DEV, "short read receiving data reply digest: read %d expected %d\n",
1367 rr, dgs);
1368 return 0;
1369 }
1370 }
1371
1372 data_size -= dgs;
1373
1374 /* optimistically update recv_cnt. if receiving fails below,
1375 * we disconnect anyways, and counters will be reset. */
1376 mdev->recv_cnt += data_size>>9;
1377
1378 bio = req->master_bio;
1379 D_ASSERT(sector == bio->bi_sector);
1380
1381 bio_for_each_segment(bvec, bio, i) {
1382 expect = min_t(int, data_size, bvec->bv_len);
1383 rr = drbd_recv(mdev,
1384 kmap(bvec->bv_page)+bvec->bv_offset,
1385 expect);
1386 kunmap(bvec->bv_page);
1387 if (rr != expect) {
1388 dev_warn(DEV, "short read receiving data reply: "
1389 "read %d expected %d\n",
1390 rr, expect);
1391 return 0;
1392 }
1393 data_size -= rr;
1394 }
1395
1396 if (dgs) {
1397 drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv);
1398 if (memcmp(dig_in, dig_vv, dgs)) {
1399 dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
1400 return 0;
1401 }
1402 }
1403
1404 D_ASSERT(data_size == 0);
1405 return 1;
1406}
1407
1408/* e_end_resync_block() is called via
1409 * drbd_process_done_ee() by asender only */
1410static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1411{
1412 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1413 sector_t sector = e->sector;
1414 int ok;
1415
1416 D_ASSERT(hlist_unhashed(&e->colision));
1417
1418 if (likely(drbd_bio_uptodate(e->private_bio))) {
1419 drbd_set_in_sync(mdev, sector, e->size);
1420 ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e);
1421 } else {
1422 /* Record failure to sync */
1423 drbd_rs_failed_io(mdev, sector, e->size);
1424
1425 ok = drbd_send_ack(mdev, P_NEG_ACK, e);
1426 }
1427 dec_unacked(mdev);
1428
1429 return ok;
1430}
1431
1432static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local)
1433{
1434 struct drbd_epoch_entry *e;
1435
1436 e = read_in_block(mdev, ID_SYNCER, sector, data_size);
1437 if (!e) {
1438 put_ldev(mdev);
1439 return FALSE;
1440 }
1441
1442 dec_rs_pending(mdev);
1443
1444 e->private_bio->bi_end_io = drbd_endio_write_sec;
1445 e->private_bio->bi_rw = WRITE;
1446 e->w.cb = e_end_resync_block;
1447
1448 inc_unacked(mdev);
1449 /* corresponding dec_unacked() in e_end_resync_block()
1450 * respective _drbd_clear_done_ee */
1451
1452 spin_lock_irq(&mdev->req_lock);
1453 list_add(&e->w.list, &mdev->sync_ee);
1454 spin_unlock_irq(&mdev->req_lock);
1455
1456 drbd_generic_make_request(mdev, DRBD_FAULT_RS_WR, e->private_bio);
1457 /* accounting done in endio */
1458
1459 maybe_kick_lo(mdev);
1460 return TRUE;
1461}
1462
1463static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
1464{
1465 struct drbd_request *req;
1466 sector_t sector;
1467 unsigned int header_size, data_size;
1468 int ok;
1469 struct p_data *p = (struct p_data *)h;
1470
1471 header_size = sizeof(*p) - sizeof(*h);
1472 data_size = h->length - header_size;
1473
1474 ERR_IF(data_size == 0) return FALSE;
1475
1476 if (drbd_recv(mdev, h->payload, header_size) != header_size)
1477 return FALSE;
1478
1479 sector = be64_to_cpu(p->sector);
1480
1481 spin_lock_irq(&mdev->req_lock);
1482 req = _ar_id_to_req(mdev, p->block_id, sector);
1483 spin_unlock_irq(&mdev->req_lock);
1484 if (unlikely(!req)) {
1485 dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n");
1486 return FALSE;
1487 }
1488
1489 /* hlist_del(&req->colision) is done in _req_may_be_done, to avoid
1490 * special casing it there for the various failure cases.
1491 * still no race with drbd_fail_pending_reads */
1492 ok = recv_dless_read(mdev, req, sector, data_size);
1493
1494 if (ok)
1495 req_mod(req, data_received);
1496 /* else: nothing. handled from drbd_disconnect...
1497 * I don't think we may complete this just yet
1498 * in case we are "on-disconnect: freeze" */
1499
1500 return ok;
1501}
1502
1503static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h)
1504{
1505 sector_t sector;
1506 unsigned int header_size, data_size;
1507 int ok;
1508 struct p_data *p = (struct p_data *)h;
1509
1510 header_size = sizeof(*p) - sizeof(*h);
1511 data_size = h->length - header_size;
1512
1513 ERR_IF(data_size == 0) return FALSE;
1514
1515 if (drbd_recv(mdev, h->payload, header_size) != header_size)
1516 return FALSE;
1517
1518 sector = be64_to_cpu(p->sector);
1519 D_ASSERT(p->block_id == ID_SYNCER);
1520
1521 if (get_ldev(mdev)) {
1522 /* data is submitted to disk within recv_resync_read.
1523 * corresponding put_ldev done below on error,
1524 * or in drbd_endio_write_sec. */
1525 ok = recv_resync_read(mdev, sector, data_size);
1526 } else {
1527 if (__ratelimit(&drbd_ratelimit_state))
1528 dev_err(DEV, "Can not write resync data to local disk.\n");
1529
1530 ok = drbd_drain_block(mdev, data_size);
1531
1532 drbd_send_ack_dp(mdev, P_NEG_ACK, p);
1533 }
1534
1535 return ok;
1536}
1537
1538/* e_end_block() is called via drbd_process_done_ee().
1539 * this means this function only runs in the asender thread
1540 */
1541static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1542{
1543 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1544 sector_t sector = e->sector;
1545 struct drbd_epoch *epoch;
1546 int ok = 1, pcmd;
1547
1548 if (e->flags & EE_IS_BARRIER) {
1549 epoch = previous_epoch(mdev, e->epoch);
1550 if (epoch)
1551 drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0));
1552 }
1553
1554 if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
1555 if (likely(drbd_bio_uptodate(e->private_bio))) {
1556 pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
1557 mdev->state.conn <= C_PAUSED_SYNC_T &&
1558 e->flags & EE_MAY_SET_IN_SYNC) ?
1559 P_RS_WRITE_ACK : P_WRITE_ACK;
1560 ok &= drbd_send_ack(mdev, pcmd, e);
1561 if (pcmd == P_RS_WRITE_ACK)
1562 drbd_set_in_sync(mdev, sector, e->size);
1563 } else {
1564 ok = drbd_send_ack(mdev, P_NEG_ACK, e);
1565 /* we expect it to be marked out of sync anyways...
1566 * maybe assert this? */
1567 }
1568 dec_unacked(mdev);
1569 }
1570 /* we delete from the conflict detection hash _after_ we sent out the
1571 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */
1572 if (mdev->net_conf->two_primaries) {
1573 spin_lock_irq(&mdev->req_lock);
1574 D_ASSERT(!hlist_unhashed(&e->colision));
1575 hlist_del_init(&e->colision);
1576 spin_unlock_irq(&mdev->req_lock);
1577 } else {
1578 D_ASSERT(hlist_unhashed(&e->colision));
1579 }
1580
1581 drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
1582
1583 return ok;
1584}
1585
1586static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1587{
1588 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1589 int ok = 1;
1590
1591 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
1592 ok = drbd_send_ack(mdev, P_DISCARD_ACK, e);
1593
1594 spin_lock_irq(&mdev->req_lock);
1595 D_ASSERT(!hlist_unhashed(&e->colision));
1596 hlist_del_init(&e->colision);
1597 spin_unlock_irq(&mdev->req_lock);
1598
1599 dec_unacked(mdev);
1600
1601 return ok;
1602}
1603
1604/* Called from receive_Data.
1605 * Synchronize packets on sock with packets on msock.
1606 *
1607 * This is here so even when a P_DATA packet traveling via sock overtook an Ack
1608 * packet traveling on msock, they are still processed in the order they have
1609 * been sent.
1610 *
1611 * Note: we don't care for Ack packets overtaking P_DATA packets.
1612 *
1613 * In case packet_seq is larger than mdev->peer_seq number, there are
1614 * outstanding packets on the msock. We wait for them to arrive.
1615 * In case we are the logically next packet, we update mdev->peer_seq
1616 * ourselves. Correctly handles 32bit wrap around.
1617 *
1618 * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
1619 * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
1620 * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
1621 * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
1622 *
1623 * returns 0 if we may process the packet,
1624 * -ERESTARTSYS if we were interrupted (by disconnect signal). */
1625static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
1626{
1627 DEFINE_WAIT(wait);
1628 unsigned int p_seq;
1629 long timeout;
1630 int ret = 0;
1631 spin_lock(&mdev->peer_seq_lock);
1632 for (;;) {
1633 prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
1634 if (seq_le(packet_seq, mdev->peer_seq+1))
1635 break;
1636 if (signal_pending(current)) {
1637 ret = -ERESTARTSYS;
1638 break;
1639 }
1640 p_seq = mdev->peer_seq;
1641 spin_unlock(&mdev->peer_seq_lock);
1642 timeout = schedule_timeout(30*HZ);
1643 spin_lock(&mdev->peer_seq_lock);
1644 if (timeout == 0 && p_seq == mdev->peer_seq) {
1645 ret = -ETIMEDOUT;
1646 dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n");
1647 break;
1648 }
1649 }
1650 finish_wait(&mdev->seq_wait, &wait);
1651 if (mdev->peer_seq+1 == packet_seq)
1652 mdev->peer_seq++;
1653 spin_unlock(&mdev->peer_seq_lock);
1654 return ret;
1655}
1656
1657/* mirrored write */
1658static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
1659{
1660 sector_t sector;
1661 struct drbd_epoch_entry *e;
1662 struct p_data *p = (struct p_data *)h;
1663 int header_size, data_size;
1664 int rw = WRITE;
1665 u32 dp_flags;
1666
1667 header_size = sizeof(*p) - sizeof(*h);
1668 data_size = h->length - header_size;
1669
1670 ERR_IF(data_size == 0) return FALSE;
1671
1672 if (drbd_recv(mdev, h->payload, header_size) != header_size)
1673 return FALSE;
1674
1675 if (!get_ldev(mdev)) {
1676 if (__ratelimit(&drbd_ratelimit_state))
1677 dev_err(DEV, "Can not write mirrored data block "
1678 "to local disk.\n");
1679 spin_lock(&mdev->peer_seq_lock);
1680 if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num))
1681 mdev->peer_seq++;
1682 spin_unlock(&mdev->peer_seq_lock);
1683
1684 drbd_send_ack_dp(mdev, P_NEG_ACK, p);
1685 atomic_inc(&mdev->current_epoch->epoch_size);
1686 return drbd_drain_block(mdev, data_size);
1687 }
1688
1689 /* get_ldev(mdev) successful.
1690 * Corresponding put_ldev done either below (on various errors),
1691 * or in drbd_endio_write_sec, if we successfully submit the data at
1692 * the end of this function. */
1693
1694 sector = be64_to_cpu(p->sector);
1695 e = read_in_block(mdev, p->block_id, sector, data_size);
1696 if (!e) {
1697 put_ldev(mdev);
1698 return FALSE;
1699 }
1700
1701 e->private_bio->bi_end_io = drbd_endio_write_sec;
1702 e->w.cb = e_end_block;
1703
1704 spin_lock(&mdev->epoch_lock);
1705 e->epoch = mdev->current_epoch;
1706 atomic_inc(&e->epoch->epoch_size);
1707 atomic_inc(&e->epoch->active);
1708
1709 if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) {
1710 struct drbd_epoch *epoch;
1711 /* Issue a barrier if we start a new epoch, and the previous epoch
1712 was not a epoch containing a single request which already was
1713 a Barrier. */
1714 epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list);
1715 if (epoch == e->epoch) {
1716 set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
1717 rw |= (1<<BIO_RW_BARRIER);
1718 e->flags |= EE_IS_BARRIER;
1719 } else {
1720 if (atomic_read(&epoch->epoch_size) > 1 ||
1721 !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) {
1722 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1723 set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
1724 rw |= (1<<BIO_RW_BARRIER);
1725 e->flags |= EE_IS_BARRIER;
1726 }
1727 }
1728 }
1729 spin_unlock(&mdev->epoch_lock);
1730
1731 dp_flags = be32_to_cpu(p->dp_flags);
1732 if (dp_flags & DP_HARDBARRIER) {
1733 dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n");
1734 /* rw |= (1<<BIO_RW_BARRIER); */
1735 }
1736 if (dp_flags & DP_RW_SYNC)
1737 rw |= (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG);
1738 if (dp_flags & DP_MAY_SET_IN_SYNC)
1739 e->flags |= EE_MAY_SET_IN_SYNC;
1740
1741 /* I'm the receiver, I do hold a net_cnt reference. */
1742 if (!mdev->net_conf->two_primaries) {
1743 spin_lock_irq(&mdev->req_lock);
1744 } else {
1745 /* don't get the req_lock yet,
1746 * we may sleep in drbd_wait_peer_seq */
1747 const int size = e->size;
1748 const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags);
1749 DEFINE_WAIT(wait);
1750 struct drbd_request *i;
1751 struct hlist_node *n;
1752 struct hlist_head *slot;
1753 int first;
1754
1755 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
1756 BUG_ON(mdev->ee_hash == NULL);
1757 BUG_ON(mdev->tl_hash == NULL);
1758
1759 /* conflict detection and handling:
1760 * 1. wait on the sequence number,
1761 * in case this data packet overtook ACK packets.
1762 * 2. check our hash tables for conflicting requests.
1763 * we only need to walk the tl_hash, since an ee can not
1764 * have a conflict with an other ee: on the submitting
1765 * node, the corresponding req had already been conflicting,
1766 * and a conflicting req is never sent.
1767 *
1768 * Note: for two_primaries, we are protocol C,
1769 * so there cannot be any request that is DONE
1770 * but still on the transfer log.
1771 *
1772 * unconditionally add to the ee_hash.
1773 *
1774 * if no conflicting request is found:
1775 * submit.
1776 *
1777 * if any conflicting request is found
1778 * that has not yet been acked,
1779 * AND I have the "discard concurrent writes" flag:
1780 * queue (via done_ee) the P_DISCARD_ACK; OUT.
1781 *
1782 * if any conflicting request is found:
1783 * block the receiver, waiting on misc_wait
1784 * until no more conflicting requests are there,
1785 * or we get interrupted (disconnect).
1786 *
1787 * we do not just write after local io completion of those
1788 * requests, but only after req is done completely, i.e.
1789 * we wait for the P_DISCARD_ACK to arrive!
1790 *
1791 * then proceed normally, i.e. submit.
1792 */
1793 if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num)))
1794 goto out_interrupted;
1795
1796 spin_lock_irq(&mdev->req_lock);
1797
1798 hlist_add_head(&e->colision, ee_hash_slot(mdev, sector));
1799
1800#define OVERLAPS overlaps(i->sector, i->size, sector, size)
1801 slot = tl_hash_slot(mdev, sector);
1802 first = 1;
1803 for (;;) {
1804 int have_unacked = 0;
1805 int have_conflict = 0;
1806 prepare_to_wait(&mdev->misc_wait, &wait,
1807 TASK_INTERRUPTIBLE);
1808 hlist_for_each_entry(i, n, slot, colision) {
1809 if (OVERLAPS) {
1810 /* only ALERT on first iteration,
1811 * we may be woken up early... */
1812 if (first)
1813 dev_alert(DEV, "%s[%u] Concurrent local write detected!"
1814 " new: %llus +%u; pending: %llus +%u\n",
1815 current->comm, current->pid,
1816 (unsigned long long)sector, size,
1817 (unsigned long long)i->sector, i->size);
1818 if (i->rq_state & RQ_NET_PENDING)
1819 ++have_unacked;
1820 ++have_conflict;
1821 }
1822 }
1823#undef OVERLAPS
1824 if (!have_conflict)
1825 break;
1826
1827 /* Discard Ack only for the _first_ iteration */
1828 if (first && discard && have_unacked) {
1829 dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n",
1830 (unsigned long long)sector);
1831 inc_unacked(mdev);
1832 e->w.cb = e_send_discard_ack;
1833 list_add_tail(&e->w.list, &mdev->done_ee);
1834
1835 spin_unlock_irq(&mdev->req_lock);
1836
1837 /* we could probably send that P_DISCARD_ACK ourselves,
1838 * but I don't like the receiver using the msock */
1839
1840 put_ldev(mdev);
1841 wake_asender(mdev);
1842 finish_wait(&mdev->misc_wait, &wait);
1843 return TRUE;
1844 }
1845
1846 if (signal_pending(current)) {
1847 hlist_del_init(&e->colision);
1848
1849 spin_unlock_irq(&mdev->req_lock);
1850
1851 finish_wait(&mdev->misc_wait, &wait);
1852 goto out_interrupted;
1853 }
1854
1855 spin_unlock_irq(&mdev->req_lock);
1856 if (first) {
1857 first = 0;
1858 dev_alert(DEV, "Concurrent write! [W AFTERWARDS] "
1859 "sec=%llus\n", (unsigned long long)sector);
1860 } else if (discard) {
1861 /* we had none on the first iteration.
1862 * there must be none now. */
1863 D_ASSERT(have_unacked == 0);
1864 }
1865 schedule();
1866 spin_lock_irq(&mdev->req_lock);
1867 }
1868 finish_wait(&mdev->misc_wait, &wait);
1869 }
1870
1871 list_add(&e->w.list, &mdev->active_ee);
1872 spin_unlock_irq(&mdev->req_lock);
1873
1874 switch (mdev->net_conf->wire_protocol) {
1875 case DRBD_PROT_C:
1876 inc_unacked(mdev);
1877 /* corresponding dec_unacked() in e_end_block()
1878 * respective _drbd_clear_done_ee */
1879 break;
1880 case DRBD_PROT_B:
1881 /* I really don't like it that the receiver thread
1882 * sends on the msock, but anyways */
1883 drbd_send_ack(mdev, P_RECV_ACK, e);
1884 break;
1885 case DRBD_PROT_A:
1886 /* nothing to do */
1887 break;
1888 }
1889
1890 if (mdev->state.pdsk == D_DISKLESS) {
1891 /* In case we have the only disk of the cluster, */
1892 drbd_set_out_of_sync(mdev, e->sector, e->size);
1893 e->flags |= EE_CALL_AL_COMPLETE_IO;
1894 drbd_al_begin_io(mdev, e->sector);
1895 }
1896
1897 e->private_bio->bi_rw = rw;
1898 drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, e->private_bio);
1899 /* accounting done in endio */
1900
1901 maybe_kick_lo(mdev);
1902 return TRUE;
1903
1904out_interrupted:
1905 /* yes, the epoch_size now is imbalanced.
1906 * but we drop the connection anyways, so we don't have a chance to
1907 * receive a barrier... atomic_inc(&mdev->epoch_size); */
1908 put_ldev(mdev);
1909 drbd_free_ee(mdev, e);
1910 return FALSE;
1911}
1912
1913static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
1914{
1915 sector_t sector;
1916 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
1917 struct drbd_epoch_entry *e;
1918 struct digest_info *di = NULL;
1919 int size, digest_size;
1920 unsigned int fault_type;
1921 struct p_block_req *p =
1922 (struct p_block_req *)h;
1923 const int brps = sizeof(*p)-sizeof(*h);
1924
1925 if (drbd_recv(mdev, h->payload, brps) != brps)
1926 return FALSE;
1927
1928 sector = be64_to_cpu(p->sector);
1929 size = be32_to_cpu(p->blksize);
1930
1931 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
1932 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
1933 (unsigned long long)sector, size);
1934 return FALSE;
1935 }
1936 if (sector + (size>>9) > capacity) {
1937 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
1938 (unsigned long long)sector, size);
1939 return FALSE;
1940 }
1941
1942 if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
1943 if (__ratelimit(&drbd_ratelimit_state))
1944 dev_err(DEV, "Can not satisfy peer's read request, "
1945 "no local data.\n");
1946 drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY :
1947 P_NEG_RS_DREPLY , p);
1948 return TRUE;
1949 }
1950
1951 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1952 * "criss-cross" setup, that might cause write-out on some other DRBD,
1953 * which in turn might block on the other node at this very place. */
1954 e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO);
1955 if (!e) {
1956 put_ldev(mdev);
1957 return FALSE;
1958 }
1959
1960 e->private_bio->bi_rw = READ;
1961 e->private_bio->bi_end_io = drbd_endio_read_sec;
1962
1963 switch (h->command) {
1964 case P_DATA_REQUEST:
1965 e->w.cb = w_e_end_data_req;
1966 fault_type = DRBD_FAULT_DT_RD;
1967 break;
1968 case P_RS_DATA_REQUEST:
1969 e->w.cb = w_e_end_rsdata_req;
1970 fault_type = DRBD_FAULT_RS_RD;
1971 /* Eventually this should become asynchronously. Currently it
1972 * blocks the whole receiver just to delay the reading of a
1973 * resync data block.
1974 * the drbd_work_queue mechanism is made for this...
1975 */
1976 if (!drbd_rs_begin_io(mdev, sector)) {
1977 /* we have been interrupted,
1978 * probably connection lost! */
1979 D_ASSERT(signal_pending(current));
1980 goto out_free_e;
1981 }
1982 break;
1983
1984 case P_OV_REPLY:
1985 case P_CSUM_RS_REQUEST:
1986 fault_type = DRBD_FAULT_RS_RD;
1987 digest_size = h->length - brps ;
1988 di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
1989 if (!di)
1990 goto out_free_e;
1991
1992 di->digest_size = digest_size;
1993 di->digest = (((char *)di)+sizeof(struct digest_info));
1994
1995 if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
1996 goto out_free_e;
1997
1998 e->block_id = (u64)(unsigned long)di;
1999 if (h->command == P_CSUM_RS_REQUEST) {
2000 D_ASSERT(mdev->agreed_pro_version >= 89);
2001 e->w.cb = w_e_end_csum_rs_req;
2002 } else if (h->command == P_OV_REPLY) {
2003 e->w.cb = w_e_end_ov_reply;
2004 dec_rs_pending(mdev);
2005 break;
2006 }
2007
2008 if (!drbd_rs_begin_io(mdev, sector)) {
2009 /* we have been interrupted, probably connection lost! */
2010 D_ASSERT(signal_pending(current));
2011 goto out_free_e;
2012 }
2013 break;
2014
2015 case P_OV_REQUEST:
2016 if (mdev->state.conn >= C_CONNECTED &&
2017 mdev->state.conn != C_VERIFY_T)
2018 dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n",
2019 drbd_conn_str(mdev->state.conn));
2020 if (mdev->ov_start_sector == ~(sector_t)0 &&
2021 mdev->agreed_pro_version >= 90) {
2022 mdev->ov_start_sector = sector;
2023 mdev->ov_position = sector;
2024 mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(sector);
2025 dev_info(DEV, "Online Verify start sector: %llu\n",
2026 (unsigned long long)sector);
2027 }
2028 e->w.cb = w_e_end_ov_req;
2029 fault_type = DRBD_FAULT_RS_RD;
2030 /* Eventually this should become asynchronous. Currently it
2031 * blocks the whole receiver just to delay the reading of a
2032 * resync data block.
2033 * the drbd_work_queue mechanism is made for this...
2034 */
2035 if (!drbd_rs_begin_io(mdev, sector)) {
2036 /* we have been interrupted,
2037 * probably connection lost! */
2038 D_ASSERT(signal_pending(current));
2039 goto out_free_e;
2040 }
2041 break;
2042
2043
2044 default:
2045 dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
2046 cmdname(h->command));
2047 fault_type = DRBD_FAULT_MAX;
2048 }
2049
2050 spin_lock_irq(&mdev->req_lock);
2051 list_add(&e->w.list, &mdev->read_ee);
2052 spin_unlock_irq(&mdev->req_lock);
2053
2054 inc_unacked(mdev);
2055
2056 drbd_generic_make_request(mdev, fault_type, e->private_bio);
2057 maybe_kick_lo(mdev);
2058
2059 return TRUE;
2060
2061out_free_e:
2062 kfree(di);
2063 put_ldev(mdev);
2064 drbd_free_ee(mdev, e);
2065 return FALSE;
2066}
2067
2068static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
2069{
2070 int self, peer, rv = -100;
2071 unsigned long ch_self, ch_peer;
2072
2073 self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2074 peer = mdev->p_uuid[UI_BITMAP] & 1;
2075
2076 ch_peer = mdev->p_uuid[UI_SIZE];
2077 ch_self = mdev->comm_bm_set;
2078
2079 switch (mdev->net_conf->after_sb_0p) {
2080 case ASB_CONSENSUS:
2081 case ASB_DISCARD_SECONDARY:
2082 case ASB_CALL_HELPER:
2083 dev_err(DEV, "Configuration error.\n");
2084 break;
2085 case ASB_DISCONNECT:
2086 break;
2087 case ASB_DISCARD_YOUNGER_PRI:
2088 if (self == 0 && peer == 1) {
2089 rv = -1;
2090 break;
2091 }
2092 if (self == 1 && peer == 0) {
2093 rv = 1;
2094 break;
2095 }
2096 /* Else fall through to one of the other strategies... */
2097 case ASB_DISCARD_OLDER_PRI:
2098 if (self == 0 && peer == 1) {
2099 rv = 1;
2100 break;
2101 }
2102 if (self == 1 && peer == 0) {
2103 rv = -1;
2104 break;
2105 }
2106 /* Else fall through to one of the other strategies... */
2107 dev_warn(DEV, "Discard younger/older primary did not find a decision\n"
2108 "Using discard-least-changes instead\n");
2109 case ASB_DISCARD_ZERO_CHG:
2110 if (ch_peer == 0 && ch_self == 0) {
2111 rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
2112 ? -1 : 1;
2113 break;
2114 } else {
2115 if (ch_peer == 0) { rv = 1; break; }
2116 if (ch_self == 0) { rv = -1; break; }
2117 }
2118 if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG)
2119 break;
2120 case ASB_DISCARD_LEAST_CHG:
2121 if (ch_self < ch_peer)
2122 rv = -1;
2123 else if (ch_self > ch_peer)
2124 rv = 1;
2125 else /* ( ch_self == ch_peer ) */
2126 /* Well, then use something else. */
2127 rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
2128 ? -1 : 1;
2129 break;
2130 case ASB_DISCARD_LOCAL:
2131 rv = -1;
2132 break;
2133 case ASB_DISCARD_REMOTE:
2134 rv = 1;
2135 }
2136
2137 return rv;
2138}
2139
2140static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
2141{
2142 int self, peer, hg, rv = -100;
2143
2144 self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2145 peer = mdev->p_uuid[UI_BITMAP] & 1;
2146
2147 switch (mdev->net_conf->after_sb_1p) {
2148 case ASB_DISCARD_YOUNGER_PRI:
2149 case ASB_DISCARD_OLDER_PRI:
2150 case ASB_DISCARD_LEAST_CHG:
2151 case ASB_DISCARD_LOCAL:
2152 case ASB_DISCARD_REMOTE:
2153 dev_err(DEV, "Configuration error.\n");
2154 break;
2155 case ASB_DISCONNECT:
2156 break;
2157 case ASB_CONSENSUS:
2158 hg = drbd_asb_recover_0p(mdev);
2159 if (hg == -1 && mdev->state.role == R_SECONDARY)
2160 rv = hg;
2161 if (hg == 1 && mdev->state.role == R_PRIMARY)
2162 rv = hg;
2163 break;
2164 case ASB_VIOLENTLY:
2165 rv = drbd_asb_recover_0p(mdev);
2166 break;
2167 case ASB_DISCARD_SECONDARY:
2168 return mdev->state.role == R_PRIMARY ? 1 : -1;
2169 case ASB_CALL_HELPER:
2170 hg = drbd_asb_recover_0p(mdev);
2171 if (hg == -1 && mdev->state.role == R_PRIMARY) {
2172 self = drbd_set_role(mdev, R_SECONDARY, 0);
2173 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2174 * we might be here in C_WF_REPORT_PARAMS which is transient.
2175 * we do not need to wait for the after state change work either. */
2176 self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2177 if (self != SS_SUCCESS) {
2178 drbd_khelper(mdev, "pri-lost-after-sb");
2179 } else {
2180 dev_warn(DEV, "Successfully gave up primary role.\n");
2181 rv = hg;
2182 }
2183 } else
2184 rv = hg;
2185 }
2186
2187 return rv;
2188}
2189
2190static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
2191{
2192 int self, peer, hg, rv = -100;
2193
2194 self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2195 peer = mdev->p_uuid[UI_BITMAP] & 1;
2196
2197 switch (mdev->net_conf->after_sb_2p) {
2198 case ASB_DISCARD_YOUNGER_PRI:
2199 case ASB_DISCARD_OLDER_PRI:
2200 case ASB_DISCARD_LEAST_CHG:
2201 case ASB_DISCARD_LOCAL:
2202 case ASB_DISCARD_REMOTE:
2203 case ASB_CONSENSUS:
2204 case ASB_DISCARD_SECONDARY:
2205 dev_err(DEV, "Configuration error.\n");
2206 break;
2207 case ASB_VIOLENTLY:
2208 rv = drbd_asb_recover_0p(mdev);
2209 break;
2210 case ASB_DISCONNECT:
2211 break;
2212 case ASB_CALL_HELPER:
2213 hg = drbd_asb_recover_0p(mdev);
2214 if (hg == -1) {
2215 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2216 * we might be here in C_WF_REPORT_PARAMS which is transient.
2217 * we do not need to wait for the after state change work either. */
2218 self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2219 if (self != SS_SUCCESS) {
2220 drbd_khelper(mdev, "pri-lost-after-sb");
2221 } else {
2222 dev_warn(DEV, "Successfully gave up primary role.\n");
2223 rv = hg;
2224 }
2225 } else
2226 rv = hg;
2227 }
2228
2229 return rv;
2230}
2231
2232static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid,
2233 u64 bits, u64 flags)
2234{
2235 if (!uuid) {
2236 dev_info(DEV, "%s uuid info vanished while I was looking!\n", text);
2237 return;
2238 }
2239 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
2240 text,
2241 (unsigned long long)uuid[UI_CURRENT],
2242 (unsigned long long)uuid[UI_BITMAP],
2243 (unsigned long long)uuid[UI_HISTORY_START],
2244 (unsigned long long)uuid[UI_HISTORY_END],
2245 (unsigned long long)bits,
2246 (unsigned long long)flags);
2247}
2248
2249/*
2250 100 after split brain try auto recover
2251 2 C_SYNC_SOURCE set BitMap
2252 1 C_SYNC_SOURCE use BitMap
2253 0 no Sync
2254 -1 C_SYNC_TARGET use BitMap
2255 -2 C_SYNC_TARGET set BitMap
2256 -100 after split brain, disconnect
2257-1000 unrelated data
2258 */
2259static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local)
2260{
2261 u64 self, peer;
2262 int i, j;
2263
2264 self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2265 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2266
2267 *rule_nr = 10;
2268 if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
2269 return 0;
2270
2271 *rule_nr = 20;
2272 if ((self == UUID_JUST_CREATED || self == (u64)0) &&
2273 peer != UUID_JUST_CREATED)
2274 return -2;
2275
2276 *rule_nr = 30;
2277 if (self != UUID_JUST_CREATED &&
2278 (peer == UUID_JUST_CREATED || peer == (u64)0))
2279 return 2;
2280
2281 if (self == peer) {
2282 int rct, dc; /* roles at crash time */
2283
2284 if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
2285
2286 if (mdev->agreed_pro_version < 91)
2287 return -1001;
2288
2289 if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
2290 (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
2291 dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n");
2292 drbd_uuid_set_bm(mdev, 0UL);
2293
2294 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2295 mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2296 *rule_nr = 34;
2297 } else {
2298 dev_info(DEV, "was SyncSource (peer failed to write sync_uuid)\n");
2299 *rule_nr = 36;
2300 }
2301
2302 return 1;
2303 }
2304
2305 if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
2306
2307 if (mdev->agreed_pro_version < 91)
2308 return -1001;
2309
2310 if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
2311 (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
2312 dev_info(DEV, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
2313
2314 mdev->p_uuid[UI_HISTORY_START + 1] = mdev->p_uuid[UI_HISTORY_START];
2315 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_BITMAP];
2316 mdev->p_uuid[UI_BITMAP] = 0UL;
2317
2318 drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2319 *rule_nr = 35;
2320 } else {
2321 dev_info(DEV, "was SyncTarget (failed to write sync_uuid)\n");
2322 *rule_nr = 37;
2323 }
2324
2325 return -1;
2326 }
2327
2328 /* Common power [off|failure] */
2329 rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) +
2330 (mdev->p_uuid[UI_FLAGS] & 2);
2331 /* lowest bit is set when we were primary,
2332 * next bit (weight 2) is set when peer was primary */
2333 *rule_nr = 40;
2334
2335 switch (rct) {
2336 case 0: /* !self_pri && !peer_pri */ return 0;
2337 case 1: /* self_pri && !peer_pri */ return 1;
2338 case 2: /* !self_pri && peer_pri */ return -1;
2339 case 3: /* self_pri && peer_pri */
2340 dc = test_bit(DISCARD_CONCURRENT, &mdev->flags);
2341 return dc ? -1 : 1;
2342 }
2343 }
2344
2345 *rule_nr = 50;
2346 peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2347 if (self == peer)
2348 return -1;
2349
2350 *rule_nr = 51;
2351 peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
2352 if (self == peer) {
2353 self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
2354 peer = mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1);
2355 if (self == peer) {
2356 /* The last P_SYNC_UUID did not get though. Undo the last start of
2357 resync as sync source modifications of the peer's UUIDs. */
2358
2359 if (mdev->agreed_pro_version < 91)
2360 return -1001;
2361
2362 mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
2363 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
2364 return -1;
2365 }
2366 }
2367
2368 *rule_nr = 60;
2369 self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2370 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2371 peer = mdev->p_uuid[i] & ~((u64)1);
2372 if (self == peer)
2373 return -2;
2374 }
2375
2376 *rule_nr = 70;
2377 self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2378 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2379 if (self == peer)
2380 return 1;
2381
2382 *rule_nr = 71;
2383 self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
2384 if (self == peer) {
2385 self = mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1);
2386 peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
2387 if (self == peer) {
2388 /* The last P_SYNC_UUID did not get though. Undo the last start of
2389 resync as sync source modifications of our UUIDs. */
2390
2391 if (mdev->agreed_pro_version < 91)
2392 return -1001;
2393
2394 _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
2395 _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]);
2396
2397 dev_info(DEV, "Undid last start of resync:\n");
2398
2399 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2400 mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2401
2402 return 1;
2403 }
2404 }
2405
2406
2407 *rule_nr = 80;
2408 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2409 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2410 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2411 if (self == peer)
2412 return 2;
2413 }
2414
2415 *rule_nr = 90;
2416 self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2417 peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2418 if (self == peer && self != ((u64)0))
2419 return 100;
2420
2421 *rule_nr = 100;
2422 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2423 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2424 for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
2425 peer = mdev->p_uuid[j] & ~((u64)1);
2426 if (self == peer)
2427 return -100;
2428 }
2429 }
2430
2431 return -1000;
2432}
2433
2434/* drbd_sync_handshake() returns the new conn state on success, or
2435 CONN_MASK (-1) on failure.
2436 */
2437static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role,
2438 enum drbd_disk_state peer_disk) __must_hold(local)
2439{
2440 int hg, rule_nr;
2441 enum drbd_conns rv = C_MASK;
2442 enum drbd_disk_state mydisk;
2443
2444 mydisk = mdev->state.disk;
2445 if (mydisk == D_NEGOTIATING)
2446 mydisk = mdev->new_state_tmp.disk;
2447
2448 dev_info(DEV, "drbd_sync_handshake:\n");
2449 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0);
2450 drbd_uuid_dump(mdev, "peer", mdev->p_uuid,
2451 mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2452
2453 hg = drbd_uuid_compare(mdev, &rule_nr);
2454
2455 dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
2456
2457 if (hg == -1000) {
2458 dev_alert(DEV, "Unrelated data, aborting!\n");
2459 return C_MASK;
2460 }
2461 if (hg == -1001) {
2462 dev_alert(DEV, "To resolve this both sides have to support at least protocol\n");
2463 return C_MASK;
2464 }
2465
2466 if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
2467 (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) {
2468 int f = (hg == -100) || abs(hg) == 2;
2469 hg = mydisk > D_INCONSISTENT ? 1 : -1;
2470 if (f)
2471 hg = hg*2;
2472 dev_info(DEV, "Becoming sync %s due to disk states.\n",
2473 hg > 0 ? "source" : "target");
2474 }
2475
2476 if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
2477 int pcount = (mdev->state.role == R_PRIMARY)
2478 + (peer_role == R_PRIMARY);
2479 int forced = (hg == -100);
2480
2481 switch (pcount) {
2482 case 0:
2483 hg = drbd_asb_recover_0p(mdev);
2484 break;
2485 case 1:
2486 hg = drbd_asb_recover_1p(mdev);
2487 break;
2488 case 2:
2489 hg = drbd_asb_recover_2p(mdev);
2490 break;
2491 }
2492 if (abs(hg) < 100) {
2493 dev_warn(DEV, "Split-Brain detected, %d primaries, "
2494 "automatically solved. Sync from %s node\n",
2495 pcount, (hg < 0) ? "peer" : "this");
2496 if (forced) {
2497 dev_warn(DEV, "Doing a full sync, since"
2498 " UUIDs where ambiguous.\n");
2499 hg = hg*2;
2500 }
2501 }
2502 }
2503
2504 if (hg == -100) {
2505 if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1))
2506 hg = -1;
2507 if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1))
2508 hg = 1;
2509
2510 if (abs(hg) < 100)
2511 dev_warn(DEV, "Split-Brain detected, manually solved. "
2512 "Sync from %s node\n",
2513 (hg < 0) ? "peer" : "this");
2514 }
2515
2516 if (hg == -100) {
2517 /* FIXME this log message is not correct if we end up here
2518 * after an attempted attach on a diskless node.
2519 * We just refuse to attach -- well, we drop the "connection"
2520 * to that disk, in a way... */
2521 dev_alert(DEV, "Split-Brain detected, dropping connection!\n");
2522 drbd_khelper(mdev, "split-brain");
2523 return C_MASK;
2524 }
2525
2526 if (hg > 0 && mydisk <= D_INCONSISTENT) {
2527 dev_err(DEV, "I shall become SyncSource, but I am inconsistent!\n");
2528 return C_MASK;
2529 }
2530
2531 if (hg < 0 && /* by intention we do not use mydisk here. */
2532 mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) {
2533 switch (mdev->net_conf->rr_conflict) {
2534 case ASB_CALL_HELPER:
2535 drbd_khelper(mdev, "pri-lost");
2536 /* fall through */
2537 case ASB_DISCONNECT:
2538 dev_err(DEV, "I shall become SyncTarget, but I am primary!\n");
2539 return C_MASK;
2540 case ASB_VIOLENTLY:
2541 dev_warn(DEV, "Becoming SyncTarget, violating the stable-data"
2542 "assumption\n");
2543 }
2544 }
2545
2546 if (mdev->net_conf->dry_run || test_bit(CONN_DRY_RUN, &mdev->flags)) {
2547 if (hg == 0)
2548 dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n");
2549 else
2550 dev_info(DEV, "dry-run connect: Would become %s, doing a %s resync.",
2551 drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
2552 abs(hg) >= 2 ? "full" : "bit-map based");
2553 return C_MASK;
2554 }
2555
2556 if (abs(hg) >= 2) {
2557 dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
2558 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake"))
2559 return C_MASK;
2560 }
2561
2562 if (hg > 0) { /* become sync source. */
2563 rv = C_WF_BITMAP_S;
2564 } else if (hg < 0) { /* become sync target */
2565 rv = C_WF_BITMAP_T;
2566 } else {
2567 rv = C_CONNECTED;
2568 if (drbd_bm_total_weight(mdev)) {
2569 dev_info(DEV, "No resync, but %lu bits in bitmap!\n",
2570 drbd_bm_total_weight(mdev));
2571 }
2572 }
2573
2574 return rv;
2575}
2576
2577/* returns 1 if invalid */
2578static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
2579{
2580 /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
2581 if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) ||
2582 (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL))
2583 return 0;
2584
2585 /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
2586 if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL ||
2587 self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL)
2588 return 1;
2589
2590 /* everything else is valid if they are equal on both sides. */
2591 if (peer == self)
2592 return 0;
2593
2594 /* everything es is invalid. */
2595 return 1;
2596}
2597
2598static int receive_protocol(struct drbd_conf *mdev, struct p_header *h)
2599{
2600 struct p_protocol *p = (struct p_protocol *)h;
2601 int header_size, data_size;
2602 int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
2603 int p_want_lose, p_two_primaries, cf;
2604 char p_integrity_alg[SHARED_SECRET_MAX] = "";
2605
2606 header_size = sizeof(*p) - sizeof(*h);
2607 data_size = h->length - header_size;
2608
2609 if (drbd_recv(mdev, h->payload, header_size) != header_size)
2610 return FALSE;
2611
2612 p_proto = be32_to_cpu(p->protocol);
2613 p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
2614 p_after_sb_1p = be32_to_cpu(p->after_sb_1p);
2615 p_after_sb_2p = be32_to_cpu(p->after_sb_2p);
2616 p_two_primaries = be32_to_cpu(p->two_primaries);
2617 cf = be32_to_cpu(p->conn_flags);
2618 p_want_lose = cf & CF_WANT_LOSE;
2619
2620 clear_bit(CONN_DRY_RUN, &mdev->flags);
2621
2622 if (cf & CF_DRY_RUN)
2623 set_bit(CONN_DRY_RUN, &mdev->flags);
2624
2625 if (p_proto != mdev->net_conf->wire_protocol) {
2626 dev_err(DEV, "incompatible communication protocols\n");
2627 goto disconnect;
2628 }
2629
2630 if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) {
2631 dev_err(DEV, "incompatible after-sb-0pri settings\n");
2632 goto disconnect;
2633 }
2634
2635 if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) {
2636 dev_err(DEV, "incompatible after-sb-1pri settings\n");
2637 goto disconnect;
2638 }
2639
2640 if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) {
2641 dev_err(DEV, "incompatible after-sb-2pri settings\n");
2642 goto disconnect;
2643 }
2644
2645 if (p_want_lose && mdev->net_conf->want_lose) {
2646 dev_err(DEV, "both sides have the 'want_lose' flag set\n");
2647 goto disconnect;
2648 }
2649
2650 if (p_two_primaries != mdev->net_conf->two_primaries) {
2651 dev_err(DEV, "incompatible setting of the two-primaries options\n");
2652 goto disconnect;
2653 }
2654
2655 if (mdev->agreed_pro_version >= 87) {
2656 unsigned char *my_alg = mdev->net_conf->integrity_alg;
2657
2658 if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size)
2659 return FALSE;
2660
2661 p_integrity_alg[SHARED_SECRET_MAX-1] = 0;
2662 if (strcmp(p_integrity_alg, my_alg)) {
2663 dev_err(DEV, "incompatible setting of the data-integrity-alg\n");
2664 goto disconnect;
2665 }
2666 dev_info(DEV, "data-integrity-alg: %s\n",
2667 my_alg[0] ? my_alg : (unsigned char *)"<not-used>");
2668 }
2669
2670 return TRUE;
2671
2672disconnect:
2673 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2674 return FALSE;
2675}
2676
2677/* helper function
2678 * input: alg name, feature name
2679 * return: NULL (alg name was "")
2680 * ERR_PTR(error) if something goes wrong
2681 * or the crypto hash ptr, if it worked out ok. */
2682struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
2683 const char *alg, const char *name)
2684{
2685 struct crypto_hash *tfm;
2686
2687 if (!alg[0])
2688 return NULL;
2689
2690 tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
2691 if (IS_ERR(tfm)) {
2692 dev_err(DEV, "Can not allocate \"%s\" as %s (reason: %ld)\n",
2693 alg, name, PTR_ERR(tfm));
2694 return tfm;
2695 }
2696 if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
2697 crypto_free_hash(tfm);
2698 dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name);
2699 return ERR_PTR(-EINVAL);
2700 }
2701 return tfm;
2702}
2703
2704static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
2705{
2706 int ok = TRUE;
2707 struct p_rs_param_89 *p = (struct p_rs_param_89 *)h;
2708 unsigned int header_size, data_size, exp_max_sz;
2709 struct crypto_hash *verify_tfm = NULL;
2710 struct crypto_hash *csums_tfm = NULL;
2711 const int apv = mdev->agreed_pro_version;
2712
2713 exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
2714 : apv == 88 ? sizeof(struct p_rs_param)
2715 + SHARED_SECRET_MAX
2716 : /* 89 */ sizeof(struct p_rs_param_89);
2717
2718 if (h->length > exp_max_sz) {
2719 dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
2720 h->length, exp_max_sz);
2721 return FALSE;
2722 }
2723
2724 if (apv <= 88) {
2725 header_size = sizeof(struct p_rs_param) - sizeof(*h);
2726 data_size = h->length - header_size;
2727 } else /* apv >= 89 */ {
2728 header_size = sizeof(struct p_rs_param_89) - sizeof(*h);
2729 data_size = h->length - header_size;
2730 D_ASSERT(data_size == 0);
2731 }
2732
2733 /* initialize verify_alg and csums_alg */
2734 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
2735
2736 if (drbd_recv(mdev, h->payload, header_size) != header_size)
2737 return FALSE;
2738
2739 mdev->sync_conf.rate = be32_to_cpu(p->rate);
2740
2741 if (apv >= 88) {
2742 if (apv == 88) {
2743 if (data_size > SHARED_SECRET_MAX) {
2744 dev_err(DEV, "verify-alg too long, "
2745 "peer wants %u, accepting only %u byte\n",
2746 data_size, SHARED_SECRET_MAX);
2747 return FALSE;
2748 }
2749
2750 if (drbd_recv(mdev, p->verify_alg, data_size) != data_size)
2751 return FALSE;
2752
2753 /* we expect NUL terminated string */
2754 /* but just in case someone tries to be evil */
2755 D_ASSERT(p->verify_alg[data_size-1] == 0);
2756 p->verify_alg[data_size-1] = 0;
2757
2758 } else /* apv >= 89 */ {
2759 /* we still expect NUL terminated strings */
2760 /* but just in case someone tries to be evil */
2761 D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0);
2762 D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0);
2763 p->verify_alg[SHARED_SECRET_MAX-1] = 0;
2764 p->csums_alg[SHARED_SECRET_MAX-1] = 0;
2765 }
2766
2767 if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) {
2768 if (mdev->state.conn == C_WF_REPORT_PARAMS) {
2769 dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
2770 mdev->sync_conf.verify_alg, p->verify_alg);
2771 goto disconnect;
2772 }
2773 verify_tfm = drbd_crypto_alloc_digest_safe(mdev,
2774 p->verify_alg, "verify-alg");
2775 if (IS_ERR(verify_tfm)) {
2776 verify_tfm = NULL;
2777 goto disconnect;
2778 }
2779 }
2780
2781 if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) {
2782 if (mdev->state.conn == C_WF_REPORT_PARAMS) {
2783 dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
2784 mdev->sync_conf.csums_alg, p->csums_alg);
2785 goto disconnect;
2786 }
2787 csums_tfm = drbd_crypto_alloc_digest_safe(mdev,
2788 p->csums_alg, "csums-alg");
2789 if (IS_ERR(csums_tfm)) {
2790 csums_tfm = NULL;
2791 goto disconnect;
2792 }
2793 }
2794
2795
2796 spin_lock(&mdev->peer_seq_lock);
2797 /* lock against drbd_nl_syncer_conf() */
2798 if (verify_tfm) {
2799 strcpy(mdev->sync_conf.verify_alg, p->verify_alg);
2800 mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1;
2801 crypto_free_hash(mdev->verify_tfm);
2802 mdev->verify_tfm = verify_tfm;
2803 dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
2804 }
2805 if (csums_tfm) {
2806 strcpy(mdev->sync_conf.csums_alg, p->csums_alg);
2807 mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1;
2808 crypto_free_hash(mdev->csums_tfm);
2809 mdev->csums_tfm = csums_tfm;
2810 dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
2811 }
2812 spin_unlock(&mdev->peer_seq_lock);
2813 }
2814
2815 return ok;
2816disconnect:
2817 /* just for completeness: actually not needed,
2818 * as this is not reached if csums_tfm was ok. */
2819 crypto_free_hash(csums_tfm);
2820 /* but free the verify_tfm again, if csums_tfm did not work out */
2821 crypto_free_hash(verify_tfm);
2822 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2823 return FALSE;
2824}
2825
2826static void drbd_setup_order_type(struct drbd_conf *mdev, int peer)
2827{
2828 /* sorry, we currently have no working implementation
2829 * of distributed TCQ */
2830}
2831
2832/* warn if the arguments differ by more than 12.5% */
2833static void warn_if_differ_considerably(struct drbd_conf *mdev,
2834 const char *s, sector_t a, sector_t b)
2835{
2836 sector_t d;
2837 if (a == 0 || b == 0)
2838 return;
2839 d = (a > b) ? (a - b) : (b - a);
2840 if (d > (a>>3) || d > (b>>3))
2841 dev_warn(DEV, "Considerable difference in %s: %llus vs. %llus\n", s,
2842 (unsigned long long)a, (unsigned long long)b);
2843}
2844
2845static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
2846{
2847 struct p_sizes *p = (struct p_sizes *)h;
2848 enum determine_dev_size dd = unchanged;
2849 unsigned int max_seg_s;
2850 sector_t p_size, p_usize, my_usize;
2851 int ldsc = 0; /* local disk size changed */
2852 enum drbd_conns nconn;
2853
2854 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
2855 if (drbd_recv(mdev, h->payload, h->length) != h->length)
2856 return FALSE;
2857
2858 p_size = be64_to_cpu(p->d_size);
2859 p_usize = be64_to_cpu(p->u_size);
2860
2861 if (p_size == 0 && mdev->state.disk == D_DISKLESS) {
2862 dev_err(DEV, "some backing storage is needed\n");
2863 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2864 return FALSE;
2865 }
2866
2867 /* just store the peer's disk size for now.
2868 * we still need to figure out whether we accept that. */
2869 mdev->p_size = p_size;
2870
2871#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
2872 if (get_ldev(mdev)) {
2873 warn_if_differ_considerably(mdev, "lower level device sizes",
2874 p_size, drbd_get_max_capacity(mdev->ldev));
2875 warn_if_differ_considerably(mdev, "user requested size",
2876 p_usize, mdev->ldev->dc.disk_size);
2877
2878 /* if this is the first connect, or an otherwise expected
2879 * param exchange, choose the minimum */
2880 if (mdev->state.conn == C_WF_REPORT_PARAMS)
2881 p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size,
2882 p_usize);
2883
2884 my_usize = mdev->ldev->dc.disk_size;
2885
2886 if (mdev->ldev->dc.disk_size != p_usize) {
2887 mdev->ldev->dc.disk_size = p_usize;
2888 dev_info(DEV, "Peer sets u_size to %lu sectors\n",
2889 (unsigned long)mdev->ldev->dc.disk_size);
2890 }
2891
2892 /* Never shrink a device with usable data during connect.
2893 But allow online shrinking if we are connected. */
2894 if (drbd_new_dev_size(mdev, mdev->ldev, 0) <
2895 drbd_get_capacity(mdev->this_bdev) &&
2896 mdev->state.disk >= D_OUTDATED &&
2897 mdev->state.conn < C_CONNECTED) {
2898 dev_err(DEV, "The peer's disk size is too small!\n");
2899 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2900 mdev->ldev->dc.disk_size = my_usize;
2901 put_ldev(mdev);
2902 return FALSE;
2903 }
2904 put_ldev(mdev);
2905 }
2906#undef min_not_zero
2907
2908 if (get_ldev(mdev)) {
2909 dd = drbd_determin_dev_size(mdev, 0);
2910 put_ldev(mdev);
2911 if (dd == dev_size_error)
2912 return FALSE;
2913 drbd_md_sync(mdev);
2914 } else {
2915 /* I am diskless, need to accept the peer's size. */
2916 drbd_set_my_capacity(mdev, p_size);
2917 }
2918
2919 if (mdev->p_uuid && mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
2920 nconn = drbd_sync_handshake(mdev,
2921 mdev->state.peer, mdev->state.pdsk);
2922 put_ldev(mdev);
2923
2924 if (nconn == C_MASK) {
2925 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2926 return FALSE;
2927 }
2928
2929 if (drbd_request_state(mdev, NS(conn, nconn)) < SS_SUCCESS) {
2930 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2931 return FALSE;
2932 }
2933 }
2934
2935 if (get_ldev(mdev)) {
2936 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
2937 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
2938 ldsc = 1;
2939 }
2940
2941 max_seg_s = be32_to_cpu(p->max_segment_size);
2942 if (max_seg_s != queue_max_segment_size(mdev->rq_queue))
2943 drbd_setup_queue_param(mdev, max_seg_s);
2944
2945 drbd_setup_order_type(mdev, be32_to_cpu(p->queue_order_type));
2946 put_ldev(mdev);
2947 }
2948
2949 if (mdev->state.conn > C_WF_REPORT_PARAMS) {
2950 if (be64_to_cpu(p->c_size) !=
2951 drbd_get_capacity(mdev->this_bdev) || ldsc) {
2952 /* we have different sizes, probably peer
2953 * needs to know my new size... */
2954 drbd_send_sizes(mdev, 0);
2955 }
2956 if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
2957 (dd == grew && mdev->state.conn == C_CONNECTED)) {
2958 if (mdev->state.pdsk >= D_INCONSISTENT &&
2959 mdev->state.disk >= D_INCONSISTENT)
2960 resync_after_online_grow(mdev);
2961 else
2962 set_bit(RESYNC_AFTER_NEG, &mdev->flags);
2963 }
2964 }
2965
2966 return TRUE;
2967}
2968
2969static int receive_uuids(struct drbd_conf *mdev, struct p_header *h)
2970{
2971 struct p_uuids *p = (struct p_uuids *)h;
2972 u64 *p_uuid;
2973 int i;
2974
2975 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
2976 if (drbd_recv(mdev, h->payload, h->length) != h->length)
2977 return FALSE;
2978
2979 p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
2980
2981 for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
2982 p_uuid[i] = be64_to_cpu(p->uuid[i]);
2983
2984 kfree(mdev->p_uuid);
2985 mdev->p_uuid = p_uuid;
2986
2987 if (mdev->state.conn < C_CONNECTED &&
2988 mdev->state.disk < D_INCONSISTENT &&
2989 mdev->state.role == R_PRIMARY &&
2990 (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
2991 dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
2992 (unsigned long long)mdev->ed_uuid);
2993 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2994 return FALSE;
2995 }
2996
2997 if (get_ldev(mdev)) {
2998 int skip_initial_sync =
2999 mdev->state.conn == C_CONNECTED &&
3000 mdev->agreed_pro_version >= 90 &&
3001 mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
3002 (p_uuid[UI_FLAGS] & 8);
3003 if (skip_initial_sync) {
3004 dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n");
3005 drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
3006 "clear_n_write from receive_uuids");
3007 _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]);
3008 _drbd_uuid_set(mdev, UI_BITMAP, 0);
3009 _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
3010 CS_VERBOSE, NULL);
3011 drbd_md_sync(mdev);
3012 }
3013 put_ldev(mdev);
3014 }
3015
3016 /* Before we test for the disk state, we should wait until an eventually
3017 ongoing cluster wide state change is finished. That is important if
3018 we are primary and are detaching from our disk. We need to see the
3019 new disk state... */
3020 wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags));
3021 if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
3022 drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
3023
3024 return TRUE;
3025}
3026
3027/**
3028 * convert_state() - Converts the peer's view of the cluster state to our point of view
3029 * @ps: The state as seen by the peer.
3030 */
3031static union drbd_state convert_state(union drbd_state ps)
3032{
3033 union drbd_state ms;
3034
3035 static enum drbd_conns c_tab[] = {
3036 [C_CONNECTED] = C_CONNECTED,
3037
3038 [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
3039 [C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
3040 [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
3041 [C_VERIFY_S] = C_VERIFY_T,
3042 [C_MASK] = C_MASK,
3043 };
3044
3045 ms.i = ps.i;
3046
3047 ms.conn = c_tab[ps.conn];
3048 ms.peer = ps.role;
3049 ms.role = ps.peer;
3050 ms.pdsk = ps.disk;
3051 ms.disk = ps.pdsk;
3052 ms.peer_isp = (ps.aftr_isp | ps.user_isp);
3053
3054 return ms;
3055}
3056
3057static int receive_req_state(struct drbd_conf *mdev, struct p_header *h)
3058{
3059 struct p_req_state *p = (struct p_req_state *)h;
3060 union drbd_state mask, val;
3061 int rv;
3062
3063 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
3064 if (drbd_recv(mdev, h->payload, h->length) != h->length)
3065 return FALSE;
3066
3067 mask.i = be32_to_cpu(p->mask);
3068 val.i = be32_to_cpu(p->val);
3069
3070 if (test_bit(DISCARD_CONCURRENT, &mdev->flags) &&
3071 test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) {
3072 drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
3073 return TRUE;
3074 }
3075
3076 mask = convert_state(mask);
3077 val = convert_state(val);
3078
3079 rv = drbd_change_state(mdev, CS_VERBOSE, mask, val);
3080
3081 drbd_send_sr_reply(mdev, rv);
3082 drbd_md_sync(mdev);
3083
3084 return TRUE;
3085}
3086
3087static int receive_state(struct drbd_conf *mdev, struct p_header *h)
3088{
3089 struct p_state *p = (struct p_state *)h;
3090 enum drbd_conns nconn, oconn;
3091 union drbd_state ns, peer_state;
3092 enum drbd_disk_state real_peer_disk;
3093 int rv;
3094
3095 ERR_IF(h->length != (sizeof(*p)-sizeof(*h)))
3096 return FALSE;
3097
3098 if (drbd_recv(mdev, h->payload, h->length) != h->length)
3099 return FALSE;
3100
3101 peer_state.i = be32_to_cpu(p->state);
3102
3103 real_peer_disk = peer_state.disk;
3104 if (peer_state.disk == D_NEGOTIATING) {
3105 real_peer_disk = mdev->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
3106 dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
3107 }
3108
3109 spin_lock_irq(&mdev->req_lock);
3110 retry:
3111 oconn = nconn = mdev->state.conn;
3112 spin_unlock_irq(&mdev->req_lock);
3113
3114 if (nconn == C_WF_REPORT_PARAMS)
3115 nconn = C_CONNECTED;
3116
3117 if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
3118 get_ldev_if_state(mdev, D_NEGOTIATING)) {
3119 int cr; /* consider resync */
3120
3121 /* if we established a new connection */
3122 cr = (oconn < C_CONNECTED);
3123 /* if we had an established connection
3124 * and one of the nodes newly attaches a disk */
3125 cr |= (oconn == C_CONNECTED &&
3126 (peer_state.disk == D_NEGOTIATING ||
3127 mdev->state.disk == D_NEGOTIATING));
3128 /* if we have both been inconsistent, and the peer has been
3129 * forced to be UpToDate with --overwrite-data */
3130 cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
3131 /* if we had been plain connected, and the admin requested to
3132 * start a sync by "invalidate" or "invalidate-remote" */
3133 cr |= (oconn == C_CONNECTED &&
3134 (peer_state.conn >= C_STARTING_SYNC_S &&
3135 peer_state.conn <= C_WF_BITMAP_T));
3136
3137 if (cr)
3138 nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
3139
3140 put_ldev(mdev);
3141 if (nconn == C_MASK) {
3142 nconn = C_CONNECTED;
3143 if (mdev->state.disk == D_NEGOTIATING) {
3144 drbd_force_state(mdev, NS(disk, D_DISKLESS));
3145 } else if (peer_state.disk == D_NEGOTIATING) {
3146 dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
3147 peer_state.disk = D_DISKLESS;
3148 real_peer_disk = D_DISKLESS;
3149 } else {
3150 if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags))
3151 return FALSE;
3152 D_ASSERT(oconn == C_WF_REPORT_PARAMS);
3153 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3154 return FALSE;
3155 }
3156 }
3157 }
3158
3159 spin_lock_irq(&mdev->req_lock);
3160 if (mdev->state.conn != oconn)
3161 goto retry;
3162 clear_bit(CONSIDER_RESYNC, &mdev->flags);
3163 ns.i = mdev->state.i;
3164 ns.conn = nconn;
3165 ns.peer = peer_state.role;
3166 ns.pdsk = real_peer_disk;
3167 ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
3168 if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
3169 ns.disk = mdev->new_state_tmp.disk;
3170
3171 rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL);
3172 ns = mdev->state;
3173 spin_unlock_irq(&mdev->req_lock);
3174
3175 if (rv < SS_SUCCESS) {
3176 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3177 return FALSE;
3178 }
3179
3180 if (oconn > C_WF_REPORT_PARAMS) {
3181 if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
3182 peer_state.disk != D_NEGOTIATING ) {
3183 /* we want resync, peer has not yet decided to sync... */
3184 /* Nowadays only used when forcing a node into primary role and
3185 setting its disk to UpToDate with that */
3186 drbd_send_uuids(mdev);
3187 drbd_send_state(mdev);
3188 }
3189 }
3190
3191 mdev->net_conf->want_lose = 0;
3192
3193 drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
3194
3195 return TRUE;
3196}
3197
3198static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h)
3199{
3200 struct p_rs_uuid *p = (struct p_rs_uuid *)h;
3201
3202 wait_event(mdev->misc_wait,
3203 mdev->state.conn == C_WF_SYNC_UUID ||
3204 mdev->state.conn < C_CONNECTED ||
3205 mdev->state.disk < D_NEGOTIATING);
3206
3207 /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
3208
3209 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
3210 if (drbd_recv(mdev, h->payload, h->length) != h->length)
3211 return FALSE;
3212
3213 /* Here the _drbd_uuid_ functions are right, current should
3214 _not_ be rotated into the history */
3215 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
3216 _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid));
3217 _drbd_uuid_set(mdev, UI_BITMAP, 0UL);
3218
3219 drbd_start_resync(mdev, C_SYNC_TARGET);
3220
3221 put_ldev(mdev);
3222 } else
3223 dev_err(DEV, "Ignoring SyncUUID packet!\n");
3224
3225 return TRUE;
3226}
3227
3228enum receive_bitmap_ret { OK, DONE, FAILED };
3229
3230static enum receive_bitmap_ret
3231receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h,
3232 unsigned long *buffer, struct bm_xfer_ctx *c)
3233{
3234 unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
3235 unsigned want = num_words * sizeof(long);
3236
3237 if (want != h->length) {
3238 dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length);
3239 return FAILED;
3240 }
3241 if (want == 0)
3242 return DONE;
3243 if (drbd_recv(mdev, buffer, want) != want)
3244 return FAILED;
3245
3246 drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer);
3247
3248 c->word_offset += num_words;
3249 c->bit_offset = c->word_offset * BITS_PER_LONG;
3250 if (c->bit_offset > c->bm_bits)
3251 c->bit_offset = c->bm_bits;
3252
3253 return OK;
3254}
3255
3256static enum receive_bitmap_ret
3257recv_bm_rle_bits(struct drbd_conf *mdev,
3258 struct p_compressed_bm *p,
3259 struct bm_xfer_ctx *c)
3260{
3261 struct bitstream bs;
3262 u64 look_ahead;
3263 u64 rl;
3264 u64 tmp;
3265 unsigned long s = c->bit_offset;
3266 unsigned long e;
3267 int len = p->head.length - (sizeof(*p) - sizeof(p->head));
3268 int toggle = DCBP_get_start(p);
3269 int have;
3270 int bits;
3271
3272 bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p));
3273
3274 bits = bitstream_get_bits(&bs, &look_ahead, 64);
3275 if (bits < 0)
3276 return FAILED;
3277
3278 for (have = bits; have > 0; s += rl, toggle = !toggle) {
3279 bits = vli_decode_bits(&rl, look_ahead);
3280 if (bits <= 0)
3281 return FAILED;
3282
3283 if (toggle) {
3284 e = s + rl -1;
3285 if (e >= c->bm_bits) {
3286 dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
3287 return FAILED;
3288 }
3289 _drbd_bm_set_bits(mdev, s, e);
3290 }
3291
3292 if (have < bits) {
3293 dev_err(DEV, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
3294 have, bits, look_ahead,
3295 (unsigned int)(bs.cur.b - p->code),
3296 (unsigned int)bs.buf_len);
3297 return FAILED;
3298 }
3299 look_ahead >>= bits;
3300 have -= bits;
3301
3302 bits = bitstream_get_bits(&bs, &tmp, 64 - have);
3303 if (bits < 0)
3304 return FAILED;
3305 look_ahead |= tmp << have;
3306 have += bits;
3307 }
3308
3309 c->bit_offset = s;
3310 bm_xfer_ctx_bit_to_word_offset(c);
3311
3312 return (s == c->bm_bits) ? DONE : OK;
3313}
3314
3315static enum receive_bitmap_ret
3316decode_bitmap_c(struct drbd_conf *mdev,
3317 struct p_compressed_bm *p,
3318 struct bm_xfer_ctx *c)
3319{
3320 if (DCBP_get_code(p) == RLE_VLI_Bits)
3321 return recv_bm_rle_bits(mdev, p, c);
3322
3323 /* other variants had been implemented for evaluation,
3324 * but have been dropped as this one turned out to be "best"
3325 * during all our tests. */
3326
3327 dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
3328 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3329 return FAILED;
3330}
3331
3332void INFO_bm_xfer_stats(struct drbd_conf *mdev,
3333 const char *direction, struct bm_xfer_ctx *c)
3334{
3335 /* what would it take to transfer it "plaintext" */
3336 unsigned plain = sizeof(struct p_header) *
3337 ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
3338 + c->bm_words * sizeof(long);
3339 unsigned total = c->bytes[0] + c->bytes[1];
3340 unsigned r;
3341
3342 /* total can not be zero. but just in case: */
3343 if (total == 0)
3344 return;
3345
3346 /* don't report if not compressed */
3347 if (total >= plain)
3348 return;
3349
3350 /* total < plain. check for overflow, still */
3351 r = (total > UINT_MAX/1000) ? (total / (plain/1000))
3352 : (1000 * total / plain);
3353
3354 if (r > 1000)
3355 r = 1000;
3356
3357 r = 1000 - r;
3358 dev_info(DEV, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
3359 "total %u; compression: %u.%u%%\n",
3360 direction,
3361 c->bytes[1], c->packets[1],
3362 c->bytes[0], c->packets[0],
3363 total, r/10, r % 10);
3364}
3365
3366/* Since we are processing the bitfield from lower addresses to higher,
3367 it does not matter if the process it in 32 bit chunks or 64 bit
3368 chunks as long as it is little endian. (Understand it as byte stream,
3369 beginning with the lowest byte...) If we would use big endian
3370 we would need to process it from the highest address to the lowest,
3371 in order to be agnostic to the 32 vs 64 bits issue.
3372
3373 returns 0 on failure, 1 if we successfully received it. */
3374static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
3375{
3376 struct bm_xfer_ctx c;
3377 void *buffer;
3378 enum receive_bitmap_ret ret;
3379 int ok = FALSE;
3380
3381 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
3382
3383 drbd_bm_lock(mdev, "receive bitmap");
3384
3385 /* maybe we should use some per thread scratch page,
3386 * and allocate that during initial device creation? */
3387 buffer = (unsigned long *) __get_free_page(GFP_NOIO);
3388 if (!buffer) {
3389 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
3390 goto out;
3391 }
3392
3393 c = (struct bm_xfer_ctx) {
3394 .bm_bits = drbd_bm_bits(mdev),
3395 .bm_words = drbd_bm_words(mdev),
3396 };
3397
3398 do {
3399 if (h->command == P_BITMAP) {
3400 ret = receive_bitmap_plain(mdev, h, buffer, &c);
3401 } else if (h->command == P_COMPRESSED_BITMAP) {
3402 /* MAYBE: sanity check that we speak proto >= 90,
3403 * and the feature is enabled! */
3404 struct p_compressed_bm *p;
3405
3406 if (h->length > BM_PACKET_PAYLOAD_BYTES) {
3407 dev_err(DEV, "ReportCBitmap packet too large\n");
3408 goto out;
3409 }
3410 /* use the page buff */
3411 p = buffer;
3412 memcpy(p, h, sizeof(*h));
3413 if (drbd_recv(mdev, p->head.payload, h->length) != h->length)
3414 goto out;
3415 if (p->head.length <= (sizeof(*p) - sizeof(p->head))) {
3416 dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length);
3417 return FAILED;
3418 }
3419 ret = decode_bitmap_c(mdev, p, &c);
3420 } else {
3421 dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command);
3422 goto out;
3423 }
3424
3425 c.packets[h->command == P_BITMAP]++;
3426 c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length;
3427
3428 if (ret != OK)
3429 break;
3430
3431 if (!drbd_recv_header(mdev, h))
3432 goto out;
3433 } while (ret == OK);
3434 if (ret == FAILED)
3435 goto out;
3436
3437 INFO_bm_xfer_stats(mdev, "receive", &c);
3438
3439 if (mdev->state.conn == C_WF_BITMAP_T) {
3440 ok = !drbd_send_bitmap(mdev);
3441 if (!ok)
3442 goto out;
3443 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
3444 ok = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
3445 D_ASSERT(ok == SS_SUCCESS);
3446 } else if (mdev->state.conn != C_WF_BITMAP_S) {
3447 /* admin may have requested C_DISCONNECTING,
3448 * other threads may have noticed network errors */
3449 dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n",
3450 drbd_conn_str(mdev->state.conn));
3451 }
3452
3453 ok = TRUE;
3454 out:
3455 drbd_bm_unlock(mdev);
3456 if (ok && mdev->state.conn == C_WF_BITMAP_S)
3457 drbd_start_resync(mdev, C_SYNC_SOURCE);
3458 free_page((unsigned long) buffer);
3459 return ok;
3460}
3461
3462static int receive_skip(struct drbd_conf *mdev, struct p_header *h)
3463{
3464 /* TODO zero copy sink :) */
3465 static char sink[128];
3466 int size, want, r;
3467
3468 dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
3469 h->command, h->length);
3470
3471 size = h->length;
3472 while (size > 0) {
3473 want = min_t(int, size, sizeof(sink));
3474 r = drbd_recv(mdev, sink, want);
3475 ERR_IF(r <= 0) break;
3476 size -= r;
3477 }
3478 return size == 0;
3479}
3480
3481static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
3482{
3483 if (mdev->state.disk >= D_INCONSISTENT)
3484 drbd_kick_lo(mdev);
3485
3486 /* Make sure we've acked all the TCP data associated
3487 * with the data requests being unplugged */
3488 drbd_tcp_quickack(mdev->data.socket);
3489
3490 return TRUE;
3491}
3492
3493typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *);
3494
3495static drbd_cmd_handler_f drbd_default_handler[] = {
3496 [P_DATA] = receive_Data,
3497 [P_DATA_REPLY] = receive_DataReply,
3498 [P_RS_DATA_REPLY] = receive_RSDataReply,
3499 [P_BARRIER] = receive_Barrier,
3500 [P_BITMAP] = receive_bitmap,
3501 [P_COMPRESSED_BITMAP] = receive_bitmap,
3502 [P_UNPLUG_REMOTE] = receive_UnplugRemote,
3503 [P_DATA_REQUEST] = receive_DataRequest,
3504 [P_RS_DATA_REQUEST] = receive_DataRequest,
3505 [P_SYNC_PARAM] = receive_SyncParam,
3506 [P_SYNC_PARAM89] = receive_SyncParam,
3507 [P_PROTOCOL] = receive_protocol,
3508 [P_UUIDS] = receive_uuids,
3509 [P_SIZES] = receive_sizes,
3510 [P_STATE] = receive_state,
3511 [P_STATE_CHG_REQ] = receive_req_state,
3512 [P_SYNC_UUID] = receive_sync_uuid,
3513 [P_OV_REQUEST] = receive_DataRequest,
3514 [P_OV_REPLY] = receive_DataRequest,
3515 [P_CSUM_RS_REQUEST] = receive_DataRequest,
3516 /* anything missing from this table is in
3517 * the asender_tbl, see get_asender_cmd */
3518 [P_MAX_CMD] = NULL,
3519};
3520
3521static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler;
3522static drbd_cmd_handler_f *drbd_opt_cmd_handler;
3523
3524static void drbdd(struct drbd_conf *mdev)
3525{
3526 drbd_cmd_handler_f handler;
3527 struct p_header *header = &mdev->data.rbuf.header;
3528
3529 while (get_t_state(&mdev->receiver) == Running) {
3530 drbd_thread_current_set_cpu(mdev);
3531 if (!drbd_recv_header(mdev, header)) {
3532 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3533 break;
3534 }
3535
3536 if (header->command < P_MAX_CMD)
3537 handler = drbd_cmd_handler[header->command];
3538 else if (P_MAY_IGNORE < header->command
3539 && header->command < P_MAX_OPT_CMD)
3540 handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE];
3541 else if (header->command > P_MAX_OPT_CMD)
3542 handler = receive_skip;
3543 else
3544 handler = NULL;
3545
3546 if (unlikely(!handler)) {
3547 dev_err(DEV, "unknown packet type %d, l: %d!\n",
3548 header->command, header->length);
3549 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3550 break;
3551 }
3552 if (unlikely(!handler(mdev, header))) {
3553 dev_err(DEV, "error receiving %s, l: %d!\n",
3554 cmdname(header->command), header->length);
3555 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3556 break;
3557 }
3558 }
3559}
3560
3561static void drbd_fail_pending_reads(struct drbd_conf *mdev)
3562{
3563 struct hlist_head *slot;
3564 struct hlist_node *pos;
3565 struct hlist_node *tmp;
3566 struct drbd_request *req;
3567 int i;
3568
3569 /*
3570 * Application READ requests
3571 */
3572 spin_lock_irq(&mdev->req_lock);
3573 for (i = 0; i < APP_R_HSIZE; i++) {
3574 slot = mdev->app_reads_hash+i;
3575 hlist_for_each_entry_safe(req, pos, tmp, slot, colision) {
3576 /* it may (but should not any longer!)
3577 * be on the work queue; if that assert triggers,
3578 * we need to also grab the
3579 * spin_lock_irq(&mdev->data.work.q_lock);
3580 * and list_del_init here. */
3581 D_ASSERT(list_empty(&req->w.list));
3582 /* It would be nice to complete outside of spinlock.
3583 * But this is easier for now. */
3584 _req_mod(req, connection_lost_while_pending);
3585 }
3586 }
3587 for (i = 0; i < APP_R_HSIZE; i++)
3588 if (!hlist_empty(mdev->app_reads_hash+i))
3589 dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: "
3590 "%p, should be NULL\n", i, mdev->app_reads_hash[i].first);
3591
3592 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
3593 spin_unlock_irq(&mdev->req_lock);
3594}
3595
3596void drbd_flush_workqueue(struct drbd_conf *mdev)
3597{
3598 struct drbd_wq_barrier barr;
3599
3600 barr.w.cb = w_prev_work_done;
3601 init_completion(&barr.done);
3602 drbd_queue_work(&mdev->data.work, &barr.w);
3603 wait_for_completion(&barr.done);
3604}
3605
3606static void drbd_disconnect(struct drbd_conf *mdev)
3607{
3608 enum drbd_fencing_p fp;
3609 union drbd_state os, ns;
3610 int rv = SS_UNKNOWN_ERROR;
3611 unsigned int i;
3612
3613 if (mdev->state.conn == C_STANDALONE)
3614 return;
3615 if (mdev->state.conn >= C_WF_CONNECTION)
3616 dev_err(DEV, "ASSERT FAILED cstate = %s, expected < WFConnection\n",
3617 drbd_conn_str(mdev->state.conn));
3618
3619 /* asender does not clean up anything. it must not interfere, either */
3620 drbd_thread_stop(&mdev->asender);
3621 drbd_free_sock(mdev);
3622
3623 spin_lock_irq(&mdev->req_lock);
3624 _drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
3625 _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
3626 _drbd_wait_ee_list_empty(mdev, &mdev->read_ee);
3627 spin_unlock_irq(&mdev->req_lock);
3628
3629 /* We do not have data structures that would allow us to
3630 * get the rs_pending_cnt down to 0 again.
3631 * * On C_SYNC_TARGET we do not have any data structures describing
3632 * the pending RSDataRequest's we have sent.
3633 * * On C_SYNC_SOURCE there is no data structure that tracks
3634 * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
3635 * And no, it is not the sum of the reference counts in the
3636 * resync_LRU. The resync_LRU tracks the whole operation including
3637 * the disk-IO, while the rs_pending_cnt only tracks the blocks
3638 * on the fly. */
3639 drbd_rs_cancel_all(mdev);
3640 mdev->rs_total = 0;
3641 mdev->rs_failed = 0;
3642 atomic_set(&mdev->rs_pending_cnt, 0);
3643 wake_up(&mdev->misc_wait);
3644
3645 /* make sure syncer is stopped and w_resume_next_sg queued */
3646 del_timer_sync(&mdev->resync_timer);
3647 set_bit(STOP_SYNC_TIMER, &mdev->flags);
3648 resync_timer_fn((unsigned long)mdev);
3649
3650 /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
3651 * w_make_resync_request etc. which may still be on the worker queue
3652 * to be "canceled" */
3653 drbd_flush_workqueue(mdev);
3654
3655 /* This also does reclaim_net_ee(). If we do this too early, we might
3656 * miss some resync ee and pages.*/
3657 drbd_process_done_ee(mdev);
3658
3659 kfree(mdev->p_uuid);
3660 mdev->p_uuid = NULL;
3661
3662 if (!mdev->state.susp)
3663 tl_clear(mdev);
3664
3665 drbd_fail_pending_reads(mdev);
3666
3667 dev_info(DEV, "Connection closed\n");
3668
3669 drbd_md_sync(mdev);
3670
3671 fp = FP_DONT_CARE;
3672 if (get_ldev(mdev)) {
3673 fp = mdev->ldev->dc.fencing;
3674 put_ldev(mdev);
3675 }
3676
3677 if (mdev->state.role == R_PRIMARY) {
3678 if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) {
3679 enum drbd_disk_state nps = drbd_try_outdate_peer(mdev);
3680 drbd_request_state(mdev, NS(pdsk, nps));
3681 }
3682 }
3683
3684 spin_lock_irq(&mdev->req_lock);
3685 os = mdev->state;
3686 if (os.conn >= C_UNCONNECTED) {
3687 /* Do not restart in case we are C_DISCONNECTING */
3688 ns = os;
3689 ns.conn = C_UNCONNECTED;
3690 rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
3691 }
3692 spin_unlock_irq(&mdev->req_lock);
3693
3694 if (os.conn == C_DISCONNECTING) {
3695 struct hlist_head *h;
3696 wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0);
3697
3698 /* we must not free the tl_hash
3699 * while application io is still on the fly */
3700 wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0);
3701
3702 spin_lock_irq(&mdev->req_lock);
3703 /* paranoia code */
3704 for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
3705 if (h->first)
3706 dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
3707 (int)(h - mdev->ee_hash), h->first);
3708 kfree(mdev->ee_hash);
3709 mdev->ee_hash = NULL;
3710 mdev->ee_hash_s = 0;
3711
3712 /* paranoia code */
3713 for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
3714 if (h->first)
3715 dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
3716 (int)(h - mdev->tl_hash), h->first);
3717 kfree(mdev->tl_hash);
3718 mdev->tl_hash = NULL;
3719 mdev->tl_hash_s = 0;
3720 spin_unlock_irq(&mdev->req_lock);
3721
3722 crypto_free_hash(mdev->cram_hmac_tfm);
3723 mdev->cram_hmac_tfm = NULL;
3724
3725 kfree(mdev->net_conf);
3726 mdev->net_conf = NULL;
3727 drbd_request_state(mdev, NS(conn, C_STANDALONE));
3728 }
3729
3730 /* tcp_close and release of sendpage pages can be deferred. I don't
3731 * want to use SO_LINGER, because apparently it can be deferred for
3732 * more than 20 seconds (longest time I checked).
3733 *
3734 * Actually we don't care for exactly when the network stack does its
3735 * put_page(), but release our reference on these pages right here.
3736 */
3737 i = drbd_release_ee(mdev, &mdev->net_ee);
3738 if (i)
3739 dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
3740 i = atomic_read(&mdev->pp_in_use);
3741 if (i)
3742 dev_info(DEV, "pp_in_use = %u, expected 0\n", i);
3743
3744 D_ASSERT(list_empty(&mdev->read_ee));
3745 D_ASSERT(list_empty(&mdev->active_ee));
3746 D_ASSERT(list_empty(&mdev->sync_ee));
3747 D_ASSERT(list_empty(&mdev->done_ee));
3748
3749 /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
3750 atomic_set(&mdev->current_epoch->epoch_size, 0);
3751 D_ASSERT(list_empty(&mdev->current_epoch->list));
3752}
3753
3754/*
3755 * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
3756 * we can agree on is stored in agreed_pro_version.
3757 *
3758 * feature flags and the reserved array should be enough room for future
3759 * enhancements of the handshake protocol, and possible plugins...
3760 *
3761 * for now, they are expected to be zero, but ignored.
3762 */
3763static int drbd_send_handshake(struct drbd_conf *mdev)
3764{
3765 /* ASSERT current == mdev->receiver ... */
3766 struct p_handshake *p = &mdev->data.sbuf.handshake;
3767 int ok;
3768
3769 if (mutex_lock_interruptible(&mdev->data.mutex)) {
3770 dev_err(DEV, "interrupted during initial handshake\n");
3771 return 0; /* interrupted. not ok. */
3772 }
3773
3774 if (mdev->data.socket == NULL) {
3775 mutex_unlock(&mdev->data.mutex);
3776 return 0;
3777 }
3778
3779 memset(p, 0, sizeof(*p));
3780 p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
3781 p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
3782 ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
3783 (struct p_header *)p, sizeof(*p), 0 );
3784 mutex_unlock(&mdev->data.mutex);
3785 return ok;
3786}
3787
3788/*
3789 * return values:
3790 * 1 yes, we have a valid connection
3791 * 0 oops, did not work out, please try again
3792 * -1 peer talks different language,
3793 * no point in trying again, please go standalone.
3794 */
3795static int drbd_do_handshake(struct drbd_conf *mdev)
3796{
3797 /* ASSERT current == mdev->receiver ... */
3798 struct p_handshake *p = &mdev->data.rbuf.handshake;
3799 const int expect = sizeof(struct p_handshake)
3800 -sizeof(struct p_header);
3801 int rv;
3802
3803 rv = drbd_send_handshake(mdev);
3804 if (!rv)
3805 return 0;
3806
3807 rv = drbd_recv_header(mdev, &p->head);
3808 if (!rv)
3809 return 0;
3810
3811 if (p->head.command != P_HAND_SHAKE) {
3812 dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
3813 cmdname(p->head.command), p->head.command);
3814 return -1;
3815 }
3816
3817 if (p->head.length != expect) {
3818 dev_err(DEV, "expected HandShake length: %u, received: %u\n",
3819 expect, p->head.length);
3820 return -1;
3821 }
3822
3823 rv = drbd_recv(mdev, &p->head.payload, expect);
3824
3825 if (rv != expect) {
3826 dev_err(DEV, "short read receiving handshake packet: l=%u\n", rv);
3827 return 0;
3828 }
3829
3830 p->protocol_min = be32_to_cpu(p->protocol_min);
3831 p->protocol_max = be32_to_cpu(p->protocol_max);
3832 if (p->protocol_max == 0)
3833 p->protocol_max = p->protocol_min;
3834
3835 if (PRO_VERSION_MAX < p->protocol_min ||
3836 PRO_VERSION_MIN > p->protocol_max)
3837 goto incompat;
3838
3839 mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
3840
3841 dev_info(DEV, "Handshake successful: "
3842 "Agreed network protocol version %d\n", mdev->agreed_pro_version);
3843
3844 return 1;
3845
3846 incompat:
3847 dev_err(DEV, "incompatible DRBD dialects: "
3848 "I support %d-%d, peer supports %d-%d\n",
3849 PRO_VERSION_MIN, PRO_VERSION_MAX,
3850 p->protocol_min, p->protocol_max);
3851 return -1;
3852}
3853
3854#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
3855static int drbd_do_auth(struct drbd_conf *mdev)
3856{
3857 dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
3858 dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
3859 return -1;
3860}
3861#else
3862#define CHALLENGE_LEN 64
3863
3864/* Return value:
3865 1 - auth succeeded,
3866 0 - failed, try again (network error),
3867 -1 - auth failed, don't try again.
3868*/
3869
3870static int drbd_do_auth(struct drbd_conf *mdev)
3871{
3872 char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */
3873 struct scatterlist sg;
3874 char *response = NULL;
3875 char *right_response = NULL;
3876 char *peers_ch = NULL;
3877 struct p_header p;
3878 unsigned int key_len = strlen(mdev->net_conf->shared_secret);
3879 unsigned int resp_size;
3880 struct hash_desc desc;
3881 int rv;
3882
3883 desc.tfm = mdev->cram_hmac_tfm;
3884 desc.flags = 0;
3885
3886 rv = crypto_hash_setkey(mdev->cram_hmac_tfm,
3887 (u8 *)mdev->net_conf->shared_secret, key_len);
3888 if (rv) {
3889 dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv);
3890 rv = -1;
3891 goto fail;
3892 }
3893
3894 get_random_bytes(my_challenge, CHALLENGE_LEN);
3895
3896 rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN);
3897 if (!rv)
3898 goto fail;
3899
3900 rv = drbd_recv_header(mdev, &p);
3901 if (!rv)
3902 goto fail;
3903
3904 if (p.command != P_AUTH_CHALLENGE) {
3905 dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
3906 cmdname(p.command), p.command);
3907 rv = 0;
3908 goto fail;
3909 }
3910
3911 if (p.length > CHALLENGE_LEN*2) {
3912 dev_err(DEV, "expected AuthChallenge payload too big.\n");
3913 rv = -1;
3914 goto fail;
3915 }
3916
3917 peers_ch = kmalloc(p.length, GFP_NOIO);
3918 if (peers_ch == NULL) {
3919 dev_err(DEV, "kmalloc of peers_ch failed\n");
3920 rv = -1;
3921 goto fail;
3922 }
3923
3924 rv = drbd_recv(mdev, peers_ch, p.length);
3925
3926 if (rv != p.length) {
3927 dev_err(DEV, "short read AuthChallenge: l=%u\n", rv);
3928 rv = 0;
3929 goto fail;
3930 }
3931
3932 resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm);
3933 response = kmalloc(resp_size, GFP_NOIO);
3934 if (response == NULL) {
3935 dev_err(DEV, "kmalloc of response failed\n");
3936 rv = -1;
3937 goto fail;
3938 }
3939
3940 sg_init_table(&sg, 1);
3941 sg_set_buf(&sg, peers_ch, p.length);
3942
3943 rv = crypto_hash_digest(&desc, &sg, sg.length, response);
3944 if (rv) {
3945 dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
3946 rv = -1;
3947 goto fail;
3948 }
3949
3950 rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size);
3951 if (!rv)
3952 goto fail;
3953
3954 rv = drbd_recv_header(mdev, &p);
3955 if (!rv)
3956 goto fail;
3957
3958 if (p.command != P_AUTH_RESPONSE) {
3959 dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
3960 cmdname(p.command), p.command);
3961 rv = 0;
3962 goto fail;
3963 }
3964
3965 if (p.length != resp_size) {
3966 dev_err(DEV, "expected AuthResponse payload of wrong size\n");
3967 rv = 0;
3968 goto fail;
3969 }
3970
3971 rv = drbd_recv(mdev, response , resp_size);
3972
3973 if (rv != resp_size) {
3974 dev_err(DEV, "short read receiving AuthResponse: l=%u\n", rv);
3975 rv = 0;
3976 goto fail;
3977 }
3978
3979 right_response = kmalloc(resp_size, GFP_NOIO);
3980 if (right_response == NULL) {
3981 dev_err(DEV, "kmalloc of right_response failed\n");
3982 rv = -1;
3983 goto fail;
3984 }
3985
3986 sg_set_buf(&sg, my_challenge, CHALLENGE_LEN);
3987
3988 rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
3989 if (rv) {
3990 dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
3991 rv = -1;
3992 goto fail;
3993 }
3994
3995 rv = !memcmp(response, right_response, resp_size);
3996
3997 if (rv)
3998 dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n",
3999 resp_size, mdev->net_conf->cram_hmac_alg);
4000 else
4001 rv = -1;
4002
4003 fail:
4004 kfree(peers_ch);
4005 kfree(response);
4006 kfree(right_response);
4007
4008 return rv;
4009}
4010#endif
4011
4012int drbdd_init(struct drbd_thread *thi)
4013{
4014 struct drbd_conf *mdev = thi->mdev;
4015 unsigned int minor = mdev_to_minor(mdev);
4016 int h;
4017
4018 sprintf(current->comm, "drbd%d_receiver", minor);
4019
4020 dev_info(DEV, "receiver (re)started\n");
4021
4022 do {
4023 h = drbd_connect(mdev);
4024 if (h == 0) {
4025 drbd_disconnect(mdev);
4026 __set_current_state(TASK_INTERRUPTIBLE);
4027 schedule_timeout(HZ);
4028 }
4029 if (h == -1) {
4030 dev_warn(DEV, "Discarding network configuration.\n");
4031 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
4032 }
4033 } while (h == 0);
4034
4035 if (h > 0) {
4036 if (get_net_conf(mdev)) {
4037 drbdd(mdev);
4038 put_net_conf(mdev);
4039 }
4040 }
4041
4042 drbd_disconnect(mdev);
4043
4044 dev_info(DEV, "receiver terminated\n");
4045 return 0;
4046}
4047
4048/* ********* acknowledge sender ******** */
4049
4050static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h)
4051{
4052 struct p_req_state_reply *p = (struct p_req_state_reply *)h;
4053
4054 int retcode = be32_to_cpu(p->retcode);
4055
4056 if (retcode >= SS_SUCCESS) {
4057 set_bit(CL_ST_CHG_SUCCESS, &mdev->flags);
4058 } else {
4059 set_bit(CL_ST_CHG_FAIL, &mdev->flags);
4060 dev_err(DEV, "Requested state change failed by peer: %s (%d)\n",
4061 drbd_set_st_err_str(retcode), retcode);
4062 }
4063 wake_up(&mdev->state_wait);
4064
4065 return TRUE;
4066}
4067
4068static int got_Ping(struct drbd_conf *mdev, struct p_header *h)
4069{
4070 return drbd_send_ping_ack(mdev);
4071
4072}
4073
4074static int got_PingAck(struct drbd_conf *mdev, struct p_header *h)
4075{
4076 /* restore idle timeout */
4077 mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
4078 if (!test_and_set_bit(GOT_PING_ACK, &mdev->flags))
4079 wake_up(&mdev->misc_wait);
4080
4081 return TRUE;
4082}
4083
4084static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h)
4085{
4086 struct p_block_ack *p = (struct p_block_ack *)h;
4087 sector_t sector = be64_to_cpu(p->sector);
4088 int blksize = be32_to_cpu(p->blksize);
4089
4090 D_ASSERT(mdev->agreed_pro_version >= 89);
4091
4092 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4093
4094 drbd_rs_complete_io(mdev, sector);
4095 drbd_set_in_sync(mdev, sector, blksize);
4096 /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
4097 mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
4098 dec_rs_pending(mdev);
4099
4100 return TRUE;
4101}
4102
4103/* when we receive the ACK for a write request,
4104 * verify that we actually know about it */
4105static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev,
4106 u64 id, sector_t sector)
4107{
4108 struct hlist_head *slot = tl_hash_slot(mdev, sector);
4109 struct hlist_node *n;
4110 struct drbd_request *req;
4111
4112 hlist_for_each_entry(req, n, slot, colision) {
4113 if ((unsigned long)req == (unsigned long)id) {
4114 if (req->sector != sector) {
4115 dev_err(DEV, "_ack_id_to_req: found req %p but it has "
4116 "wrong sector (%llus versus %llus)\n", req,
4117 (unsigned long long)req->sector,
4118 (unsigned long long)sector);
4119 break;
4120 }
4121 return req;
4122 }
4123 }
4124 dev_err(DEV, "_ack_id_to_req: failed to find req %p, sector %llus in list\n",
4125 (void *)(unsigned long)id, (unsigned long long)sector);
4126 return NULL;
4127}
4128
4129typedef struct drbd_request *(req_validator_fn)
4130 (struct drbd_conf *mdev, u64 id, sector_t sector);
4131
4132static int validate_req_change_req_state(struct drbd_conf *mdev,
4133 u64 id, sector_t sector, req_validator_fn validator,
4134 const char *func, enum drbd_req_event what)
4135{
4136 struct drbd_request *req;
4137 struct bio_and_error m;
4138
4139 spin_lock_irq(&mdev->req_lock);
4140 req = validator(mdev, id, sector);
4141 if (unlikely(!req)) {
4142 spin_unlock_irq(&mdev->req_lock);
4143 dev_err(DEV, "%s: got a corrupt block_id/sector pair\n", func);
4144 return FALSE;
4145 }
4146 __req_mod(req, what, &m);
4147 spin_unlock_irq(&mdev->req_lock);
4148
4149 if (m.bio)
4150 complete_master_bio(mdev, &m);
4151 return TRUE;
4152}
4153
4154static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h)
4155{
4156 struct p_block_ack *p = (struct p_block_ack *)h;
4157 sector_t sector = be64_to_cpu(p->sector);
4158 int blksize = be32_to_cpu(p->blksize);
4159 enum drbd_req_event what;
4160
4161 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4162
4163 if (is_syncer_block_id(p->block_id)) {
4164 drbd_set_in_sync(mdev, sector, blksize);
4165 dec_rs_pending(mdev);
4166 return TRUE;
4167 }
4168 switch (be16_to_cpu(h->command)) {
4169 case P_RS_WRITE_ACK:
4170 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4171 what = write_acked_by_peer_and_sis;
4172 break;
4173 case P_WRITE_ACK:
4174 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4175 what = write_acked_by_peer;
4176 break;
4177 case P_RECV_ACK:
4178 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B);
4179 what = recv_acked_by_peer;
4180 break;
4181 case P_DISCARD_ACK:
4182 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4183 what = conflict_discarded_by_peer;
4184 break;
4185 default:
4186 D_ASSERT(0);
4187 return FALSE;
4188 }
4189
4190 return validate_req_change_req_state(mdev, p->block_id, sector,
4191 _ack_id_to_req, __func__ , what);
4192}
4193
4194static int got_NegAck(struct drbd_conf *mdev, struct p_header *h)
4195{
4196 struct p_block_ack *p = (struct p_block_ack *)h;
4197 sector_t sector = be64_to_cpu(p->sector);
4198
4199 if (__ratelimit(&drbd_ratelimit_state))
4200 dev_warn(DEV, "Got NegAck packet. Peer is in troubles?\n");
4201
4202 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4203
4204 if (is_syncer_block_id(p->block_id)) {
4205 int size = be32_to_cpu(p->blksize);
4206 dec_rs_pending(mdev);
4207 drbd_rs_failed_io(mdev, sector, size);
4208 return TRUE;
4209 }
4210 return validate_req_change_req_state(mdev, p->block_id, sector,
4211 _ack_id_to_req, __func__ , neg_acked);
4212}
4213
4214static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h)
4215{
4216 struct p_block_ack *p = (struct p_block_ack *)h;
4217 sector_t sector = be64_to_cpu(p->sector);
4218
4219 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4220 dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n",
4221 (unsigned long long)sector, be32_to_cpu(p->blksize));
4222
4223 return validate_req_change_req_state(mdev, p->block_id, sector,
4224 _ar_id_to_req, __func__ , neg_acked);
4225}
4226
4227static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
4228{
4229 sector_t sector;
4230 int size;
4231 struct p_block_ack *p = (struct p_block_ack *)h;
4232
4233 sector = be64_to_cpu(p->sector);
4234 size = be32_to_cpu(p->blksize);
4235 D_ASSERT(p->block_id == ID_SYNCER);
4236
4237 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4238
4239 dec_rs_pending(mdev);
4240
4241 if (get_ldev_if_state(mdev, D_FAILED)) {
4242 drbd_rs_complete_io(mdev, sector);
4243 drbd_rs_failed_io(mdev, sector, size);
4244 put_ldev(mdev);
4245 }
4246
4247 return TRUE;
4248}
4249
4250static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h)
4251{
4252 struct p_barrier_ack *p = (struct p_barrier_ack *)h;
4253
4254 tl_release(mdev, p->barrier, be32_to_cpu(p->set_size));
4255
4256 return TRUE;
4257}
4258
4259static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
4260{
4261 struct p_block_ack *p = (struct p_block_ack *)h;
4262 struct drbd_work *w;
4263 sector_t sector;
4264 int size;
4265
4266 sector = be64_to_cpu(p->sector);
4267 size = be32_to_cpu(p->blksize);
4268
4269 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4270
4271 if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
4272 drbd_ov_oos_found(mdev, sector, size);
4273 else
4274 ov_oos_print(mdev);
4275
4276 drbd_rs_complete_io(mdev, sector);
4277 dec_rs_pending(mdev);
4278
4279 if (--mdev->ov_left == 0) {
4280 w = kmalloc(sizeof(*w), GFP_NOIO);
4281 if (w) {
4282 w->cb = w_ov_finished;
4283 drbd_queue_work_front(&mdev->data.work, w);
4284 } else {
4285 dev_err(DEV, "kmalloc(w) failed.");
4286 ov_oos_print(mdev);
4287 drbd_resync_finished(mdev);
4288 }
4289 }
4290 return TRUE;
4291}
4292
4293struct asender_cmd {
4294 size_t pkt_size;
4295 int (*process)(struct drbd_conf *mdev, struct p_header *h);
4296};
4297
4298static struct asender_cmd *get_asender_cmd(int cmd)
4299{
4300 static struct asender_cmd asender_tbl[] = {
4301 /* anything missing from this table is in
4302 * the drbd_cmd_handler (drbd_default_handler) table,
4303 * see the beginning of drbdd() */
4304 [P_PING] = { sizeof(struct p_header), got_Ping },
4305 [P_PING_ACK] = { sizeof(struct p_header), got_PingAck },
4306 [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4307 [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4308 [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4309 [P_DISCARD_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4310 [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck },
4311 [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply },
4312 [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply},
4313 [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult },
4314 [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
4315 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
4316 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
4317 [P_MAX_CMD] = { 0, NULL },
4318 };
4319 if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
4320 return NULL;
4321 return &asender_tbl[cmd];
4322}
4323
4324int drbd_asender(struct drbd_thread *thi)
4325{
4326 struct drbd_conf *mdev = thi->mdev;
4327 struct p_header *h = &mdev->meta.rbuf.header;
4328 struct asender_cmd *cmd = NULL;
4329
4330 int rv, len;
4331 void *buf = h;
4332 int received = 0;
4333 int expect = sizeof(struct p_header);
4334 int empty;
4335
4336 sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
4337
4338 current->policy = SCHED_RR; /* Make this a realtime task! */
4339 current->rt_priority = 2; /* more important than all other tasks */
4340
4341 while (get_t_state(thi) == Running) {
4342 drbd_thread_current_set_cpu(mdev);
4343 if (test_and_clear_bit(SEND_PING, &mdev->flags)) {
4344 ERR_IF(!drbd_send_ping(mdev)) goto reconnect;
4345 mdev->meta.socket->sk->sk_rcvtimeo =
4346 mdev->net_conf->ping_timeo*HZ/10;
4347 }
4348
4349 /* conditionally cork;
4350 * it may hurt latency if we cork without much to send */
4351 if (!mdev->net_conf->no_cork &&
4352 3 < atomic_read(&mdev->unacked_cnt))
4353 drbd_tcp_cork(mdev->meta.socket);
4354 while (1) {
4355 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4356 flush_signals(current);
4357 if (!drbd_process_done_ee(mdev)) {
4358 dev_err(DEV, "process_done_ee() = NOT_OK\n");
4359 goto reconnect;
4360 }
4361 /* to avoid race with newly queued ACKs */
4362 set_bit(SIGNAL_ASENDER, &mdev->flags);
4363 spin_lock_irq(&mdev->req_lock);
4364 empty = list_empty(&mdev->done_ee);
4365 spin_unlock_irq(&mdev->req_lock);
4366 /* new ack may have been queued right here,
4367 * but then there is also a signal pending,
4368 * and we start over... */
4369 if (empty)
4370 break;
4371 }
4372 /* but unconditionally uncork unless disabled */
4373 if (!mdev->net_conf->no_cork)
4374 drbd_tcp_uncork(mdev->meta.socket);
4375
4376 /* short circuit, recv_msg would return EINTR anyways. */
4377 if (signal_pending(current))
4378 continue;
4379
4380 rv = drbd_recv_short(mdev, mdev->meta.socket,
4381 buf, expect-received, 0);
4382 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4383
4384 flush_signals(current);
4385
4386 /* Note:
4387 * -EINTR (on meta) we got a signal
4388 * -EAGAIN (on meta) rcvtimeo expired
4389 * -ECONNRESET other side closed the connection
4390 * -ERESTARTSYS (on data) we got a signal
4391 * rv < 0 other than above: unexpected error!
4392 * rv == expected: full header or command
4393 * rv < expected: "woken" by signal during receive
4394 * rv == 0 : "connection shut down by peer"
4395 */
4396 if (likely(rv > 0)) {
4397 received += rv;
4398 buf += rv;
4399 } else if (rv == 0) {
4400 dev_err(DEV, "meta connection shut down by peer.\n");
4401 goto reconnect;
4402 } else if (rv == -EAGAIN) {
4403 if (mdev->meta.socket->sk->sk_rcvtimeo ==
4404 mdev->net_conf->ping_timeo*HZ/10) {
4405 dev_err(DEV, "PingAck did not arrive in time.\n");
4406 goto reconnect;
4407 }
4408 set_bit(SEND_PING, &mdev->flags);
4409 continue;
4410 } else if (rv == -EINTR) {
4411 continue;
4412 } else {
4413 dev_err(DEV, "sock_recvmsg returned %d\n", rv);
4414 goto reconnect;
4415 }
4416
4417 if (received == expect && cmd == NULL) {
4418 if (unlikely(h->magic != BE_DRBD_MAGIC)) {
4419 dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n",
4420 (long)be32_to_cpu(h->magic),
4421 h->command, h->length);
4422 goto reconnect;
4423 }
4424 cmd = get_asender_cmd(be16_to_cpu(h->command));
4425 len = be16_to_cpu(h->length);
4426 if (unlikely(cmd == NULL)) {
4427 dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n",
4428 (long)be32_to_cpu(h->magic),
4429 h->command, h->length);
4430 goto disconnect;
4431 }
4432 expect = cmd->pkt_size;
4433 ERR_IF(len != expect-sizeof(struct p_header))
4434 goto reconnect;
4435 }
4436 if (received == expect) {
4437 D_ASSERT(cmd != NULL);
4438 if (!cmd->process(mdev, h))
4439 goto reconnect;
4440
4441 buf = h;
4442 received = 0;
4443 expect = sizeof(struct p_header);
4444 cmd = NULL;
4445 }
4446 }
4447
4448 if (0) {
4449reconnect:
4450 drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
4451 }
4452 if (0) {
4453disconnect:
4454 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
4455 }
4456 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4457
4458 D_ASSERT(mdev->state.conn < C_CONNECTED);
4459 dev_info(DEV, "asender terminated\n");
4460
4461 return 0;
4462}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
new file mode 100644
index 000000000000..de81ab7b4627
--- /dev/null
+++ b/drivers/block/drbd/drbd_req.c
@@ -0,0 +1,1125 @@
1/*
2 drbd_req.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/module.h>
27
28#include <linux/slab.h>
29#include <linux/drbd.h>
30#include "drbd_int.h"
31#include "drbd_req.h"
32
33
34/* Update disk stats at start of I/O request */
35static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio)
36{
37 const int rw = bio_data_dir(bio);
38 int cpu;
39 cpu = part_stat_lock();
40 part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
41 part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
42 part_inc_in_flight(&mdev->vdisk->part0, rw);
43 part_stat_unlock();
44}
45
46/* Update disk stats when completing request upwards */
47static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
48{
49 int rw = bio_data_dir(req->master_bio);
50 unsigned long duration = jiffies - req->start_time;
51 int cpu;
52 cpu = part_stat_lock();
53 part_stat_add(cpu, &mdev->vdisk->part0, ticks[rw], duration);
54 part_round_stats(cpu, &mdev->vdisk->part0);
55 part_dec_in_flight(&mdev->vdisk->part0, rw);
56 part_stat_unlock();
57}
58
59static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw)
60{
61 const unsigned long s = req->rq_state;
62 /* if it was a write, we may have to set the corresponding
63 * bit(s) out-of-sync first. If it had a local part, we need to
64 * release the reference to the activity log. */
65 if (rw == WRITE) {
66 /* remove it from the transfer log.
67 * well, only if it had been there in the first
68 * place... if it had not (local only or conflicting
69 * and never sent), it should still be "empty" as
70 * initialized in drbd_req_new(), so we can list_del() it
71 * here unconditionally */
72 list_del(&req->tl_requests);
73 /* Set out-of-sync unless both OK flags are set
74 * (local only or remote failed).
75 * Other places where we set out-of-sync:
76 * READ with local io-error */
77 if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
78 drbd_set_out_of_sync(mdev, req->sector, req->size);
79
80 if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
81 drbd_set_in_sync(mdev, req->sector, req->size);
82
83 /* one might be tempted to move the drbd_al_complete_io
84 * to the local io completion callback drbd_endio_pri.
85 * but, if this was a mirror write, we may only
86 * drbd_al_complete_io after this is RQ_NET_DONE,
87 * otherwise the extent could be dropped from the al
88 * before it has actually been written on the peer.
89 * if we crash before our peer knows about the request,
90 * but after the extent has been dropped from the al,
91 * we would forget to resync the corresponding extent.
92 */
93 if (s & RQ_LOCAL_MASK) {
94 if (get_ldev_if_state(mdev, D_FAILED)) {
95 drbd_al_complete_io(mdev, req->sector);
96 put_ldev(mdev);
97 } else if (__ratelimit(&drbd_ratelimit_state)) {
98 dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), "
99 "but my Disk seems to have failed :(\n",
100 (unsigned long long) req->sector);
101 }
102 }
103 }
104
105 /* if it was a local io error, we want to notify our
106 * peer about that, and see if we need to
107 * detach the disk and stuff.
108 * to avoid allocating some special work
109 * struct, reuse the request. */
110
111 /* THINK
112 * why do we do this not when we detect the error,
113 * but delay it until it is "done", i.e. possibly
114 * until the next barrier ack? */
115
116 if (rw == WRITE &&
117 ((s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) {
118 if (!(req->w.list.next == LIST_POISON1 ||
119 list_empty(&req->w.list))) {
120 /* DEBUG ASSERT only; if this triggers, we
121 * probably corrupt the worker list here */
122 dev_err(DEV, "req->w.list.next = %p\n", req->w.list.next);
123 dev_err(DEV, "req->w.list.prev = %p\n", req->w.list.prev);
124 }
125 req->w.cb = w_io_error;
126 drbd_queue_work(&mdev->data.work, &req->w);
127 /* drbd_req_free() is done in w_io_error */
128 } else {
129 drbd_req_free(req);
130 }
131}
132
133static void queue_barrier(struct drbd_conf *mdev)
134{
135 struct drbd_tl_epoch *b;
136
137 /* We are within the req_lock. Once we queued the barrier for sending,
138 * we set the CREATE_BARRIER bit. It is cleared as soon as a new
139 * barrier/epoch object is added. This is the only place this bit is
140 * set. It indicates that the barrier for this epoch is already queued,
141 * and no new epoch has been created yet. */
142 if (test_bit(CREATE_BARRIER, &mdev->flags))
143 return;
144
145 b = mdev->newest_tle;
146 b->w.cb = w_send_barrier;
147 /* inc_ap_pending done here, so we won't
148 * get imbalanced on connection loss.
149 * dec_ap_pending will be done in got_BarrierAck
150 * or (on connection loss) in tl_clear. */
151 inc_ap_pending(mdev);
152 drbd_queue_work(&mdev->data.work, &b->w);
153 set_bit(CREATE_BARRIER, &mdev->flags);
154}
155
156static void _about_to_complete_local_write(struct drbd_conf *mdev,
157 struct drbd_request *req)
158{
159 const unsigned long s = req->rq_state;
160 struct drbd_request *i;
161 struct drbd_epoch_entry *e;
162 struct hlist_node *n;
163 struct hlist_head *slot;
164
165 /* before we can signal completion to the upper layers,
166 * we may need to close the current epoch */
167 if (mdev->state.conn >= C_CONNECTED &&
168 req->epoch == mdev->newest_tle->br_number)
169 queue_barrier(mdev);
170
171 /* we need to do the conflict detection stuff,
172 * if we have the ee_hash (two_primaries) and
173 * this has been on the network */
174 if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) {
175 const sector_t sector = req->sector;
176 const int size = req->size;
177
178 /* ASSERT:
179 * there must be no conflicting requests, since
180 * they must have been failed on the spot */
181#define OVERLAPS overlaps(sector, size, i->sector, i->size)
182 slot = tl_hash_slot(mdev, sector);
183 hlist_for_each_entry(i, n, slot, colision) {
184 if (OVERLAPS) {
185 dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; "
186 "other: %p %llus +%u\n",
187 req, (unsigned long long)sector, size,
188 i, (unsigned long long)i->sector, i->size);
189 }
190 }
191
192 /* maybe "wake" those conflicting epoch entries
193 * that wait for this request to finish.
194 *
195 * currently, there can be only _one_ such ee
196 * (well, or some more, which would be pending
197 * P_DISCARD_ACK not yet sent by the asender...),
198 * since we block the receiver thread upon the
199 * first conflict detection, which will wait on
200 * misc_wait. maybe we want to assert that?
201 *
202 * anyways, if we found one,
203 * we just have to do a wake_up. */
204#undef OVERLAPS
205#define OVERLAPS overlaps(sector, size, e->sector, e->size)
206 slot = ee_hash_slot(mdev, req->sector);
207 hlist_for_each_entry(e, n, slot, colision) {
208 if (OVERLAPS) {
209 wake_up(&mdev->misc_wait);
210 break;
211 }
212 }
213 }
214#undef OVERLAPS
215}
216
217void complete_master_bio(struct drbd_conf *mdev,
218 struct bio_and_error *m)
219{
220 bio_endio(m->bio, m->error);
221 dec_ap_bio(mdev);
222}
223
224/* Helper for __req_mod().
225 * Set m->bio to the master bio, if it is fit to be completed,
226 * or leave it alone (it is initialized to NULL in __req_mod),
227 * if it has already been completed, or cannot be completed yet.
228 * If m->bio is set, the error status to be returned is placed in m->error.
229 */
230void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
231{
232 const unsigned long s = req->rq_state;
233 struct drbd_conf *mdev = req->mdev;
234 /* only WRITES may end up here without a master bio (on barrier ack) */
235 int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE;
236
237 /* we must not complete the master bio, while it is
238 * still being processed by _drbd_send_zc_bio (drbd_send_dblock)
239 * not yet acknowledged by the peer
240 * not yet completed by the local io subsystem
241 * these flags may get cleared in any order by
242 * the worker,
243 * the receiver,
244 * the bio_endio completion callbacks.
245 */
246 if (s & RQ_NET_QUEUED)
247 return;
248 if (s & RQ_NET_PENDING)
249 return;
250 if (s & RQ_LOCAL_PENDING)
251 return;
252
253 if (req->master_bio) {
254 /* this is data_received (remote read)
255 * or protocol C P_WRITE_ACK
256 * or protocol B P_RECV_ACK
257 * or protocol A "handed_over_to_network" (SendAck)
258 * or canceled or failed,
259 * or killed from the transfer log due to connection loss.
260 */
261
262 /*
263 * figure out whether to report success or failure.
264 *
265 * report success when at least one of the operations succeeded.
266 * or, to put the other way,
267 * only report failure, when both operations failed.
268 *
269 * what to do about the failures is handled elsewhere.
270 * what we need to do here is just: complete the master_bio.
271 *
272 * local completion error, if any, has been stored as ERR_PTR
273 * in private_bio within drbd_endio_pri.
274 */
275 int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
276 int error = PTR_ERR(req->private_bio);
277
278 /* remove the request from the conflict detection
279 * respective block_id verification hash */
280 if (!hlist_unhashed(&req->colision))
281 hlist_del(&req->colision);
282 else
283 D_ASSERT((s & RQ_NET_MASK) == 0);
284
285 /* for writes we need to do some extra housekeeping */
286 if (rw == WRITE)
287 _about_to_complete_local_write(mdev, req);
288
289 /* Update disk stats */
290 _drbd_end_io_acct(mdev, req);
291
292 m->error = ok ? 0 : (error ?: -EIO);
293 m->bio = req->master_bio;
294 req->master_bio = NULL;
295 }
296
297 if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) {
298 /* this is disconnected (local only) operation,
299 * or protocol C P_WRITE_ACK,
300 * or protocol A or B P_BARRIER_ACK,
301 * or killed from the transfer log due to connection loss. */
302 _req_is_done(mdev, req, rw);
303 }
304 /* else: network part and not DONE yet. that is
305 * protocol A or B, barrier ack still pending... */
306}
307
308/*
309 * checks whether there was an overlapping request
310 * or ee already registered.
311 *
312 * if so, return 1, in which case this request is completed on the spot,
313 * without ever being submitted or send.
314 *
315 * return 0 if it is ok to submit this request.
316 *
317 * NOTE:
318 * paranoia: assume something above us is broken, and issues different write
319 * requests for the same block simultaneously...
320 *
321 * To ensure these won't be reordered differently on both nodes, resulting in
322 * diverging data sets, we discard the later one(s). Not that this is supposed
323 * to happen, but this is the rationale why we also have to check for
324 * conflicting requests with local origin, and why we have to do so regardless
325 * of whether we allowed multiple primaries.
326 *
327 * BTW, in case we only have one primary, the ee_hash is empty anyways, and the
328 * second hlist_for_each_entry becomes a noop. This is even simpler than to
329 * grab a reference on the net_conf, and check for the two_primaries flag...
330 */
331static int _req_conflicts(struct drbd_request *req)
332{
333 struct drbd_conf *mdev = req->mdev;
334 const sector_t sector = req->sector;
335 const int size = req->size;
336 struct drbd_request *i;
337 struct drbd_epoch_entry *e;
338 struct hlist_node *n;
339 struct hlist_head *slot;
340
341 D_ASSERT(hlist_unhashed(&req->colision));
342
343 if (!get_net_conf(mdev))
344 return 0;
345
346 /* BUG_ON */
347 ERR_IF (mdev->tl_hash_s == 0)
348 goto out_no_conflict;
349 BUG_ON(mdev->tl_hash == NULL);
350
351#define OVERLAPS overlaps(i->sector, i->size, sector, size)
352 slot = tl_hash_slot(mdev, sector);
353 hlist_for_each_entry(i, n, slot, colision) {
354 if (OVERLAPS) {
355 dev_alert(DEV, "%s[%u] Concurrent local write detected! "
356 "[DISCARD L] new: %llus +%u; "
357 "pending: %llus +%u\n",
358 current->comm, current->pid,
359 (unsigned long long)sector, size,
360 (unsigned long long)i->sector, i->size);
361 goto out_conflict;
362 }
363 }
364
365 if (mdev->ee_hash_s) {
366 /* now, check for overlapping requests with remote origin */
367 BUG_ON(mdev->ee_hash == NULL);
368#undef OVERLAPS
369#define OVERLAPS overlaps(e->sector, e->size, sector, size)
370 slot = ee_hash_slot(mdev, sector);
371 hlist_for_each_entry(e, n, slot, colision) {
372 if (OVERLAPS) {
373 dev_alert(DEV, "%s[%u] Concurrent remote write detected!"
374 " [DISCARD L] new: %llus +%u; "
375 "pending: %llus +%u\n",
376 current->comm, current->pid,
377 (unsigned long long)sector, size,
378 (unsigned long long)e->sector, e->size);
379 goto out_conflict;
380 }
381 }
382 }
383#undef OVERLAPS
384
385out_no_conflict:
386 /* this is like it should be, and what we expected.
387 * our users do behave after all... */
388 put_net_conf(mdev);
389 return 0;
390
391out_conflict:
392 put_net_conf(mdev);
393 return 1;
394}
395
396/* obviously this could be coded as many single functions
397 * instead of one huge switch,
398 * or by putting the code directly in the respective locations
399 * (as it has been before).
400 *
401 * but having it this way
402 * enforces that it is all in this one place, where it is easier to audit,
403 * it makes it obvious that whatever "event" "happens" to a request should
404 * happen "atomically" within the req_lock,
405 * and it enforces that we have to think in a very structured manner
406 * about the "events" that may happen to a request during its life time ...
407 */
408void __req_mod(struct drbd_request *req, enum drbd_req_event what,
409 struct bio_and_error *m)
410{
411 struct drbd_conf *mdev = req->mdev;
412 m->bio = NULL;
413
414 switch (what) {
415 default:
416 dev_err(DEV, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__);
417 break;
418
419 /* does not happen...
420 * initialization done in drbd_req_new
421 case created:
422 break;
423 */
424
425 case to_be_send: /* via network */
426 /* reached via drbd_make_request_common
427 * and from w_read_retry_remote */
428 D_ASSERT(!(req->rq_state & RQ_NET_MASK));
429 req->rq_state |= RQ_NET_PENDING;
430 inc_ap_pending(mdev);
431 break;
432
433 case to_be_submitted: /* locally */
434 /* reached via drbd_make_request_common */
435 D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK));
436 req->rq_state |= RQ_LOCAL_PENDING;
437 break;
438
439 case completed_ok:
440 if (bio_data_dir(req->master_bio) == WRITE)
441 mdev->writ_cnt += req->size>>9;
442 else
443 mdev->read_cnt += req->size>>9;
444
445 req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
446 req->rq_state &= ~RQ_LOCAL_PENDING;
447
448 _req_may_be_done(req, m);
449 put_ldev(mdev);
450 break;
451
452 case write_completed_with_error:
453 req->rq_state |= RQ_LOCAL_COMPLETED;
454 req->rq_state &= ~RQ_LOCAL_PENDING;
455
456 dev_alert(DEV, "Local WRITE failed sec=%llus size=%u\n",
457 (unsigned long long)req->sector, req->size);
458 /* and now: check how to handle local io error. */
459 __drbd_chk_io_error(mdev, FALSE);
460 _req_may_be_done(req, m);
461 put_ldev(mdev);
462 break;
463
464 case read_ahead_completed_with_error:
465 /* it is legal to fail READA */
466 req->rq_state |= RQ_LOCAL_COMPLETED;
467 req->rq_state &= ~RQ_LOCAL_PENDING;
468 _req_may_be_done(req, m);
469 put_ldev(mdev);
470 break;
471
472 case read_completed_with_error:
473 drbd_set_out_of_sync(mdev, req->sector, req->size);
474
475 req->rq_state |= RQ_LOCAL_COMPLETED;
476 req->rq_state &= ~RQ_LOCAL_PENDING;
477
478 dev_alert(DEV, "Local READ failed sec=%llus size=%u\n",
479 (unsigned long long)req->sector, req->size);
480 /* _req_mod(req,to_be_send); oops, recursion... */
481 D_ASSERT(!(req->rq_state & RQ_NET_MASK));
482 req->rq_state |= RQ_NET_PENDING;
483 inc_ap_pending(mdev);
484
485 __drbd_chk_io_error(mdev, FALSE);
486 put_ldev(mdev);
487 /* NOTE: if we have no connection,
488 * or know the peer has no good data either,
489 * then we don't actually need to "queue_for_net_read",
490 * but we do so anyways, since the drbd_io_error()
491 * and the potential state change to "Diskless"
492 * needs to be done from process context */
493
494 /* fall through: _req_mod(req,queue_for_net_read); */
495
496 case queue_for_net_read:
497 /* READ or READA, and
498 * no local disk,
499 * or target area marked as invalid,
500 * or just got an io-error. */
501 /* from drbd_make_request_common
502 * or from bio_endio during read io-error recovery */
503
504 /* so we can verify the handle in the answer packet
505 * corresponding hlist_del is in _req_may_be_done() */
506 hlist_add_head(&req->colision, ar_hash_slot(mdev, req->sector));
507
508 set_bit(UNPLUG_REMOTE, &mdev->flags);
509
510 D_ASSERT(req->rq_state & RQ_NET_PENDING);
511 req->rq_state |= RQ_NET_QUEUED;
512 req->w.cb = (req->rq_state & RQ_LOCAL_MASK)
513 ? w_read_retry_remote
514 : w_send_read_req;
515 drbd_queue_work(&mdev->data.work, &req->w);
516 break;
517
518 case queue_for_net_write:
519 /* assert something? */
520 /* from drbd_make_request_common only */
521
522 hlist_add_head(&req->colision, tl_hash_slot(mdev, req->sector));
523 /* corresponding hlist_del is in _req_may_be_done() */
524
525 /* NOTE
526 * In case the req ended up on the transfer log before being
527 * queued on the worker, it could lead to this request being
528 * missed during cleanup after connection loss.
529 * So we have to do both operations here,
530 * within the same lock that protects the transfer log.
531 *
532 * _req_add_to_epoch(req); this has to be after the
533 * _maybe_start_new_epoch(req); which happened in
534 * drbd_make_request_common, because we now may set the bit
535 * again ourselves to close the current epoch.
536 *
537 * Add req to the (now) current epoch (barrier). */
538
539 /* otherwise we may lose an unplug, which may cause some remote
540 * io-scheduler timeout to expire, increasing maximum latency,
541 * hurting performance. */
542 set_bit(UNPLUG_REMOTE, &mdev->flags);
543
544 /* see drbd_make_request_common,
545 * just after it grabs the req_lock */
546 D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0);
547
548 req->epoch = mdev->newest_tle->br_number;
549 list_add_tail(&req->tl_requests,
550 &mdev->newest_tle->requests);
551
552 /* increment size of current epoch */
553 mdev->newest_tle->n_req++;
554
555 /* queue work item to send data */
556 D_ASSERT(req->rq_state & RQ_NET_PENDING);
557 req->rq_state |= RQ_NET_QUEUED;
558 req->w.cb = w_send_dblock;
559 drbd_queue_work(&mdev->data.work, &req->w);
560
561 /* close the epoch, in case it outgrew the limit */
562 if (mdev->newest_tle->n_req >= mdev->net_conf->max_epoch_size)
563 queue_barrier(mdev);
564
565 break;
566
567 case send_canceled:
568 /* treat it the same */
569 case send_failed:
570 /* real cleanup will be done from tl_clear. just update flags
571 * so it is no longer marked as on the worker queue */
572 req->rq_state &= ~RQ_NET_QUEUED;
573 /* if we did it right, tl_clear should be scheduled only after
574 * this, so this should not be necessary! */
575 _req_may_be_done(req, m);
576 break;
577
578 case handed_over_to_network:
579 /* assert something? */
580 if (bio_data_dir(req->master_bio) == WRITE &&
581 mdev->net_conf->wire_protocol == DRBD_PROT_A) {
582 /* this is what is dangerous about protocol A:
583 * pretend it was successfully written on the peer. */
584 if (req->rq_state & RQ_NET_PENDING) {
585 dec_ap_pending(mdev);
586 req->rq_state &= ~RQ_NET_PENDING;
587 req->rq_state |= RQ_NET_OK;
588 } /* else: neg-ack was faster... */
589 /* it is still not yet RQ_NET_DONE until the
590 * corresponding epoch barrier got acked as well,
591 * so we know what to dirty on connection loss */
592 }
593 req->rq_state &= ~RQ_NET_QUEUED;
594 req->rq_state |= RQ_NET_SENT;
595 /* because _drbd_send_zc_bio could sleep, and may want to
596 * dereference the bio even after the "write_acked_by_peer" and
597 * "completed_ok" events came in, once we return from
598 * _drbd_send_zc_bio (drbd_send_dblock), we have to check
599 * whether it is done already, and end it. */
600 _req_may_be_done(req, m);
601 break;
602
603 case connection_lost_while_pending:
604 /* transfer log cleanup after connection loss */
605 /* assert something? */
606 if (req->rq_state & RQ_NET_PENDING)
607 dec_ap_pending(mdev);
608 req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
609 req->rq_state |= RQ_NET_DONE;
610 /* if it is still queued, we may not complete it here.
611 * it will be canceled soon. */
612 if (!(req->rq_state & RQ_NET_QUEUED))
613 _req_may_be_done(req, m);
614 break;
615
616 case write_acked_by_peer_and_sis:
617 req->rq_state |= RQ_NET_SIS;
618 case conflict_discarded_by_peer:
619 /* for discarded conflicting writes of multiple primaries,
620 * there is no need to keep anything in the tl, potential
621 * node crashes are covered by the activity log. */
622 if (what == conflict_discarded_by_peer)
623 dev_alert(DEV, "Got DiscardAck packet %llus +%u!"
624 " DRBD is not a random data generator!\n",
625 (unsigned long long)req->sector, req->size);
626 req->rq_state |= RQ_NET_DONE;
627 /* fall through */
628 case write_acked_by_peer:
629 /* protocol C; successfully written on peer.
630 * Nothing to do here.
631 * We want to keep the tl in place for all protocols, to cater
632 * for volatile write-back caches on lower level devices.
633 *
634 * A barrier request is expected to have forced all prior
635 * requests onto stable storage, so completion of a barrier
636 * request could set NET_DONE right here, and not wait for the
637 * P_BARRIER_ACK, but that is an unnecessary optimization. */
638
639 /* this makes it effectively the same as for: */
640 case recv_acked_by_peer:
641 /* protocol B; pretends to be successfully written on peer.
642 * see also notes above in handed_over_to_network about
643 * protocol != C */
644 req->rq_state |= RQ_NET_OK;
645 D_ASSERT(req->rq_state & RQ_NET_PENDING);
646 dec_ap_pending(mdev);
647 req->rq_state &= ~RQ_NET_PENDING;
648 _req_may_be_done(req, m);
649 break;
650
651 case neg_acked:
652 /* assert something? */
653 if (req->rq_state & RQ_NET_PENDING)
654 dec_ap_pending(mdev);
655 req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
656
657 req->rq_state |= RQ_NET_DONE;
658 _req_may_be_done(req, m);
659 /* else: done by handed_over_to_network */
660 break;
661
662 case barrier_acked:
663 if (req->rq_state & RQ_NET_PENDING) {
664 /* barrier came in before all requests have been acked.
665 * this is bad, because if the connection is lost now,
666 * we won't be able to clean them up... */
667 dev_err(DEV, "FIXME (barrier_acked but pending)\n");
668 list_move(&req->tl_requests, &mdev->out_of_sequence_requests);
669 }
670 D_ASSERT(req->rq_state & RQ_NET_SENT);
671 req->rq_state |= RQ_NET_DONE;
672 _req_may_be_done(req, m);
673 break;
674
675 case data_received:
676 D_ASSERT(req->rq_state & RQ_NET_PENDING);
677 dec_ap_pending(mdev);
678 req->rq_state &= ~RQ_NET_PENDING;
679 req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
680 _req_may_be_done(req, m);
681 break;
682 };
683}
684
685/* we may do a local read if:
686 * - we are consistent (of course),
687 * - or we are generally inconsistent,
688 * BUT we are still/already IN SYNC for this area.
689 * since size may be bigger than BM_BLOCK_SIZE,
690 * we may need to check several bits.
691 */
692static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size)
693{
694 unsigned long sbnr, ebnr;
695 sector_t esector, nr_sectors;
696
697 if (mdev->state.disk == D_UP_TO_DATE)
698 return 1;
699 if (mdev->state.disk >= D_OUTDATED)
700 return 0;
701 if (mdev->state.disk < D_INCONSISTENT)
702 return 0;
703 /* state.disk == D_INCONSISTENT We will have a look at the BitMap */
704 nr_sectors = drbd_get_capacity(mdev->this_bdev);
705 esector = sector + (size >> 9) - 1;
706
707 D_ASSERT(sector < nr_sectors);
708 D_ASSERT(esector < nr_sectors);
709
710 sbnr = BM_SECT_TO_BIT(sector);
711 ebnr = BM_SECT_TO_BIT(esector);
712
713 return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr);
714}
715
716static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
717{
718 const int rw = bio_rw(bio);
719 const int size = bio->bi_size;
720 const sector_t sector = bio->bi_sector;
721 struct drbd_tl_epoch *b = NULL;
722 struct drbd_request *req;
723 int local, remote;
724 int err = -EIO;
725
726 /* allocate outside of all locks; */
727 req = drbd_req_new(mdev, bio);
728 if (!req) {
729 dec_ap_bio(mdev);
730 /* only pass the error to the upper layers.
731 * if user cannot handle io errors, that's not our business. */
732 dev_err(DEV, "could not kmalloc() req\n");
733 bio_endio(bio, -ENOMEM);
734 return 0;
735 }
736
737 local = get_ldev(mdev);
738 if (!local) {
739 bio_put(req->private_bio); /* or we get a bio leak */
740 req->private_bio = NULL;
741 }
742 if (rw == WRITE) {
743 remote = 1;
744 } else {
745 /* READ || READA */
746 if (local) {
747 if (!drbd_may_do_local_read(mdev, sector, size)) {
748 /* we could kick the syncer to
749 * sync this extent asap, wait for
750 * it, then continue locally.
751 * Or just issue the request remotely.
752 */
753 local = 0;
754 bio_put(req->private_bio);
755 req->private_bio = NULL;
756 put_ldev(mdev);
757 }
758 }
759 remote = !local && mdev->state.pdsk >= D_UP_TO_DATE;
760 }
761
762 /* If we have a disk, but a READA request is mapped to remote,
763 * we are R_PRIMARY, D_INCONSISTENT, SyncTarget.
764 * Just fail that READA request right here.
765 *
766 * THINK: maybe fail all READA when not local?
767 * or make this configurable...
768 * if network is slow, READA won't do any good.
769 */
770 if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) {
771 err = -EWOULDBLOCK;
772 goto fail_and_free_req;
773 }
774
775 /* For WRITES going to the local disk, grab a reference on the target
776 * extent. This waits for any resync activity in the corresponding
777 * resync extent to finish, and, if necessary, pulls in the target
778 * extent into the activity log, which involves further disk io because
779 * of transactional on-disk meta data updates. */
780 if (rw == WRITE && local)
781 drbd_al_begin_io(mdev, sector);
782
783 remote = remote && (mdev->state.pdsk == D_UP_TO_DATE ||
784 (mdev->state.pdsk == D_INCONSISTENT &&
785 mdev->state.conn >= C_CONNECTED));
786
787 if (!(local || remote)) {
788 dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
789 goto fail_free_complete;
790 }
791
792 /* For WRITE request, we have to make sure that we have an
793 * unused_spare_tle, in case we need to start a new epoch.
794 * I try to be smart and avoid to pre-allocate always "just in case",
795 * but there is a race between testing the bit and pointer outside the
796 * spinlock, and grabbing the spinlock.
797 * if we lost that race, we retry. */
798 if (rw == WRITE && remote &&
799 mdev->unused_spare_tle == NULL &&
800 test_bit(CREATE_BARRIER, &mdev->flags)) {
801allocate_barrier:
802 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO);
803 if (!b) {
804 dev_err(DEV, "Failed to alloc barrier.\n");
805 err = -ENOMEM;
806 goto fail_free_complete;
807 }
808 }
809
810 /* GOOD, everything prepared, grab the spin_lock */
811 spin_lock_irq(&mdev->req_lock);
812
813 if (remote) {
814 remote = (mdev->state.pdsk == D_UP_TO_DATE ||
815 (mdev->state.pdsk == D_INCONSISTENT &&
816 mdev->state.conn >= C_CONNECTED));
817 if (!remote)
818 dev_warn(DEV, "lost connection while grabbing the req_lock!\n");
819 if (!(local || remote)) {
820 dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
821 spin_unlock_irq(&mdev->req_lock);
822 goto fail_free_complete;
823 }
824 }
825
826 if (b && mdev->unused_spare_tle == NULL) {
827 mdev->unused_spare_tle = b;
828 b = NULL;
829 }
830 if (rw == WRITE && remote &&
831 mdev->unused_spare_tle == NULL &&
832 test_bit(CREATE_BARRIER, &mdev->flags)) {
833 /* someone closed the current epoch
834 * while we were grabbing the spinlock */
835 spin_unlock_irq(&mdev->req_lock);
836 goto allocate_barrier;
837 }
838
839
840 /* Update disk stats */
841 _drbd_start_io_acct(mdev, req, bio);
842
843 /* _maybe_start_new_epoch(mdev);
844 * If we need to generate a write barrier packet, we have to add the
845 * new epoch (barrier) object, and queue the barrier packet for sending,
846 * and queue the req's data after it _within the same lock_, otherwise
847 * we have race conditions were the reorder domains could be mixed up.
848 *
849 * Even read requests may start a new epoch and queue the corresponding
850 * barrier packet. To get the write ordering right, we only have to
851 * make sure that, if this is a write request and it triggered a
852 * barrier packet, this request is queued within the same spinlock. */
853 if (remote && mdev->unused_spare_tle &&
854 test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
855 _tl_add_barrier(mdev, mdev->unused_spare_tle);
856 mdev->unused_spare_tle = NULL;
857 } else {
858 D_ASSERT(!(remote && rw == WRITE &&
859 test_bit(CREATE_BARRIER, &mdev->flags)));
860 }
861
862 /* NOTE
863 * Actually, 'local' may be wrong here already, since we may have failed
864 * to write to the meta data, and may become wrong anytime because of
865 * local io-error for some other request, which would lead to us
866 * "detaching" the local disk.
867 *
868 * 'remote' may become wrong any time because the network could fail.
869 *
870 * This is a harmless race condition, though, since it is handled
871 * correctly at the appropriate places; so it just defers the failure
872 * of the respective operation.
873 */
874
875 /* mark them early for readability.
876 * this just sets some state flags. */
877 if (remote)
878 _req_mod(req, to_be_send);
879 if (local)
880 _req_mod(req, to_be_submitted);
881
882 /* check this request on the collision detection hash tables.
883 * if we have a conflict, just complete it here.
884 * THINK do we want to check reads, too? (I don't think so...) */
885 if (rw == WRITE && _req_conflicts(req)) {
886 /* this is a conflicting request.
887 * even though it may have been only _partially_
888 * overlapping with one of the currently pending requests,
889 * without even submitting or sending it, we will
890 * pretend that it was successfully served right now.
891 */
892 if (local) {
893 bio_put(req->private_bio);
894 req->private_bio = NULL;
895 drbd_al_complete_io(mdev, req->sector);
896 put_ldev(mdev);
897 local = 0;
898 }
899 if (remote)
900 dec_ap_pending(mdev);
901 _drbd_end_io_acct(mdev, req);
902 /* THINK: do we want to fail it (-EIO), or pretend success? */
903 bio_endio(req->master_bio, 0);
904 req->master_bio = NULL;
905 dec_ap_bio(mdev);
906 drbd_req_free(req);
907 remote = 0;
908 }
909
910 /* NOTE remote first: to get the concurrent write detection right,
911 * we must register the request before start of local IO. */
912 if (remote) {
913 /* either WRITE and C_CONNECTED,
914 * or READ, and no local disk,
915 * or READ, but not in sync.
916 */
917 _req_mod(req, (rw == WRITE)
918 ? queue_for_net_write
919 : queue_for_net_read);
920 }
921 spin_unlock_irq(&mdev->req_lock);
922 kfree(b); /* if someone else has beaten us to it... */
923
924 if (local) {
925 req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
926
927 if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
928 : rw == READ ? DRBD_FAULT_DT_RD
929 : DRBD_FAULT_DT_RA))
930 bio_endio(req->private_bio, -EIO);
931 else
932 generic_make_request(req->private_bio);
933 }
934
935 /* we need to plug ALWAYS since we possibly need to kick lo_dev.
936 * we plug after submit, so we won't miss an unplug event */
937 drbd_plug_device(mdev);
938
939 return 0;
940
941fail_free_complete:
942 if (rw == WRITE && local)
943 drbd_al_complete_io(mdev, sector);
944fail_and_free_req:
945 if (local) {
946 bio_put(req->private_bio);
947 req->private_bio = NULL;
948 put_ldev(mdev);
949 }
950 bio_endio(bio, err);
951 drbd_req_free(req);
952 dec_ap_bio(mdev);
953 kfree(b);
954
955 return 0;
956}
957
958/* helper function for drbd_make_request
959 * if we can determine just by the mdev (state) that this request will fail,
960 * return 1
961 * otherwise return 0
962 */
963static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
964{
965 /* Unconfigured */
966 if (mdev->state.conn == C_DISCONNECTING &&
967 mdev->state.disk == D_DISKLESS)
968 return 1;
969
970 if (mdev->state.role != R_PRIMARY &&
971 (!allow_oos || is_write)) {
972 if (__ratelimit(&drbd_ratelimit_state)) {
973 dev_err(DEV, "Process %s[%u] tried to %s; "
974 "since we are not in Primary state, "
975 "we cannot allow this\n",
976 current->comm, current->pid,
977 is_write ? "WRITE" : "READ");
978 }
979 return 1;
980 }
981
982 /*
983 * Paranoia: we might have been primary, but sync target, or
984 * even diskless, then lost the connection.
985 * This should have been handled (panic? suspend?) somewhere
986 * else. But maybe it was not, so check again here.
987 * Caution: as long as we do not have a read/write lock on mdev,
988 * to serialize state changes, this is racy, since we may lose
989 * the connection *after* we test for the cstate.
990 */
991 if (mdev->state.disk < D_UP_TO_DATE && mdev->state.pdsk < D_UP_TO_DATE) {
992 if (__ratelimit(&drbd_ratelimit_state))
993 dev_err(DEV, "Sorry, I have no access to good data anymore.\n");
994 return 1;
995 }
996
997 return 0;
998}
999
1000int drbd_make_request_26(struct request_queue *q, struct bio *bio)
1001{
1002 unsigned int s_enr, e_enr;
1003 struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
1004
1005 if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) {
1006 bio_endio(bio, -EPERM);
1007 return 0;
1008 }
1009
1010 /* Reject barrier requests if we know the underlying device does
1011 * not support them.
1012 * XXX: Need to get this info from peer as well some how so we
1013 * XXX: reject if EITHER side/data/metadata area does not support them.
1014 *
1015 * because of those XXX, this is not yet enabled,
1016 * i.e. in drbd_init_set_defaults we set the NO_BARRIER_SUPP bit.
1017 */
1018 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags))) {
1019 /* dev_warn(DEV, "Rejecting barrier request as underlying device does not support\n"); */
1020 bio_endio(bio, -EOPNOTSUPP);
1021 return 0;
1022 }
1023
1024 /*
1025 * what we "blindly" assume:
1026 */
1027 D_ASSERT(bio->bi_size > 0);
1028 D_ASSERT((bio->bi_size & 0x1ff) == 0);
1029 D_ASSERT(bio->bi_idx == 0);
1030
1031 /* to make some things easier, force alignment of requests within the
1032 * granularity of our hash tables */
1033 s_enr = bio->bi_sector >> HT_SHIFT;
1034 e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT;
1035
1036 if (likely(s_enr == e_enr)) {
1037 inc_ap_bio(mdev, 1);
1038 return drbd_make_request_common(mdev, bio);
1039 }
1040
1041 /* can this bio be split generically?
1042 * Maybe add our own split-arbitrary-bios function. */
1043 if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_SEGMENT_SIZE) {
1044 /* rather error out here than BUG in bio_split */
1045 dev_err(DEV, "bio would need to, but cannot, be split: "
1046 "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n",
1047 bio->bi_vcnt, bio->bi_idx, bio->bi_size,
1048 (unsigned long long)bio->bi_sector);
1049 bio_endio(bio, -EINVAL);
1050 } else {
1051 /* This bio crosses some boundary, so we have to split it. */
1052 struct bio_pair *bp;
1053 /* works for the "do not cross hash slot boundaries" case
1054 * e.g. sector 262269, size 4096
1055 * s_enr = 262269 >> 6 = 4097
1056 * e_enr = (262269+8-1) >> 6 = 4098
1057 * HT_SHIFT = 6
1058 * sps = 64, mask = 63
1059 * first_sectors = 64 - (262269 & 63) = 3
1060 */
1061 const sector_t sect = bio->bi_sector;
1062 const int sps = 1 << HT_SHIFT; /* sectors per slot */
1063 const int mask = sps - 1;
1064 const sector_t first_sectors = sps - (sect & mask);
1065 bp = bio_split(bio,
1066#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
1067 bio_split_pool,
1068#endif
1069 first_sectors);
1070
1071 /* we need to get a "reference count" (ap_bio_cnt)
1072 * to avoid races with the disconnect/reconnect/suspend code.
1073 * In case we need to split the bio here, we need to get two references
1074 * atomically, otherwise we might deadlock when trying to submit the
1075 * second one! */
1076 inc_ap_bio(mdev, 2);
1077
1078 D_ASSERT(e_enr == s_enr + 1);
1079
1080 drbd_make_request_common(mdev, &bp->bio1);
1081 drbd_make_request_common(mdev, &bp->bio2);
1082 bio_pair_release(bp);
1083 }
1084 return 0;
1085}
1086
1087/* This is called by bio_add_page(). With this function we reduce
1088 * the number of BIOs that span over multiple DRBD_MAX_SEGMENT_SIZEs
1089 * units (was AL_EXTENTs).
1090 *
1091 * we do the calculation within the lower 32bit of the byte offsets,
1092 * since we don't care for actual offset, but only check whether it
1093 * would cross "activity log extent" boundaries.
1094 *
1095 * As long as the BIO is empty we have to allow at least one bvec,
1096 * regardless of size and offset. so the resulting bio may still
1097 * cross extent boundaries. those are dealt with (bio_split) in
1098 * drbd_make_request_26.
1099 */
1100int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
1101{
1102 struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
1103 unsigned int bio_offset =
1104 (unsigned int)bvm->bi_sector << 9; /* 32 bit */
1105 unsigned int bio_size = bvm->bi_size;
1106 int limit, backing_limit;
1107
1108 limit = DRBD_MAX_SEGMENT_SIZE
1109 - ((bio_offset & (DRBD_MAX_SEGMENT_SIZE-1)) + bio_size);
1110 if (limit < 0)
1111 limit = 0;
1112 if (bio_size == 0) {
1113 if (limit <= bvec->bv_len)
1114 limit = bvec->bv_len;
1115 } else if (limit && get_ldev(mdev)) {
1116 struct request_queue * const b =
1117 mdev->ldev->backing_bdev->bd_disk->queue;
1118 if (b->merge_bvec_fn && mdev->ldev->dc.use_bmbv) {
1119 backing_limit = b->merge_bvec_fn(b, bvm, bvec);
1120 limit = min(limit, backing_limit);
1121 }
1122 put_ldev(mdev);
1123 }
1124 return limit;
1125}
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
new file mode 100644
index 000000000000..16119d7056cc
--- /dev/null
+++ b/drivers/block/drbd/drbd_req.h
@@ -0,0 +1,326 @@
1/*
2 drbd_req.h
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2006-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2006-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
8 Copyright (C) 2006-2008, Philipp Reisner <philipp.reisner@linbit.com>.
9
10 DRBD is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 DRBD is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25#ifndef _DRBD_REQ_H
26#define _DRBD_REQ_H
27
28#include <linux/module.h>
29
30#include <linux/slab.h>
31#include <linux/drbd.h>
32#include "drbd_int.h"
33#include "drbd_wrappers.h"
34
35/* The request callbacks will be called in irq context by the IDE drivers,
36 and in Softirqs/Tasklets/BH context by the SCSI drivers,
37 and by the receiver and worker in kernel-thread context.
38 Try to get the locking right :) */
39
40/*
41 * Objects of type struct drbd_request do only exist on a R_PRIMARY node, and are
42 * associated with IO requests originating from the block layer above us.
43 *
44 * There are quite a few things that may happen to a drbd request
45 * during its lifetime.
46 *
47 * It will be created.
48 * It will be marked with the intention to be
49 * submitted to local disk and/or
50 * send via the network.
51 *
52 * It has to be placed on the transfer log and other housekeeping lists,
53 * In case we have a network connection.
54 *
55 * It may be identified as a concurrent (write) request
56 * and be handled accordingly.
57 *
58 * It may me handed over to the local disk subsystem.
59 * It may be completed by the local disk subsystem,
60 * either successfully or with io-error.
61 * In case it is a READ request, and it failed locally,
62 * it may be retried remotely.
63 *
64 * It may be queued for sending.
65 * It may be handed over to the network stack,
66 * which may fail.
67 * It may be acknowledged by the "peer" according to the wire_protocol in use.
68 * this may be a negative ack.
69 * It may receive a faked ack when the network connection is lost and the
70 * transfer log is cleaned up.
71 * Sending may be canceled due to network connection loss.
72 * When it finally has outlived its time,
73 * corresponding dirty bits in the resync-bitmap may be cleared or set,
74 * it will be destroyed,
75 * and completion will be signalled to the originator,
76 * with or without "success".
77 */
78
79enum drbd_req_event {
80 created,
81 to_be_send,
82 to_be_submitted,
83
84 /* XXX yes, now I am inconsistent...
85 * these two are not "events" but "actions"
86 * oh, well... */
87 queue_for_net_write,
88 queue_for_net_read,
89
90 send_canceled,
91 send_failed,
92 handed_over_to_network,
93 connection_lost_while_pending,
94 recv_acked_by_peer,
95 write_acked_by_peer,
96 write_acked_by_peer_and_sis, /* and set_in_sync */
97 conflict_discarded_by_peer,
98 neg_acked,
99 barrier_acked, /* in protocol A and B */
100 data_received, /* (remote read) */
101
102 read_completed_with_error,
103 read_ahead_completed_with_error,
104 write_completed_with_error,
105 completed_ok,
106 nothing, /* for tracing only */
107};
108
109/* encoding of request states for now. we don't actually need that many bits.
110 * we don't need to do atomic bit operations either, since most of the time we
111 * need to look at the connection state and/or manipulate some lists at the
112 * same time, so we should hold the request lock anyways.
113 */
114enum drbd_req_state_bits {
115 /* 210
116 * 000: no local possible
117 * 001: to be submitted
118 * UNUSED, we could map: 011: submitted, completion still pending
119 * 110: completed ok
120 * 010: completed with error
121 */
122 __RQ_LOCAL_PENDING,
123 __RQ_LOCAL_COMPLETED,
124 __RQ_LOCAL_OK,
125
126 /* 76543
127 * 00000: no network possible
128 * 00001: to be send
129 * 00011: to be send, on worker queue
130 * 00101: sent, expecting recv_ack (B) or write_ack (C)
131 * 11101: sent,
132 * recv_ack (B) or implicit "ack" (A),
133 * still waiting for the barrier ack.
134 * master_bio may already be completed and invalidated.
135 * 11100: write_acked (C),
136 * data_received (for remote read, any protocol)
137 * or finally the barrier ack has arrived (B,A)...
138 * request can be freed
139 * 01100: neg-acked (write, protocol C)
140 * or neg-d-acked (read, any protocol)
141 * or killed from the transfer log
142 * during cleanup after connection loss
143 * request can be freed
144 * 01000: canceled or send failed...
145 * request can be freed
146 */
147
148 /* if "SENT" is not set, yet, this can still fail or be canceled.
149 * if "SENT" is set already, we still wait for an Ack packet.
150 * when cleared, the master_bio may be completed.
151 * in (B,A) the request object may still linger on the transaction log
152 * until the corresponding barrier ack comes in */
153 __RQ_NET_PENDING,
154
155 /* If it is QUEUED, and it is a WRITE, it is also registered in the
156 * transfer log. Currently we need this flag to avoid conflicts between
157 * worker canceling the request and tl_clear_barrier killing it from
158 * transfer log. We should restructure the code so this conflict does
159 * no longer occur. */
160 __RQ_NET_QUEUED,
161
162 /* well, actually only "handed over to the network stack".
163 *
164 * TODO can potentially be dropped because of the similar meaning
165 * of RQ_NET_SENT and ~RQ_NET_QUEUED.
166 * however it is not exactly the same. before we drop it
167 * we must ensure that we can tell a request with network part
168 * from a request without, regardless of what happens to it. */
169 __RQ_NET_SENT,
170
171 /* when set, the request may be freed (if RQ_NET_QUEUED is clear).
172 * basically this means the corresponding P_BARRIER_ACK was received */
173 __RQ_NET_DONE,
174
175 /* whether or not we know (C) or pretend (B,A) that the write
176 * was successfully written on the peer.
177 */
178 __RQ_NET_OK,
179
180 /* peer called drbd_set_in_sync() for this write */
181 __RQ_NET_SIS,
182
183 /* keep this last, its for the RQ_NET_MASK */
184 __RQ_NET_MAX,
185};
186
187#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
188#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED)
189#define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK)
190
191#define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */
192
193#define RQ_NET_PENDING (1UL << __RQ_NET_PENDING)
194#define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED)
195#define RQ_NET_SENT (1UL << __RQ_NET_SENT)
196#define RQ_NET_DONE (1UL << __RQ_NET_DONE)
197#define RQ_NET_OK (1UL << __RQ_NET_OK)
198#define RQ_NET_SIS (1UL << __RQ_NET_SIS)
199
200/* 0x1f8 */
201#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
202
203/* epoch entries */
204static inline
205struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector)
206{
207 BUG_ON(mdev->ee_hash_s == 0);
208 return mdev->ee_hash +
209 ((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s);
210}
211
212/* transfer log (drbd_request objects) */
213static inline
214struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector)
215{
216 BUG_ON(mdev->tl_hash_s == 0);
217 return mdev->tl_hash +
218 ((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s);
219}
220
221/* application reads (drbd_request objects) */
222static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector)
223{
224 return mdev->app_reads_hash
225 + ((unsigned int)(sector) % APP_R_HSIZE);
226}
227
228/* when we receive the answer for a read request,
229 * verify that we actually know about it */
230static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
231 u64 id, sector_t sector)
232{
233 struct hlist_head *slot = ar_hash_slot(mdev, sector);
234 struct hlist_node *n;
235 struct drbd_request *req;
236
237 hlist_for_each_entry(req, n, slot, colision) {
238 if ((unsigned long)req == (unsigned long)id) {
239 D_ASSERT(req->sector == sector);
240 return req;
241 }
242 }
243 return NULL;
244}
245
246static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
247 struct bio *bio_src)
248{
249 struct bio *bio;
250 struct drbd_request *req =
251 mempool_alloc(drbd_request_mempool, GFP_NOIO);
252 if (likely(req)) {
253 bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
254
255 req->rq_state = 0;
256 req->mdev = mdev;
257 req->master_bio = bio_src;
258 req->private_bio = bio;
259 req->epoch = 0;
260 req->sector = bio->bi_sector;
261 req->size = bio->bi_size;
262 req->start_time = jiffies;
263 INIT_HLIST_NODE(&req->colision);
264 INIT_LIST_HEAD(&req->tl_requests);
265 INIT_LIST_HEAD(&req->w.list);
266
267 bio->bi_private = req;
268 bio->bi_end_io = drbd_endio_pri;
269 bio->bi_next = NULL;
270 }
271 return req;
272}
273
274static inline void drbd_req_free(struct drbd_request *req)
275{
276 mempool_free(req, drbd_request_mempool);
277}
278
279static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
280{
281 return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
282}
283
284/* Short lived temporary struct on the stack.
285 * We could squirrel the error to be returned into
286 * bio->bi_size, or similar. But that would be too ugly. */
287struct bio_and_error {
288 struct bio *bio;
289 int error;
290};
291
292extern void _req_may_be_done(struct drbd_request *req,
293 struct bio_and_error *m);
294extern void __req_mod(struct drbd_request *req, enum drbd_req_event what,
295 struct bio_and_error *m);
296extern void complete_master_bio(struct drbd_conf *mdev,
297 struct bio_and_error *m);
298
299/* use this if you don't want to deal with calling complete_master_bio()
300 * outside the spinlock, e.g. when walking some list on cleanup. */
301static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what)
302{
303 struct drbd_conf *mdev = req->mdev;
304 struct bio_and_error m;
305
306 /* __req_mod possibly frees req, do not touch req after that! */
307 __req_mod(req, what, &m);
308 if (m.bio)
309 complete_master_bio(mdev, &m);
310}
311
312/* completion of master bio is outside of spinlock.
313 * If you need it irqsave, do it your self! */
314static inline void req_mod(struct drbd_request *req,
315 enum drbd_req_event what)
316{
317 struct drbd_conf *mdev = req->mdev;
318 struct bio_and_error m;
319 spin_lock_irq(&mdev->req_lock);
320 __req_mod(req, what, &m);
321 spin_unlock_irq(&mdev->req_lock);
322
323 if (m.bio)
324 complete_master_bio(mdev, &m);
325}
326#endif
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
new file mode 100644
index 000000000000..76863e3f05be
--- /dev/null
+++ b/drivers/block/drbd/drbd_strings.c
@@ -0,0 +1,113 @@
1/*
2 drbd.h
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24*/
25
26#include <linux/drbd.h>
27
28static const char *drbd_conn_s_names[] = {
29 [C_STANDALONE] = "StandAlone",
30 [C_DISCONNECTING] = "Disconnecting",
31 [C_UNCONNECTED] = "Unconnected",
32 [C_TIMEOUT] = "Timeout",
33 [C_BROKEN_PIPE] = "BrokenPipe",
34 [C_NETWORK_FAILURE] = "NetworkFailure",
35 [C_PROTOCOL_ERROR] = "ProtocolError",
36 [C_WF_CONNECTION] = "WFConnection",
37 [C_WF_REPORT_PARAMS] = "WFReportParams",
38 [C_TEAR_DOWN] = "TearDown",
39 [C_CONNECTED] = "Connected",
40 [C_STARTING_SYNC_S] = "StartingSyncS",
41 [C_STARTING_SYNC_T] = "StartingSyncT",
42 [C_WF_BITMAP_S] = "WFBitMapS",
43 [C_WF_BITMAP_T] = "WFBitMapT",
44 [C_WF_SYNC_UUID] = "WFSyncUUID",
45 [C_SYNC_SOURCE] = "SyncSource",
46 [C_SYNC_TARGET] = "SyncTarget",
47 [C_PAUSED_SYNC_S] = "PausedSyncS",
48 [C_PAUSED_SYNC_T] = "PausedSyncT",
49 [C_VERIFY_S] = "VerifyS",
50 [C_VERIFY_T] = "VerifyT",
51};
52
53static const char *drbd_role_s_names[] = {
54 [R_PRIMARY] = "Primary",
55 [R_SECONDARY] = "Secondary",
56 [R_UNKNOWN] = "Unknown"
57};
58
59static const char *drbd_disk_s_names[] = {
60 [D_DISKLESS] = "Diskless",
61 [D_ATTACHING] = "Attaching",
62 [D_FAILED] = "Failed",
63 [D_NEGOTIATING] = "Negotiating",
64 [D_INCONSISTENT] = "Inconsistent",
65 [D_OUTDATED] = "Outdated",
66 [D_UNKNOWN] = "DUnknown",
67 [D_CONSISTENT] = "Consistent",
68 [D_UP_TO_DATE] = "UpToDate",
69};
70
71static const char *drbd_state_sw_errors[] = {
72 [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
73 [-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk",
74 [-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
75 [-SS_NO_REMOTE_DISK] = "Can not resync without remote disk",
76 [-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected",
77 [-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated",
78 [-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active",
79 [-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device",
80 [-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node",
81 [-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk",
82 [-SS_DEVICE_IN_USE] = "Device is held open by someone",
83 [-SS_NO_NET_CONFIG] = "Have no net/connection configuration",
84 [-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify",
85 [-SS_NEED_CONNECTION] = "Need a connection to start verify or resync",
86 [-SS_NOT_SUPPORTED] = "Peer does not support protocol",
87 [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
88 [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
89 [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
90};
91
92const char *drbd_conn_str(enum drbd_conns s)
93{
94 /* enums are unsigned... */
95 return s > C_PAUSED_SYNC_T ? "TOO_LARGE" : drbd_conn_s_names[s];
96}
97
98const char *drbd_role_str(enum drbd_role s)
99{
100 return s > R_SECONDARY ? "TOO_LARGE" : drbd_role_s_names[s];
101}
102
103const char *drbd_disk_str(enum drbd_disk_state s)
104{
105 return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s];
106}
107
108const char *drbd_set_st_err_str(enum drbd_state_ret_codes err)
109{
110 return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" :
111 err > SS_TWO_PRIMARIES ? "TOO_LARGE"
112 : drbd_state_sw_errors[-err];
113}
diff --git a/drivers/block/drbd/drbd_vli.h b/drivers/block/drbd/drbd_vli.h
new file mode 100644
index 000000000000..fc824006e721
--- /dev/null
+++ b/drivers/block/drbd/drbd_vli.h
@@ -0,0 +1,351 @@
1/*
2-*- linux-c -*-
3 drbd_receiver.c
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25#ifndef _DRBD_VLI_H
26#define _DRBD_VLI_H
27
28/*
29 * At a granularity of 4KiB storage represented per bit,
30 * and stroage sizes of several TiB,
31 * and possibly small-bandwidth replication,
32 * the bitmap transfer time can take much too long,
33 * if transmitted in plain text.
34 *
35 * We try to reduce the transfered bitmap information
36 * by encoding runlengths of bit polarity.
37 *
38 * We never actually need to encode a "zero" (runlengths are positive).
39 * But then we have to store the value of the first bit.
40 * The first bit of information thus shall encode if the first runlength
41 * gives the number of set or unset bits.
42 *
43 * We assume that large areas are either completely set or unset,
44 * which gives good compression with any runlength method,
45 * even when encoding the runlength as fixed size 32bit/64bit integers.
46 *
47 * Still, there may be areas where the polarity flips every few bits,
48 * and encoding the runlength sequence of those areas with fix size
49 * integers would be much worse than plaintext.
50 *
51 * We want to encode small runlength values with minimum code length,
52 * while still being able to encode a Huge run of all zeros.
53 *
54 * Thus we need a Variable Length Integer encoding, VLI.
55 *
56 * For some cases, we produce more code bits than plaintext input.
57 * We need to send incompressible chunks as plaintext, skip over them
58 * and then see if the next chunk compresses better.
59 *
60 * We don't care too much about "excellent" compression ratio for large
61 * runlengths (all set/all clear): whether we achieve a factor of 100
62 * or 1000 is not that much of an issue.
63 * We do not want to waste too much on short runlengths in the "noisy"
64 * parts of the bitmap, though.
65 *
66 * There are endless variants of VLI, we experimented with:
67 * * simple byte-based
68 * * various bit based with different code word length.
69 *
70 * To avoid yet an other configuration parameter (choice of bitmap compression
71 * algorithm) which was difficult to explain and tune, we just chose the one
72 * variant that turned out best in all test cases.
73 * Based on real world usage patterns, with device sizes ranging from a few GiB
74 * to several TiB, file server/mailserver/webserver/mysql/postgress,
75 * mostly idle to really busy, the all time winner (though sometimes only
76 * marginally better) is:
77 */
78
79/*
80 * encoding is "visualised" as
81 * __little endian__ bitstream, least significant bit first (left most)
82 *
83 * this particular encoding is chosen so that the prefix code
84 * starts as unary encoding the level, then modified so that
85 * 10 levels can be described in 8bit, with minimal overhead
86 * for the smaller levels.
87 *
88 * Number of data bits follow fibonacci sequence, with the exception of the
89 * last level (+1 data bit, so it makes 64bit total). The only worse code when
90 * encoding bit polarity runlength is 1 plain bits => 2 code bits.
91prefix data bits max val Nº data bits
920 x 0x2 1
9310 x 0x4 1
94110 xx 0x8 2
951110 xxx 0x10 3
9611110 xxx xx 0x30 5
97111110 xx xxxxxx 0x130 8
9811111100 xxxxxxxx xxxxx 0x2130 13
9911111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21
10011111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34
10111111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
102 * maximum encodable value: 0x100000400202130 == 2**56 + some */
103
104/* compression "table":
105 transmitted x 0.29
106 as plaintext x ........................
107 x ........................
108 x ........................
109 x 0.59 0.21........................
110 x ........................................................
111 x .. c ...................................................
112 x 0.44.. o ...................................................
113 x .......... d ...................................................
114 x .......... e ...................................................
115 X............. ...................................................
116 x.............. b ...................................................
1172.0x............... i ...................................................
118 #X................ t ...................................................
119 #................. s ........................... plain bits ..........
120-+-----------------------------------------------------------------------
121 1 16 32 64
122*/
123
124/* LEVEL: (total bits, prefix bits, prefix value),
125 * sorted ascending by number of total bits.
126 * The rest of the code table is calculated at compiletime from this. */
127
128/* fibonacci data 1, 1, ... */
129#define VLI_L_1_1() do { \
130 LEVEL( 2, 1, 0x00); \
131 LEVEL( 3, 2, 0x01); \
132 LEVEL( 5, 3, 0x03); \
133 LEVEL( 7, 4, 0x07); \
134 LEVEL(10, 5, 0x0f); \
135 LEVEL(14, 6, 0x1f); \
136 LEVEL(21, 8, 0x3f); \
137 LEVEL(29, 8, 0x7f); \
138 LEVEL(42, 8, 0xbf); \
139 LEVEL(64, 8, 0xff); \
140 } while (0)
141
142/* finds a suitable level to decode the least significant part of in.
143 * returns number of bits consumed.
144 *
145 * BUG() for bad input, as that would mean a buggy code table. */
146static inline int vli_decode_bits(u64 *out, const u64 in)
147{
148 u64 adj = 1;
149
150#define LEVEL(t,b,v) \
151 do { \
152 if ((in & ((1 << b) -1)) == v) { \
153 *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \
154 return t; \
155 } \
156 adj += 1ULL << (t - b); \
157 } while (0)
158
159 VLI_L_1_1();
160
161 /* NOT REACHED, if VLI_LEVELS code table is defined properly */
162 BUG();
163#undef LEVEL
164}
165
166/* return number of code bits needed,
167 * or negative error number */
168static inline int __vli_encode_bits(u64 *out, const u64 in)
169{
170 u64 max = 0;
171 u64 adj = 1;
172
173 if (in == 0)
174 return -EINVAL;
175
176#define LEVEL(t,b,v) do { \
177 max += 1ULL << (t - b); \
178 if (in <= max) { \
179 if (out) \
180 *out = ((in - adj) << b) | v; \
181 return t; \
182 } \
183 adj = max + 1; \
184 } while (0)
185
186 VLI_L_1_1();
187
188 return -EOVERFLOW;
189#undef LEVEL
190}
191
192#undef VLI_L_1_1
193
194/* code from here down is independend of actually used bit code */
195
196/*
197 * Code length is determined by some unique (e.g. unary) prefix.
198 * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
199 * not a byte stream.
200 */
201
202/* for the bitstream, we need a cursor */
203struct bitstream_cursor {
204 /* the current byte */
205 u8 *b;
206 /* the current bit within *b, nomalized: 0..7 */
207 unsigned int bit;
208};
209
210/* initialize cursor to point to first bit of stream */
211static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
212{
213 cur->b = s;
214 cur->bit = 0;
215}
216
217/* advance cursor by that many bits; maximum expected input value: 64,
218 * but depending on VLI implementation, it may be more. */
219static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
220{
221 bits += cur->bit;
222 cur->b = cur->b + (bits >> 3);
223 cur->bit = bits & 7;
224}
225
226/* the bitstream itself knows its length */
227struct bitstream {
228 struct bitstream_cursor cur;
229 unsigned char *buf;
230 size_t buf_len; /* in bytes */
231
232 /* for input stream:
233 * number of trailing 0 bits for padding
234 * total number of valid bits in stream: buf_len * 8 - pad_bits */
235 unsigned int pad_bits;
236};
237
238static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
239{
240 bs->buf = s;
241 bs->buf_len = len;
242 bs->pad_bits = pad_bits;
243 bitstream_cursor_reset(&bs->cur, bs->buf);
244}
245
246static inline void bitstream_rewind(struct bitstream *bs)
247{
248 bitstream_cursor_reset(&bs->cur, bs->buf);
249 memset(bs->buf, 0, bs->buf_len);
250}
251
252/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
253 * Ignores "pad_bits".
254 * Returns zero if bits == 0 (nothing to do).
255 * Returns number of bits used if successful.
256 *
257 * If there is not enough room left in bitstream,
258 * leaves bitstream unchanged and returns -ENOBUFS.
259 */
260static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
261{
262 unsigned char *b = bs->cur.b;
263 unsigned int tmp;
264
265 if (bits == 0)
266 return 0;
267
268 if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
269 return -ENOBUFS;
270
271 /* paranoia: strip off hi bits; they should not be set anyways. */
272 if (bits < 64)
273 val &= ~0ULL >> (64 - bits);
274
275 *b++ |= (val & 0xff) << bs->cur.bit;
276
277 for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
278 *b++ |= (val >> tmp) & 0xff;
279
280 bitstream_cursor_advance(&bs->cur, bits);
281 return bits;
282}
283
284/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
285 *
286 * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
287 *
288 * If there are less than the requested number of valid bits left in the
289 * bitstream, still fetches all available bits.
290 *
291 * Returns number of actually fetched bits.
292 */
293static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
294{
295 u64 val;
296 unsigned int n;
297
298 if (bits > 64)
299 return -EINVAL;
300
301 if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
302 bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
303 - bs->cur.bit - bs->pad_bits;
304
305 if (bits == 0) {
306 *out = 0;
307 return 0;
308 }
309
310 /* get the high bits */
311 val = 0;
312 n = (bs->cur.bit + bits + 7) >> 3;
313 /* n may be at most 9, if cur.bit + bits > 64 */
314 /* which means this copies at most 8 byte */
315 if (n) {
316 memcpy(&val, bs->cur.b+1, n - 1);
317 val = le64_to_cpu(val) << (8 - bs->cur.bit);
318 }
319
320 /* we still need the low bits */
321 val |= bs->cur.b[0] >> bs->cur.bit;
322
323 /* and mask out bits we don't want */
324 val &= ~0ULL >> (64 - bits);
325
326 bitstream_cursor_advance(&bs->cur, bits);
327 *out = val;
328
329 return bits;
330}
331
332/* encodes @in as vli into @bs;
333
334 * return values
335 * > 0: number of bits successfully stored in bitstream
336 * -ENOBUFS @bs is full
337 * -EINVAL input zero (invalid)
338 * -EOVERFLOW input too large for this vli code (invalid)
339 */
340static inline int vli_encode_bits(struct bitstream *bs, u64 in)
341{
342 u64 code = code;
343 int bits = __vli_encode_bits(&code, in);
344
345 if (bits <= 0)
346 return bits;
347
348 return bitstream_put_bits(bs, code, bits);
349}
350
351#endif
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
new file mode 100644
index 000000000000..d48a1dfd7b24
--- /dev/null
+++ b/drivers/block/drbd/drbd_worker.c
@@ -0,0 +1,1516 @@
1/*
2 drbd_worker.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/module.h>
27#include <linux/drbd.h>
28#include <linux/sched.h>
29#include <linux/smp_lock.h>
30#include <linux/wait.h>
31#include <linux/mm.h>
32#include <linux/memcontrol.h>
33#include <linux/mm_inline.h>
34#include <linux/slab.h>
35#include <linux/random.h>
36#include <linux/string.h>
37#include <linux/scatterlist.h>
38
39#include "drbd_int.h"
40#include "drbd_req.h"
41
42#define SLEEP_TIME (HZ/10)
43
44static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
45
46
47
48/* defined here:
49 drbd_md_io_complete
50 drbd_endio_write_sec
51 drbd_endio_read_sec
52 drbd_endio_pri
53
54 * more endio handlers:
55 atodb_endio in drbd_actlog.c
56 drbd_bm_async_io_complete in drbd_bitmap.c
57
58 * For all these callbacks, note the following:
59 * The callbacks will be called in irq context by the IDE drivers,
60 * and in Softirqs/Tasklets/BH context by the SCSI drivers.
61 * Try to get the locking right :)
62 *
63 */
64
65
66/* About the global_state_lock
67 Each state transition on an device holds a read lock. In case we have
68 to evaluate the sync after dependencies, we grab a write lock, because
69 we need stable states on all devices for that. */
70rwlock_t global_state_lock;
71
72/* used for synchronous meta data and bitmap IO
73 * submitted by drbd_md_sync_page_io()
74 */
75void drbd_md_io_complete(struct bio *bio, int error)
76{
77 struct drbd_md_io *md_io;
78
79 md_io = (struct drbd_md_io *)bio->bi_private;
80 md_io->error = error;
81
82 complete(&md_io->event);
83}
84
85/* reads on behalf of the partner,
86 * "submitted" by the receiver
87 */
88void drbd_endio_read_sec(struct bio *bio, int error) __releases(local)
89{
90 unsigned long flags = 0;
91 struct drbd_epoch_entry *e = NULL;
92 struct drbd_conf *mdev;
93 int uptodate = bio_flagged(bio, BIO_UPTODATE);
94
95 e = bio->bi_private;
96 mdev = e->mdev;
97
98 if (error)
99 dev_warn(DEV, "read: error=%d s=%llus\n", error,
100 (unsigned long long)e->sector);
101 if (!error && !uptodate) {
102 dev_warn(DEV, "read: setting error to -EIO s=%llus\n",
103 (unsigned long long)e->sector);
104 /* strange behavior of some lower level drivers...
105 * fail the request by clearing the uptodate flag,
106 * but do not return any error?! */
107 error = -EIO;
108 }
109
110 D_ASSERT(e->block_id != ID_VACANT);
111
112 spin_lock_irqsave(&mdev->req_lock, flags);
113 mdev->read_cnt += e->size >> 9;
114 list_del(&e->w.list);
115 if (list_empty(&mdev->read_ee))
116 wake_up(&mdev->ee_wait);
117 spin_unlock_irqrestore(&mdev->req_lock, flags);
118
119 drbd_chk_io_error(mdev, error, FALSE);
120 drbd_queue_work(&mdev->data.work, &e->w);
121 put_ldev(mdev);
122}
123
124/* writes on behalf of the partner, or resync writes,
125 * "submitted" by the receiver.
126 */
127void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
128{
129 unsigned long flags = 0;
130 struct drbd_epoch_entry *e = NULL;
131 struct drbd_conf *mdev;
132 sector_t e_sector;
133 int do_wake;
134 int is_syncer_req;
135 int do_al_complete_io;
136 int uptodate = bio_flagged(bio, BIO_UPTODATE);
137 int is_barrier = bio_rw_flagged(bio, BIO_RW_BARRIER);
138
139 e = bio->bi_private;
140 mdev = e->mdev;
141
142 if (error)
143 dev_warn(DEV, "write: error=%d s=%llus\n", error,
144 (unsigned long long)e->sector);
145 if (!error && !uptodate) {
146 dev_warn(DEV, "write: setting error to -EIO s=%llus\n",
147 (unsigned long long)e->sector);
148 /* strange behavior of some lower level drivers...
149 * fail the request by clearing the uptodate flag,
150 * but do not return any error?! */
151 error = -EIO;
152 }
153
154 /* error == -ENOTSUPP would be a better test,
155 * alas it is not reliable */
156 if (error && is_barrier && e->flags & EE_IS_BARRIER) {
157 drbd_bump_write_ordering(mdev, WO_bdev_flush);
158 spin_lock_irqsave(&mdev->req_lock, flags);
159 list_del(&e->w.list);
160 e->w.cb = w_e_reissue;
161 /* put_ldev actually happens below, once we come here again. */
162 __release(local);
163 spin_unlock_irqrestore(&mdev->req_lock, flags);
164 drbd_queue_work(&mdev->data.work, &e->w);
165 return;
166 }
167
168 D_ASSERT(e->block_id != ID_VACANT);
169
170 spin_lock_irqsave(&mdev->req_lock, flags);
171 mdev->writ_cnt += e->size >> 9;
172 is_syncer_req = is_syncer_block_id(e->block_id);
173
174 /* after we moved e to done_ee,
175 * we may no longer access it,
176 * it may be freed/reused already!
177 * (as soon as we release the req_lock) */
178 e_sector = e->sector;
179 do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
180
181 list_del(&e->w.list); /* has been on active_ee or sync_ee */
182 list_add_tail(&e->w.list, &mdev->done_ee);
183
184 /* No hlist_del_init(&e->colision) here, we did not send the Ack yet,
185 * neither did we wake possibly waiting conflicting requests.
186 * done from "drbd_process_done_ee" within the appropriate w.cb
187 * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
188
189 do_wake = is_syncer_req
190 ? list_empty(&mdev->sync_ee)
191 : list_empty(&mdev->active_ee);
192
193 if (error)
194 __drbd_chk_io_error(mdev, FALSE);
195 spin_unlock_irqrestore(&mdev->req_lock, flags);
196
197 if (is_syncer_req)
198 drbd_rs_complete_io(mdev, e_sector);
199
200 if (do_wake)
201 wake_up(&mdev->ee_wait);
202
203 if (do_al_complete_io)
204 drbd_al_complete_io(mdev, e_sector);
205
206 wake_asender(mdev);
207 put_ldev(mdev);
208
209}
210
211/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
212 */
213void drbd_endio_pri(struct bio *bio, int error)
214{
215 unsigned long flags;
216 struct drbd_request *req = bio->bi_private;
217 struct drbd_conf *mdev = req->mdev;
218 struct bio_and_error m;
219 enum drbd_req_event what;
220 int uptodate = bio_flagged(bio, BIO_UPTODATE);
221
222 if (error)
223 dev_warn(DEV, "p %s: error=%d\n",
224 bio_data_dir(bio) == WRITE ? "write" : "read", error);
225 if (!error && !uptodate) {
226 dev_warn(DEV, "p %s: setting error to -EIO\n",
227 bio_data_dir(bio) == WRITE ? "write" : "read");
228 /* strange behavior of some lower level drivers...
229 * fail the request by clearing the uptodate flag,
230 * but do not return any error?! */
231 error = -EIO;
232 }
233
234 /* to avoid recursion in __req_mod */
235 if (unlikely(error)) {
236 what = (bio_data_dir(bio) == WRITE)
237 ? write_completed_with_error
238 : (bio_rw(bio) == READ)
239 ? read_completed_with_error
240 : read_ahead_completed_with_error;
241 } else
242 what = completed_ok;
243
244 bio_put(req->private_bio);
245 req->private_bio = ERR_PTR(error);
246
247 spin_lock_irqsave(&mdev->req_lock, flags);
248 __req_mod(req, what, &m);
249 spin_unlock_irqrestore(&mdev->req_lock, flags);
250
251 if (m.bio)
252 complete_master_bio(mdev, &m);
253}
254
255int w_io_error(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
256{
257 struct drbd_request *req = container_of(w, struct drbd_request, w);
258
259 /* NOTE: mdev->ldev can be NULL by the time we get here! */
260 /* D_ASSERT(mdev->ldev->dc.on_io_error != EP_PASS_ON); */
261
262 /* the only way this callback is scheduled is from _req_may_be_done,
263 * when it is done and had a local write error, see comments there */
264 drbd_req_free(req);
265
266 return TRUE;
267}
268
269int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
270{
271 struct drbd_request *req = container_of(w, struct drbd_request, w);
272
273 /* We should not detach for read io-error,
274 * but try to WRITE the P_DATA_REPLY to the failed location,
275 * to give the disk the chance to relocate that block */
276
277 spin_lock_irq(&mdev->req_lock);
278 if (cancel ||
279 mdev->state.conn < C_CONNECTED ||
280 mdev->state.pdsk <= D_INCONSISTENT) {
281 _req_mod(req, send_canceled);
282 spin_unlock_irq(&mdev->req_lock);
283 dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n");
284 return 1;
285 }
286 spin_unlock_irq(&mdev->req_lock);
287
288 return w_send_read_req(mdev, w, 0);
289}
290
291int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
292{
293 ERR_IF(cancel) return 1;
294 dev_err(DEV, "resync inactive, but callback triggered??\n");
295 return 1; /* Simply ignore this! */
296}
297
298void drbd_csum(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
299{
300 struct hash_desc desc;
301 struct scatterlist sg;
302 struct bio_vec *bvec;
303 int i;
304
305 desc.tfm = tfm;
306 desc.flags = 0;
307
308 sg_init_table(&sg, 1);
309 crypto_hash_init(&desc);
310
311 __bio_for_each_segment(bvec, bio, i, 0) {
312 sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
313 crypto_hash_update(&desc, &sg, sg.length);
314 }
315 crypto_hash_final(&desc, digest);
316}
317
318static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
319{
320 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
321 int digest_size;
322 void *digest;
323 int ok;
324
325 D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);
326
327 if (unlikely(cancel)) {
328 drbd_free_ee(mdev, e);
329 return 1;
330 }
331
332 if (likely(drbd_bio_uptodate(e->private_bio))) {
333 digest_size = crypto_hash_digestsize(mdev->csums_tfm);
334 digest = kmalloc(digest_size, GFP_NOIO);
335 if (digest) {
336 drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
337
338 inc_rs_pending(mdev);
339 ok = drbd_send_drequest_csum(mdev,
340 e->sector,
341 e->size,
342 digest,
343 digest_size,
344 P_CSUM_RS_REQUEST);
345 kfree(digest);
346 } else {
347 dev_err(DEV, "kmalloc() of digest failed.\n");
348 ok = 0;
349 }
350 } else
351 ok = 1;
352
353 drbd_free_ee(mdev, e);
354
355 if (unlikely(!ok))
356 dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
357 return ok;
358}
359
360#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
361
362static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
363{
364 struct drbd_epoch_entry *e;
365
366 if (!get_ldev(mdev))
367 return 0;
368
369 /* GFP_TRY, because if there is no memory available right now, this may
370 * be rescheduled for later. It is "only" background resync, after all. */
371 e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
372 if (!e) {
373 put_ldev(mdev);
374 return 2;
375 }
376
377 spin_lock_irq(&mdev->req_lock);
378 list_add(&e->w.list, &mdev->read_ee);
379 spin_unlock_irq(&mdev->req_lock);
380
381 e->private_bio->bi_end_io = drbd_endio_read_sec;
382 e->private_bio->bi_rw = READ;
383 e->w.cb = w_e_send_csum;
384
385 mdev->read_cnt += size >> 9;
386 drbd_generic_make_request(mdev, DRBD_FAULT_RS_RD, e->private_bio);
387
388 return 1;
389}
390
391void resync_timer_fn(unsigned long data)
392{
393 unsigned long flags;
394 struct drbd_conf *mdev = (struct drbd_conf *) data;
395 int queue;
396
397 spin_lock_irqsave(&mdev->req_lock, flags);
398
399 if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) {
400 queue = 1;
401 if (mdev->state.conn == C_VERIFY_S)
402 mdev->resync_work.cb = w_make_ov_request;
403 else
404 mdev->resync_work.cb = w_make_resync_request;
405 } else {
406 queue = 0;
407 mdev->resync_work.cb = w_resync_inactive;
408 }
409
410 spin_unlock_irqrestore(&mdev->req_lock, flags);
411
412 /* harmless race: list_empty outside data.work.q_lock */
413 if (list_empty(&mdev->resync_work.list) && queue)
414 drbd_queue_work(&mdev->data.work, &mdev->resync_work);
415}
416
417int w_make_resync_request(struct drbd_conf *mdev,
418 struct drbd_work *w, int cancel)
419{
420 unsigned long bit;
421 sector_t sector;
422 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
423 int max_segment_size = queue_max_segment_size(mdev->rq_queue);
424 int number, i, size, pe, mx;
425 int align, queued, sndbuf;
426
427 if (unlikely(cancel))
428 return 1;
429
430 if (unlikely(mdev->state.conn < C_CONNECTED)) {
431 dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected");
432 return 0;
433 }
434
435 if (mdev->state.conn != C_SYNC_TARGET)
436 dev_err(DEV, "%s in w_make_resync_request\n",
437 drbd_conn_str(mdev->state.conn));
438
439 if (!get_ldev(mdev)) {
440 /* Since we only need to access mdev->rsync a
441 get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
442 to continue resync with a broken disk makes no sense at
443 all */
444 dev_err(DEV, "Disk broke down during resync!\n");
445 mdev->resync_work.cb = w_resync_inactive;
446 return 1;
447 }
448
449 number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
450 pe = atomic_read(&mdev->rs_pending_cnt);
451
452 mutex_lock(&mdev->data.mutex);
453 if (mdev->data.socket)
454 mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req);
455 else
456 mx = 1;
457 mutex_unlock(&mdev->data.mutex);
458
459 /* For resync rates >160MB/sec, allow more pending RS requests */
460 if (number > mx)
461 mx = number;
462
463 /* Limit the number of pending RS requests to no more than the peer's receive buffer */
464 if ((pe + number) > mx) {
465 number = mx - pe;
466 }
467
468 for (i = 0; i < number; i++) {
469 /* Stop generating RS requests, when half of the send buffer is filled */
470 mutex_lock(&mdev->data.mutex);
471 if (mdev->data.socket) {
472 queued = mdev->data.socket->sk->sk_wmem_queued;
473 sndbuf = mdev->data.socket->sk->sk_sndbuf;
474 } else {
475 queued = 1;
476 sndbuf = 0;
477 }
478 mutex_unlock(&mdev->data.mutex);
479 if (queued > sndbuf / 2)
480 goto requeue;
481
482next_sector:
483 size = BM_BLOCK_SIZE;
484 bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo);
485
486 if (bit == -1UL) {
487 mdev->bm_resync_fo = drbd_bm_bits(mdev);
488 mdev->resync_work.cb = w_resync_inactive;
489 put_ldev(mdev);
490 return 1;
491 }
492
493 sector = BM_BIT_TO_SECT(bit);
494
495 if (drbd_try_rs_begin_io(mdev, sector)) {
496 mdev->bm_resync_fo = bit;
497 goto requeue;
498 }
499 mdev->bm_resync_fo = bit + 1;
500
501 if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) {
502 drbd_rs_complete_io(mdev, sector);
503 goto next_sector;
504 }
505
506#if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE
507 /* try to find some adjacent bits.
508 * we stop if we have already the maximum req size.
509 *
510 * Additionally always align bigger requests, in order to
511 * be prepared for all stripe sizes of software RAIDs.
512 *
513 * we _do_ care about the agreed-upon q->max_segment_size
514 * here, as splitting up the requests on the other side is more
515 * difficult. the consequence is, that on lvm and md and other
516 * "indirect" devices, this is dead code, since
517 * q->max_segment_size will be PAGE_SIZE.
518 */
519 align = 1;
520 for (;;) {
521 if (size + BM_BLOCK_SIZE > max_segment_size)
522 break;
523
524 /* Be always aligned */
525 if (sector & ((1<<(align+3))-1))
526 break;
527
528 /* do not cross extent boundaries */
529 if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
530 break;
531 /* now, is it actually dirty, after all?
532 * caution, drbd_bm_test_bit is tri-state for some
533 * obscure reason; ( b == 0 ) would get the out-of-band
534 * only accidentally right because of the "oddly sized"
535 * adjustment below */
536 if (drbd_bm_test_bit(mdev, bit+1) != 1)
537 break;
538 bit++;
539 size += BM_BLOCK_SIZE;
540 if ((BM_BLOCK_SIZE << align) <= size)
541 align++;
542 i++;
543 }
544 /* if we merged some,
545 * reset the offset to start the next drbd_bm_find_next from */
546 if (size > BM_BLOCK_SIZE)
547 mdev->bm_resync_fo = bit + 1;
548#endif
549
550 /* adjust very last sectors, in case we are oddly sized */
551 if (sector + (size>>9) > capacity)
552 size = (capacity-sector)<<9;
553 if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
554 switch (read_for_csum(mdev, sector, size)) {
555 case 0: /* Disk failure*/
556 put_ldev(mdev);
557 return 0;
558 case 2: /* Allocation failed */
559 drbd_rs_complete_io(mdev, sector);
560 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
561 goto requeue;
562 /* case 1: everything ok */
563 }
564 } else {
565 inc_rs_pending(mdev);
566 if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
567 sector, size, ID_SYNCER)) {
568 dev_err(DEV, "drbd_send_drequest() failed, aborting...\n");
569 dec_rs_pending(mdev);
570 put_ldev(mdev);
571 return 0;
572 }
573 }
574 }
575
576 if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) {
577 /* last syncer _request_ was sent,
578 * but the P_RS_DATA_REPLY not yet received. sync will end (and
579 * next sync group will resume), as soon as we receive the last
580 * resync data block, and the last bit is cleared.
581 * until then resync "work" is "inactive" ...
582 */
583 mdev->resync_work.cb = w_resync_inactive;
584 put_ldev(mdev);
585 return 1;
586 }
587
588 requeue:
589 mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
590 put_ldev(mdev);
591 return 1;
592}
593
594static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
595{
596 int number, i, size;
597 sector_t sector;
598 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
599
600 if (unlikely(cancel))
601 return 1;
602
603 if (unlikely(mdev->state.conn < C_CONNECTED)) {
604 dev_err(DEV, "Confused in w_make_ov_request()! cstate < Connected");
605 return 0;
606 }
607
608 number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
609 if (atomic_read(&mdev->rs_pending_cnt) > number)
610 goto requeue;
611
612 number -= atomic_read(&mdev->rs_pending_cnt);
613
614 sector = mdev->ov_position;
615 for (i = 0; i < number; i++) {
616 if (sector >= capacity) {
617 mdev->resync_work.cb = w_resync_inactive;
618 return 1;
619 }
620
621 size = BM_BLOCK_SIZE;
622
623 if (drbd_try_rs_begin_io(mdev, sector)) {
624 mdev->ov_position = sector;
625 goto requeue;
626 }
627
628 if (sector + (size>>9) > capacity)
629 size = (capacity-sector)<<9;
630
631 inc_rs_pending(mdev);
632 if (!drbd_send_ov_request(mdev, sector, size)) {
633 dec_rs_pending(mdev);
634 return 0;
635 }
636 sector += BM_SECT_PER_BIT;
637 }
638 mdev->ov_position = sector;
639
640 requeue:
641 mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
642 return 1;
643}
644
645
646int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
647{
648 kfree(w);
649 ov_oos_print(mdev);
650 drbd_resync_finished(mdev);
651
652 return 1;
653}
654
655static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
656{
657 kfree(w);
658
659 drbd_resync_finished(mdev);
660
661 return 1;
662}
663
664int drbd_resync_finished(struct drbd_conf *mdev)
665{
666 unsigned long db, dt, dbdt;
667 unsigned long n_oos;
668 union drbd_state os, ns;
669 struct drbd_work *w;
670 char *khelper_cmd = NULL;
671
672 /* Remove all elements from the resync LRU. Since future actions
673 * might set bits in the (main) bitmap, then the entries in the
674 * resync LRU would be wrong. */
675 if (drbd_rs_del_all(mdev)) {
676 /* In case this is not possible now, most probably because
677 * there are P_RS_DATA_REPLY Packets lingering on the worker's
678 * queue (or even the read operations for those packets
679 * is not finished by now). Retry in 100ms. */
680
681 drbd_kick_lo(mdev);
682 __set_current_state(TASK_INTERRUPTIBLE);
683 schedule_timeout(HZ / 10);
684 w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
685 if (w) {
686 w->cb = w_resync_finished;
687 drbd_queue_work(&mdev->data.work, w);
688 return 1;
689 }
690 dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
691 }
692
693 dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
694 if (dt <= 0)
695 dt = 1;
696 db = mdev->rs_total;
697 dbdt = Bit2KB(db/dt);
698 mdev->rs_paused /= HZ;
699
700 if (!get_ldev(mdev))
701 goto out;
702
703 spin_lock_irq(&mdev->req_lock);
704 os = mdev->state;
705
706 /* This protects us against multiple calls (that can happen in the presence
707 of application IO), and against connectivity loss just before we arrive here. */
708 if (os.conn <= C_CONNECTED)
709 goto out_unlock;
710
711 ns = os;
712 ns.conn = C_CONNECTED;
713
714 dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
715 (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) ?
716 "Online verify " : "Resync",
717 dt + mdev->rs_paused, mdev->rs_paused, dbdt);
718
719 n_oos = drbd_bm_total_weight(mdev);
720
721 if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
722 if (n_oos) {
723 dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n",
724 n_oos, Bit2KB(1));
725 khelper_cmd = "out-of-sync";
726 }
727 } else {
728 D_ASSERT((n_oos - mdev->rs_failed) == 0);
729
730 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
731 khelper_cmd = "after-resync-target";
732
733 if (mdev->csums_tfm && mdev->rs_total) {
734 const unsigned long s = mdev->rs_same_csum;
735 const unsigned long t = mdev->rs_total;
736 const int ratio =
737 (t == 0) ? 0 :
738 (t < 100000) ? ((s*100)/t) : (s/(t/100));
739 dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; "
740 "transferred %luK total %luK\n",
741 ratio,
742 Bit2KB(mdev->rs_same_csum),
743 Bit2KB(mdev->rs_total - mdev->rs_same_csum),
744 Bit2KB(mdev->rs_total));
745 }
746 }
747
748 if (mdev->rs_failed) {
749 dev_info(DEV, " %lu failed blocks\n", mdev->rs_failed);
750
751 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
752 ns.disk = D_INCONSISTENT;
753 ns.pdsk = D_UP_TO_DATE;
754 } else {
755 ns.disk = D_UP_TO_DATE;
756 ns.pdsk = D_INCONSISTENT;
757 }
758 } else {
759 ns.disk = D_UP_TO_DATE;
760 ns.pdsk = D_UP_TO_DATE;
761
762 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
763 if (mdev->p_uuid) {
764 int i;
765 for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
766 _drbd_uuid_set(mdev, i, mdev->p_uuid[i]);
767 drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]);
768 _drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]);
769 } else {
770 dev_err(DEV, "mdev->p_uuid is NULL! BUG\n");
771 }
772 }
773
774 drbd_uuid_set_bm(mdev, 0UL);
775
776 if (mdev->p_uuid) {
777 /* Now the two UUID sets are equal, update what we
778 * know of the peer. */
779 int i;
780 for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
781 mdev->p_uuid[i] = mdev->ldev->md.uuid[i];
782 }
783 }
784
785 _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
786out_unlock:
787 spin_unlock_irq(&mdev->req_lock);
788 put_ldev(mdev);
789out:
790 mdev->rs_total = 0;
791 mdev->rs_failed = 0;
792 mdev->rs_paused = 0;
793 mdev->ov_start_sector = 0;
794
795 if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
796 dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n");
797 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
798 }
799
800 if (khelper_cmd)
801 drbd_khelper(mdev, khelper_cmd);
802
803 return 1;
804}
805
806/* helper */
807static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
808{
809 if (drbd_bio_has_active_page(e->private_bio)) {
810 /* This might happen if sendpage() has not finished */
811 spin_lock_irq(&mdev->req_lock);
812 list_add_tail(&e->w.list, &mdev->net_ee);
813 spin_unlock_irq(&mdev->req_lock);
814 } else
815 drbd_free_ee(mdev, e);
816}
817
818/**
819 * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
820 * @mdev: DRBD device.
821 * @w: work object.
822 * @cancel: The connection will be closed anyways
823 */
824int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
825{
826 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
827 int ok;
828
829 if (unlikely(cancel)) {
830 drbd_free_ee(mdev, e);
831 dec_unacked(mdev);
832 return 1;
833 }
834
835 if (likely(drbd_bio_uptodate(e->private_bio))) {
836 ok = drbd_send_block(mdev, P_DATA_REPLY, e);
837 } else {
838 if (__ratelimit(&drbd_ratelimit_state))
839 dev_err(DEV, "Sending NegDReply. sector=%llus.\n",
840 (unsigned long long)e->sector);
841
842 ok = drbd_send_ack(mdev, P_NEG_DREPLY, e);
843 }
844
845 dec_unacked(mdev);
846
847 move_to_net_ee_or_free(mdev, e);
848
849 if (unlikely(!ok))
850 dev_err(DEV, "drbd_send_block() failed\n");
851 return ok;
852}
853
854/**
855 * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS
856 * @mdev: DRBD device.
857 * @w: work object.
858 * @cancel: The connection will be closed anyways
859 */
860int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
861{
862 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
863 int ok;
864
865 if (unlikely(cancel)) {
866 drbd_free_ee(mdev, e);
867 dec_unacked(mdev);
868 return 1;
869 }
870
871 if (get_ldev_if_state(mdev, D_FAILED)) {
872 drbd_rs_complete_io(mdev, e->sector);
873 put_ldev(mdev);
874 }
875
876 if (likely(drbd_bio_uptodate(e->private_bio))) {
877 if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
878 inc_rs_pending(mdev);
879 ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
880 } else {
881 if (__ratelimit(&drbd_ratelimit_state))
882 dev_err(DEV, "Not sending RSDataReply, "
883 "partner DISKLESS!\n");
884 ok = 1;
885 }
886 } else {
887 if (__ratelimit(&drbd_ratelimit_state))
888 dev_err(DEV, "Sending NegRSDReply. sector %llus.\n",
889 (unsigned long long)e->sector);
890
891 ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
892
893 /* update resync data with failure */
894 drbd_rs_failed_io(mdev, e->sector, e->size);
895 }
896
897 dec_unacked(mdev);
898
899 move_to_net_ee_or_free(mdev, e);
900
901 if (unlikely(!ok))
902 dev_err(DEV, "drbd_send_block() failed\n");
903 return ok;
904}
905
906int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
907{
908 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
909 struct digest_info *di;
910 int digest_size;
911 void *digest = NULL;
912 int ok, eq = 0;
913
914 if (unlikely(cancel)) {
915 drbd_free_ee(mdev, e);
916 dec_unacked(mdev);
917 return 1;
918 }
919
920 drbd_rs_complete_io(mdev, e->sector);
921
922 di = (struct digest_info *)(unsigned long)e->block_id;
923
924 if (likely(drbd_bio_uptodate(e->private_bio))) {
925 /* quick hack to try to avoid a race against reconfiguration.
926 * a real fix would be much more involved,
927 * introducing more locking mechanisms */
928 if (mdev->csums_tfm) {
929 digest_size = crypto_hash_digestsize(mdev->csums_tfm);
930 D_ASSERT(digest_size == di->digest_size);
931 digest = kmalloc(digest_size, GFP_NOIO);
932 }
933 if (digest) {
934 drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
935 eq = !memcmp(digest, di->digest, digest_size);
936 kfree(digest);
937 }
938
939 if (eq) {
940 drbd_set_in_sync(mdev, e->sector, e->size);
941 /* rs_same_csums unit is BM_BLOCK_SIZE */
942 mdev->rs_same_csum += e->size >> BM_BLOCK_SHIFT;
943 ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
944 } else {
945 inc_rs_pending(mdev);
946 e->block_id = ID_SYNCER;
947 ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
948 }
949 } else {
950 ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
951 if (__ratelimit(&drbd_ratelimit_state))
952 dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
953 }
954
955 dec_unacked(mdev);
956
957 kfree(di);
958
959 move_to_net_ee_or_free(mdev, e);
960
961 if (unlikely(!ok))
962 dev_err(DEV, "drbd_send_block/ack() failed\n");
963 return ok;
964}
965
966int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
967{
968 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
969 int digest_size;
970 void *digest;
971 int ok = 1;
972
973 if (unlikely(cancel))
974 goto out;
975
976 if (unlikely(!drbd_bio_uptodate(e->private_bio)))
977 goto out;
978
979 digest_size = crypto_hash_digestsize(mdev->verify_tfm);
980 /* FIXME if this allocation fails, online verify will not terminate! */
981 digest = kmalloc(digest_size, GFP_NOIO);
982 if (digest) {
983 drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest);
984 inc_rs_pending(mdev);
985 ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
986 digest, digest_size, P_OV_REPLY);
987 if (!ok)
988 dec_rs_pending(mdev);
989 kfree(digest);
990 }
991
992out:
993 drbd_free_ee(mdev, e);
994
995 dec_unacked(mdev);
996
997 return ok;
998}
999
1000void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
1001{
1002 if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) {
1003 mdev->ov_last_oos_size += size>>9;
1004 } else {
1005 mdev->ov_last_oos_start = sector;
1006 mdev->ov_last_oos_size = size>>9;
1007 }
1008 drbd_set_out_of_sync(mdev, sector, size);
1009 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
1010}
1011
1012int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1013{
1014 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
1015 struct digest_info *di;
1016 int digest_size;
1017 void *digest;
1018 int ok, eq = 0;
1019
1020 if (unlikely(cancel)) {
1021 drbd_free_ee(mdev, e);
1022 dec_unacked(mdev);
1023 return 1;
1024 }
1025
1026 /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
1027 * the resync lru has been cleaned up already */
1028 drbd_rs_complete_io(mdev, e->sector);
1029
1030 di = (struct digest_info *)(unsigned long)e->block_id;
1031
1032 if (likely(drbd_bio_uptodate(e->private_bio))) {
1033 digest_size = crypto_hash_digestsize(mdev->verify_tfm);
1034 digest = kmalloc(digest_size, GFP_NOIO);
1035 if (digest) {
1036 drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest);
1037
1038 D_ASSERT(digest_size == di->digest_size);
1039 eq = !memcmp(digest, di->digest, digest_size);
1040 kfree(digest);
1041 }
1042 } else {
1043 ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
1044 if (__ratelimit(&drbd_ratelimit_state))
1045 dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
1046 }
1047
1048 dec_unacked(mdev);
1049
1050 kfree(di);
1051
1052 if (!eq)
1053 drbd_ov_oos_found(mdev, e->sector, e->size);
1054 else
1055 ov_oos_print(mdev);
1056
1057 ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size,
1058 eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
1059
1060 drbd_free_ee(mdev, e);
1061
1062 if (--mdev->ov_left == 0) {
1063 ov_oos_print(mdev);
1064 drbd_resync_finished(mdev);
1065 }
1066
1067 return ok;
1068}
1069
1070int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1071{
1072 struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w);
1073 complete(&b->done);
1074 return 1;
1075}
1076
1077int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1078{
1079 struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w);
1080 struct p_barrier *p = &mdev->data.sbuf.barrier;
1081 int ok = 1;
1082
1083 /* really avoid racing with tl_clear. w.cb may have been referenced
1084 * just before it was reassigned and re-queued, so double check that.
1085 * actually, this race was harmless, since we only try to send the
1086 * barrier packet here, and otherwise do nothing with the object.
1087 * but compare with the head of w_clear_epoch */
1088 spin_lock_irq(&mdev->req_lock);
1089 if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED)
1090 cancel = 1;
1091 spin_unlock_irq(&mdev->req_lock);
1092 if (cancel)
1093 return 1;
1094
1095 if (!drbd_get_data_sock(mdev))
1096 return 0;
1097 p->barrier = b->br_number;
1098 /* inc_ap_pending was done where this was queued.
1099 * dec_ap_pending will be done in got_BarrierAck
1100 * or (on connection loss) in w_clear_epoch. */
1101 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
1102 (struct p_header *)p, sizeof(*p), 0);
1103 drbd_put_data_sock(mdev);
1104
1105 return ok;
1106}
1107
1108int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1109{
1110 if (cancel)
1111 return 1;
1112 return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE);
1113}
1114
1115/**
1116 * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
1117 * @mdev: DRBD device.
1118 * @w: work object.
1119 * @cancel: The connection will be closed anyways
1120 */
1121int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1122{
1123 struct drbd_request *req = container_of(w, struct drbd_request, w);
1124 int ok;
1125
1126 if (unlikely(cancel)) {
1127 req_mod(req, send_canceled);
1128 return 1;
1129 }
1130
1131 ok = drbd_send_dblock(mdev, req);
1132 req_mod(req, ok ? handed_over_to_network : send_failed);
1133
1134 return ok;
1135}
1136
1137/**
1138 * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
1139 * @mdev: DRBD device.
1140 * @w: work object.
1141 * @cancel: The connection will be closed anyways
1142 */
1143int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1144{
1145 struct drbd_request *req = container_of(w, struct drbd_request, w);
1146 int ok;
1147
1148 if (unlikely(cancel)) {
1149 req_mod(req, send_canceled);
1150 return 1;
1151 }
1152
1153 ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size,
1154 (unsigned long)req);
1155
1156 if (!ok) {
1157 /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send();
1158 * so this is probably redundant */
1159 if (mdev->state.conn >= C_CONNECTED)
1160 drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
1161 }
1162 req_mod(req, ok ? handed_over_to_network : send_failed);
1163
1164 return ok;
1165}
1166
1167static int _drbd_may_sync_now(struct drbd_conf *mdev)
1168{
1169 struct drbd_conf *odev = mdev;
1170
1171 while (1) {
1172 if (odev->sync_conf.after == -1)
1173 return 1;
1174 odev = minor_to_mdev(odev->sync_conf.after);
1175 ERR_IF(!odev) return 1;
1176 if ((odev->state.conn >= C_SYNC_SOURCE &&
1177 odev->state.conn <= C_PAUSED_SYNC_T) ||
1178 odev->state.aftr_isp || odev->state.peer_isp ||
1179 odev->state.user_isp)
1180 return 0;
1181 }
1182}
1183
1184/**
1185 * _drbd_pause_after() - Pause resync on all devices that may not resync now
1186 * @mdev: DRBD device.
1187 *
1188 * Called from process context only (admin command and after_state_ch).
1189 */
1190static int _drbd_pause_after(struct drbd_conf *mdev)
1191{
1192 struct drbd_conf *odev;
1193 int i, rv = 0;
1194
1195 for (i = 0; i < minor_count; i++) {
1196 odev = minor_to_mdev(i);
1197 if (!odev)
1198 continue;
1199 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1200 continue;
1201 if (!_drbd_may_sync_now(odev))
1202 rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
1203 != SS_NOTHING_TO_DO);
1204 }
1205
1206 return rv;
1207}
1208
1209/**
1210 * _drbd_resume_next() - Resume resync on all devices that may resync now
1211 * @mdev: DRBD device.
1212 *
1213 * Called from process context only (admin command and worker).
1214 */
1215static int _drbd_resume_next(struct drbd_conf *mdev)
1216{
1217 struct drbd_conf *odev;
1218 int i, rv = 0;
1219
1220 for (i = 0; i < minor_count; i++) {
1221 odev = minor_to_mdev(i);
1222 if (!odev)
1223 continue;
1224 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1225 continue;
1226 if (odev->state.aftr_isp) {
1227 if (_drbd_may_sync_now(odev))
1228 rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
1229 CS_HARD, NULL)
1230 != SS_NOTHING_TO_DO) ;
1231 }
1232 }
1233 return rv;
1234}
1235
1236void resume_next_sg(struct drbd_conf *mdev)
1237{
1238 write_lock_irq(&global_state_lock);
1239 _drbd_resume_next(mdev);
1240 write_unlock_irq(&global_state_lock);
1241}
1242
1243void suspend_other_sg(struct drbd_conf *mdev)
1244{
1245 write_lock_irq(&global_state_lock);
1246 _drbd_pause_after(mdev);
1247 write_unlock_irq(&global_state_lock);
1248}
1249
1250static int sync_after_error(struct drbd_conf *mdev, int o_minor)
1251{
1252 struct drbd_conf *odev;
1253
1254 if (o_minor == -1)
1255 return NO_ERROR;
1256 if (o_minor < -1 || minor_to_mdev(o_minor) == NULL)
1257 return ERR_SYNC_AFTER;
1258
1259 /* check for loops */
1260 odev = minor_to_mdev(o_minor);
1261 while (1) {
1262 if (odev == mdev)
1263 return ERR_SYNC_AFTER_CYCLE;
1264
1265 /* dependency chain ends here, no cycles. */
1266 if (odev->sync_conf.after == -1)
1267 return NO_ERROR;
1268
1269 /* follow the dependency chain */
1270 odev = minor_to_mdev(odev->sync_conf.after);
1271 }
1272}
1273
1274int drbd_alter_sa(struct drbd_conf *mdev, int na)
1275{
1276 int changes;
1277 int retcode;
1278
1279 write_lock_irq(&global_state_lock);
1280 retcode = sync_after_error(mdev, na);
1281 if (retcode == NO_ERROR) {
1282 mdev->sync_conf.after = na;
1283 do {
1284 changes = _drbd_pause_after(mdev);
1285 changes |= _drbd_resume_next(mdev);
1286 } while (changes);
1287 }
1288 write_unlock_irq(&global_state_lock);
1289 return retcode;
1290}
1291
1292static void ping_peer(struct drbd_conf *mdev)
1293{
1294 clear_bit(GOT_PING_ACK, &mdev->flags);
1295 request_ping(mdev);
1296 wait_event(mdev->misc_wait,
1297 test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
1298}
1299
1300/**
1301 * drbd_start_resync() - Start the resync process
1302 * @mdev: DRBD device.
1303 * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET
1304 *
1305 * This function might bring you directly into one of the
1306 * C_PAUSED_SYNC_* states.
1307 */
1308void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1309{
1310 union drbd_state ns;
1311 int r;
1312
1313 if (mdev->state.conn >= C_SYNC_SOURCE) {
1314 dev_err(DEV, "Resync already running!\n");
1315 return;
1316 }
1317
1318 /* In case a previous resync run was aborted by an IO error/detach on the peer. */
1319 drbd_rs_cancel_all(mdev);
1320
1321 if (side == C_SYNC_TARGET) {
1322 /* Since application IO was locked out during C_WF_BITMAP_T and
1323 C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
1324 we check that we might make the data inconsistent. */
1325 r = drbd_khelper(mdev, "before-resync-target");
1326 r = (r >> 8) & 0xff;
1327 if (r > 0) {
1328 dev_info(DEV, "before-resync-target handler returned %d, "
1329 "dropping connection.\n", r);
1330 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
1331 return;
1332 }
1333 }
1334
1335 drbd_state_lock(mdev);
1336
1337 if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
1338 drbd_state_unlock(mdev);
1339 return;
1340 }
1341
1342 if (side == C_SYNC_TARGET) {
1343 mdev->bm_resync_fo = 0;
1344 } else /* side == C_SYNC_SOURCE */ {
1345 u64 uuid;
1346
1347 get_random_bytes(&uuid, sizeof(u64));
1348 drbd_uuid_set(mdev, UI_BITMAP, uuid);
1349 drbd_send_sync_uuid(mdev, uuid);
1350
1351 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
1352 }
1353
1354 write_lock_irq(&global_state_lock);
1355 ns = mdev->state;
1356
1357 ns.aftr_isp = !_drbd_may_sync_now(mdev);
1358
1359 ns.conn = side;
1360
1361 if (side == C_SYNC_TARGET)
1362 ns.disk = D_INCONSISTENT;
1363 else /* side == C_SYNC_SOURCE */
1364 ns.pdsk = D_INCONSISTENT;
1365
1366 r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
1367 ns = mdev->state;
1368
1369 if (ns.conn < C_CONNECTED)
1370 r = SS_UNKNOWN_ERROR;
1371
1372 if (r == SS_SUCCESS) {
1373 mdev->rs_total =
1374 mdev->rs_mark_left = drbd_bm_total_weight(mdev);
1375 mdev->rs_failed = 0;
1376 mdev->rs_paused = 0;
1377 mdev->rs_start =
1378 mdev->rs_mark_time = jiffies;
1379 mdev->rs_same_csum = 0;
1380 _drbd_pause_after(mdev);
1381 }
1382 write_unlock_irq(&global_state_lock);
1383 put_ldev(mdev);
1384
1385 if (r == SS_SUCCESS) {
1386 dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
1387 drbd_conn_str(ns.conn),
1388 (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
1389 (unsigned long) mdev->rs_total);
1390
1391 if (mdev->rs_total == 0) {
1392 /* Peer still reachable? Beware of failing before-resync-target handlers! */
1393 ping_peer(mdev);
1394 drbd_resync_finished(mdev);
1395 }
1396
1397 /* ns.conn may already be != mdev->state.conn,
1398 * we may have been paused in between, or become paused until
1399 * the timer triggers.
1400 * No matter, that is handled in resync_timer_fn() */
1401 if (ns.conn == C_SYNC_TARGET)
1402 mod_timer(&mdev->resync_timer, jiffies);
1403
1404 drbd_md_sync(mdev);
1405 }
1406 drbd_state_unlock(mdev);
1407}
1408
1409int drbd_worker(struct drbd_thread *thi)
1410{
1411 struct drbd_conf *mdev = thi->mdev;
1412 struct drbd_work *w = NULL;
1413 LIST_HEAD(work_list);
1414 int intr = 0, i;
1415
1416 sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev));
1417
1418 while (get_t_state(thi) == Running) {
1419 drbd_thread_current_set_cpu(mdev);
1420
1421 if (down_trylock(&mdev->data.work.s)) {
1422 mutex_lock(&mdev->data.mutex);
1423 if (mdev->data.socket && !mdev->net_conf->no_cork)
1424 drbd_tcp_uncork(mdev->data.socket);
1425 mutex_unlock(&mdev->data.mutex);
1426
1427 intr = down_interruptible(&mdev->data.work.s);
1428
1429 mutex_lock(&mdev->data.mutex);
1430 if (mdev->data.socket && !mdev->net_conf->no_cork)
1431 drbd_tcp_cork(mdev->data.socket);
1432 mutex_unlock(&mdev->data.mutex);
1433 }
1434
1435 if (intr) {
1436 D_ASSERT(intr == -EINTR);
1437 flush_signals(current);
1438 ERR_IF (get_t_state(thi) == Running)
1439 continue;
1440 break;
1441 }
1442
1443 if (get_t_state(thi) != Running)
1444 break;
1445 /* With this break, we have done a down() but not consumed
1446 the entry from the list. The cleanup code takes care of
1447 this... */
1448
1449 w = NULL;
1450 spin_lock_irq(&mdev->data.work.q_lock);
1451 ERR_IF(list_empty(&mdev->data.work.q)) {
1452 /* something terribly wrong in our logic.
1453 * we were able to down() the semaphore,
1454 * but the list is empty... doh.
1455 *
1456 * what is the best thing to do now?
1457 * try again from scratch, restarting the receiver,
1458 * asender, whatnot? could break even more ugly,
1459 * e.g. when we are primary, but no good local data.
1460 *
1461 * I'll try to get away just starting over this loop.
1462 */
1463 spin_unlock_irq(&mdev->data.work.q_lock);
1464 continue;
1465 }
1466 w = list_entry(mdev->data.work.q.next, struct drbd_work, list);
1467 list_del_init(&w->list);
1468 spin_unlock_irq(&mdev->data.work.q_lock);
1469
1470 if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) {
1471 /* dev_warn(DEV, "worker: a callback failed! \n"); */
1472 if (mdev->state.conn >= C_CONNECTED)
1473 drbd_force_state(mdev,
1474 NS(conn, C_NETWORK_FAILURE));
1475 }
1476 }
1477 D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags));
1478 D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags));
1479
1480 spin_lock_irq(&mdev->data.work.q_lock);
1481 i = 0;
1482 while (!list_empty(&mdev->data.work.q)) {
1483 list_splice_init(&mdev->data.work.q, &work_list);
1484 spin_unlock_irq(&mdev->data.work.q_lock);
1485
1486 while (!list_empty(&work_list)) {
1487 w = list_entry(work_list.next, struct drbd_work, list);
1488 list_del_init(&w->list);
1489 w->cb(mdev, w, 1);
1490 i++; /* dead debugging code */
1491 }
1492
1493 spin_lock_irq(&mdev->data.work.q_lock);
1494 }
1495 sema_init(&mdev->data.work.s, 0);
1496 /* DANGEROUS race: if someone did queue his work within the spinlock,
1497 * but up() ed outside the spinlock, we could get an up() on the
1498 * semaphore without corresponding list entry.
1499 * So don't do that.
1500 */
1501 spin_unlock_irq(&mdev->data.work.q_lock);
1502
1503 D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
1504 /* _drbd_set_state only uses stop_nowait.
1505 * wait here for the Exiting receiver. */
1506 drbd_thread_stop(&mdev->receiver);
1507 drbd_mdev_cleanup(mdev);
1508
1509 dev_info(DEV, "worker terminated\n");
1510
1511 clear_bit(DEVICE_DYING, &mdev->flags);
1512 clear_bit(CONFIG_PENDING, &mdev->flags);
1513 wake_up(&mdev->state_wait);
1514
1515 return 0;
1516}
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h
new file mode 100644
index 000000000000..f93fa111ce50
--- /dev/null
+++ b/drivers/block/drbd/drbd_wrappers.h
@@ -0,0 +1,91 @@
1#ifndef _DRBD_WRAPPERS_H
2#define _DRBD_WRAPPERS_H
3
4#include <linux/ctype.h>
5#include <linux/mm.h>
6
7/* see get_sb_bdev and bd_claim */
8extern char *drbd_sec_holder;
9
10/* sets the number of 512 byte sectors of our virtual device */
11static inline void drbd_set_my_capacity(struct drbd_conf *mdev,
12 sector_t size)
13{
14 /* set_capacity(mdev->this_bdev->bd_disk, size); */
15 set_capacity(mdev->vdisk, size);
16 mdev->this_bdev->bd_inode->i_size = (loff_t)size << 9;
17}
18
19#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE)
20
21static inline int drbd_bio_has_active_page(struct bio *bio)
22{
23 struct bio_vec *bvec;
24 int i;
25
26 __bio_for_each_segment(bvec, bio, i, 0) {
27 if (page_count(bvec->bv_page) > 1)
28 return 1;
29 }
30
31 return 0;
32}
33
34/* bi_end_io handlers */
35extern void drbd_md_io_complete(struct bio *bio, int error);
36extern void drbd_endio_read_sec(struct bio *bio, int error);
37extern void drbd_endio_write_sec(struct bio *bio, int error);
38extern void drbd_endio_pri(struct bio *bio, int error);
39
40/*
41 * used to submit our private bio
42 */
43static inline void drbd_generic_make_request(struct drbd_conf *mdev,
44 int fault_type, struct bio *bio)
45{
46 __release(local);
47 if (!bio->bi_bdev) {
48 printk(KERN_ERR "drbd%d: drbd_generic_make_request: "
49 "bio->bi_bdev == NULL\n",
50 mdev_to_minor(mdev));
51 dump_stack();
52 bio_endio(bio, -ENODEV);
53 return;
54 }
55
56 if (FAULT_ACTIVE(mdev, fault_type))
57 bio_endio(bio, -EIO);
58 else
59 generic_make_request(bio);
60}
61
62static inline void drbd_plug_device(struct drbd_conf *mdev)
63{
64 struct request_queue *q;
65 q = bdev_get_queue(mdev->this_bdev);
66
67 spin_lock_irq(q->queue_lock);
68
69/* XXX the check on !blk_queue_plugged is redundant,
70 * implicitly checked in blk_plug_device */
71
72 if (!blk_queue_plugged(q)) {
73 blk_plug_device(q);
74 del_timer(&q->unplug_timer);
75 /* unplugging should not happen automatically... */
76 }
77 spin_unlock_irq(q->queue_lock);
78}
79
80static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm)
81{
82 return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK)
83 == CRYPTO_ALG_TYPE_HASH;
84}
85
86#ifndef __CHECKER__
87# undef __cond_lock
88# define __cond_lock(x,c) (c)
89#endif
90
91#endif
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 5c01f747571b..90c4038702da 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -144,13 +144,23 @@
144 * Better audit of register_blkdev. 144 * Better audit of register_blkdev.
145 */ 145 */
146 146
147#define FLOPPY_SANITY_CHECK
148#undef FLOPPY_SILENT_DCL_CLEAR 147#undef FLOPPY_SILENT_DCL_CLEAR
149 148
150#define REALLY_SLOW_IO 149#define REALLY_SLOW_IO
151 150
152#define DEBUGT 2 151#define DEBUGT 2
153#define DCL_DEBUG /* debug disk change line */ 152
153#define DPRINT(format, args...) \
154 pr_info("floppy%d: " format, current_drive, ##args)
155
156#define DCL_DEBUG /* debug disk change line */
157#ifdef DCL_DEBUG
158#define debug_dcl(test, fmt, args...) \
159 do { if ((test) & FD_DEBUG) DPRINT(fmt, ##args); } while (0)
160#else
161#define debug_dcl(test, fmt, args...) \
162 do { if (0) DPRINT(fmt, ##args); } while (0)
163#endif
154 164
155/* do print messages for unexpected interrupts */ 165/* do print messages for unexpected interrupts */
156static int print_unex = 1; 166static int print_unex = 1;
@@ -180,6 +190,8 @@ static int print_unex = 1;
180#include <linux/mod_devicetable.h> 190#include <linux/mod_devicetable.h>
181#include <linux/buffer_head.h> /* for invalidate_buffers() */ 191#include <linux/buffer_head.h> /* for invalidate_buffers() */
182#include <linux/mutex.h> 192#include <linux/mutex.h>
193#include <linux/io.h>
194#include <linux/uaccess.h>
183 195
184/* 196/*
185 * PS/2 floppies have much slower step rates than regular floppies. 197 * PS/2 floppies have much slower step rates than regular floppies.
@@ -191,8 +203,6 @@ static int slow_floppy;
191#include <asm/dma.h> 203#include <asm/dma.h>
192#include <asm/irq.h> 204#include <asm/irq.h>
193#include <asm/system.h> 205#include <asm/system.h>
194#include <asm/io.h>
195#include <asm/uaccess.h>
196 206
197static int FLOPPY_IRQ = 6; 207static int FLOPPY_IRQ = 6;
198static int FLOPPY_DMA = 2; 208static int FLOPPY_DMA = 2;
@@ -241,8 +251,6 @@ static int allowed_drive_mask = 0x33;
241 251
242static int irqdma_allocated; 252static int irqdma_allocated;
243 253
244#define DEVICE_NAME "floppy"
245
246#include <linux/blkdev.h> 254#include <linux/blkdev.h>
247#include <linux/blkpg.h> 255#include <linux/blkpg.h>
248#include <linux/cdrom.h> /* for the compatibility eject ioctl */ 256#include <linux/cdrom.h> /* for the compatibility eject ioctl */
@@ -250,7 +258,7 @@ static int irqdma_allocated;
250 258
251static struct request *current_req; 259static struct request *current_req;
252static struct request_queue *floppy_queue; 260static struct request_queue *floppy_queue;
253static void do_fd_request(struct request_queue * q); 261static void do_fd_request(struct request_queue *q);
254 262
255#ifndef fd_get_dma_residue 263#ifndef fd_get_dma_residue
256#define fd_get_dma_residue() get_dma_residue(FLOPPY_DMA) 264#define fd_get_dma_residue() get_dma_residue(FLOPPY_DMA)
@@ -263,7 +271,7 @@ static void do_fd_request(struct request_queue * q);
263#endif 271#endif
264 272
265#ifndef fd_dma_mem_alloc 273#ifndef fd_dma_mem_alloc
266#define fd_dma_mem_alloc(size) __get_dma_pages(GFP_KERNEL,get_order(size)) 274#define fd_dma_mem_alloc(size) __get_dma_pages(GFP_KERNEL, get_order(size))
267#endif 275#endif
268 276
269static inline void fallback_on_nodma_alloc(char **addr, size_t l) 277static inline void fallback_on_nodma_alloc(char **addr, size_t l)
@@ -273,7 +281,7 @@ static inline void fallback_on_nodma_alloc(char **addr, size_t l)
273 return; /* we have the memory */ 281 return; /* we have the memory */
274 if (can_use_virtual_dma != 2) 282 if (can_use_virtual_dma != 2)
275 return; /* no fallback allowed */ 283 return; /* no fallback allowed */
276 printk("DMA memory shortage. Temporarily falling back on virtual DMA\n"); 284 pr_info("DMA memory shortage. Temporarily falling back on virtual DMA\n");
277 *addr = (char *)nodma_mem_alloc(l); 285 *addr = (char *)nodma_mem_alloc(l);
278#else 286#else
279 return; 287 return;
@@ -283,59 +291,50 @@ static inline void fallback_on_nodma_alloc(char **addr, size_t l)
283/* End dma memory related stuff */ 291/* End dma memory related stuff */
284 292
285static unsigned long fake_change; 293static unsigned long fake_change;
286static int initialising = 1; 294static bool initialized;
287 295
288#define ITYPE(x) (((x)>>2) & 0x1f) 296#define ITYPE(x) (((x) >> 2) & 0x1f)
289#define TOMINOR(x) ((x & 3) | ((x & 4) << 5)) 297#define TOMINOR(x) ((x & 3) | ((x & 4) << 5))
290#define UNIT(x) ((x) & 0x03) /* drive on fdc */ 298#define UNIT(x) ((x) & 0x03) /* drive on fdc */
291#define FDC(x) (((x) & 0x04) >> 2) /* fdc of drive */ 299#define FDC(x) (((x) & 0x04) >> 2) /* fdc of drive */
292 /* reverse mapping from unit and fdc to drive */ 300 /* reverse mapping from unit and fdc to drive */
293#define REVDRIVE(fdc, unit) ((unit) + ((fdc) << 2)) 301#define REVDRIVE(fdc, unit) ((unit) + ((fdc) << 2))
294#define DP (&drive_params[current_drive])
295#define DRS (&drive_state[current_drive])
296#define DRWE (&write_errors[current_drive])
297#define FDCS (&fdc_state[fdc])
298#define CLEARF(x) clear_bit(x##_BIT, &DRS->flags)
299#define SETF(x) set_bit(x##_BIT, &DRS->flags)
300#define TESTF(x) test_bit(x##_BIT, &DRS->flags)
301 302
302#define UDP (&drive_params[drive]) 303#define DP (&drive_params[current_drive])
303#define UDRS (&drive_state[drive]) 304#define DRS (&drive_state[current_drive])
304#define UDRWE (&write_errors[drive]) 305#define DRWE (&write_errors[current_drive])
305#define UFDCS (&fdc_state[FDC(drive)]) 306#define FDCS (&fdc_state[fdc])
306#define UCLEARF(x) clear_bit(x##_BIT, &UDRS->flags)
307#define USETF(x) set_bit(x##_BIT, &UDRS->flags)
308#define UTESTF(x) test_bit(x##_BIT, &UDRS->flags)
309 307
310#define DPRINT(format, args...) printk(DEVICE_NAME "%d: " format, current_drive , ## args) 308#define UDP (&drive_params[drive])
309#define UDRS (&drive_state[drive])
310#define UDRWE (&write_errors[drive])
311#define UFDCS (&fdc_state[FDC(drive)])
311 312
312#define PH_HEAD(floppy,head) (((((floppy)->stretch & 2) >>1) ^ head) << 2) 313#define PH_HEAD(floppy, head) (((((floppy)->stretch & 2) >> 1) ^ head) << 2)
313#define STRETCH(floppy) ((floppy)->stretch & FD_STRETCH) 314#define STRETCH(floppy) ((floppy)->stretch & FD_STRETCH)
314
315#define CLEARSTRUCT(x) memset((x), 0, sizeof(*(x)))
316 315
317/* read/write */ 316/* read/write */
318#define COMMAND raw_cmd->cmd[0] 317#define COMMAND (raw_cmd->cmd[0])
319#define DR_SELECT raw_cmd->cmd[1] 318#define DR_SELECT (raw_cmd->cmd[1])
320#define TRACK raw_cmd->cmd[2] 319#define TRACK (raw_cmd->cmd[2])
321#define HEAD raw_cmd->cmd[3] 320#define HEAD (raw_cmd->cmd[3])
322#define SECTOR raw_cmd->cmd[4] 321#define SECTOR (raw_cmd->cmd[4])
323#define SIZECODE raw_cmd->cmd[5] 322#define SIZECODE (raw_cmd->cmd[5])
324#define SECT_PER_TRACK raw_cmd->cmd[6] 323#define SECT_PER_TRACK (raw_cmd->cmd[6])
325#define GAP raw_cmd->cmd[7] 324#define GAP (raw_cmd->cmd[7])
326#define SIZECODE2 raw_cmd->cmd[8] 325#define SIZECODE2 (raw_cmd->cmd[8])
327#define NR_RW 9 326#define NR_RW 9
328 327
329/* format */ 328/* format */
330#define F_SIZECODE raw_cmd->cmd[2] 329#define F_SIZECODE (raw_cmd->cmd[2])
331#define F_SECT_PER_TRACK raw_cmd->cmd[3] 330#define F_SECT_PER_TRACK (raw_cmd->cmd[3])
332#define F_GAP raw_cmd->cmd[4] 331#define F_GAP (raw_cmd->cmd[4])
333#define F_FILL raw_cmd->cmd[5] 332#define F_FILL (raw_cmd->cmd[5])
334#define NR_F 6 333#define NR_F 6
335 334
336/* 335/*
337 * Maximum disk size (in kilobytes). This default is used whenever the 336 * Maximum disk size (in kilobytes).
338 * current disk size is unknown. 337 * This default is used whenever the current disk size is unknown.
339 * [Now it is rather a minimum] 338 * [Now it is rather a minimum]
340 */ 339 */
341#define MAX_DISK_SIZE 4 /* 3984 */ 340#define MAX_DISK_SIZE 4 /* 3984 */
@@ -345,16 +344,17 @@ static int initialising = 1;
345 */ 344 */
346#define MAX_REPLIES 16 345#define MAX_REPLIES 16
347static unsigned char reply_buffer[MAX_REPLIES]; 346static unsigned char reply_buffer[MAX_REPLIES];
348static int inr; /* size of reply buffer, when called from interrupt */ 347static int inr; /* size of reply buffer, when called from interrupt */
349#define ST0 (reply_buffer[0]) 348#define ST0 (reply_buffer[0])
350#define ST1 (reply_buffer[1]) 349#define ST1 (reply_buffer[1])
351#define ST2 (reply_buffer[2]) 350#define ST2 (reply_buffer[2])
352#define ST3 (reply_buffer[0]) /* result of GETSTATUS */ 351#define ST3 (reply_buffer[0]) /* result of GETSTATUS */
353#define R_TRACK (reply_buffer[3]) 352#define R_TRACK (reply_buffer[3])
354#define R_HEAD (reply_buffer[4]) 353#define R_HEAD (reply_buffer[4])
355#define R_SECTOR (reply_buffer[5]) 354#define R_SECTOR (reply_buffer[5])
356#define R_SIZECODE (reply_buffer[6]) 355#define R_SIZECODE (reply_buffer[6])
357#define SEL_DLY (2*HZ/100) 356
357#define SEL_DLY (2 * HZ / 100)
358 358
359/* 359/*
360 * this struct defines the different floppy drive types. 360 * this struct defines the different floppy drive types.
@@ -505,9 +505,9 @@ static char floppy_device_name[] = "floppy";
505static int probing; 505static int probing;
506 506
507/* Synchronization of FDC access. */ 507/* Synchronization of FDC access. */
508#define FD_COMMAND_NONE -1 508#define FD_COMMAND_NONE -1
509#define FD_COMMAND_ERROR 2 509#define FD_COMMAND_ERROR 2
510#define FD_COMMAND_OKAY 3 510#define FD_COMMAND_OKAY 3
511 511
512static volatile int command_status = FD_COMMAND_NONE; 512static volatile int command_status = FD_COMMAND_NONE;
513static unsigned long fdc_busy; 513static unsigned long fdc_busy;
@@ -515,11 +515,6 @@ static DECLARE_WAIT_QUEUE_HEAD(fdc_wait);
515static DECLARE_WAIT_QUEUE_HEAD(command_done); 515static DECLARE_WAIT_QUEUE_HEAD(command_done);
516 516
517#define NO_SIGNAL (!interruptible || !signal_pending(current)) 517#define NO_SIGNAL (!interruptible || !signal_pending(current))
518#define CALL(x) if ((x) == -EINTR) return -EINTR
519#define ECALL(x) if ((ret = (x))) return ret;
520#define _WAIT(x,i) CALL(ret=wait_til_done((x),i))
521#define WAIT(x) _WAIT((x),interruptible)
522#define IWAIT(x) _WAIT((x),1)
523 518
524/* Errors during formatting are counted here. */ 519/* Errors during formatting are counted here. */
525static int format_errors; 520static int format_errors;
@@ -545,8 +540,9 @@ static int max_buffer_sectors;
545static int *errors; 540static int *errors;
546typedef void (*done_f)(int); 541typedef void (*done_f)(int);
547static struct cont_t { 542static struct cont_t {
548 void (*interrupt)(void); /* this is called after the interrupt of the 543 void (*interrupt)(void);
549 * main command */ 544 /* this is called after the interrupt of the
545 * main command */
550 void (*redo)(void); /* this is called to retry the operation */ 546 void (*redo)(void); /* this is called to retry the operation */
551 void (*error)(void); /* this is called to tally an error */ 547 void (*error)(void); /* this is called to tally an error */
552 done_f done; /* this is called to say if the operation has 548 done_f done; /* this is called to say if the operation has
@@ -571,7 +567,6 @@ static void floppy_release_irq_and_dma(void);
571 * reset doesn't need to be tested before sending commands, because 567 * reset doesn't need to be tested before sending commands, because
572 * output_byte is automatically disabled when reset is set. 568 * output_byte is automatically disabled when reset is set.
573 */ 569 */
574#define CHECK_RESET { if (FDCS->reset){ reset_fdc(); return; } }
575static void reset_fdc(void); 570static void reset_fdc(void);
576 571
577/* 572/*
@@ -579,9 +574,9 @@ static void reset_fdc(void);
579 * information to interrupts. They are the data used for the current 574 * information to interrupts. They are the data used for the current
580 * request. 575 * request.
581 */ 576 */
582#define NO_TRACK -1 577#define NO_TRACK -1
583#define NEED_1_RECAL -2 578#define NEED_1_RECAL -2
584#define NEED_2_RECAL -3 579#define NEED_2_RECAL -3
585 580
586static int usage_count; 581static int usage_count;
587 582
@@ -621,39 +616,35 @@ static inline void set_debugt(void)
621 debugtimer = jiffies; 616 debugtimer = jiffies;
622} 617}
623 618
624static inline void debugt(const char *message) 619static inline void debugt(const char *func, const char *msg)
625{ 620{
626 if (DP->flags & DEBUGT) 621 if (DP->flags & DEBUGT)
627 printk("%s dtime=%lu\n", message, jiffies - debugtimer); 622 pr_info("%s:%s dtime=%lu\n", func, msg, jiffies - debugtimer);
628} 623}
629#else 624#else
630static inline void set_debugt(void) { } 625static inline void set_debugt(void) { }
631static inline void debugt(const char *message) { } 626static inline void debugt(const char *func, const char *msg) { }
632#endif /* DEBUGT */ 627#endif /* DEBUGT */
633 628
634typedef void (*timeout_fn) (unsigned long); 629typedef void (*timeout_fn)(unsigned long);
635static DEFINE_TIMER(fd_timeout, floppy_shutdown, 0, 0); 630static DEFINE_TIMER(fd_timeout, floppy_shutdown, 0, 0);
636 631
637static const char *timeout_message; 632static const char *timeout_message;
638 633
639#ifdef FLOPPY_SANITY_CHECK 634static void is_alive(const char *func, const char *message)
640static void is_alive(const char *message)
641{ 635{
642 /* this routine checks whether the floppy driver is "alive" */ 636 /* this routine checks whether the floppy driver is "alive" */
643 if (test_bit(0, &fdc_busy) && command_status < 2 637 if (test_bit(0, &fdc_busy) && command_status < 2 &&
644 && !timer_pending(&fd_timeout)) { 638 !timer_pending(&fd_timeout)) {
645 DPRINT("timeout handler died: %s\n", message); 639 DPRINT("%s: timeout handler died. %s\n", func, message);
646 } 640 }
647} 641}
648#endif
649 642
650static void (*do_floppy) (void) = NULL; 643static void (*do_floppy)(void) = NULL;
651
652#ifdef FLOPPY_SANITY_CHECK
653 644
654#define OLOGSIZE 20 645#define OLOGSIZE 20
655 646
656static void (*lasthandler) (void); 647static void (*lasthandler)(void);
657static unsigned long interruptjiffies; 648static unsigned long interruptjiffies;
658static unsigned long resultjiffies; 649static unsigned long resultjiffies;
659static int resultsize; 650static int resultsize;
@@ -666,12 +657,11 @@ static struct output_log {
666} output_log[OLOGSIZE]; 657} output_log[OLOGSIZE];
667 658
668static int output_log_pos; 659static int output_log_pos;
669#endif
670 660
671#define current_reqD -1 661#define current_reqD -1
672#define MAXTIMEOUT -2 662#define MAXTIMEOUT -2
673 663
674static void __reschedule_timeout(int drive, const char *message, int marg) 664static void __reschedule_timeout(int drive, const char *message)
675{ 665{
676 if (drive == current_reqD) 666 if (drive == current_reqD)
677 drive = current_drive; 667 drive = current_drive;
@@ -682,25 +672,22 @@ static void __reschedule_timeout(int drive, const char *message, int marg)
682 } else 672 } else
683 fd_timeout.expires = jiffies + UDP->timeout; 673 fd_timeout.expires = jiffies + UDP->timeout;
684 add_timer(&fd_timeout); 674 add_timer(&fd_timeout);
685 if (UDP->flags & FD_DEBUG) { 675 if (UDP->flags & FD_DEBUG)
686 DPRINT("reschedule timeout "); 676 DPRINT("reschedule timeout %s\n", message);
687 printk(message, marg);
688 printk("\n");
689 }
690 timeout_message = message; 677 timeout_message = message;
691} 678}
692 679
693static void reschedule_timeout(int drive, const char *message, int marg) 680static void reschedule_timeout(int drive, const char *message)
694{ 681{
695 unsigned long flags; 682 unsigned long flags;
696 683
697 spin_lock_irqsave(&floppy_lock, flags); 684 spin_lock_irqsave(&floppy_lock, flags);
698 __reschedule_timeout(drive, message, marg); 685 __reschedule_timeout(drive, message);
699 spin_unlock_irqrestore(&floppy_lock, flags); 686 spin_unlock_irqrestore(&floppy_lock, flags);
700} 687}
701 688
702#define INFBOUND(a,b) (a)=max_t(int, a, b) 689#define INFBOUND(a, b) (a) = max_t(int, a, b)
703#define SUPBOUND(a,b) (a)=min_t(int, a, b) 690#define SUPBOUND(a, b) (a) = min_t(int, a, b)
704 691
705/* 692/*
706 * Bottom half floppy driver. 693 * Bottom half floppy driver.
@@ -739,7 +726,6 @@ static int disk_change(int drive)
739{ 726{
740 int fdc = FDC(drive); 727 int fdc = FDC(drive);
741 728
742#ifdef FLOPPY_SANITY_CHECK
743 if (time_before(jiffies, UDRS->select_date + UDP->select_delay)) 729 if (time_before(jiffies, UDRS->select_date + UDP->select_delay))
744 DPRINT("WARNING disk change called early\n"); 730 DPRINT("WARNING disk change called early\n");
745 if (!(FDCS->dor & (0x10 << UNIT(drive))) || 731 if (!(FDCS->dor & (0x10 << UNIT(drive))) ||
@@ -748,31 +734,27 @@ static int disk_change(int drive)
748 DPRINT("drive=%d fdc=%d dor=%x\n", drive, FDC(drive), 734 DPRINT("drive=%d fdc=%d dor=%x\n", drive, FDC(drive),
749 (unsigned int)FDCS->dor); 735 (unsigned int)FDCS->dor);
750 } 736 }
751#endif
752 737
753#ifdef DCL_DEBUG 738 debug_dcl(UDP->flags,
754 if (UDP->flags & FD_DEBUG) { 739 "checking disk change line for drive %d\n", drive);
755 DPRINT("checking disk change line for drive %d\n", drive); 740 debug_dcl(UDP->flags, "jiffies=%lu\n", jiffies);
756 DPRINT("jiffies=%lu\n", jiffies); 741 debug_dcl(UDP->flags, "disk change line=%x\n", fd_inb(FD_DIR) & 0x80);
757 DPRINT("disk change line=%x\n", fd_inb(FD_DIR) & 0x80); 742 debug_dcl(UDP->flags, "flags=%lx\n", UDRS->flags);
758 DPRINT("flags=%lx\n", UDRS->flags); 743
759 }
760#endif
761 if (UDP->flags & FD_BROKEN_DCL) 744 if (UDP->flags & FD_BROKEN_DCL)
762 return UTESTF(FD_DISK_CHANGED); 745 return test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags);
763 if ((fd_inb(FD_DIR) ^ UDP->flags) & 0x80) { 746 if ((fd_inb(FD_DIR) ^ UDP->flags) & 0x80) {
764 USETF(FD_VERIFY); /* verify write protection */ 747 set_bit(FD_VERIFY_BIT, &UDRS->flags);
765 if (UDRS->maxblock) { 748 /* verify write protection */
766 /* mark it changed */ 749
767 USETF(FD_DISK_CHANGED); 750 if (UDRS->maxblock) /* mark it changed */
768 } 751 set_bit(FD_DISK_CHANGED_BIT, &UDRS->flags);
769 752
770 /* invalidate its geometry */ 753 /* invalidate its geometry */
771 if (UDRS->keep_data >= 0) { 754 if (UDRS->keep_data >= 0) {
772 if ((UDP->flags & FTD_MSG) && 755 if ((UDP->flags & FTD_MSG) &&
773 current_type[drive] != NULL) 756 current_type[drive] != NULL)
774 DPRINT("Disk type is undefined after " 757 DPRINT("Disk type is undefined after disk change\n");
775 "disk change\n");
776 current_type[drive] = NULL; 758 current_type[drive] = NULL;
777 floppy_sizes[TOMINOR(drive)] = MAX_DISK_SIZE << 1; 759 floppy_sizes[TOMINOR(drive)] = MAX_DISK_SIZE << 1;
778 } 760 }
@@ -780,7 +762,7 @@ static int disk_change(int drive)
780 return 1; 762 return 1;
781 } else { 763 } else {
782 UDRS->last_checked = jiffies; 764 UDRS->last_checked = jiffies;
783 UCLEARF(FD_DISK_NEWCHANGE); 765 clear_bit(FD_DISK_NEWCHANGE_BIT, &UDRS->flags);
784 } 766 }
785 return 0; 767 return 0;
786} 768}
@@ -790,6 +772,12 @@ static inline int is_selected(int dor, int unit)
790 return ((dor & (0x10 << unit)) && (dor & 3) == unit); 772 return ((dor & (0x10 << unit)) && (dor & 3) == unit);
791} 773}
792 774
775static bool is_ready_state(int status)
776{
777 int state = status & (STATUS_READY | STATUS_DIR | STATUS_DMA);
778 return state == STATUS_READY;
779}
780
793static int set_dor(int fdc, char mask, char data) 781static int set_dor(int fdc, char mask, char data)
794{ 782{
795 unsigned char unit; 783 unsigned char unit;
@@ -806,11 +794,8 @@ static int set_dor(int fdc, char mask, char data)
806 unit = olddor & 0x3; 794 unit = olddor & 0x3;
807 if (is_selected(olddor, unit) && !is_selected(newdor, unit)) { 795 if (is_selected(olddor, unit) && !is_selected(newdor, unit)) {
808 drive = REVDRIVE(fdc, unit); 796 drive = REVDRIVE(fdc, unit);
809#ifdef DCL_DEBUG 797 debug_dcl(UDP->flags,
810 if (UDP->flags & FD_DEBUG) { 798 "calling disk change from set_dor\n");
811 DPRINT("calling disk change from set_dor\n");
812 }
813#endif
814 disk_change(drive); 799 disk_change(drive);
815 } 800 }
816 FDCS->dor = newdor; 801 FDCS->dor = newdor;
@@ -834,8 +819,10 @@ static void twaddle(void)
834 DRS->select_date = jiffies; 819 DRS->select_date = jiffies;
835} 820}
836 821
837/* reset all driver information about the current fdc. This is needed after 822/*
838 * a reset, and after a raw command. */ 823 * Reset all driver information about the current fdc.
824 * This is needed after a reset, and after a raw command.
825 */
839static void reset_fdc_info(int mode) 826static void reset_fdc_info(int mode)
840{ 827{
841 int drive; 828 int drive;
@@ -857,7 +844,7 @@ static void set_fdc(int drive)
857 current_drive = drive; 844 current_drive = drive;
858 } 845 }
859 if (fdc != 1 && fdc != 0) { 846 if (fdc != 1 && fdc != 0) {
860 printk("bad fdc value\n"); 847 pr_info("bad fdc value\n");
861 return; 848 return;
862 } 849 }
863 set_dor(fdc, ~0, 8); 850 set_dor(fdc, ~0, 8);
@@ -871,11 +858,10 @@ static void set_fdc(int drive)
871} 858}
872 859
873/* locks the driver */ 860/* locks the driver */
874static int _lock_fdc(int drive, int interruptible, int line) 861static int _lock_fdc(int drive, bool interruptible, int line)
875{ 862{
876 if (!usage_count) { 863 if (!usage_count) {
877 printk(KERN_ERR 864 pr_err("Trying to lock fdc while usage count=0 at line %d\n",
878 "Trying to lock fdc while usage count=0 at line %d\n",
879 line); 865 line);
880 return -1; 866 return -1;
881 } 867 }
@@ -904,15 +890,13 @@ static int _lock_fdc(int drive, int interruptible, int line)
904 } 890 }
905 command_status = FD_COMMAND_NONE; 891 command_status = FD_COMMAND_NONE;
906 892
907 __reschedule_timeout(drive, "lock fdc", 0); 893 __reschedule_timeout(drive, "lock fdc");
908 set_fdc(drive); 894 set_fdc(drive);
909 return 0; 895 return 0;
910} 896}
911 897
912#define lock_fdc(drive,interruptible) _lock_fdc(drive,interruptible, __LINE__) 898#define lock_fdc(drive, interruptible) \
913 899 _lock_fdc(drive, interruptible, __LINE__)
914#define LOCK_FDC(drive,interruptible) \
915if (lock_fdc(drive,interruptible)) return -EINTR;
916 900
917/* unlocks the driver */ 901/* unlocks the driver */
918static inline void unlock_fdc(void) 902static inline void unlock_fdc(void)
@@ -924,7 +908,7 @@ static inline void unlock_fdc(void)
924 DPRINT("FDC access conflict!\n"); 908 DPRINT("FDC access conflict!\n");
925 909
926 if (do_floppy) 910 if (do_floppy)
927 DPRINT("device interrupt still active at FDC release: %p!\n", 911 DPRINT("device interrupt still active at FDC release: %pf!\n",
928 do_floppy); 912 do_floppy);
929 command_status = FD_COMMAND_NONE; 913 command_status = FD_COMMAND_NONE;
930 spin_lock_irqsave(&floppy_lock, flags); 914 spin_lock_irqsave(&floppy_lock, flags);
@@ -1003,7 +987,7 @@ static void empty(void)
1003 987
1004static DECLARE_WORK(floppy_work, NULL); 988static DECLARE_WORK(floppy_work, NULL);
1005 989
1006static void schedule_bh(void (*handler) (void)) 990static void schedule_bh(void (*handler)(void))
1007{ 991{
1008 PREPARE_WORK(&floppy_work, (work_func_t)handler); 992 PREPARE_WORK(&floppy_work, (work_func_t)handler);
1009 schedule_work(&floppy_work); 993 schedule_work(&floppy_work);
@@ -1026,11 +1010,7 @@ static void cancel_activity(void)
1026 * transfer */ 1010 * transfer */
1027static void fd_watchdog(void) 1011static void fd_watchdog(void)
1028{ 1012{
1029#ifdef DCL_DEBUG 1013 debug_dcl(DP->flags, "calling disk change from watchdog\n");
1030 if (DP->flags & FD_DEBUG) {
1031 DPRINT("calling disk change from watchdog\n");
1032 }
1033#endif
1034 1014
1035 if (disk_change(current_drive)) { 1015 if (disk_change(current_drive)) {
1036 DPRINT("disk removed during i/o\n"); 1016 DPRINT("disk removed during i/o\n");
@@ -1039,7 +1019,7 @@ static void fd_watchdog(void)
1039 reset_fdc(); 1019 reset_fdc();
1040 } else { 1020 } else {
1041 del_timer(&fd_timer); 1021 del_timer(&fd_timer);
1042 fd_timer.function = (timeout_fn) fd_watchdog; 1022 fd_timer.function = (timeout_fn)fd_watchdog;
1043 fd_timer.expires = jiffies + HZ / 10; 1023 fd_timer.expires = jiffies + HZ / 10;
1044 add_timer(&fd_timer); 1024 add_timer(&fd_timer);
1045 } 1025 }
@@ -1105,25 +1085,23 @@ static void setup_DMA(void)
1105{ 1085{
1106 unsigned long f; 1086 unsigned long f;
1107 1087
1108#ifdef FLOPPY_SANITY_CHECK
1109 if (raw_cmd->length == 0) { 1088 if (raw_cmd->length == 0) {
1110 int i; 1089 int i;
1111 1090
1112 printk("zero dma transfer size:"); 1091 pr_info("zero dma transfer size:");
1113 for (i = 0; i < raw_cmd->cmd_count; i++) 1092 for (i = 0; i < raw_cmd->cmd_count; i++)
1114 printk("%x,", raw_cmd->cmd[i]); 1093 pr_cont("%x,", raw_cmd->cmd[i]);
1115 printk("\n"); 1094 pr_cont("\n");
1116 cont->done(0); 1095 cont->done(0);
1117 FDCS->reset = 1; 1096 FDCS->reset = 1;
1118 return; 1097 return;
1119 } 1098 }
1120 if (((unsigned long)raw_cmd->kernel_data) % 512) { 1099 if (((unsigned long)raw_cmd->kernel_data) % 512) {
1121 printk("non aligned address: %p\n", raw_cmd->kernel_data); 1100 pr_info("non aligned address: %p\n", raw_cmd->kernel_data);
1122 cont->done(0); 1101 cont->done(0);
1123 FDCS->reset = 1; 1102 FDCS->reset = 1;
1124 return; 1103 return;
1125 } 1104 }
1126#endif
1127 f = claim_dma_lock(); 1105 f = claim_dma_lock();
1128 fd_disable_dma(); 1106 fd_disable_dma();
1129#ifdef fd_dma_setup 1107#ifdef fd_dma_setup
@@ -1165,7 +1143,7 @@ static int wait_til_ready(void)
1165 if (status & STATUS_READY) 1143 if (status & STATUS_READY)
1166 return status; 1144 return status;
1167 } 1145 }
1168 if (!initialising) { 1146 if (initialized) {
1169 DPRINT("Getstatus times out (%x) on fdc %d\n", status, fdc); 1147 DPRINT("Getstatus times out (%x) on fdc %d\n", status, fdc);
1170 show_floppy(); 1148 show_floppy();
1171 } 1149 }
@@ -1176,22 +1154,21 @@ static int wait_til_ready(void)
1176/* sends a command byte to the fdc */ 1154/* sends a command byte to the fdc */
1177static int output_byte(char byte) 1155static int output_byte(char byte)
1178{ 1156{
1179 int status; 1157 int status = wait_til_ready();
1180 1158
1181 if ((status = wait_til_ready()) < 0) 1159 if (status < 0)
1182 return -1; 1160 return -1;
1183 if ((status & (STATUS_READY | STATUS_DIR | STATUS_DMA)) == STATUS_READY) { 1161
1162 if (is_ready_state(status)) {
1184 fd_outb(byte, FD_DATA); 1163 fd_outb(byte, FD_DATA);
1185#ifdef FLOPPY_SANITY_CHECK
1186 output_log[output_log_pos].data = byte; 1164 output_log[output_log_pos].data = byte;
1187 output_log[output_log_pos].status = status; 1165 output_log[output_log_pos].status = status;
1188 output_log[output_log_pos].jiffies = jiffies; 1166 output_log[output_log_pos].jiffies = jiffies;
1189 output_log_pos = (output_log_pos + 1) % OLOGSIZE; 1167 output_log_pos = (output_log_pos + 1) % OLOGSIZE;
1190#endif
1191 return 0; 1168 return 0;
1192 } 1169 }
1193 FDCS->reset = 1; 1170 FDCS->reset = 1;
1194 if (!initialising) { 1171 if (initialized) {
1195 DPRINT("Unable to send byte %x to FDC. Fdc=%x Status=%x\n", 1172 DPRINT("Unable to send byte %x to FDC. Fdc=%x Status=%x\n",
1196 byte, fdc, status); 1173 byte, fdc, status);
1197 show_floppy(); 1174 show_floppy();
@@ -1199,8 +1176,6 @@ static int output_byte(char byte)
1199 return -1; 1176 return -1;
1200} 1177}
1201 1178
1202#define LAST_OUT(x) if (output_byte(x)<0){ reset_fdc();return;}
1203
1204/* gets the response from the fdc */ 1179/* gets the response from the fdc */
1205static int result(void) 1180static int result(void)
1206{ 1181{
@@ -1208,14 +1183,13 @@ static int result(void)
1208 int status = 0; 1183 int status = 0;
1209 1184
1210 for (i = 0; i < MAX_REPLIES; i++) { 1185 for (i = 0; i < MAX_REPLIES; i++) {
1211 if ((status = wait_til_ready()) < 0) 1186 status = wait_til_ready();
1187 if (status < 0)
1212 break; 1188 break;
1213 status &= STATUS_DIR | STATUS_READY | STATUS_BUSY | STATUS_DMA; 1189 status &= STATUS_DIR | STATUS_READY | STATUS_BUSY | STATUS_DMA;
1214 if ((status & ~STATUS_BUSY) == STATUS_READY) { 1190 if ((status & ~STATUS_BUSY) == STATUS_READY) {
1215#ifdef FLOPPY_SANITY_CHECK
1216 resultjiffies = jiffies; 1191 resultjiffies = jiffies;
1217 resultsize = i; 1192 resultsize = i;
1218#endif
1219 return i; 1193 return i;
1220 } 1194 }
1221 if (status == (STATUS_DIR | STATUS_READY | STATUS_BUSY)) 1195 if (status == (STATUS_DIR | STATUS_READY | STATUS_BUSY))
@@ -1223,10 +1197,9 @@ static int result(void)
1223 else 1197 else
1224 break; 1198 break;
1225 } 1199 }
1226 if (!initialising) { 1200 if (initialized) {
1227 DPRINT 1201 DPRINT("get result error. Fdc=%d Last status=%x Read bytes=%d\n",
1228 ("get result error. Fdc=%d Last status=%x Read bytes=%d\n", 1202 fdc, status, i);
1229 fdc, status, i);
1230 show_floppy(); 1203 show_floppy();
1231 } 1204 }
1232 FDCS->reset = 1; 1205 FDCS->reset = 1;
@@ -1237,12 +1210,14 @@ static int result(void)
1237/* does the fdc need more output? */ 1210/* does the fdc need more output? */
1238static int need_more_output(void) 1211static int need_more_output(void)
1239{ 1212{
1240 int status; 1213 int status = wait_til_ready();
1241 1214
1242 if ((status = wait_til_ready()) < 0) 1215 if (status < 0)
1243 return -1; 1216 return -1;
1244 if ((status & (STATUS_READY | STATUS_DIR | STATUS_DMA)) == STATUS_READY) 1217
1218 if (is_ready_state(status))
1245 return MORE_OUTPUT; 1219 return MORE_OUTPUT;
1220
1246 return result(); 1221 return result();
1247} 1222}
1248 1223
@@ -1264,9 +1239,12 @@ static inline void perpendicular_mode(void)
1264 default: 1239 default:
1265 DPRINT("Invalid data rate for perpendicular mode!\n"); 1240 DPRINT("Invalid data rate for perpendicular mode!\n");
1266 cont->done(0); 1241 cont->done(0);
1267 FDCS->reset = 1; /* convenient way to return to 1242 FDCS->reset = 1;
1268 * redo without to much hassle (deep 1243 /*
1269 * stack et al. */ 1244 * convenient way to return to
1245 * redo without too much hassle
1246 * (deep stack et al.)
1247 */
1270 return; 1248 return;
1271 } 1249 }
1272 } else 1250 } else
@@ -1366,9 +1344,9 @@ static void fdc_specify(void)
1366 1344
1367 /* Convert step rate from microseconds to milliseconds and 4 bits */ 1345 /* Convert step rate from microseconds to milliseconds and 4 bits */
1368 srt = 16 - DIV_ROUND_UP(DP->srt * scale_dtr / 1000, NOMINAL_DTR); 1346 srt = 16 - DIV_ROUND_UP(DP->srt * scale_dtr / 1000, NOMINAL_DTR);
1369 if (slow_floppy) { 1347 if (slow_floppy)
1370 srt = srt / 4; 1348 srt = srt / 4;
1371 } 1349
1372 SUPBOUND(srt, 0xf); 1350 SUPBOUND(srt, 0xf);
1373 INFBOUND(srt, 0); 1351 INFBOUND(srt, 0);
1374 1352
@@ -1415,16 +1393,46 @@ static int fdc_dtr(void)
1415 * Pause 5 msec to avoid trouble. (Needs to be 2 jiffies) 1393 * Pause 5 msec to avoid trouble. (Needs to be 2 jiffies)
1416 */ 1394 */
1417 FDCS->dtr = raw_cmd->rate & 3; 1395 FDCS->dtr = raw_cmd->rate & 3;
1418 return (fd_wait_for_completion(jiffies + 2UL * HZ / 100, 1396 return fd_wait_for_completion(jiffies + 2UL * HZ / 100,
1419 (timeout_fn) floppy_ready)); 1397 (timeout_fn)floppy_ready);
1420} /* fdc_dtr */ 1398} /* fdc_dtr */
1421 1399
1422static void tell_sector(void) 1400static void tell_sector(void)
1423{ 1401{
1424 printk(": track %d, head %d, sector %d, size %d", 1402 pr_cont(": track %d, head %d, sector %d, size %d",
1425 R_TRACK, R_HEAD, R_SECTOR, R_SIZECODE); 1403 R_TRACK, R_HEAD, R_SECTOR, R_SIZECODE);
1426} /* tell_sector */ 1404} /* tell_sector */
1427 1405
1406static void print_errors(void)
1407{
1408 DPRINT("");
1409 if (ST0 & ST0_ECE) {
1410 pr_cont("Recalibrate failed!");
1411 } else if (ST2 & ST2_CRC) {
1412 pr_cont("data CRC error");
1413 tell_sector();
1414 } else if (ST1 & ST1_CRC) {
1415 pr_cont("CRC error");
1416 tell_sector();
1417 } else if ((ST1 & (ST1_MAM | ST1_ND)) ||
1418 (ST2 & ST2_MAM)) {
1419 if (!probing) {
1420 pr_cont("sector not found");
1421 tell_sector();
1422 } else
1423 pr_cont("probe failed...");
1424 } else if (ST2 & ST2_WC) { /* seek error */
1425 pr_cont("wrong cylinder");
1426 } else if (ST2 & ST2_BC) { /* cylinder marked as bad */
1427 pr_cont("bad cylinder");
1428 } else {
1429 pr_cont("unknown error. ST[0..2] are: 0x%x 0x%x 0x%x",
1430 ST0, ST1, ST2);
1431 tell_sector();
1432 }
1433 pr_cont("\n");
1434}
1435
1428/* 1436/*
1429 * OK, this error interpreting routine is called after a 1437 * OK, this error interpreting routine is called after a
1430 * DMA read/write has succeeded 1438 * DMA read/write has succeeded
@@ -1437,7 +1445,7 @@ static int interpret_errors(void)
1437 char bad; 1445 char bad;
1438 1446
1439 if (inr != 7) { 1447 if (inr != 7) {
1440 DPRINT("-- FDC reply error"); 1448 DPRINT("-- FDC reply error\n");
1441 FDCS->reset = 1; 1449 FDCS->reset = 1;
1442 return 1; 1450 return 1;
1443 } 1451 }
@@ -1450,43 +1458,17 @@ static int interpret_errors(void)
1450 bad = 1; 1458 bad = 1;
1451 if (ST1 & ST1_WP) { 1459 if (ST1 & ST1_WP) {
1452 DPRINT("Drive is write protected\n"); 1460 DPRINT("Drive is write protected\n");
1453 CLEARF(FD_DISK_WRITABLE); 1461 clear_bit(FD_DISK_WRITABLE_BIT, &DRS->flags);
1454 cont->done(0); 1462 cont->done(0);
1455 bad = 2; 1463 bad = 2;
1456 } else if (ST1 & ST1_ND) { 1464 } else if (ST1 & ST1_ND) {
1457 SETF(FD_NEED_TWADDLE); 1465 set_bit(FD_NEED_TWADDLE_BIT, &DRS->flags);
1458 } else if (ST1 & ST1_OR) { 1466 } else if (ST1 & ST1_OR) {
1459 if (DP->flags & FTD_MSG) 1467 if (DP->flags & FTD_MSG)
1460 DPRINT("Over/Underrun - retrying\n"); 1468 DPRINT("Over/Underrun - retrying\n");
1461 bad = 0; 1469 bad = 0;
1462 } else if (*errors >= DP->max_errors.reporting) { 1470 } else if (*errors >= DP->max_errors.reporting) {
1463 DPRINT(""); 1471 print_errors();
1464 if (ST0 & ST0_ECE) {
1465 printk("Recalibrate failed!");
1466 } else if (ST2 & ST2_CRC) {
1467 printk("data CRC error");
1468 tell_sector();
1469 } else if (ST1 & ST1_CRC) {
1470 printk("CRC error");
1471 tell_sector();
1472 } else if ((ST1 & (ST1_MAM | ST1_ND))
1473 || (ST2 & ST2_MAM)) {
1474 if (!probing) {
1475 printk("sector not found");
1476 tell_sector();
1477 } else
1478 printk("probe failed...");
1479 } else if (ST2 & ST2_WC) { /* seek error */
1480 printk("wrong cylinder");
1481 } else if (ST2 & ST2_BC) { /* cylinder marked as bad */
1482 printk("bad cylinder");
1483 } else {
1484 printk
1485 ("unknown error. ST[0..2] are: 0x%x 0x%x 0x%x",
1486 ST0, ST1, ST2);
1487 tell_sector();
1488 }
1489 printk("\n");
1490 } 1472 }
1491 if (ST2 & ST2_WC || ST2 & ST2_BC) 1473 if (ST2 & ST2_WC || ST2 & ST2_BC)
1492 /* wrong cylinder => recal */ 1474 /* wrong cylinder => recal */
@@ -1531,9 +1513,9 @@ static void setup_rw_floppy(void)
1531 */ 1513 */
1532 if (time_after(ready_date, jiffies + DP->select_delay)) { 1514 if (time_after(ready_date, jiffies + DP->select_delay)) {
1533 ready_date -= DP->select_delay; 1515 ready_date -= DP->select_delay;
1534 function = (timeout_fn) floppy_start; 1516 function = (timeout_fn)floppy_start;
1535 } else 1517 } else
1536 function = (timeout_fn) setup_rw_floppy; 1518 function = (timeout_fn)setup_rw_floppy;
1537 1519
1538 /* wait until the floppy is spinning fast enough */ 1520 /* wait until the floppy is spinning fast enough */
1539 if (fd_wait_for_completion(ready_date, function)) 1521 if (fd_wait_for_completion(ready_date, function))
@@ -1551,7 +1533,7 @@ static void setup_rw_floppy(void)
1551 for (i = 0; i < raw_cmd->cmd_count; i++) 1533 for (i = 0; i < raw_cmd->cmd_count; i++)
1552 r |= output_byte(raw_cmd->cmd[i]); 1534 r |= output_byte(raw_cmd->cmd[i]);
1553 1535
1554 debugt("rw_command: "); 1536 debugt(__func__, "rw_command");
1555 1537
1556 if (r) { 1538 if (r) {
1557 cont->error(); 1539 cont->error();
@@ -1574,7 +1556,7 @@ static int blind_seek;
1574 */ 1556 */
1575static void seek_interrupt(void) 1557static void seek_interrupt(void)
1576{ 1558{
1577 debugt("seek interrupt:"); 1559 debugt(__func__, "");
1578 if (inr != 2 || (ST0 & 0xF8) != 0x20) { 1560 if (inr != 2 || (ST0 & 0xF8) != 0x20) {
1579 DPRINT("seek failed\n"); 1561 DPRINT("seek failed\n");
1580 DRS->track = NEED_2_RECAL; 1562 DRS->track = NEED_2_RECAL;
@@ -1583,14 +1565,11 @@ static void seek_interrupt(void)
1583 return; 1565 return;
1584 } 1566 }
1585 if (DRS->track >= 0 && DRS->track != ST1 && !blind_seek) { 1567 if (DRS->track >= 0 && DRS->track != ST1 && !blind_seek) {
1586#ifdef DCL_DEBUG 1568 debug_dcl(DP->flags,
1587 if (DP->flags & FD_DEBUG) { 1569 "clearing NEWCHANGE flag because of effective seek\n");
1588 DPRINT 1570 debug_dcl(DP->flags, "jiffies=%lu\n", jiffies);
1589 ("clearing NEWCHANGE flag because of effective seek\n"); 1571 clear_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags);
1590 DPRINT("jiffies=%lu\n", jiffies); 1572 /* effective seek */
1591 }
1592#endif
1593 CLEARF(FD_DISK_NEWCHANGE); /* effective seek */
1594 DRS->select_date = jiffies; 1573 DRS->select_date = jiffies;
1595 } 1574 }
1596 DRS->track = ST1; 1575 DRS->track = ST1;
@@ -1599,26 +1578,23 @@ static void seek_interrupt(void)
1599 1578
1600static void check_wp(void) 1579static void check_wp(void)
1601{ 1580{
1602 if (TESTF(FD_VERIFY)) { 1581 if (test_bit(FD_VERIFY_BIT, &DRS->flags)) {
1603 /* check write protection */ 1582 /* check write protection */
1604 output_byte(FD_GETSTATUS); 1583 output_byte(FD_GETSTATUS);
1605 output_byte(UNIT(current_drive)); 1584 output_byte(UNIT(current_drive));
1606 if (result() != 1) { 1585 if (result() != 1) {
1607 FDCS->reset = 1; 1586 FDCS->reset = 1;
1608 return; 1587 return;
1609 } 1588 }
1610 CLEARF(FD_VERIFY); 1589 clear_bit(FD_VERIFY_BIT, &DRS->flags);
1611 CLEARF(FD_NEED_TWADDLE); 1590 clear_bit(FD_NEED_TWADDLE_BIT, &DRS->flags);
1612#ifdef DCL_DEBUG 1591 debug_dcl(DP->flags,
1613 if (DP->flags & FD_DEBUG) { 1592 "checking whether disk is write protected\n");
1614 DPRINT("checking whether disk is write protected\n"); 1593 debug_dcl(DP->flags, "wp=%x\n", ST3 & 0x40);
1615 DPRINT("wp=%x\n", ST3 & 0x40);
1616 }
1617#endif
1618 if (!(ST3 & 0x40)) 1594 if (!(ST3 & 0x40))
1619 SETF(FD_DISK_WRITABLE); 1595 set_bit(FD_DISK_WRITABLE_BIT, &DRS->flags);
1620 else 1596 else
1621 CLEARF(FD_DISK_WRITABLE); 1597 clear_bit(FD_DISK_WRITABLE_BIT, &DRS->flags);
1622 } 1598 }
1623} 1599}
1624 1600
@@ -1628,19 +1604,15 @@ static void seek_floppy(void)
1628 1604
1629 blind_seek = 0; 1605 blind_seek = 0;
1630 1606
1631#ifdef DCL_DEBUG 1607 debug_dcl(DP->flags, "calling disk change from %s\n", __func__);
1632 if (DP->flags & FD_DEBUG) {
1633 DPRINT("calling disk change from seek\n");
1634 }
1635#endif
1636 1608
1637 if (!TESTF(FD_DISK_NEWCHANGE) && 1609 if (!test_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags) &&
1638 disk_change(current_drive) && (raw_cmd->flags & FD_RAW_NEED_DISK)) { 1610 disk_change(current_drive) && (raw_cmd->flags & FD_RAW_NEED_DISK)) {
1639 /* the media changed flag should be cleared after the seek. 1611 /* the media changed flag should be cleared after the seek.
1640 * If it isn't, this means that there is really no disk in 1612 * If it isn't, this means that there is really no disk in
1641 * the drive. 1613 * the drive.
1642 */ 1614 */
1643 SETF(FD_DISK_CHANGED); 1615 set_bit(FD_DISK_CHANGED_BIT, &DRS->flags);
1644 cont->done(0); 1616 cont->done(0);
1645 cont->redo(); 1617 cont->redo();
1646 return; 1618 return;
@@ -1648,7 +1620,7 @@ static void seek_floppy(void)
1648 if (DRS->track <= NEED_1_RECAL) { 1620 if (DRS->track <= NEED_1_RECAL) {
1649 recalibrate_floppy(); 1621 recalibrate_floppy();
1650 return; 1622 return;
1651 } else if (TESTF(FD_DISK_NEWCHANGE) && 1623 } else if (test_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags) &&
1652 (raw_cmd->flags & FD_RAW_NEED_DISK) && 1624 (raw_cmd->flags & FD_RAW_NEED_DISK) &&
1653 (DRS->track <= NO_TRACK || DRS->track == raw_cmd->track)) { 1625 (DRS->track <= NO_TRACK || DRS->track == raw_cmd->track)) {
1654 /* we seek to clear the media-changed condition. Does anybody 1626 /* we seek to clear the media-changed condition. Does anybody
@@ -1677,19 +1649,22 @@ static void seek_floppy(void)
1677 do_floppy = seek_interrupt; 1649 do_floppy = seek_interrupt;
1678 output_byte(FD_SEEK); 1650 output_byte(FD_SEEK);
1679 output_byte(UNIT(current_drive)); 1651 output_byte(UNIT(current_drive));
1680 LAST_OUT(track); 1652 if (output_byte(track) < 0) {
1681 debugt("seek command:"); 1653 reset_fdc();
1654 return;
1655 }
1656 debugt(__func__, "");
1682} 1657}
1683 1658
1684static void recal_interrupt(void) 1659static void recal_interrupt(void)
1685{ 1660{
1686 debugt("recal interrupt:"); 1661 debugt(__func__, "");
1687 if (inr != 2) 1662 if (inr != 2)
1688 FDCS->reset = 1; 1663 FDCS->reset = 1;
1689 else if (ST0 & ST0_ECE) { 1664 else if (ST0 & ST0_ECE) {
1690 switch (DRS->track) { 1665 switch (DRS->track) {
1691 case NEED_1_RECAL: 1666 case NEED_1_RECAL:
1692 debugt("recal interrupt need 1 recal:"); 1667 debugt(__func__, "need 1 recal");
1693 /* after a second recalibrate, we still haven't 1668 /* after a second recalibrate, we still haven't
1694 * reached track 0. Probably no drive. Raise an 1669 * reached track 0. Probably no drive. Raise an
1695 * error, as failing immediately might upset 1670 * error, as failing immediately might upset
@@ -1698,25 +1673,21 @@ static void recal_interrupt(void)
1698 cont->redo(); 1673 cont->redo();
1699 return; 1674 return;
1700 case NEED_2_RECAL: 1675 case NEED_2_RECAL:
1701 debugt("recal interrupt need 2 recal:"); 1676 debugt(__func__, "need 2 recal");
1702 /* If we already did a recalibrate, 1677 /* If we already did a recalibrate,
1703 * and we are not at track 0, this 1678 * and we are not at track 0, this
1704 * means we have moved. (The only way 1679 * means we have moved. (The only way
1705 * not to move at recalibration is to 1680 * not to move at recalibration is to
1706 * be already at track 0.) Clear the 1681 * be already at track 0.) Clear the
1707 * new change flag */ 1682 * new change flag */
1708#ifdef DCL_DEBUG 1683 debug_dcl(DP->flags,
1709 if (DP->flags & FD_DEBUG) { 1684 "clearing NEWCHANGE flag because of second recalibrate\n");
1710 DPRINT
1711 ("clearing NEWCHANGE flag because of second recalibrate\n");
1712 }
1713#endif
1714 1685
1715 CLEARF(FD_DISK_NEWCHANGE); 1686 clear_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags);
1716 DRS->select_date = jiffies; 1687 DRS->select_date = jiffies;
1717 /* fall through */ 1688 /* fall through */
1718 default: 1689 default:
1719 debugt("recal interrupt default:"); 1690 debugt(__func__, "default");
1720 /* Recalibrate moves the head by at 1691 /* Recalibrate moves the head by at
1721 * most 80 steps. If after one 1692 * most 80 steps. If after one
1722 * recalibrate we don't have reached 1693 * recalibrate we don't have reached
@@ -1738,8 +1709,8 @@ static void print_result(char *message, int inr)
1738 DPRINT("%s ", message); 1709 DPRINT("%s ", message);
1739 if (inr >= 0) 1710 if (inr >= 0)
1740 for (i = 0; i < inr; i++) 1711 for (i = 0; i < inr; i++)
1741 printk("repl[%d]=%x ", i, reply_buffer[i]); 1712 pr_cont("repl[%d]=%x ", i, reply_buffer[i]);
1742 printk("\n"); 1713 pr_cont("\n");
1743} 1714}
1744 1715
1745/* interrupt handler. Note that this can be called externally on the Sparc */ 1716/* interrupt handler. Note that this can be called externally on the Sparc */
@@ -1760,10 +1731,10 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id)
1760 do_floppy = NULL; 1731 do_floppy = NULL;
1761 if (fdc >= N_FDC || FDCS->address == -1) { 1732 if (fdc >= N_FDC || FDCS->address == -1) {
1762 /* we don't even know which FDC is the culprit */ 1733 /* we don't even know which FDC is the culprit */
1763 printk("DOR0=%x\n", fdc_state[0].dor); 1734 pr_info("DOR0=%x\n", fdc_state[0].dor);
1764 printk("floppy interrupt on bizarre fdc %d\n", fdc); 1735 pr_info("floppy interrupt on bizarre fdc %d\n", fdc);
1765 printk("handler=%p\n", handler); 1736 pr_info("handler=%pf\n", handler);
1766 is_alive("bizarre fdc"); 1737 is_alive(__func__, "bizarre fdc");
1767 return IRQ_NONE; 1738 return IRQ_NONE;
1768 } 1739 }
1769 1740
@@ -1777,7 +1748,7 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id)
1777 * activity. 1748 * activity.
1778 */ 1749 */
1779 1750
1780 do_print = !handler && print_unex && !initialising; 1751 do_print = !handler && print_unex && initialized;
1781 1752
1782 inr = result(); 1753 inr = result();
1783 if (do_print) 1754 if (do_print)
@@ -1790,15 +1761,15 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id)
1790 if (do_print) 1761 if (do_print)
1791 print_result("sensei", inr); 1762 print_result("sensei", inr);
1792 max_sensei--; 1763 max_sensei--;
1793 } while ((ST0 & 0x83) != UNIT(current_drive) && inr == 2 1764 } while ((ST0 & 0x83) != UNIT(current_drive) &&
1794 && max_sensei); 1765 inr == 2 && max_sensei);
1795 } 1766 }
1796 if (!handler) { 1767 if (!handler) {
1797 FDCS->reset = 1; 1768 FDCS->reset = 1;
1798 return IRQ_NONE; 1769 return IRQ_NONE;
1799 } 1770 }
1800 schedule_bh(handler); 1771 schedule_bh(handler);
1801 is_alive("normal interrupt end"); 1772 is_alive(__func__, "normal interrupt end");
1802 1773
1803 /* FIXME! Was it really for us? */ 1774 /* FIXME! Was it really for us? */
1804 return IRQ_HANDLED; 1775 return IRQ_HANDLED;
@@ -1806,10 +1777,11 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id)
1806 1777
1807static void recalibrate_floppy(void) 1778static void recalibrate_floppy(void)
1808{ 1779{
1809 debugt("recalibrate floppy:"); 1780 debugt(__func__, "");
1810 do_floppy = recal_interrupt; 1781 do_floppy = recal_interrupt;
1811 output_byte(FD_RECALIBRATE); 1782 output_byte(FD_RECALIBRATE);
1812 LAST_OUT(UNIT(current_drive)); 1783 if (output_byte(UNIT(current_drive)) < 0)
1784 reset_fdc();
1813} 1785}
1814 1786
1815/* 1787/*
@@ -1817,10 +1789,10 @@ static void recalibrate_floppy(void)
1817 */ 1789 */
1818static void reset_interrupt(void) 1790static void reset_interrupt(void)
1819{ 1791{
1820 debugt("reset interrupt:"); 1792 debugt(__func__, "");
1821 result(); /* get the status ready for set_fdc */ 1793 result(); /* get the status ready for set_fdc */
1822 if (FDCS->reset) { 1794 if (FDCS->reset) {
1823 printk("reset set in interrupt, calling %p\n", cont->error); 1795 pr_info("reset set in interrupt, calling %pf\n", cont->error);
1824 cont->error(); /* a reset just after a reset. BAD! */ 1796 cont->error(); /* a reset just after a reset. BAD! */
1825 } 1797 }
1826 cont->redo(); 1798 cont->redo();
@@ -1858,53 +1830,49 @@ static void show_floppy(void)
1858{ 1830{
1859 int i; 1831 int i;
1860 1832
1861 printk("\n"); 1833 pr_info("\n");
1862 printk("floppy driver state\n"); 1834 pr_info("floppy driver state\n");
1863 printk("-------------------\n"); 1835 pr_info("-------------------\n");
1864 printk("now=%lu last interrupt=%lu diff=%lu last called handler=%p\n", 1836 pr_info("now=%lu last interrupt=%lu diff=%lu last called handler=%pf\n",
1865 jiffies, interruptjiffies, jiffies - interruptjiffies, 1837 jiffies, interruptjiffies, jiffies - interruptjiffies,
1866 lasthandler); 1838 lasthandler);
1867 1839
1868#ifdef FLOPPY_SANITY_CHECK 1840 pr_info("timeout_message=%s\n", timeout_message);
1869 printk("timeout_message=%s\n", timeout_message); 1841 pr_info("last output bytes:\n");
1870 printk("last output bytes:\n");
1871 for (i = 0; i < OLOGSIZE; i++) 1842 for (i = 0; i < OLOGSIZE; i++)
1872 printk("%2x %2x %lu\n", 1843 pr_info("%2x %2x %lu\n",
1873 output_log[(i + output_log_pos) % OLOGSIZE].data, 1844 output_log[(i + output_log_pos) % OLOGSIZE].data,
1874 output_log[(i + output_log_pos) % OLOGSIZE].status, 1845 output_log[(i + output_log_pos) % OLOGSIZE].status,
1875 output_log[(i + output_log_pos) % OLOGSIZE].jiffies); 1846 output_log[(i + output_log_pos) % OLOGSIZE].jiffies);
1876 printk("last result at %lu\n", resultjiffies); 1847 pr_info("last result at %lu\n", resultjiffies);
1877 printk("last redo_fd_request at %lu\n", lastredo); 1848 pr_info("last redo_fd_request at %lu\n", lastredo);
1878 for (i = 0; i < resultsize; i++) { 1849 print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, 16, 1,
1879 printk("%2x ", reply_buffer[i]); 1850 reply_buffer, resultsize, true);
1880 } 1851
1881 printk("\n"); 1852 pr_info("status=%x\n", fd_inb(FD_STATUS));
1882#endif 1853 pr_info("fdc_busy=%lu\n", fdc_busy);
1883
1884 printk("status=%x\n", fd_inb(FD_STATUS));
1885 printk("fdc_busy=%lu\n", fdc_busy);
1886 if (do_floppy) 1854 if (do_floppy)
1887 printk("do_floppy=%p\n", do_floppy); 1855 pr_info("do_floppy=%pf\n", do_floppy);
1888 if (work_pending(&floppy_work)) 1856 if (work_pending(&floppy_work))
1889 printk("floppy_work.func=%p\n", floppy_work.func); 1857 pr_info("floppy_work.func=%pf\n", floppy_work.func);
1890 if (timer_pending(&fd_timer)) 1858 if (timer_pending(&fd_timer))
1891 printk("fd_timer.function=%p\n", fd_timer.function); 1859 pr_info("fd_timer.function=%pf\n", fd_timer.function);
1892 if (timer_pending(&fd_timeout)) { 1860 if (timer_pending(&fd_timeout)) {
1893 printk("timer_function=%p\n", fd_timeout.function); 1861 pr_info("timer_function=%pf\n", fd_timeout.function);
1894 printk("expires=%lu\n", fd_timeout.expires - jiffies); 1862 pr_info("expires=%lu\n", fd_timeout.expires - jiffies);
1895 printk("now=%lu\n", jiffies); 1863 pr_info("now=%lu\n", jiffies);
1896 } 1864 }
1897 printk("cont=%p\n", cont); 1865 pr_info("cont=%p\n", cont);
1898 printk("current_req=%p\n", current_req); 1866 pr_info("current_req=%p\n", current_req);
1899 printk("command_status=%d\n", command_status); 1867 pr_info("command_status=%d\n", command_status);
1900 printk("\n"); 1868 pr_info("\n");
1901} 1869}
1902 1870
1903static void floppy_shutdown(unsigned long data) 1871static void floppy_shutdown(unsigned long data)
1904{ 1872{
1905 unsigned long flags; 1873 unsigned long flags;
1906 1874
1907 if (!initialising) 1875 if (initialized)
1908 show_floppy(); 1876 show_floppy();
1909 cancel_activity(); 1877 cancel_activity();
1910 1878
@@ -1916,17 +1884,17 @@ static void floppy_shutdown(unsigned long data)
1916 1884
1917 /* avoid dma going to a random drive after shutdown */ 1885 /* avoid dma going to a random drive after shutdown */
1918 1886
1919 if (!initialising) 1887 if (initialized)
1920 DPRINT("floppy timeout called\n"); 1888 DPRINT("floppy timeout called\n");
1921 FDCS->reset = 1; 1889 FDCS->reset = 1;
1922 if (cont) { 1890 if (cont) {
1923 cont->done(0); 1891 cont->done(0);
1924 cont->redo(); /* this will recall reset when needed */ 1892 cont->redo(); /* this will recall reset when needed */
1925 } else { 1893 } else {
1926 printk("no cont in shutdown!\n"); 1894 pr_info("no cont in shutdown!\n");
1927 process_fd_request(); 1895 process_fd_request();
1928 } 1896 }
1929 is_alive("floppy shutdown"); 1897 is_alive(__func__, "");
1930} 1898}
1931 1899
1932/* start motor, check media-changed condition and write protection */ 1900/* start motor, check media-changed condition and write protection */
@@ -1954,27 +1922,26 @@ static int start_motor(void (*function)(void))
1954 set_dor(fdc, mask, data); 1922 set_dor(fdc, mask, data);
1955 1923
1956 /* wait_for_completion also schedules reset if needed. */ 1924 /* wait_for_completion also schedules reset if needed. */
1957 return (fd_wait_for_completion(DRS->select_date + DP->select_delay, 1925 return fd_wait_for_completion(DRS->select_date + DP->select_delay,
1958 (timeout_fn) function)); 1926 (timeout_fn)function);
1959} 1927}
1960 1928
1961static void floppy_ready(void) 1929static void floppy_ready(void)
1962{ 1930{
1963 CHECK_RESET; 1931 if (FDCS->reset) {
1932 reset_fdc();
1933 return;
1934 }
1964 if (start_motor(floppy_ready)) 1935 if (start_motor(floppy_ready))
1965 return; 1936 return;
1966 if (fdc_dtr()) 1937 if (fdc_dtr())
1967 return; 1938 return;
1968 1939
1969#ifdef DCL_DEBUG 1940 debug_dcl(DP->flags, "calling disk change from floppy_ready\n");
1970 if (DP->flags & FD_DEBUG) {
1971 DPRINT("calling disk change from floppy_ready\n");
1972 }
1973#endif
1974 if (!(raw_cmd->flags & FD_RAW_NO_MOTOR) && 1941 if (!(raw_cmd->flags & FD_RAW_NO_MOTOR) &&
1975 disk_change(current_drive) && !DP->select_delay) 1942 disk_change(current_drive) && !DP->select_delay)
1976 twaddle(); /* this clears the dcl on certain drive/controller 1943 twaddle(); /* this clears the dcl on certain
1977 * combinations */ 1944 * drive/controller combinations */
1978 1945
1979#ifdef fd_chose_dma_mode 1946#ifdef fd_chose_dma_mode
1980 if ((raw_cmd->flags & FD_RAW_READ) || (raw_cmd->flags & FD_RAW_WRITE)) { 1947 if ((raw_cmd->flags & FD_RAW_READ) || (raw_cmd->flags & FD_RAW_WRITE)) {
@@ -1998,15 +1965,11 @@ static void floppy_ready(void)
1998 1965
1999static void floppy_start(void) 1966static void floppy_start(void)
2000{ 1967{
2001 reschedule_timeout(current_reqD, "floppy start", 0); 1968 reschedule_timeout(current_reqD, "floppy start");
2002 1969
2003 scandrives(); 1970 scandrives();
2004#ifdef DCL_DEBUG 1971 debug_dcl(DP->flags, "setting NEWCHANGE in floppy_start\n");
2005 if (DP->flags & FD_DEBUG) { 1972 set_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags);
2006 DPRINT("setting NEWCHANGE in floppy_start\n");
2007 }
2008#endif
2009 SETF(FD_DISK_NEWCHANGE);
2010 floppy_ready(); 1973 floppy_ready();
2011} 1974}
2012 1975
@@ -2026,7 +1989,7 @@ static void floppy_start(void)
2026 1989
2027static void do_wakeup(void) 1990static void do_wakeup(void)
2028{ 1991{
2029 reschedule_timeout(MAXTIMEOUT, "do wakeup", 0); 1992 reschedule_timeout(MAXTIMEOUT, "do wakeup");
2030 cont = NULL; 1993 cont = NULL;
2031 command_status += 2; 1994 command_status += 2;
2032 wake_up(&command_done); 1995 wake_up(&command_done);
@@ -2046,7 +2009,7 @@ static struct cont_t intr_cont = {
2046 .done = (done_f)empty 2009 .done = (done_f)empty
2047}; 2010};
2048 2011
2049static int wait_til_done(void (*handler)(void), int interruptible) 2012static int wait_til_done(void (*handler)(void), bool interruptible)
2050{ 2013{
2051 int ret; 2014 int ret;
2052 2015
@@ -2064,7 +2027,7 @@ static int wait_til_done(void (*handler)(void), int interruptible)
2064 if (command_status >= 2 || !NO_SIGNAL) 2027 if (command_status >= 2 || !NO_SIGNAL)
2065 break; 2028 break;
2066 2029
2067 is_alive("wait_til_done"); 2030 is_alive(__func__, "");
2068 schedule(); 2031 schedule();
2069 } 2032 }
2070 2033
@@ -2180,9 +2143,9 @@ static void format_interrupt(void)
2180 cont->redo(); 2143 cont->redo();
2181} 2144}
2182 2145
2183#define CODE2SIZE (ssize = ((1 << SIZECODE) + 3) >> 2) 2146#define FM_MODE(x, y) ((y) & ~(((x)->rate & 0x80) >> 1))
2184#define FM_MODE(x,y) ((y) & ~(((x)->rate & 0x80) >>1))
2185#define CT(x) ((x) | 0xc0) 2147#define CT(x) ((x) | 0xc0)
2148
2186static void setup_format_params(int track) 2149static void setup_format_params(int track)
2187{ 2150{
2188 int n; 2151 int n;
@@ -2197,8 +2160,8 @@ static void setup_format_params(int track)
2197 raw_cmd = &default_raw_cmd; 2160 raw_cmd = &default_raw_cmd;
2198 raw_cmd->track = track; 2161 raw_cmd->track = track;
2199 2162
2200 raw_cmd->flags = FD_RAW_WRITE | FD_RAW_INTR | FD_RAW_SPIN | 2163 raw_cmd->flags = (FD_RAW_WRITE | FD_RAW_INTR | FD_RAW_SPIN |
2201 FD_RAW_NEED_DISK | FD_RAW_NEED_SEEK; 2164 FD_RAW_NEED_DISK | FD_RAW_NEED_SEEK);
2202 raw_cmd->rate = _floppy->rate & 0x43; 2165 raw_cmd->rate = _floppy->rate & 0x43;
2203 raw_cmd->cmd_count = NR_F; 2166 raw_cmd->cmd_count = NR_F;
2204 COMMAND = FM_MODE(_floppy, FD_FORMAT); 2167 COMMAND = FM_MODE(_floppy, FD_FORMAT);
@@ -2257,7 +2220,7 @@ static void redo_format(void)
2257 buffer_track = -1; 2220 buffer_track = -1;
2258 setup_format_params(format_req.track << STRETCH(_floppy)); 2221 setup_format_params(format_req.track << STRETCH(_floppy));
2259 floppy_start(); 2222 floppy_start();
2260 debugt("queue format request"); 2223 debugt(__func__, "queue format request");
2261} 2224}
2262 2225
2263static struct cont_t format_cont = { 2226static struct cont_t format_cont = {
@@ -2271,7 +2234,9 @@ static int do_format(int drive, struct format_descr *tmp_format_req)
2271{ 2234{
2272 int ret; 2235 int ret;
2273 2236
2274 LOCK_FDC(drive, 1); 2237 if (lock_fdc(drive, true))
2238 return -EINTR;
2239
2275 set_floppy(drive); 2240 set_floppy(drive);
2276 if (!_floppy || 2241 if (!_floppy ||
2277 _floppy->track > DP->tracks || 2242 _floppy->track > DP->tracks ||
@@ -2286,7 +2251,9 @@ static int do_format(int drive, struct format_descr *tmp_format_req)
2286 format_errors = 0; 2251 format_errors = 0;
2287 cont = &format_cont; 2252 cont = &format_cont;
2288 errors = &format_errors; 2253 errors = &format_errors;
2289 IWAIT(redo_format); 2254 ret = wait_til_done(redo_format, true);
2255 if (ret == -EINTR)
2256 return -EINTR;
2290 process_fd_request(); 2257 process_fd_request();
2291 return ret; 2258 return ret;
2292} 2259}
@@ -2320,12 +2287,14 @@ static void request_done(int uptodate)
2320 struct request *req = current_req; 2287 struct request *req = current_req;
2321 unsigned long flags; 2288 unsigned long flags;
2322 int block; 2289 int block;
2290 char msg[sizeof("request done ") + sizeof(int) * 3];
2323 2291
2324 probing = 0; 2292 probing = 0;
2325 reschedule_timeout(MAXTIMEOUT, "request done %d", uptodate); 2293 snprintf(msg, sizeof(msg), "request done %d", uptodate);
2294 reschedule_timeout(MAXTIMEOUT, msg);
2326 2295
2327 if (!req) { 2296 if (!req) {
2328 printk("floppy.c: no request in request_done\n"); 2297 pr_info("floppy.c: no request in request_done\n");
2329 return; 2298 return;
2330 } 2299 }
2331 2300
@@ -2377,7 +2346,7 @@ static void rw_interrupt(void)
2377 DRS->first_read_date = jiffies; 2346 DRS->first_read_date = jiffies;
2378 2347
2379 nr_sectors = 0; 2348 nr_sectors = 0;
2380 CODE2SIZE; 2349 ssize = DIV_ROUND_UP(1 << SIZECODE, 4);
2381 2350
2382 if (ST1 & ST1_EOC) 2351 if (ST1 & ST1_EOC)
2383 eoc = 1; 2352 eoc = 1;
@@ -2393,20 +2362,18 @@ static void rw_interrupt(void)
2393 R_HEAD - HEAD) * SECT_PER_TRACK + 2362 R_HEAD - HEAD) * SECT_PER_TRACK +
2394 R_SECTOR - SECTOR + eoc) << SIZECODE >> 2; 2363 R_SECTOR - SECTOR + eoc) << SIZECODE >> 2;
2395 2364
2396#ifdef FLOPPY_SANITY_CHECK
2397 if (nr_sectors / ssize > 2365 if (nr_sectors / ssize >
2398 DIV_ROUND_UP(in_sector_offset + current_count_sectors, ssize)) { 2366 DIV_ROUND_UP(in_sector_offset + current_count_sectors, ssize)) {
2399 DPRINT("long rw: %x instead of %lx\n", 2367 DPRINT("long rw: %x instead of %lx\n",
2400 nr_sectors, current_count_sectors); 2368 nr_sectors, current_count_sectors);
2401 printk("rs=%d s=%d\n", R_SECTOR, SECTOR); 2369 pr_info("rs=%d s=%d\n", R_SECTOR, SECTOR);
2402 printk("rh=%d h=%d\n", R_HEAD, HEAD); 2370 pr_info("rh=%d h=%d\n", R_HEAD, HEAD);
2403 printk("rt=%d t=%d\n", R_TRACK, TRACK); 2371 pr_info("rt=%d t=%d\n", R_TRACK, TRACK);
2404 printk("heads=%d eoc=%d\n", heads, eoc); 2372 pr_info("heads=%d eoc=%d\n", heads, eoc);
2405 printk("spt=%d st=%d ss=%d\n", SECT_PER_TRACK, 2373 pr_info("spt=%d st=%d ss=%d\n",
2406 fsector_t, ssize); 2374 SECT_PER_TRACK, fsector_t, ssize);
2407 printk("in_sector_offset=%d\n", in_sector_offset); 2375 pr_info("in_sector_offset=%d\n", in_sector_offset);
2408 } 2376 }
2409#endif
2410 2377
2411 nr_sectors -= in_sector_offset; 2378 nr_sectors -= in_sector_offset;
2412 INFBOUND(nr_sectors, 0); 2379 INFBOUND(nr_sectors, 0);
@@ -2511,19 +2478,17 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2)
2511 blk_rq_sectors(current_req)); 2478 blk_rq_sectors(current_req));
2512 2479
2513 remaining = current_count_sectors << 9; 2480 remaining = current_count_sectors << 9;
2514#ifdef FLOPPY_SANITY_CHECK
2515 if (remaining > blk_rq_bytes(current_req) && CT(COMMAND) == FD_WRITE) { 2481 if (remaining > blk_rq_bytes(current_req) && CT(COMMAND) == FD_WRITE) {
2516 DPRINT("in copy buffer\n"); 2482 DPRINT("in copy buffer\n");
2517 printk("current_count_sectors=%ld\n", current_count_sectors); 2483 pr_info("current_count_sectors=%ld\n", current_count_sectors);
2518 printk("remaining=%d\n", remaining >> 9); 2484 pr_info("remaining=%d\n", remaining >> 9);
2519 printk("current_req->nr_sectors=%u\n", 2485 pr_info("current_req->nr_sectors=%u\n",
2520 blk_rq_sectors(current_req)); 2486 blk_rq_sectors(current_req));
2521 printk("current_req->current_nr_sectors=%u\n", 2487 pr_info("current_req->current_nr_sectors=%u\n",
2522 blk_rq_cur_sectors(current_req)); 2488 blk_rq_cur_sectors(current_req));
2523 printk("max_sector=%d\n", max_sector); 2489 pr_info("max_sector=%d\n", max_sector);
2524 printk("ssize=%d\n", ssize); 2490 pr_info("ssize=%d\n", ssize);
2525 } 2491 }
2526#endif
2527 2492
2528 buffer_max = max(max_sector, buffer_max); 2493 buffer_max = max(max_sector, buffer_max);
2529 2494
@@ -2539,26 +2504,24 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2)
2539 SUPBOUND(size, remaining); 2504 SUPBOUND(size, remaining);
2540 2505
2541 buffer = page_address(bv->bv_page) + bv->bv_offset; 2506 buffer = page_address(bv->bv_page) + bv->bv_offset;
2542#ifdef FLOPPY_SANITY_CHECK
2543 if (dma_buffer + size > 2507 if (dma_buffer + size >
2544 floppy_track_buffer + (max_buffer_sectors << 10) || 2508 floppy_track_buffer + (max_buffer_sectors << 10) ||
2545 dma_buffer < floppy_track_buffer) { 2509 dma_buffer < floppy_track_buffer) {
2546 DPRINT("buffer overrun in copy buffer %d\n", 2510 DPRINT("buffer overrun in copy buffer %d\n",
2547 (int)((floppy_track_buffer - 2511 (int)((floppy_track_buffer - dma_buffer) >> 9));
2548 dma_buffer) >> 9)); 2512 pr_info("fsector_t=%d buffer_min=%d\n",
2549 printk("fsector_t=%d buffer_min=%d\n", 2513 fsector_t, buffer_min);
2550 fsector_t, buffer_min); 2514 pr_info("current_count_sectors=%ld\n",
2551 printk("current_count_sectors=%ld\n", 2515 current_count_sectors);
2552 current_count_sectors);
2553 if (CT(COMMAND) == FD_READ) 2516 if (CT(COMMAND) == FD_READ)
2554 printk("read\n"); 2517 pr_info("read\n");
2555 if (CT(COMMAND) == FD_WRITE) 2518 if (CT(COMMAND) == FD_WRITE)
2556 printk("write\n"); 2519 pr_info("write\n");
2557 break; 2520 break;
2558 } 2521 }
2559 if (((unsigned long)buffer) % 512) 2522 if (((unsigned long)buffer) % 512)
2560 DPRINT("%p buffer not aligned\n", buffer); 2523 DPRINT("%p buffer not aligned\n", buffer);
2561#endif 2524
2562 if (CT(COMMAND) == FD_READ) 2525 if (CT(COMMAND) == FD_READ)
2563 memcpy(buffer, dma_buffer, size); 2526 memcpy(buffer, dma_buffer, size);
2564 else 2527 else
@@ -2567,13 +2530,11 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2)
2567 remaining -= size; 2530 remaining -= size;
2568 dma_buffer += size; 2531 dma_buffer += size;
2569 } 2532 }
2570#ifdef FLOPPY_SANITY_CHECK
2571 if (remaining) { 2533 if (remaining) {
2572 if (remaining > 0) 2534 if (remaining > 0)
2573 max_sector -= remaining >> 9; 2535 max_sector -= remaining >> 9;
2574 DPRINT("weirdness: remaining %d\n", remaining >> 9); 2536 DPRINT("weirdness: remaining %d\n", remaining >> 9);
2575 } 2537 }
2576#endif
2577} 2538}
2578 2539
2579/* work around a bug in pseudo DMA 2540/* work around a bug in pseudo DMA
@@ -2593,15 +2554,14 @@ static void virtualdmabug_workaround(void)
2593 2554
2594 hard_sectors = raw_cmd->length >> (7 + SIZECODE); 2555 hard_sectors = raw_cmd->length >> (7 + SIZECODE);
2595 end_sector = SECTOR + hard_sectors - 1; 2556 end_sector = SECTOR + hard_sectors - 1;
2596#ifdef FLOPPY_SANITY_CHECK
2597 if (end_sector > SECT_PER_TRACK) { 2557 if (end_sector > SECT_PER_TRACK) {
2598 printk("too many sectors %d > %d\n", 2558 pr_info("too many sectors %d > %d\n",
2599 end_sector, SECT_PER_TRACK); 2559 end_sector, SECT_PER_TRACK);
2600 return; 2560 return;
2601 } 2561 }
2602#endif 2562 SECT_PER_TRACK = end_sector;
2603 SECT_PER_TRACK = end_sector; /* make sure SECT_PER_TRACK points 2563 /* make sure SECT_PER_TRACK
2604 * to end of transfer */ 2564 * points to end of transfer */
2605 } 2565 }
2606} 2566}
2607 2567
@@ -2624,7 +2584,7 @@ static int make_raw_rw_request(void)
2624 int ssize; 2584 int ssize;
2625 2585
2626 if (max_buffer_sectors == 0) { 2586 if (max_buffer_sectors == 0) {
2627 printk("VFS: Block I/O scheduled on unopened device\n"); 2587 pr_info("VFS: Block I/O scheduled on unopened device\n");
2628 return 0; 2588 return 0;
2629 } 2589 }
2630 2590
@@ -2641,7 +2601,7 @@ static int make_raw_rw_request(void)
2641 raw_cmd->flags |= FD_RAW_WRITE; 2601 raw_cmd->flags |= FD_RAW_WRITE;
2642 COMMAND = FM_MODE(_floppy, FD_WRITE); 2602 COMMAND = FM_MODE(_floppy, FD_WRITE);
2643 } else { 2603 } else {
2644 DPRINT("make_raw_rw_request: unknown command\n"); 2604 DPRINT("%s: unknown command\n", __func__);
2645 return 0; 2605 return 0;
2646 } 2606 }
2647 2607
@@ -2659,7 +2619,8 @@ static int make_raw_rw_request(void)
2659 HEAD = fsector_t / _floppy->sect; 2619 HEAD = fsector_t / _floppy->sect;
2660 2620
2661 if (((_floppy->stretch & (FD_SWAPSIDES | FD_SECTBASEMASK)) || 2621 if (((_floppy->stretch & (FD_SWAPSIDES | FD_SECTBASEMASK)) ||
2662 TESTF(FD_NEED_TWADDLE)) && fsector_t < _floppy->sect) 2622 test_bit(FD_NEED_TWADDLE_BIT, &DRS->flags)) &&
2623 fsector_t < _floppy->sect)
2663 max_sector = _floppy->sect; 2624 max_sector = _floppy->sect;
2664 2625
2665 /* 2M disks have phantom sectors on the first track */ 2626 /* 2M disks have phantom sectors on the first track */
@@ -2685,7 +2646,7 @@ static int make_raw_rw_request(void)
2685 raw_cmd->track = TRACK << STRETCH(_floppy); 2646 raw_cmd->track = TRACK << STRETCH(_floppy);
2686 DR_SELECT = UNIT(current_drive) + PH_HEAD(_floppy, HEAD); 2647 DR_SELECT = UNIT(current_drive) + PH_HEAD(_floppy, HEAD);
2687 GAP = _floppy->gap; 2648 GAP = _floppy->gap;
2688 CODE2SIZE; 2649 ssize = DIV_ROUND_UP(1 << SIZECODE, 4);
2689 SECT_PER_TRACK = _floppy->sect << 2 >> SIZECODE; 2650 SECT_PER_TRACK = _floppy->sect << 2 >> SIZECODE;
2690 SECTOR = ((fsector_t % _floppy->sect) << 2 >> SIZECODE) + 2651 SECTOR = ((fsector_t % _floppy->sect) << 2 >> SIZECODE) +
2691 FD_SECTBASE(_floppy); 2652 FD_SECTBASE(_floppy);
@@ -2730,8 +2691,10 @@ static int make_raw_rw_request(void)
2730 } 2691 }
2731 } else if (in_sector_offset || blk_rq_sectors(current_req) < ssize) { 2692 } else if (in_sector_offset || blk_rq_sectors(current_req) < ssize) {
2732 if (CT(COMMAND) == FD_WRITE) { 2693 if (CT(COMMAND) == FD_WRITE) {
2733 if (fsector_t + blk_rq_sectors(current_req) > ssize && 2694 unsigned int sectors;
2734 fsector_t + blk_rq_sectors(current_req) < ssize + ssize) 2695
2696 sectors = fsector_t + blk_rq_sectors(current_req);
2697 if (sectors > ssize && sectors < ssize + ssize)
2735 max_size = ssize + ssize; 2698 max_size = ssize + ssize;
2736 else 2699 else
2737 max_size = ssize; 2700 max_size = ssize;
@@ -2752,12 +2715,10 @@ static int make_raw_rw_request(void)
2752 * on a 64 bit machine! 2715 * on a 64 bit machine!
2753 */ 2716 */
2754 max_size = buffer_chain_size(); 2717 max_size = buffer_chain_size();
2755 dma_limit = 2718 dma_limit = (MAX_DMA_ADDRESS -
2756 (MAX_DMA_ADDRESS - 2719 ((unsigned long)current_req->buffer)) >> 9;
2757 ((unsigned long)current_req->buffer)) >> 9; 2720 if ((unsigned long)max_size > dma_limit)
2758 if ((unsigned long)max_size > dma_limit) {
2759 max_size = dma_limit; 2721 max_size = dma_limit;
2760 }
2761 /* 64 kb boundaries */ 2722 /* 64 kb boundaries */
2762 if (CROSS_64KB(current_req->buffer, max_size << 9)) 2723 if (CROSS_64KB(current_req->buffer, max_size << 9))
2763 max_size = (K_64 - 2724 max_size = (K_64 -
@@ -2773,16 +2734,16 @@ static int make_raw_rw_request(void)
2773 */ 2734 */
2774 if (!direct || 2735 if (!direct ||
2775 (indirect * 2 > direct * 3 && 2736 (indirect * 2 > direct * 3 &&
2776 *errors < DP->max_errors.read_track && ((!probing 2737 *errors < DP->max_errors.read_track &&
2777 || (DP->read_track & (1 << DRS->probed_format)))))) { 2738 ((!probing ||
2739 (DP->read_track & (1 << DRS->probed_format)))))) {
2778 max_size = blk_rq_sectors(current_req); 2740 max_size = blk_rq_sectors(current_req);
2779 } else { 2741 } else {
2780 raw_cmd->kernel_data = current_req->buffer; 2742 raw_cmd->kernel_data = current_req->buffer;
2781 raw_cmd->length = current_count_sectors << 9; 2743 raw_cmd->length = current_count_sectors << 9;
2782 if (raw_cmd->length == 0) { 2744 if (raw_cmd->length == 0) {
2783 DPRINT 2745 DPRINT("%s: zero dma transfer attempted\n", __func__);
2784 ("zero dma transfer attempted from make_raw_request\n"); 2746 DPRINT("indirect=%d direct=%d fsector_t=%d\n",
2785 DPRINT("indirect=%d direct=%d fsector_t=%d",
2786 indirect, direct, fsector_t); 2747 indirect, direct, fsector_t);
2787 return 0; 2748 return 0;
2788 } 2749 }
@@ -2802,25 +2763,22 @@ static int make_raw_rw_request(void)
2802 ((CT(COMMAND) == FD_READ || 2763 ((CT(COMMAND) == FD_READ ||
2803 (!in_sector_offset && blk_rq_sectors(current_req) >= ssize)) && 2764 (!in_sector_offset && blk_rq_sectors(current_req) >= ssize)) &&
2804 max_sector > 2 * max_buffer_sectors + buffer_min && 2765 max_sector > 2 * max_buffer_sectors + buffer_min &&
2805 max_size + fsector_t > 2 * max_buffer_sectors + buffer_min) 2766 max_size + fsector_t > 2 * max_buffer_sectors + buffer_min)) {
2806 /* not enough space */ 2767 /* not enough space */
2807 ) {
2808 buffer_track = -1; 2768 buffer_track = -1;
2809 buffer_drive = current_drive; 2769 buffer_drive = current_drive;
2810 buffer_max = buffer_min = aligned_sector_t; 2770 buffer_max = buffer_min = aligned_sector_t;
2811 } 2771 }
2812 raw_cmd->kernel_data = floppy_track_buffer + 2772 raw_cmd->kernel_data = floppy_track_buffer +
2813 ((aligned_sector_t - buffer_min) << 9); 2773 ((aligned_sector_t - buffer_min) << 9);
2814 2774
2815 if (CT(COMMAND) == FD_WRITE) { 2775 if (CT(COMMAND) == FD_WRITE) {
2816 /* copy write buffer to track buffer. 2776 /* copy write buffer to track buffer.
2817 * if we get here, we know that the write 2777 * if we get here, we know that the write
2818 * is either aligned or the data already in the buffer 2778 * is either aligned or the data already in the buffer
2819 * (buffer will be overwritten) */ 2779 * (buffer will be overwritten) */
2820#ifdef FLOPPY_SANITY_CHECK
2821 if (in_sector_offset && buffer_track == -1) 2780 if (in_sector_offset && buffer_track == -1)
2822 DPRINT("internal error offset !=0 on write\n"); 2781 DPRINT("internal error offset !=0 on write\n");
2823#endif
2824 buffer_track = raw_cmd->track; 2782 buffer_track = raw_cmd->track;
2825 buffer_drive = current_drive; 2783 buffer_drive = current_drive;
2826 copy_buffer(ssize, max_sector, 2784 copy_buffer(ssize, max_sector,
@@ -2834,7 +2792,6 @@ static int make_raw_rw_request(void)
2834 raw_cmd->length = in_sector_offset + current_count_sectors; 2792 raw_cmd->length = in_sector_offset + current_count_sectors;
2835 raw_cmd->length = ((raw_cmd->length - 1) | (ssize - 1)) + 1; 2793 raw_cmd->length = ((raw_cmd->length - 1) | (ssize - 1)) + 1;
2836 raw_cmd->length <<= 9; 2794 raw_cmd->length <<= 9;
2837#ifdef FLOPPY_SANITY_CHECK
2838 if ((raw_cmd->length < current_count_sectors << 9) || 2795 if ((raw_cmd->length < current_count_sectors << 9) ||
2839 (raw_cmd->kernel_data != current_req->buffer && 2796 (raw_cmd->kernel_data != current_req->buffer &&
2840 CT(COMMAND) == FD_WRITE && 2797 CT(COMMAND) == FD_WRITE &&
@@ -2845,19 +2802,19 @@ static int make_raw_rw_request(void)
2845 DPRINT("fractionary current count b=%lx s=%lx\n", 2802 DPRINT("fractionary current count b=%lx s=%lx\n",
2846 raw_cmd->length, current_count_sectors); 2803 raw_cmd->length, current_count_sectors);
2847 if (raw_cmd->kernel_data != current_req->buffer) 2804 if (raw_cmd->kernel_data != current_req->buffer)
2848 printk("addr=%d, length=%ld\n", 2805 pr_info("addr=%d, length=%ld\n",
2849 (int)((raw_cmd->kernel_data - 2806 (int)((raw_cmd->kernel_data -
2850 floppy_track_buffer) >> 9), 2807 floppy_track_buffer) >> 9),
2851 current_count_sectors); 2808 current_count_sectors);
2852 printk("st=%d ast=%d mse=%d msi=%d\n", 2809 pr_info("st=%d ast=%d mse=%d msi=%d\n",
2853 fsector_t, aligned_sector_t, max_sector, max_size); 2810 fsector_t, aligned_sector_t, max_sector, max_size);
2854 printk("ssize=%x SIZECODE=%d\n", ssize, SIZECODE); 2811 pr_info("ssize=%x SIZECODE=%d\n", ssize, SIZECODE);
2855 printk("command=%x SECTOR=%d HEAD=%d, TRACK=%d\n", 2812 pr_info("command=%x SECTOR=%d HEAD=%d, TRACK=%d\n",
2856 COMMAND, SECTOR, HEAD, TRACK); 2813 COMMAND, SECTOR, HEAD, TRACK);
2857 printk("buffer drive=%d\n", buffer_drive); 2814 pr_info("buffer drive=%d\n", buffer_drive);
2858 printk("buffer track=%d\n", buffer_track); 2815 pr_info("buffer track=%d\n", buffer_track);
2859 printk("buffer_min=%d\n", buffer_min); 2816 pr_info("buffer_min=%d\n", buffer_min);
2860 printk("buffer_max=%d\n", buffer_max); 2817 pr_info("buffer_max=%d\n", buffer_max);
2861 return 0; 2818 return 0;
2862 } 2819 }
2863 2820
@@ -2868,14 +2825,14 @@ static int make_raw_rw_request(void)
2868 raw_cmd->kernel_data + raw_cmd->length > 2825 raw_cmd->kernel_data + raw_cmd->length >
2869 floppy_track_buffer + (max_buffer_sectors << 10)) { 2826 floppy_track_buffer + (max_buffer_sectors << 10)) {
2870 DPRINT("buffer overrun in schedule dma\n"); 2827 DPRINT("buffer overrun in schedule dma\n");
2871 printk("fsector_t=%d buffer_min=%d current_count=%ld\n", 2828 pr_info("fsector_t=%d buffer_min=%d current_count=%ld\n",
2872 fsector_t, buffer_min, raw_cmd->length >> 9); 2829 fsector_t, buffer_min, raw_cmd->length >> 9);
2873 printk("current_count_sectors=%ld\n", 2830 pr_info("current_count_sectors=%ld\n",
2874 current_count_sectors); 2831 current_count_sectors);
2875 if (CT(COMMAND) == FD_READ) 2832 if (CT(COMMAND) == FD_READ)
2876 printk("read\n"); 2833 pr_info("read\n");
2877 if (CT(COMMAND) == FD_WRITE) 2834 if (CT(COMMAND) == FD_WRITE)
2878 printk("write\n"); 2835 pr_info("write\n");
2879 return 0; 2836 return 0;
2880 } 2837 }
2881 } else if (raw_cmd->length > blk_rq_bytes(current_req) || 2838 } else if (raw_cmd->length > blk_rq_bytes(current_req) ||
@@ -2884,14 +2841,13 @@ static int make_raw_rw_request(void)
2884 return 0; 2841 return 0;
2885 } else if (raw_cmd->length < current_count_sectors << 9) { 2842 } else if (raw_cmd->length < current_count_sectors << 9) {
2886 DPRINT("more sectors than bytes\n"); 2843 DPRINT("more sectors than bytes\n");
2887 printk("bytes=%ld\n", raw_cmd->length >> 9); 2844 pr_info("bytes=%ld\n", raw_cmd->length >> 9);
2888 printk("sectors=%ld\n", current_count_sectors); 2845 pr_info("sectors=%ld\n", current_count_sectors);
2889 } 2846 }
2890 if (raw_cmd->length == 0) { 2847 if (raw_cmd->length == 0) {
2891 DPRINT("zero dma transfer attempted from make_raw_request\n"); 2848 DPRINT("zero dma transfer attempted from make_raw_request\n");
2892 return 0; 2849 return 0;
2893 } 2850 }
2894#endif
2895 2851
2896 virtualdmabug_workaround(); 2852 virtualdmabug_workaround();
2897 return 2; 2853 return 2;
@@ -2899,7 +2855,6 @@ static int make_raw_rw_request(void)
2899 2855
2900static void redo_fd_request(void) 2856static void redo_fd_request(void)
2901{ 2857{
2902#define REPEAT {request_done(0); continue; }
2903 int drive; 2858 int drive;
2904 int tmp; 2859 int tmp;
2905 2860
@@ -2907,63 +2862,63 @@ static void redo_fd_request(void)
2907 if (current_drive < N_DRIVE) 2862 if (current_drive < N_DRIVE)
2908 floppy_off(current_drive); 2863 floppy_off(current_drive);
2909 2864
2910 for (;;) { 2865do_request:
2911 if (!current_req) { 2866 if (!current_req) {
2912 struct request *req; 2867 struct request *req;
2913
2914 spin_lock_irq(floppy_queue->queue_lock);
2915 req = blk_fetch_request(floppy_queue);
2916 spin_unlock_irq(floppy_queue->queue_lock);
2917 if (!req) {
2918 do_floppy = NULL;
2919 unlock_fdc();
2920 return;
2921 }
2922 current_req = req;
2923 }
2924 drive = (long)current_req->rq_disk->private_data;
2925 set_fdc(drive);
2926 reschedule_timeout(current_reqD, "redo fd request", 0);
2927 2868
2928 set_floppy(drive); 2869 spin_lock_irq(floppy_queue->queue_lock);
2929 raw_cmd = &default_raw_cmd; 2870 req = blk_fetch_request(floppy_queue);
2930 raw_cmd->flags = 0; 2871 spin_unlock_irq(floppy_queue->queue_lock);
2931 if (start_motor(redo_fd_request)) 2872 if (!req) {
2873 do_floppy = NULL;
2874 unlock_fdc();
2932 return; 2875 return;
2933 disk_change(current_drive);
2934 if (test_bit(current_drive, &fake_change) ||
2935 TESTF(FD_DISK_CHANGED)) {
2936 DPRINT("disk absent or changed during operation\n");
2937 REPEAT;
2938 }
2939 if (!_floppy) { /* Autodetection */
2940 if (!probing) {
2941 DRS->probed_format = 0;
2942 if (next_valid_format()) {
2943 DPRINT("no autodetectable formats\n");
2944 _floppy = NULL;
2945 REPEAT;
2946 }
2947 }
2948 probing = 1;
2949 _floppy =
2950 floppy_type + DP->autodetect[DRS->probed_format];
2951 } else
2952 probing = 0;
2953 errors = &(current_req->errors);
2954 tmp = make_raw_rw_request();
2955 if (tmp < 2) {
2956 request_done(tmp);
2957 continue;
2958 } 2876 }
2877 current_req = req;
2878 }
2879 drive = (long)current_req->rq_disk->private_data;
2880 set_fdc(drive);
2881 reschedule_timeout(current_reqD, "redo fd request");
2959 2882
2960 if (TESTF(FD_NEED_TWADDLE)) 2883 set_floppy(drive);
2961 twaddle(); 2884 raw_cmd = &default_raw_cmd;
2962 schedule_bh(floppy_start); 2885 raw_cmd->flags = 0;
2963 debugt("queue fd request"); 2886 if (start_motor(redo_fd_request))
2964 return; 2887 return;
2888
2889 disk_change(current_drive);
2890 if (test_bit(current_drive, &fake_change) ||
2891 test_bit(FD_DISK_CHANGED_BIT, &DRS->flags)) {
2892 DPRINT("disk absent or changed during operation\n");
2893 request_done(0);
2894 goto do_request;
2895 }
2896 if (!_floppy) { /* Autodetection */
2897 if (!probing) {
2898 DRS->probed_format = 0;
2899 if (next_valid_format()) {
2900 DPRINT("no autodetectable formats\n");
2901 _floppy = NULL;
2902 request_done(0);
2903 goto do_request;
2904 }
2905 }
2906 probing = 1;
2907 _floppy = floppy_type + DP->autodetect[DRS->probed_format];
2908 } else
2909 probing = 0;
2910 errors = &(current_req->errors);
2911 tmp = make_raw_rw_request();
2912 if (tmp < 2) {
2913 request_done(tmp);
2914 goto do_request;
2965 } 2915 }
2966#undef REPEAT 2916
2917 if (test_bit(FD_NEED_TWADDLE_BIT, &DRS->flags))
2918 twaddle();
2919 schedule_bh(floppy_start);
2920 debugt(__func__, "queue fd request");
2921 return;
2967} 2922}
2968 2923
2969static struct cont_t rw_cont = { 2924static struct cont_t rw_cont = {
@@ -2979,30 +2934,30 @@ static void process_fd_request(void)
2979 schedule_bh(redo_fd_request); 2934 schedule_bh(redo_fd_request);
2980} 2935}
2981 2936
2982static void do_fd_request(struct request_queue * q) 2937static void do_fd_request(struct request_queue *q)
2983{ 2938{
2984 if (max_buffer_sectors == 0) { 2939 if (max_buffer_sectors == 0) {
2985 printk("VFS: do_fd_request called on non-open device\n"); 2940 pr_info("VFS: %s called on non-open device\n", __func__);
2986 return; 2941 return;
2987 } 2942 }
2988 2943
2989 if (usage_count == 0) { 2944 if (usage_count == 0) {
2990 printk("warning: usage count=0, current_req=%p exiting\n", 2945 pr_info("warning: usage count=0, current_req=%p exiting\n",
2991 current_req); 2946 current_req);
2992 printk("sect=%ld type=%x flags=%x\n", 2947 pr_info("sect=%ld type=%x flags=%x\n",
2993 (long)blk_rq_pos(current_req), current_req->cmd_type, 2948 (long)blk_rq_pos(current_req), current_req->cmd_type,
2994 current_req->cmd_flags); 2949 current_req->cmd_flags);
2995 return; 2950 return;
2996 } 2951 }
2997 if (test_bit(0, &fdc_busy)) { 2952 if (test_bit(0, &fdc_busy)) {
2998 /* fdc busy, this new request will be treated when the 2953 /* fdc busy, this new request will be treated when the
2999 current one is done */ 2954 current one is done */
3000 is_alive("do fd request, old request running"); 2955 is_alive(__func__, "old request running");
3001 return; 2956 return;
3002 } 2957 }
3003 lock_fdc(MAXTIMEOUT, 0); 2958 lock_fdc(MAXTIMEOUT, false);
3004 process_fd_request(); 2959 process_fd_request();
3005 is_alive("do fd request"); 2960 is_alive(__func__, "");
3006} 2961}
3007 2962
3008static struct cont_t poll_cont = { 2963static struct cont_t poll_cont = {
@@ -3012,24 +2967,18 @@ static struct cont_t poll_cont = {
3012 .done = generic_done 2967 .done = generic_done
3013}; 2968};
3014 2969
3015static int poll_drive(int interruptible, int flag) 2970static int poll_drive(bool interruptible, int flag)
3016{ 2971{
3017 int ret;
3018
3019 /* no auto-sense, just clear dcl */ 2972 /* no auto-sense, just clear dcl */
3020 raw_cmd = &default_raw_cmd; 2973 raw_cmd = &default_raw_cmd;
3021 raw_cmd->flags = flag; 2974 raw_cmd->flags = flag;
3022 raw_cmd->track = 0; 2975 raw_cmd->track = 0;
3023 raw_cmd->cmd_count = 0; 2976 raw_cmd->cmd_count = 0;
3024 cont = &poll_cont; 2977 cont = &poll_cont;
3025#ifdef DCL_DEBUG 2978 debug_dcl(DP->flags, "setting NEWCHANGE in poll_drive\n");
3026 if (DP->flags & FD_DEBUG) { 2979 set_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags);
3027 DPRINT("setting NEWCHANGE in poll_drive\n"); 2980
3028 } 2981 return wait_til_done(floppy_ready, interruptible);
3029#endif
3030 SETF(FD_DISK_NEWCHANGE);
3031 WAIT(floppy_ready);
3032 return ret;
3033} 2982}
3034 2983
3035/* 2984/*
@@ -3039,7 +2988,7 @@ static int poll_drive(int interruptible, int flag)
3039 2988
3040static void reset_intr(void) 2989static void reset_intr(void)
3041{ 2990{
3042 printk("weird, reset interrupt called\n"); 2991 pr_info("weird, reset interrupt called\n");
3043} 2992}
3044 2993
3045static struct cont_t reset_cont = { 2994static struct cont_t reset_cont = {
@@ -3049,20 +2998,23 @@ static struct cont_t reset_cont = {
3049 .done = generic_done 2998 .done = generic_done
3050}; 2999};
3051 3000
3052static int user_reset_fdc(int drive, int arg, int interruptible) 3001static int user_reset_fdc(int drive, int arg, bool interruptible)
3053{ 3002{
3054 int ret; 3003 int ret;
3055 3004
3056 ret = 0; 3005 if (lock_fdc(drive, interruptible))
3057 LOCK_FDC(drive, interruptible); 3006 return -EINTR;
3007
3058 if (arg == FD_RESET_ALWAYS) 3008 if (arg == FD_RESET_ALWAYS)
3059 FDCS->reset = 1; 3009 FDCS->reset = 1;
3060 if (FDCS->reset) { 3010 if (FDCS->reset) {
3061 cont = &reset_cont; 3011 cont = &reset_cont;
3062 WAIT(reset_fdc); 3012 ret = wait_til_done(reset_fdc, interruptible);
3013 if (ret == -EINTR)
3014 return -EINTR;
3063 } 3015 }
3064 process_fd_request(); 3016 process_fd_request();
3065 return ret; 3017 return 0;
3066} 3018}
3067 3019
3068/* 3020/*
@@ -3075,17 +3027,12 @@ static inline int fd_copyout(void __user *param, const void *address,
3075 return copy_to_user(param, address, size) ? -EFAULT : 0; 3027 return copy_to_user(param, address, size) ? -EFAULT : 0;
3076} 3028}
3077 3029
3078static inline int fd_copyin(void __user *param, void *address, unsigned long size) 3030static inline int fd_copyin(void __user *param, void *address,
3031 unsigned long size)
3079{ 3032{
3080 return copy_from_user(address, param, size) ? -EFAULT : 0; 3033 return copy_from_user(address, param, size) ? -EFAULT : 0;
3081} 3034}
3082 3035
3083#define _COPYOUT(x) (copy_to_user((void __user *)param, &(x), sizeof(x)) ? -EFAULT : 0)
3084#define _COPYIN(x) (copy_from_user(&(x), (void __user *)param, sizeof(x)) ? -EFAULT : 0)
3085
3086#define COPYOUT(x) ECALL(_COPYOUT(x))
3087#define COPYIN(x) ECALL(_COPYIN(x))
3088
3089static inline const char *drive_name(int type, int drive) 3036static inline const char *drive_name(int type, int drive)
3090{ 3037{
3091 struct floppy_struct *floppy; 3038 struct floppy_struct *floppy;
@@ -3156,23 +3103,29 @@ static struct cont_t raw_cmd_cont = {
3156 .done = raw_cmd_done 3103 .done = raw_cmd_done
3157}; 3104};
3158 3105
3159static inline int raw_cmd_copyout(int cmd, char __user *param, 3106static inline int raw_cmd_copyout(int cmd, void __user *param,
3160 struct floppy_raw_cmd *ptr) 3107 struct floppy_raw_cmd *ptr)
3161{ 3108{
3162 int ret; 3109 int ret;
3163 3110
3164 while (ptr) { 3111 while (ptr) {
3165 COPYOUT(*ptr); 3112 ret = copy_to_user(param, ptr, sizeof(*ptr));
3113 if (ret)
3114 return -EFAULT;
3166 param += sizeof(struct floppy_raw_cmd); 3115 param += sizeof(struct floppy_raw_cmd);
3167 if ((ptr->flags & FD_RAW_READ) && ptr->buffer_length) { 3116 if ((ptr->flags & FD_RAW_READ) && ptr->buffer_length) {
3168 if (ptr->length >= 0 3117 if (ptr->length >= 0 &&
3169 && ptr->length <= ptr->buffer_length) 3118 ptr->length <= ptr->buffer_length) {
3170 ECALL(fd_copyout 3119 long length = ptr->buffer_length - ptr->length;
3171 (ptr->data, ptr->kernel_data, 3120 ret = fd_copyout(ptr->data, ptr->kernel_data,
3172 ptr->buffer_length - ptr->length)); 3121 length);
3122 if (ret)
3123 return ret;
3124 }
3173 } 3125 }
3174 ptr = ptr->next; 3126 ptr = ptr->next;
3175 } 3127 }
3128
3176 return 0; 3129 return 0;
3177} 3130}
3178 3131
@@ -3195,7 +3148,7 @@ static void raw_cmd_free(struct floppy_raw_cmd **ptr)
3195 } 3148 }
3196} 3149}
3197 3150
3198static inline int raw_cmd_copyin(int cmd, char __user *param, 3151static inline int raw_cmd_copyin(int cmd, void __user *param,
3199 struct floppy_raw_cmd **rcmd) 3152 struct floppy_raw_cmd **rcmd)
3200{ 3153{
3201 struct floppy_raw_cmd *ptr; 3154 struct floppy_raw_cmd *ptr;
@@ -3203,17 +3156,19 @@ static inline int raw_cmd_copyin(int cmd, char __user *param,
3203 int i; 3156 int i;
3204 3157
3205 *rcmd = NULL; 3158 *rcmd = NULL;
3206 while (1) { 3159
3207 ptr = (struct floppy_raw_cmd *) 3160loop:
3208 kmalloc(sizeof(struct floppy_raw_cmd), GFP_USER); 3161 ptr = kmalloc(sizeof(struct floppy_raw_cmd), GFP_USER);
3209 if (!ptr) 3162 if (!ptr)
3210 return -ENOMEM; 3163 return -ENOMEM;
3211 *rcmd = ptr; 3164 *rcmd = ptr;
3212 COPYIN(*ptr); 3165 ret = copy_from_user(ptr, param, sizeof(*ptr));
3213 ptr->next = NULL; 3166 if (ret)
3214 ptr->buffer_length = 0; 3167 return -EFAULT;
3215 param += sizeof(struct floppy_raw_cmd); 3168 ptr->next = NULL;
3216 if (ptr->cmd_count > 33) 3169 ptr->buffer_length = 0;
3170 param += sizeof(struct floppy_raw_cmd);
3171 if (ptr->cmd_count > 33)
3217 /* the command may now also take up the space 3172 /* the command may now also take up the space
3218 * initially intended for the reply & the 3173 * initially intended for the reply & the
3219 * reply count. Needed for long 82078 commands 3174 * reply count. Needed for long 82078 commands
@@ -3222,31 +3177,35 @@ static inline int raw_cmd_copyin(int cmd, char __user *param,
3222 * 16 bytes for a structure, you'll one day 3177 * 16 bytes for a structure, you'll one day
3223 * discover that you really need 17... 3178 * discover that you really need 17...
3224 */ 3179 */
3180 return -EINVAL;
3181
3182 for (i = 0; i < 16; i++)
3183 ptr->reply[i] = 0;
3184 ptr->resultcode = 0;
3185 ptr->kernel_data = NULL;
3186
3187 if (ptr->flags & (FD_RAW_READ | FD_RAW_WRITE)) {
3188 if (ptr->length <= 0)
3225 return -EINVAL; 3189 return -EINVAL;
3190 ptr->kernel_data = (char *)fd_dma_mem_alloc(ptr->length);
3191 fallback_on_nodma_alloc(&ptr->kernel_data, ptr->length);
3192 if (!ptr->kernel_data)
3193 return -ENOMEM;
3194 ptr->buffer_length = ptr->length;
3195 }
3196 if (ptr->flags & FD_RAW_WRITE) {
3197 ret = fd_copyin(ptr->data, ptr->kernel_data, ptr->length);
3198 if (ret)
3199 return ret;
3200 }
3226 3201
3227 for (i = 0; i < 16; i++) 3202 if (ptr->flags & FD_RAW_MORE) {
3228 ptr->reply[i] = 0;
3229 ptr->resultcode = 0;
3230 ptr->kernel_data = NULL;
3231
3232 if (ptr->flags & (FD_RAW_READ | FD_RAW_WRITE)) {
3233 if (ptr->length <= 0)
3234 return -EINVAL;
3235 ptr->kernel_data =
3236 (char *)fd_dma_mem_alloc(ptr->length);
3237 fallback_on_nodma_alloc(&ptr->kernel_data, ptr->length);
3238 if (!ptr->kernel_data)
3239 return -ENOMEM;
3240 ptr->buffer_length = ptr->length;
3241 }
3242 if (ptr->flags & FD_RAW_WRITE)
3243 ECALL(fd_copyin(ptr->data, ptr->kernel_data,
3244 ptr->length));
3245 rcmd = &(ptr->next); 3203 rcmd = &(ptr->next);
3246 if (!(ptr->flags & FD_RAW_MORE))
3247 return 0;
3248 ptr->rate &= 0x43; 3204 ptr->rate &= 0x43;
3205 goto loop;
3249 } 3206 }
3207
3208 return 0;
3250} 3209}
3251 3210
3252static int raw_cmd_ioctl(int cmd, void __user *param) 3211static int raw_cmd_ioctl(int cmd, void __user *param)
@@ -3283,12 +3242,8 @@ static int raw_cmd_ioctl(int cmd, void __user *param)
3283 3242
3284 raw_cmd = my_raw_cmd; 3243 raw_cmd = my_raw_cmd;
3285 cont = &raw_cmd_cont; 3244 cont = &raw_cmd_cont;
3286 ret = wait_til_done(floppy_start, 1); 3245 ret = wait_til_done(floppy_start, true);
3287#ifdef DCL_DEBUG 3246 debug_dcl(DP->flags, "calling disk change from raw_cmd ioctl\n");
3288 if (DP->flags & FD_DEBUG) {
3289 DPRINT("calling disk change from raw_cmd ioctl\n");
3290 }
3291#endif
3292 3247
3293 if (ret != -EINTR && FDCS->reset) 3248 if (ret != -EINTR && FDCS->reset)
3294 ret = -EIO; 3249 ret = -EIO;
@@ -3327,7 +3282,7 @@ static inline int set_geometry(unsigned int cmd, struct floppy_struct *g,
3327 if (!capable(CAP_SYS_ADMIN)) 3282 if (!capable(CAP_SYS_ADMIN))
3328 return -EPERM; 3283 return -EPERM;
3329 mutex_lock(&open_lock); 3284 mutex_lock(&open_lock);
3330 if (lock_fdc(drive, 1)) { 3285 if (lock_fdc(drive, true)) {
3331 mutex_unlock(&open_lock); 3286 mutex_unlock(&open_lock);
3332 return -EINTR; 3287 return -EINTR;
3333 } 3288 }
@@ -3346,11 +3301,15 @@ static inline int set_geometry(unsigned int cmd, struct floppy_struct *g,
3346 mutex_unlock(&open_lock); 3301 mutex_unlock(&open_lock);
3347 } else { 3302 } else {
3348 int oldStretch; 3303 int oldStretch;
3349 LOCK_FDC(drive, 1); 3304
3350 if (cmd != FDDEFPRM) 3305 if (lock_fdc(drive, true))
3306 return -EINTR;
3307 if (cmd != FDDEFPRM) {
3351 /* notice a disk change immediately, else 3308 /* notice a disk change immediately, else
3352 * we lose our settings immediately*/ 3309 * we lose our settings immediately*/
3353 CALL(poll_drive(1, FD_RAW_NEED_DISK)); 3310 if (poll_drive(true, FD_RAW_NEED_DISK) == -EINTR)
3311 return -EINTR;
3312 }
3354 oldStretch = g->stretch; 3313 oldStretch = g->stretch;
3355 user_params[drive] = *g; 3314 user_params[drive] = *g;
3356 if (buffer_drive == drive) 3315 if (buffer_drive == drive)
@@ -3415,7 +3374,7 @@ static inline int normalize_ioctl(int *cmd, int *size)
3415 *size = _IOC_SIZE(*cmd); 3374 *size = _IOC_SIZE(*cmd);
3416 *cmd = ioctl_table[i]; 3375 *cmd = ioctl_table[i];
3417 if (*size > _IOC_SIZE(*cmd)) { 3376 if (*size > _IOC_SIZE(*cmd)) {
3418 printk("ioctl not yet supported\n"); 3377 pr_info("ioctl not yet supported\n");
3419 return -EFAULT; 3378 return -EFAULT;
3420 } 3379 }
3421 return 0; 3380 return 0;
@@ -3429,8 +3388,10 @@ static int get_floppy_geometry(int drive, int type, struct floppy_struct **g)
3429 if (type) 3388 if (type)
3430 *g = &floppy_type[type]; 3389 *g = &floppy_type[type];
3431 else { 3390 else {
3432 LOCK_FDC(drive, 0); 3391 if (lock_fdc(drive, false))
3433 CALL(poll_drive(0, 0)); 3392 return -EINTR;
3393 if (poll_drive(false, 0) == -EINTR)
3394 return -EINTR;
3434 process_fd_request(); 3395 process_fd_request();
3435 *g = current_type[drive]; 3396 *g = current_type[drive];
3436 } 3397 }
@@ -3459,10 +3420,6 @@ static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
3459static int fd_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, 3420static int fd_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
3460 unsigned long param) 3421 unsigned long param)
3461{ 3422{
3462#define FD_IOCTL_ALLOWED (mode & (FMODE_WRITE|FMODE_WRITE_IOCTL))
3463#define OUT(c,x) case c: outparam = (const char *) (x); break
3464#define IN(c,x,tag) case c: *(x) = inparam. tag ; return 0
3465
3466 int drive = (long)bdev->bd_disk->private_data; 3423 int drive = (long)bdev->bd_disk->private_data;
3467 int type = ITYPE(UDRS->fd_device); 3424 int type = ITYPE(UDRS->fd_device);
3468 int i; 3425 int i;
@@ -3474,153 +3431,171 @@ static int fd_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
3474 struct floppy_max_errors max_errors; 3431 struct floppy_max_errors max_errors;
3475 struct floppy_drive_params dp; 3432 struct floppy_drive_params dp;
3476 } inparam; /* parameters coming from user space */ 3433 } inparam; /* parameters coming from user space */
3477 const char *outparam; /* parameters passed back to user space */ 3434 const void *outparam; /* parameters passed back to user space */
3478 3435
3479 /* convert compatibility eject ioctls into floppy eject ioctl. 3436 /* convert compatibility eject ioctls into floppy eject ioctl.
3480 * We do this in order to provide a means to eject floppy disks before 3437 * We do this in order to provide a means to eject floppy disks before
3481 * installing the new fdutils package */ 3438 * installing the new fdutils package */
3482 if (cmd == CDROMEJECT || /* CD-ROM eject */ 3439 if (cmd == CDROMEJECT || /* CD-ROM eject */
3483 cmd == 0x6470 /* SunOS floppy eject */ ) { 3440 cmd == 0x6470) { /* SunOS floppy eject */
3484 DPRINT("obsolete eject ioctl\n"); 3441 DPRINT("obsolete eject ioctl\n");
3485 DPRINT("please use floppycontrol --eject\n"); 3442 DPRINT("please use floppycontrol --eject\n");
3486 cmd = FDEJECT; 3443 cmd = FDEJECT;
3487 } 3444 }
3488 3445
3489 /* convert the old style command into a new style command */ 3446 if (!((cmd & 0xff00) == 0x0200))
3490 if ((cmd & 0xff00) == 0x0200) {
3491 ECALL(normalize_ioctl(&cmd, &size));
3492 } else
3493 return -EINVAL; 3447 return -EINVAL;
3494 3448
3449 /* convert the old style command into a new style command */
3450 ret = normalize_ioctl(&cmd, &size);
3451 if (ret)
3452 return ret;
3453
3495 /* permission checks */ 3454 /* permission checks */
3496 if (((cmd & 0x40) && !FD_IOCTL_ALLOWED) || 3455 if (((cmd & 0x40) && !(mode & (FMODE_WRITE | FMODE_WRITE_IOCTL))) ||
3497 ((cmd & 0x80) && !capable(CAP_SYS_ADMIN))) 3456 ((cmd & 0x80) && !capable(CAP_SYS_ADMIN)))
3498 return -EPERM; 3457 return -EPERM;
3499 3458
3459 if (WARN_ON(size < 0 || size > sizeof(inparam)))
3460 return -EINVAL;
3461
3500 /* copyin */ 3462 /* copyin */
3501 CLEARSTRUCT(&inparam); 3463 memset(&inparam, 0, sizeof(inparam));
3502 if (_IOC_DIR(cmd) & _IOC_WRITE) 3464 if (_IOC_DIR(cmd) & _IOC_WRITE) {
3503 ECALL(fd_copyin((void __user *)param, &inparam, size)) 3465 ret = fd_copyin((void __user *)param, &inparam, size);
3504 3466 if (ret)
3505 switch (cmd) {
3506 case FDEJECT:
3507 if (UDRS->fd_ref != 1)
3508 /* somebody else has this drive open */
3509 return -EBUSY;
3510 LOCK_FDC(drive, 1);
3511
3512 /* do the actual eject. Fails on
3513 * non-Sparc architectures */
3514 ret = fd_eject(UNIT(drive));
3515
3516 USETF(FD_DISK_CHANGED);
3517 USETF(FD_VERIFY);
3518 process_fd_request();
3519 return ret; 3467 return ret;
3520 case FDCLRPRM: 3468 }
3521 LOCK_FDC(drive, 1);
3522 current_type[drive] = NULL;
3523 floppy_sizes[drive] = MAX_DISK_SIZE << 1;
3524 UDRS->keep_data = 0;
3525 return invalidate_drive(bdev);
3526 case FDSETPRM:
3527 case FDDEFPRM:
3528 return set_geometry(cmd, &inparam.g,
3529 drive, type, bdev);
3530 case FDGETPRM:
3531 ECALL(get_floppy_geometry(drive, type,
3532 (struct floppy_struct **)
3533 &outparam));
3534 break;
3535
3536 case FDMSGON:
3537 UDP->flags |= FTD_MSG;
3538 return 0;
3539 case FDMSGOFF:
3540 UDP->flags &= ~FTD_MSG;
3541 return 0;
3542
3543 case FDFMTBEG:
3544 LOCK_FDC(drive, 1);
3545 CALL(poll_drive(1, FD_RAW_NEED_DISK));
3546 ret = UDRS->flags;
3547 process_fd_request();
3548 if (ret & FD_VERIFY)
3549 return -ENODEV;
3550 if (!(ret & FD_DISK_WRITABLE))
3551 return -EROFS;
3552 return 0;
3553 case FDFMTTRK:
3554 if (UDRS->fd_ref != 1)
3555 return -EBUSY;
3556 return do_format(drive, &inparam.f);
3557 case FDFMTEND:
3558 case FDFLUSH:
3559 LOCK_FDC(drive, 1);
3560 return invalidate_drive(bdev);
3561
3562 case FDSETEMSGTRESH:
3563 UDP->max_errors.reporting =
3564 (unsigned short)(param & 0x0f);
3565 return 0;
3566 OUT(FDGETMAXERRS, &UDP->max_errors);
3567 IN(FDSETMAXERRS, &UDP->max_errors, max_errors);
3568
3569 case FDGETDRVTYP:
3570 outparam = drive_name(type, drive);
3571 SUPBOUND(size, strlen(outparam) + 1);
3572 break;
3573
3574 IN(FDSETDRVPRM, UDP, dp);
3575 OUT(FDGETDRVPRM, UDP);
3576
3577 case FDPOLLDRVSTAT:
3578 LOCK_FDC(drive, 1);
3579 CALL(poll_drive(1, FD_RAW_NEED_DISK));
3580 process_fd_request();
3581 /* fall through */
3582 OUT(FDGETDRVSTAT, UDRS);
3583
3584 case FDRESET:
3585 return user_reset_fdc(drive, (int)param, 1);
3586
3587 OUT(FDGETFDCSTAT, UFDCS);
3588 3469
3589 case FDWERRORCLR: 3470 switch (cmd) {
3590 CLEARSTRUCT(UDRWE); 3471 case FDEJECT:
3591 return 0; 3472 if (UDRS->fd_ref != 1)
3592 OUT(FDWERRORGET, UDRWE); 3473 /* somebody else has this drive open */
3593 3474 return -EBUSY;
3594 case FDRAWCMD: 3475 if (lock_fdc(drive, true))
3595 if (type) 3476 return -EINTR;
3596 return -EINVAL;
3597 LOCK_FDC(drive, 1);
3598 set_floppy(drive);
3599 CALL(i = raw_cmd_ioctl(cmd, (void __user *)param));
3600 process_fd_request();
3601 return i;
3602 3477
3603 case FDTWADDLE: 3478 /* do the actual eject. Fails on
3604 LOCK_FDC(drive, 1); 3479 * non-Sparc architectures */
3605 twaddle(); 3480 ret = fd_eject(UNIT(drive));
3606 process_fd_request();
3607 return 0;
3608 3481
3609 default: 3482 set_bit(FD_DISK_CHANGED_BIT, &UDRS->flags);
3483 set_bit(FD_VERIFY_BIT, &UDRS->flags);
3484 process_fd_request();
3485 return ret;
3486 case FDCLRPRM:
3487 if (lock_fdc(drive, true))
3488 return -EINTR;
3489 current_type[drive] = NULL;
3490 floppy_sizes[drive] = MAX_DISK_SIZE << 1;
3491 UDRS->keep_data = 0;
3492 return invalidate_drive(bdev);
3493 case FDSETPRM:
3494 case FDDEFPRM:
3495 return set_geometry(cmd, &inparam.g, drive, type, bdev);
3496 case FDGETPRM:
3497 ret = get_floppy_geometry(drive, type,
3498 (struct floppy_struct **)&outparam);
3499 if (ret)
3500 return ret;
3501 break;
3502 case FDMSGON:
3503 UDP->flags |= FTD_MSG;
3504 return 0;
3505 case FDMSGOFF:
3506 UDP->flags &= ~FTD_MSG;
3507 return 0;
3508 case FDFMTBEG:
3509 if (lock_fdc(drive, true))
3510 return -EINTR;
3511 if (poll_drive(true, FD_RAW_NEED_DISK) == -EINTR)
3512 return -EINTR;
3513 ret = UDRS->flags;
3514 process_fd_request();
3515 if (ret & FD_VERIFY)
3516 return -ENODEV;
3517 if (!(ret & FD_DISK_WRITABLE))
3518 return -EROFS;
3519 return 0;
3520 case FDFMTTRK:
3521 if (UDRS->fd_ref != 1)
3522 return -EBUSY;
3523 return do_format(drive, &inparam.f);
3524 case FDFMTEND:
3525 case FDFLUSH:
3526 if (lock_fdc(drive, true))
3527 return -EINTR;
3528 return invalidate_drive(bdev);
3529 case FDSETEMSGTRESH:
3530 UDP->max_errors.reporting = (unsigned short)(param & 0x0f);
3531 return 0;
3532 case FDGETMAXERRS:
3533 outparam = &UDP->max_errors;
3534 break;
3535 case FDSETMAXERRS:
3536 UDP->max_errors = inparam.max_errors;
3537 break;
3538 case FDGETDRVTYP:
3539 outparam = drive_name(type, drive);
3540 SUPBOUND(size, strlen((const char *)outparam) + 1);
3541 break;
3542 case FDSETDRVPRM:
3543 *UDP = inparam.dp;
3544 break;
3545 case FDGETDRVPRM:
3546 outparam = UDP;
3547 break;
3548 case FDPOLLDRVSTAT:
3549 if (lock_fdc(drive, true))
3550 return -EINTR;
3551 if (poll_drive(true, FD_RAW_NEED_DISK) == -EINTR)
3552 return -EINTR;
3553 process_fd_request();
3554 /* fall through */
3555 case FDGETDRVSTAT:
3556 outparam = UDRS;
3557 break;
3558 case FDRESET:
3559 return user_reset_fdc(drive, (int)param, true);
3560 case FDGETFDCSTAT:
3561 outparam = UFDCS;
3562 break;
3563 case FDWERRORCLR:
3564 memset(UDRWE, 0, sizeof(*UDRWE));
3565 return 0;
3566 case FDWERRORGET:
3567 outparam = UDRWE;
3568 break;
3569 case FDRAWCMD:
3570 if (type)
3610 return -EINVAL; 3571 return -EINVAL;
3611 } 3572 if (lock_fdc(drive, true))
3573 return -EINTR;
3574 set_floppy(drive);
3575 i = raw_cmd_ioctl(cmd, (void __user *)param);
3576 if (i == -EINTR)
3577 return -EINTR;
3578 process_fd_request();
3579 return i;
3580 case FDTWADDLE:
3581 if (lock_fdc(drive, true))
3582 return -EINTR;
3583 twaddle();
3584 process_fd_request();
3585 return 0;
3586 default:
3587 return -EINVAL;
3588 }
3612 3589
3613 if (_IOC_DIR(cmd) & _IOC_READ) 3590 if (_IOC_DIR(cmd) & _IOC_READ)
3614 return fd_copyout((void __user *)param, outparam, size); 3591 return fd_copyout((void __user *)param, outparam, size);
3615 else 3592
3616 return 0; 3593 return 0;
3617#undef OUT
3618#undef IN
3619} 3594}
3620 3595
3621static void __init config_types(void) 3596static void __init config_types(void)
3622{ 3597{
3623 int first = 1; 3598 bool has_drive = false;
3624 int drive; 3599 int drive;
3625 3600
3626 /* read drive info out of physical CMOS */ 3601 /* read drive info out of physical CMOS */
@@ -3652,17 +3627,22 @@ static void __init config_types(void)
3652 name = temparea; 3627 name = temparea;
3653 } 3628 }
3654 if (name) { 3629 if (name) {
3655 const char *prepend = ","; 3630 const char *prepend;
3656 if (first) { 3631 if (!has_drive) {
3657 prepend = KERN_INFO "Floppy drive(s):"; 3632 prepend = "";
3658 first = 0; 3633 has_drive = true;
3634 pr_info("Floppy drive(s):");
3635 } else {
3636 prepend = ",";
3659 } 3637 }
3660 printk("%s fd%d is %s", prepend, drive, name); 3638
3639 pr_cont("%s fd%d is %s", prepend, drive, name);
3661 } 3640 }
3662 *UDP = *params; 3641 *UDP = *params;
3663 } 3642 }
3664 if (!first) 3643
3665 printk("\n"); 3644 if (has_drive)
3645 pr_cont("\n");
3666} 3646}
3667 3647
3668static int floppy_release(struct gendisk *disk, fmode_t mode) 3648static int floppy_release(struct gendisk *disk, fmode_t mode)
@@ -3702,8 +3682,8 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
3702 goto out2; 3682 goto out2;
3703 3683
3704 if (!UDRS->fd_ref && (UDP->flags & FD_BROKEN_DCL)) { 3684 if (!UDRS->fd_ref && (UDP->flags & FD_BROKEN_DCL)) {
3705 USETF(FD_DISK_CHANGED); 3685 set_bit(FD_DISK_CHANGED_BIT, &UDRS->flags);
3706 USETF(FD_VERIFY); 3686 set_bit(FD_VERIFY_BIT, &UDRS->flags);
3707 } 3687 }
3708 3688
3709 if (UDRS->fd_ref == -1 || (UDRS->fd_ref && (mode & FMODE_EXCL))) 3689 if (UDRS->fd_ref == -1 || (UDRS->fd_ref && (mode & FMODE_EXCL)))
@@ -3732,9 +3712,8 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
3732 INFBOUND(try, 16); 3712 INFBOUND(try, 16);
3733 tmp = (char *)fd_dma_mem_alloc(1024 * try); 3713 tmp = (char *)fd_dma_mem_alloc(1024 * try);
3734 } 3714 }
3735 if (!tmp && !floppy_track_buffer) { 3715 if (!tmp && !floppy_track_buffer)
3736 fallback_on_nodma_alloc(&tmp, 2048 * try); 3716 fallback_on_nodma_alloc(&tmp, 2048 * try);
3737 }
3738 if (!tmp && !floppy_track_buffer) { 3717 if (!tmp && !floppy_track_buffer) {
3739 DPRINT("Unable to allocate DMA memory\n"); 3718 DPRINT("Unable to allocate DMA memory\n");
3740 goto out; 3719 goto out;
@@ -3764,11 +3743,12 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
3764 if (mode & (FMODE_READ|FMODE_WRITE)) { 3743 if (mode & (FMODE_READ|FMODE_WRITE)) {
3765 UDRS->last_checked = 0; 3744 UDRS->last_checked = 0;
3766 check_disk_change(bdev); 3745 check_disk_change(bdev);
3767 if (UTESTF(FD_DISK_CHANGED)) 3746 if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags))
3768 goto out; 3747 goto out;
3769 } 3748 }
3770 res = -EROFS; 3749 res = -EROFS;
3771 if ((mode & FMODE_WRITE) && !(UTESTF(FD_DISK_WRITABLE))) 3750 if ((mode & FMODE_WRITE) &&
3751 !test_bit(FD_DISK_WRITABLE_BIT, &UDRS->flags))
3772 goto out; 3752 goto out;
3773 } 3753 }
3774 mutex_unlock(&open_lock); 3754 mutex_unlock(&open_lock);
@@ -3792,17 +3772,18 @@ static int check_floppy_change(struct gendisk *disk)
3792{ 3772{
3793 int drive = (long)disk->private_data; 3773 int drive = (long)disk->private_data;
3794 3774
3795 if (UTESTF(FD_DISK_CHANGED) || UTESTF(FD_VERIFY)) 3775 if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) ||
3776 test_bit(FD_VERIFY_BIT, &UDRS->flags))
3796 return 1; 3777 return 1;
3797 3778
3798 if (time_after(jiffies, UDRS->last_checked + UDP->checkfreq)) { 3779 if (time_after(jiffies, UDRS->last_checked + UDP->checkfreq)) {
3799 lock_fdc(drive, 0); 3780 lock_fdc(drive, false);
3800 poll_drive(0, 0); 3781 poll_drive(false, 0);
3801 process_fd_request(); 3782 process_fd_request();
3802 } 3783 }
3803 3784
3804 if (UTESTF(FD_DISK_CHANGED) || 3785 if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) ||
3805 UTESTF(FD_VERIFY) || 3786 test_bit(FD_VERIFY_BIT, &UDRS->flags) ||
3806 test_bit(drive, &fake_change) || 3787 test_bit(drive, &fake_change) ||
3807 (!ITYPE(UDRS->fd_device) && !current_type[drive])) 3788 (!ITYPE(UDRS->fd_device) && !current_type[drive]))
3808 return 1; 3789 return 1;
@@ -3815,8 +3796,7 @@ static int check_floppy_change(struct gendisk *disk)
3815 * a disk in the drive, and whether that disk is writable. 3796 * a disk in the drive, and whether that disk is writable.
3816 */ 3797 */
3817 3798
3818static void floppy_rb0_complete(struct bio *bio, 3799static void floppy_rb0_complete(struct bio *bio, int err)
3819 int err)
3820{ 3800{
3821 complete((struct completion *)bio->bi_private); 3801 complete((struct completion *)bio->bi_private);
3822} 3802}
@@ -3874,14 +3854,16 @@ static int floppy_revalidate(struct gendisk *disk)
3874 int cf; 3854 int cf;
3875 int res = 0; 3855 int res = 0;
3876 3856
3877 if (UTESTF(FD_DISK_CHANGED) || 3857 if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) ||
3878 UTESTF(FD_VERIFY) || test_bit(drive, &fake_change) || NO_GEOM) { 3858 test_bit(FD_VERIFY_BIT, &UDRS->flags) ||
3859 test_bit(drive, &fake_change) || NO_GEOM) {
3879 if (usage_count == 0) { 3860 if (usage_count == 0) {
3880 printk("VFS: revalidate called on non-open device.\n"); 3861 pr_info("VFS: revalidate called on non-open device.\n");
3881 return -EFAULT; 3862 return -EFAULT;
3882 } 3863 }
3883 lock_fdc(drive, 0); 3864 lock_fdc(drive, false);
3884 cf = UTESTF(FD_DISK_CHANGED) || UTESTF(FD_VERIFY); 3865 cf = (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) ||
3866 test_bit(FD_VERIFY_BIT, &UDRS->flags));
3885 if (!(cf || test_bit(drive, &fake_change) || NO_GEOM)) { 3867 if (!(cf || test_bit(drive, &fake_change) || NO_GEOM)) {
3886 process_fd_request(); /*already done by another thread */ 3868 process_fd_request(); /*already done by another thread */
3887 return 0; 3869 return 0;
@@ -3891,7 +3873,7 @@ static int floppy_revalidate(struct gendisk *disk)
3891 if (buffer_drive == drive) 3873 if (buffer_drive == drive)
3892 buffer_track = -1; 3874 buffer_track = -1;
3893 clear_bit(drive, &fake_change); 3875 clear_bit(drive, &fake_change);
3894 UCLEARF(FD_DISK_CHANGED); 3876 clear_bit(FD_DISK_CHANGED_BIT, &UDRS->flags);
3895 if (cf) 3877 if (cf)
3896 UDRS->generation++; 3878 UDRS->generation++;
3897 if (NO_GEOM) { 3879 if (NO_GEOM) {
@@ -3899,7 +3881,7 @@ static int floppy_revalidate(struct gendisk *disk)
3899 res = __floppy_read_block_0(opened_bdev[drive]); 3881 res = __floppy_read_block_0(opened_bdev[drive]);
3900 } else { 3882 } else {
3901 if (cf) 3883 if (cf)
3902 poll_drive(0, FD_RAW_NEED_DISK); 3884 poll_drive(false, FD_RAW_NEED_DISK);
3903 process_fd_request(); 3885 process_fd_request();
3904 } 3886 }
3905 } 3887 }
@@ -3931,21 +3913,21 @@ static char __init get_fdc_version(void)
3931 output_byte(FD_DUMPREGS); /* 82072 and better know DUMPREGS */ 3913 output_byte(FD_DUMPREGS); /* 82072 and better know DUMPREGS */
3932 if (FDCS->reset) 3914 if (FDCS->reset)
3933 return FDC_NONE; 3915 return FDC_NONE;
3934 if ((r = result()) <= 0x00) 3916 r = result();
3917 if (r <= 0x00)
3935 return FDC_NONE; /* No FDC present ??? */ 3918 return FDC_NONE; /* No FDC present ??? */
3936 if ((r == 1) && (reply_buffer[0] == 0x80)) { 3919 if ((r == 1) && (reply_buffer[0] == 0x80)) {
3937 printk(KERN_INFO "FDC %d is an 8272A\n", fdc); 3920 pr_info("FDC %d is an 8272A\n", fdc);
3938 return FDC_8272A; /* 8272a/765 don't know DUMPREGS */ 3921 return FDC_8272A; /* 8272a/765 don't know DUMPREGS */
3939 } 3922 }
3940 if (r != 10) { 3923 if (r != 10) {
3941 printk 3924 pr_info("FDC %d init: DUMPREGS: unexpected return of %d bytes.\n",
3942 ("FDC %d init: DUMPREGS: unexpected return of %d bytes.\n", 3925 fdc, r);
3943 fdc, r);
3944 return FDC_UNKNOWN; 3926 return FDC_UNKNOWN;
3945 } 3927 }
3946 3928
3947 if (!fdc_configure()) { 3929 if (!fdc_configure()) {
3948 printk(KERN_INFO "FDC %d is an 82072\n", fdc); 3930 pr_info("FDC %d is an 82072\n", fdc);
3949 return FDC_82072; /* 82072 doesn't know CONFIGURE */ 3931 return FDC_82072; /* 82072 doesn't know CONFIGURE */
3950 } 3932 }
3951 3933
@@ -3953,52 +3935,50 @@ static char __init get_fdc_version(void)
3953 if (need_more_output() == MORE_OUTPUT) { 3935 if (need_more_output() == MORE_OUTPUT) {
3954 output_byte(0); 3936 output_byte(0);
3955 } else { 3937 } else {
3956 printk(KERN_INFO "FDC %d is an 82072A\n", fdc); 3938 pr_info("FDC %d is an 82072A\n", fdc);
3957 return FDC_82072A; /* 82072A as found on Sparcs. */ 3939 return FDC_82072A; /* 82072A as found on Sparcs. */
3958 } 3940 }
3959 3941
3960 output_byte(FD_UNLOCK); 3942 output_byte(FD_UNLOCK);
3961 r = result(); 3943 r = result();
3962 if ((r == 1) && (reply_buffer[0] == 0x80)) { 3944 if ((r == 1) && (reply_buffer[0] == 0x80)) {
3963 printk(KERN_INFO "FDC %d is a pre-1991 82077\n", fdc); 3945 pr_info("FDC %d is a pre-1991 82077\n", fdc);
3964 return FDC_82077_ORIG; /* Pre-1991 82077, doesn't know 3946 return FDC_82077_ORIG; /* Pre-1991 82077, doesn't know
3965 * LOCK/UNLOCK */ 3947 * LOCK/UNLOCK */
3966 } 3948 }
3967 if ((r != 1) || (reply_buffer[0] != 0x00)) { 3949 if ((r != 1) || (reply_buffer[0] != 0x00)) {
3968 printk("FDC %d init: UNLOCK: unexpected return of %d bytes.\n", 3950 pr_info("FDC %d init: UNLOCK: unexpected return of %d bytes.\n",
3969 fdc, r); 3951 fdc, r);
3970 return FDC_UNKNOWN; 3952 return FDC_UNKNOWN;
3971 } 3953 }
3972 output_byte(FD_PARTID); 3954 output_byte(FD_PARTID);
3973 r = result(); 3955 r = result();
3974 if (r != 1) { 3956 if (r != 1) {
3975 printk("FDC %d init: PARTID: unexpected return of %d bytes.\n", 3957 pr_info("FDC %d init: PARTID: unexpected return of %d bytes.\n",
3976 fdc, r); 3958 fdc, r);
3977 return FDC_UNKNOWN; 3959 return FDC_UNKNOWN;
3978 } 3960 }
3979 if (reply_buffer[0] == 0x80) { 3961 if (reply_buffer[0] == 0x80) {
3980 printk(KERN_INFO "FDC %d is a post-1991 82077\n", fdc); 3962 pr_info("FDC %d is a post-1991 82077\n", fdc);
3981 return FDC_82077; /* Revised 82077AA passes all the tests */ 3963 return FDC_82077; /* Revised 82077AA passes all the tests */
3982 } 3964 }
3983 switch (reply_buffer[0] >> 5) { 3965 switch (reply_buffer[0] >> 5) {
3984 case 0x0: 3966 case 0x0:
3985 /* Either a 82078-1 or a 82078SL running at 5Volt */ 3967 /* Either a 82078-1 or a 82078SL running at 5Volt */
3986 printk(KERN_INFO "FDC %d is an 82078.\n", fdc); 3968 pr_info("FDC %d is an 82078.\n", fdc);
3987 return FDC_82078; 3969 return FDC_82078;
3988 case 0x1: 3970 case 0x1:
3989 printk(KERN_INFO "FDC %d is a 44pin 82078\n", fdc); 3971 pr_info("FDC %d is a 44pin 82078\n", fdc);
3990 return FDC_82078; 3972 return FDC_82078;
3991 case 0x2: 3973 case 0x2:
3992 printk(KERN_INFO "FDC %d is a S82078B\n", fdc); 3974 pr_info("FDC %d is a S82078B\n", fdc);
3993 return FDC_S82078B; 3975 return FDC_S82078B;
3994 case 0x3: 3976 case 0x3:
3995 printk(KERN_INFO "FDC %d is a National Semiconductor PC87306\n", 3977 pr_info("FDC %d is a National Semiconductor PC87306\n", fdc);
3996 fdc);
3997 return FDC_87306; 3978 return FDC_87306;
3998 default: 3979 default:
3999 printk(KERN_INFO 3980 pr_info("FDC %d init: 82078 variant with unknown PARTID=%d.\n",
4000 "FDC %d init: 82078 variant with unknown PARTID=%d.\n", 3981 fdc, reply_buffer[0] >> 5);
4001 fdc, reply_buffer[0] >> 5);
4002 return FDC_82078_UNKN; 3982 return FDC_82078_UNKN;
4003 } 3983 }
4004} /* get_fdc_version */ 3984} /* get_fdc_version */
@@ -4110,9 +4090,9 @@ static int __init floppy_setup(char *str)
4110 else 4090 else
4111 param = config_params[i].def_param; 4091 param = config_params[i].def_param;
4112 if (config_params[i].fn) 4092 if (config_params[i].fn)
4113 config_params[i]. 4093 config_params[i].fn(ints, param,
4114 fn(ints, param, 4094 config_params[i].
4115 config_params[i].param2); 4095 param2);
4116 if (config_params[i].var) { 4096 if (config_params[i].var) {
4117 DPRINT("%s=%d\n", str, param); 4097 DPRINT("%s=%d\n", str, param);
4118 *config_params[i].var = param; 4098 *config_params[i].var = param;
@@ -4126,8 +4106,8 @@ static int __init floppy_setup(char *str)
4126 4106
4127 DPRINT("allowed options are:"); 4107 DPRINT("allowed options are:");
4128 for (i = 0; i < ARRAY_SIZE(config_params); i++) 4108 for (i = 0; i < ARRAY_SIZE(config_params); i++)
4129 printk(" %s", config_params[i].name); 4109 pr_cont(" %s", config_params[i].name);
4130 printk("\n"); 4110 pr_cont("\n");
4131 } else 4111 } else
4132 DPRINT("botched floppy option\n"); 4112 DPRINT("botched floppy option\n");
4133 DPRINT("Read Documentation/blockdev/floppy.txt\n"); 4113 DPRINT("Read Documentation/blockdev/floppy.txt\n");
@@ -4145,7 +4125,8 @@ static ssize_t floppy_cmos_show(struct device *dev,
4145 drive = p->id; 4125 drive = p->id;
4146 return sprintf(buf, "%X\n", UDP->cmos); 4126 return sprintf(buf, "%X\n", UDP->cmos);
4147} 4127}
4148DEVICE_ATTR(cmos,S_IRUGO,floppy_cmos_show,NULL); 4128
4129DEVICE_ATTR(cmos, S_IRUGO, floppy_cmos_show, NULL);
4149 4130
4150static void floppy_device_release(struct device *dev) 4131static void floppy_device_release(struct device *dev)
4151{ 4132{
@@ -4157,20 +4138,20 @@ static int floppy_resume(struct device *dev)
4157 4138
4158 for (fdc = 0; fdc < N_FDC; fdc++) 4139 for (fdc = 0; fdc < N_FDC; fdc++)
4159 if (FDCS->address != -1) 4140 if (FDCS->address != -1)
4160 user_reset_fdc(-1, FD_RESET_ALWAYS, 0); 4141 user_reset_fdc(-1, FD_RESET_ALWAYS, false);
4161 4142
4162 return 0; 4143 return 0;
4163} 4144}
4164 4145
4165static struct dev_pm_ops floppy_pm_ops = { 4146static const struct dev_pm_ops floppy_pm_ops = {
4166 .resume = floppy_resume, 4147 .resume = floppy_resume,
4167 .restore = floppy_resume, 4148 .restore = floppy_resume,
4168}; 4149};
4169 4150
4170static struct platform_driver floppy_driver = { 4151static struct platform_driver floppy_driver = {
4171 .driver = { 4152 .driver = {
4172 .name = "floppy", 4153 .name = "floppy",
4173 .pm = &floppy_pm_ops, 4154 .pm = &floppy_pm_ops,
4174 }, 4155 },
4175}; 4156};
4176 4157
@@ -4231,7 +4212,7 @@ static int __init floppy_init(void)
4231 err = -ENOMEM; 4212 err = -ENOMEM;
4232 goto out_unreg_driver; 4213 goto out_unreg_driver;
4233 } 4214 }
4234 blk_queue_max_sectors(floppy_queue, 64); 4215 blk_queue_max_hw_sectors(floppy_queue, 64);
4235 4216
4236 blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE, 4217 blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE,
4237 floppy_find, NULL, NULL); 4218 floppy_find, NULL, NULL);
@@ -4242,16 +4223,16 @@ static int __init floppy_init(void)
4242 else 4223 else
4243 floppy_sizes[i] = MAX_DISK_SIZE << 1; 4224 floppy_sizes[i] = MAX_DISK_SIZE << 1;
4244 4225
4245 reschedule_timeout(MAXTIMEOUT, "floppy init", MAXTIMEOUT); 4226 reschedule_timeout(MAXTIMEOUT, "floppy init");
4246 config_types(); 4227 config_types();
4247 4228
4248 for (i = 0; i < N_FDC; i++) { 4229 for (i = 0; i < N_FDC; i++) {
4249 fdc = i; 4230 fdc = i;
4250 CLEARSTRUCT(FDCS); 4231 memset(FDCS, 0, sizeof(*FDCS));
4251 FDCS->dtr = -1; 4232 FDCS->dtr = -1;
4252 FDCS->dor = 0x4; 4233 FDCS->dor = 0x4;
4253#if defined(__sparc__) || defined(__mc68000__) 4234#if defined(__sparc__) || defined(__mc68000__)
4254 /*sparcs/sun3x don't have a DOR reset which we can fall back on to */ 4235 /*sparcs/sun3x don't have a DOR reset which we can fall back on to */
4255#ifdef __mc68000__ 4236#ifdef __mc68000__
4256 if (MACH_IS_SUN3X) 4237 if (MACH_IS_SUN3X)
4257#endif 4238#endif
@@ -4280,11 +4261,11 @@ static int __init floppy_init(void)
4280 4261
4281 /* initialise drive state */ 4262 /* initialise drive state */
4282 for (drive = 0; drive < N_DRIVE; drive++) { 4263 for (drive = 0; drive < N_DRIVE; drive++) {
4283 CLEARSTRUCT(UDRS); 4264 memset(UDRS, 0, sizeof(*UDRS));
4284 CLEARSTRUCT(UDRWE); 4265 memset(UDRWE, 0, sizeof(*UDRWE));
4285 USETF(FD_DISK_NEWCHANGE); 4266 set_bit(FD_DISK_NEWCHANGE_BIT, &UDRS->flags);
4286 USETF(FD_DISK_CHANGED); 4267 set_bit(FD_DISK_CHANGED_BIT, &UDRS->flags);
4287 USETF(FD_VERIFY); 4268 set_bit(FD_VERIFY_BIT, &UDRS->flags);
4288 UDRS->fd_device = -1; 4269 UDRS->fd_device = -1;
4289 floppy_track_buffer = NULL; 4270 floppy_track_buffer = NULL;
4290 max_buffer_sectors = 0; 4271 max_buffer_sectors = 0;
@@ -4304,7 +4285,7 @@ static int __init floppy_init(void)
4304 if (FDCS->address == -1) 4285 if (FDCS->address == -1)
4305 continue; 4286 continue;
4306 FDCS->rawcmd = 2; 4287 FDCS->rawcmd = 2;
4307 if (user_reset_fdc(-1, FD_RESET_ALWAYS, 0)) { 4288 if (user_reset_fdc(-1, FD_RESET_ALWAYS, false)) {
4308 /* free ioports reserved by floppy_grab_irq_and_dma() */ 4289 /* free ioports reserved by floppy_grab_irq_and_dma() */
4309 floppy_release_regions(fdc); 4290 floppy_release_regions(fdc);
4310 FDCS->address = -1; 4291 FDCS->address = -1;
@@ -4327,12 +4308,12 @@ static int __init floppy_init(void)
4327 * properly, so force a reset for the standard FDC clones, 4308 * properly, so force a reset for the standard FDC clones,
4328 * to avoid interrupt garbage. 4309 * to avoid interrupt garbage.
4329 */ 4310 */
4330 user_reset_fdc(-1, FD_RESET_ALWAYS, 0); 4311 user_reset_fdc(-1, FD_RESET_ALWAYS, false);
4331 } 4312 }
4332 fdc = 0; 4313 fdc = 0;
4333 del_timer(&fd_timeout); 4314 del_timer(&fd_timeout);
4334 current_drive = 0; 4315 current_drive = 0;
4335 initialising = 0; 4316 initialized = true;
4336 if (have_no_fdc) { 4317 if (have_no_fdc) {
4337 DPRINT("no floppy controllers found\n"); 4318 DPRINT("no floppy controllers found\n");
4338 err = have_no_fdc; 4319 err = have_no_fdc;
@@ -4353,7 +4334,8 @@ static int __init floppy_init(void)
4353 if (err) 4334 if (err)
4354 goto out_flush_work; 4335 goto out_flush_work;
4355 4336
4356 err = device_create_file(&floppy_device[drive].dev,&dev_attr_cmos); 4337 err = device_create_file(&floppy_device[drive].dev,
4338 &dev_attr_cmos);
4357 if (err) 4339 if (err)
4358 goto out_unreg_platform_dev; 4340 goto out_unreg_platform_dev;
4359 4341
@@ -4417,8 +4399,10 @@ static int floppy_request_regions(int fdc)
4417 const struct io_region *p; 4399 const struct io_region *p;
4418 4400
4419 for (p = io_regions; p < ARRAY_END(io_regions); p++) { 4401 for (p = io_regions; p < ARRAY_END(io_regions); p++) {
4420 if (!request_region(FDCS->address + p->offset, p->size, "floppy")) { 4402 if (!request_region(FDCS->address + p->offset,
4421 DPRINT("Floppy io-port 0x%04lx in use\n", FDCS->address + p->offset); 4403 p->size, "floppy")) {
4404 DPRINT("Floppy io-port 0x%04lx in use\n",
4405 FDCS->address + p->offset);
4422 floppy_release_allocated_regions(fdc, p); 4406 floppy_release_allocated_regions(fdc, p);
4423 return -EBUSY; 4407 return -EBUSY;
4424 } 4408 }
@@ -4509,11 +4493,9 @@ cleanup:
4509static void floppy_release_irq_and_dma(void) 4493static void floppy_release_irq_and_dma(void)
4510{ 4494{
4511 int old_fdc; 4495 int old_fdc;
4512#ifdef FLOPPY_SANITY_CHECK
4513#ifndef __sparc__ 4496#ifndef __sparc__
4514 int drive; 4497 int drive;
4515#endif 4498#endif
4516#endif
4517 long tmpsize; 4499 long tmpsize;
4518 unsigned long tmpaddr; 4500 unsigned long tmpaddr;
4519 unsigned long flags; 4501 unsigned long flags;
@@ -4544,20 +4526,18 @@ static void floppy_release_irq_and_dma(void)
4544 buffer_min = buffer_max = -1; 4526 buffer_min = buffer_max = -1;
4545 fd_dma_mem_free(tmpaddr, tmpsize); 4527 fd_dma_mem_free(tmpaddr, tmpsize);
4546 } 4528 }
4547#ifdef FLOPPY_SANITY_CHECK
4548#ifndef __sparc__ 4529#ifndef __sparc__
4549 for (drive = 0; drive < N_FDC * 4; drive++) 4530 for (drive = 0; drive < N_FDC * 4; drive++)
4550 if (timer_pending(motor_off_timer + drive)) 4531 if (timer_pending(motor_off_timer + drive))
4551 printk("motor off timer %d still active\n", drive); 4532 pr_info("motor off timer %d still active\n", drive);
4552#endif 4533#endif
4553 4534
4554 if (timer_pending(&fd_timeout)) 4535 if (timer_pending(&fd_timeout))
4555 printk("floppy timer still active:%s\n", timeout_message); 4536 pr_info("floppy timer still active:%s\n", timeout_message);
4556 if (timer_pending(&fd_timer)) 4537 if (timer_pending(&fd_timer))
4557 printk("auxiliary floppy timer still active\n"); 4538 pr_info("auxiliary floppy timer still active\n");
4558 if (work_pending(&floppy_work)) 4539 if (work_pending(&floppy_work))
4559 printk("work still pending\n"); 4540 pr_info("work still pending\n");
4560#endif
4561 old_fdc = fdc; 4541 old_fdc = fdc;
4562 for (fdc = 0; fdc < N_FDC; fdc++) 4542 for (fdc = 0; fdc < N_FDC; fdc++)
4563 if (FDCS->address != -1) 4543 if (FDCS->address != -1)
@@ -4574,7 +4554,9 @@ static void __init parse_floppy_cfg_string(char *cfg)
4574 char *ptr; 4554 char *ptr;
4575 4555
4576 while (*cfg) { 4556 while (*cfg) {
4577 for (ptr = cfg; *cfg && *cfg != ' ' && *cfg != '\t'; cfg++) ; 4557 ptr = cfg;
4558 while (*cfg && *cfg != ' ' && *cfg != '\t')
4559 cfg++;
4578 if (*cfg) { 4560 if (*cfg) {
4579 *cfg = '\0'; 4561 *cfg = '\0';
4580 cfg++; 4562 cfg++;
@@ -4622,6 +4604,7 @@ static void __exit floppy_module_exit(void)
4622 /* eject disk, if any */ 4604 /* eject disk, if any */
4623 fd_eject(0); 4605 fd_eject(0);
4624} 4606}
4607
4625module_exit(floppy_module_exit); 4608module_exit(floppy_module_exit);
4626 4609
4627module_param(floppy, charp, 0); 4610module_param(floppy, charp, 0);
@@ -4633,9 +4616,10 @@ MODULE_LICENSE("GPL");
4633 4616
4634/* This doesn't actually get used other than for module information */ 4617/* This doesn't actually get used other than for module information */
4635static const struct pnp_device_id floppy_pnpids[] = { 4618static const struct pnp_device_id floppy_pnpids[] = {
4636 { "PNP0700", 0 }, 4619 {"PNP0700", 0},
4637 { } 4620 {}
4638}; 4621};
4622
4639MODULE_DEVICE_TABLE(pnp, floppy_pnpids); 4623MODULE_DEVICE_TABLE(pnp, floppy_pnpids);
4640 4624
4641#else 4625#else
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index d5cdce08ffd2..034e6dfc878c 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -34,7 +34,6 @@
34#include <linux/fs.h> 34#include <linux/fs.h>
35#include <linux/kernel.h> 35#include <linux/kernel.h>
36#include <linux/genhd.h> 36#include <linux/genhd.h>
37#include <linux/slab.h>
38#include <linux/string.h> 37#include <linux/string.h>
39#include <linux/ioport.h> 38#include <linux/ioport.h>
40#include <linux/init.h> 39#include <linux/init.h>
@@ -719,7 +718,7 @@ static int __init hd_init(void)
719 return -ENOMEM; 718 return -ENOMEM;
720 } 719 }
721 720
722 blk_queue_max_sectors(hd_queue, 255); 721 blk_queue_max_hw_sectors(hd_queue, 255);
723 init_timer(&device_timer); 722 init_timer(&device_timer);
724 device_timer.function = hd_times_out; 723 device_timer.function = hd_times_out;
725 blk_queue_logical_block_size(hd_queue, 512); 724 blk_queue_logical_block_size(hd_queue, 512);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index bd112c8c7bcd..8546d123b9a7 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -71,7 +71,6 @@
71#include <linux/buffer_head.h> /* for invalidate_bdev() */ 71#include <linux/buffer_head.h> /* for invalidate_bdev() */
72#include <linux/completion.h> 72#include <linux/completion.h>
73#include <linux/highmem.h> 73#include <linux/highmem.h>
74#include <linux/gfp.h>
75#include <linux/kthread.h> 74#include <linux/kthread.h>
76#include <linux/splice.h> 75#include <linux/splice.h>
77 76
@@ -238,6 +237,8 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
238 if (ret) 237 if (ret)
239 goto fail; 238 goto fail;
240 239
240 file_update_time(file);
241
241 transfer_result = lo_do_transfer(lo, WRITE, page, offset, 242 transfer_result = lo_do_transfer(lo, WRITE, page, offset,
242 bvec->bv_page, bv_offs, size, IV); 243 bvec->bv_page, bv_offs, size, IV);
243 copied = size; 244 copied = size;
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index e0339aaa1815..28db925dbdad 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -23,6 +23,7 @@
23#include <linux/platform_device.h> 23#include <linux/platform_device.h>
24#include <linux/gpio.h> 24#include <linux/gpio.h>
25#include <linux/mg_disk.h> 25#include <linux/mg_disk.h>
26#include <linux/slab.h>
26 27
27#define MG_RES_SEC (CONFIG_MG_DISK_RES << 1) 28#define MG_RES_SEC (CONFIG_MG_DISK_RES << 1)
28 29
@@ -860,7 +861,7 @@ static int mg_probe(struct platform_device *plat_dev)
860 err = -EINVAL; 861 err = -EINVAL;
861 goto probe_err_2; 862 goto probe_err_2;
862 } 863 }
863 host->dev_base = ioremap(rsc->start , rsc->end + 1); 864 host->dev_base = ioremap(rsc->start, resource_size(rsc));
864 if (!host->dev_base) { 865 if (!host->dev_base) {
865 printk(KERN_ERR "%s:%d ioremap fail\n", 866 printk(KERN_ERR "%s:%d ioremap fail\n",
866 __func__, __LINE__); 867 __func__, __LINE__);
@@ -980,7 +981,7 @@ static int mg_probe(struct platform_device *plat_dev)
980 __func__, __LINE__); 981 __func__, __LINE__);
981 goto probe_err_6; 982 goto probe_err_6;
982 } 983 }
983 blk_queue_max_sectors(host->breq, MG_MAX_SECTS); 984 blk_queue_max_hw_sectors(host->breq, MG_MAX_SECTS);
984 blk_queue_logical_block_size(host->breq, MG_SECTOR_SIZE); 985 blk_queue_logical_block_size(host->breq, MG_SECTOR_SIZE);
985 986
986 init_timer(&host->timer); 987 init_timer(&host->timer);
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index cc923a5b430c..218d091f3c52 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -27,6 +27,7 @@
27#include <linux/compiler.h> 27#include <linux/compiler.h>
28#include <linux/err.h> 28#include <linux/err.h>
29#include <linux/kernel.h> 29#include <linux/kernel.h>
30#include <linux/slab.h>
30#include <net/sock.h> 31#include <net/sock.h>
31#include <linux/net.h> 32#include <linux/net.h>
32#include <linux/kthread.h> 33#include <linux/kthread.h>
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index a808b1530b3b..6cd8b705b11b 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -63,6 +63,7 @@
63#include <linux/device.h> 63#include <linux/device.h>
64#include <linux/module.h> 64#include <linux/module.h>
65#include <linux/fs.h> 65#include <linux/fs.h>
66#include <linux/slab.h>
66#include <scsi/osd_initiator.h> 67#include <scsi/osd_initiator.h>
67#include <scsi/osd_attributes.h> 68#include <scsi/osd_attributes.h>
68#include <scsi/osd_sec.h> 69#include <scsi/osd_sec.h>
@@ -476,7 +477,9 @@ static void class_osdblk_release(struct class *cls)
476 kfree(cls); 477 kfree(cls);
477} 478}
478 479
479static ssize_t class_osdblk_list(struct class *c, char *data) 480static ssize_t class_osdblk_list(struct class *c,
481 struct class_attribute *attr,
482 char *data)
480{ 483{
481 int n = 0; 484 int n = 0;
482 struct list_head *tmp; 485 struct list_head *tmp;
@@ -500,7 +503,9 @@ static ssize_t class_osdblk_list(struct class *c, char *data)
500 return n; 503 return n;
501} 504}
502 505
503static ssize_t class_osdblk_add(struct class *c, const char *buf, size_t count) 506static ssize_t class_osdblk_add(struct class *c,
507 struct class_attribute *attr,
508 const char *buf, size_t count)
504{ 509{
505 struct osdblk_device *osdev; 510 struct osdblk_device *osdev;
506 ssize_t rc; 511 ssize_t rc;
@@ -592,7 +597,9 @@ err_out_mod:
592 return rc; 597 return rc;
593} 598}
594 599
595static ssize_t class_osdblk_remove(struct class *c, const char *buf, 600static ssize_t class_osdblk_remove(struct class *c,
601 struct class_attribute *attr,
602 const char *buf,
596 size_t count) 603 size_t count)
597{ 604{
598 struct osdblk_device *osdev = NULL; 605 struct osdblk_device *osdev = NULL;
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 8866ca369d5e..71acf4e53356 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -341,11 +341,11 @@ static int pcd_wait(struct pcd_unit *cd, int go, int stop, char *fun, char *msg)
341 && (j++ < PCD_SPIN)) 341 && (j++ < PCD_SPIN))
342 udelay(PCD_DELAY); 342 udelay(PCD_DELAY);
343 343
344 if ((r & (IDE_ERR & stop)) || (j >= PCD_SPIN)) { 344 if ((r & (IDE_ERR & stop)) || (j > PCD_SPIN)) {
345 s = read_reg(cd, 7); 345 s = read_reg(cd, 7);
346 e = read_reg(cd, 1); 346 e = read_reg(cd, 1);
347 p = read_reg(cd, 2); 347 p = read_reg(cd, 2);
348 if (j >= PCD_SPIN) 348 if (j > PCD_SPIN)
349 e |= 0x100; 349 e |= 0x100;
350 if (fun) 350 if (fun)
351 printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x" 351 printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x"
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 569e39e8f114..c1e5cd029b23 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -145,6 +145,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_GEO, D_SBY, D_DLY, D_SLV};
145 145
146#include <linux/init.h> 146#include <linux/init.h>
147#include <linux/module.h> 147#include <linux/module.h>
148#include <linux/gfp.h>
148#include <linux/fs.h> 149#include <linux/fs.h>
149#include <linux/delay.h> 150#include <linux/delay.h>
150#include <linux/hdreg.h> 151#include <linux/hdreg.h>
@@ -906,7 +907,7 @@ static int __init pd_init(void)
906 if (!pd_queue) 907 if (!pd_queue)
907 goto out1; 908 goto out1;
908 909
909 blk_queue_max_sectors(pd_queue, cluster); 910 blk_queue_max_hw_sectors(pd_queue, cluster);
910 911
911 if (register_blkdev(major, name)) 912 if (register_blkdev(major, name))
912 goto out2; 913 goto out2;
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index ea54ea393553..c059aab3006b 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -391,11 +391,11 @@ static int pf_wait(struct pf_unit *pf, int go, int stop, char *fun, char *msg)
391 && (j++ < PF_SPIN)) 391 && (j++ < PF_SPIN))
392 udelay(PF_SPIN_DEL); 392 udelay(PF_SPIN_DEL);
393 393
394 if ((r & (STAT_ERR & stop)) || (j >= PF_SPIN)) { 394 if ((r & (STAT_ERR & stop)) || (j > PF_SPIN)) {
395 s = read_reg(pf, 7); 395 s = read_reg(pf, 7);
396 e = read_reg(pf, 1); 396 e = read_reg(pf, 1);
397 p = read_reg(pf, 2); 397 p = read_reg(pf, 2);
398 if (j >= PF_SPIN) 398 if (j > PF_SPIN)
399 e |= 0x100; 399 e |= 0x100;
400 if (fun) 400 if (fun)
401 printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x" 401 printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x"
@@ -956,8 +956,7 @@ static int __init pf_init(void)
956 return -ENOMEM; 956 return -ENOMEM;
957 } 957 }
958 958
959 blk_queue_max_phys_segments(pf_queue, cluster); 959 blk_queue_max_segments(pf_queue, cluster);
960 blk_queue_max_hw_segments(pf_queue, cluster);
961 960
962 for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) { 961 for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) {
963 struct gendisk *disk = pf->disk; 962 struct gendisk *disk = pf->disk;
diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c
index 1e4006e18f03..bc5825fdeaab 100644
--- a/drivers/block/paride/pt.c
+++ b/drivers/block/paride/pt.c
@@ -274,11 +274,11 @@ static int pt_wait(struct pt_unit *tape, int go, int stop, char *fun, char *msg)
274 && (j++ < PT_SPIN)) 274 && (j++ < PT_SPIN))
275 udelay(PT_SPIN_DEL); 275 udelay(PT_SPIN_DEL);
276 276
277 if ((r & (STAT_ERR & stop)) || (j >= PT_SPIN)) { 277 if ((r & (STAT_ERR & stop)) || (j > PT_SPIN)) {
278 s = read_reg(pi, 7); 278 s = read_reg(pi, 7);
279 e = read_reg(pi, 1); 279 e = read_reg(pi, 1);
280 p = read_reg(pi, 2); 280 p = read_reg(pi, 2);
281 if (j >= PT_SPIN) 281 if (j > PT_SPIN)
282 e |= 0x100; 282 e |= 0x100;
283 if (fun) 283 if (fun)
284 printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x" 284 printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x"
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 2ddf03ae034e..8a549db2aa78 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -48,6 +48,7 @@
48#include <linux/module.h> 48#include <linux/module.h>
49#include <linux/types.h> 49#include <linux/types.h>
50#include <linux/kernel.h> 50#include <linux/kernel.h>
51#include <linux/compat.h>
51#include <linux/kthread.h> 52#include <linux/kthread.h>
52#include <linux/errno.h> 53#include <linux/errno.h>
53#include <linux/spinlock.h> 54#include <linux/spinlock.h>
@@ -57,6 +58,7 @@
57#include <linux/miscdevice.h> 58#include <linux/miscdevice.h>
58#include <linux/freezer.h> 59#include <linux/freezer.h>
59#include <linux/mutex.h> 60#include <linux/mutex.h>
61#include <linux/slab.h>
60#include <scsi/scsi_cmnd.h> 62#include <scsi/scsi_cmnd.h>
61#include <scsi/scsi_ioctl.h> 63#include <scsi/scsi_ioctl.h>
62#include <scsi/scsi.h> 64#include <scsi/scsi.h>
@@ -284,7 +286,7 @@ static ssize_t kobj_pkt_store(struct kobject *kobj,
284 return len; 286 return len;
285} 287}
286 288
287static struct sysfs_ops kobj_pkt_ops = { 289static const struct sysfs_ops kobj_pkt_ops = {
288 .show = kobj_pkt_show, 290 .show = kobj_pkt_show,
289 .store = kobj_pkt_store 291 .store = kobj_pkt_store
290}; 292};
@@ -322,7 +324,7 @@ static void pkt_sysfs_dev_remove(struct pktcdvd_device *pd)
322 pkt_kobj_remove(pd->kobj_stat); 324 pkt_kobj_remove(pd->kobj_stat);
323 pkt_kobj_remove(pd->kobj_wqueue); 325 pkt_kobj_remove(pd->kobj_wqueue);
324 if (class_pktcdvd) 326 if (class_pktcdvd)
325 device_destroy(class_pktcdvd, pd->pkt_dev); 327 device_unregister(pd->dev);
326} 328}
327 329
328 330
@@ -337,7 +339,9 @@ static void class_pktcdvd_release(struct class *cls)
337{ 339{
338 kfree(cls); 340 kfree(cls);
339} 341}
340static ssize_t class_pktcdvd_show_map(struct class *c, char *data) 342static ssize_t class_pktcdvd_show_map(struct class *c,
343 struct class_attribute *attr,
344 char *data)
341{ 345{
342 int n = 0; 346 int n = 0;
343 int idx; 347 int idx;
@@ -356,7 +360,9 @@ static ssize_t class_pktcdvd_show_map(struct class *c, char *data)
356 return n; 360 return n;
357} 361}
358 362
359static ssize_t class_pktcdvd_store_add(struct class *c, const char *buf, 363static ssize_t class_pktcdvd_store_add(struct class *c,
364 struct class_attribute *attr,
365 const char *buf,
360 size_t count) 366 size_t count)
361{ 367{
362 unsigned int major, minor; 368 unsigned int major, minor;
@@ -376,7 +382,9 @@ static ssize_t class_pktcdvd_store_add(struct class *c, const char *buf,
376 return -EINVAL; 382 return -EINVAL;
377} 383}
378 384
379static ssize_t class_pktcdvd_store_remove(struct class *c, const char *buf, 385static ssize_t class_pktcdvd_store_remove(struct class *c,
386 struct class_attribute *attr,
387 const char *buf,
380 size_t count) 388 size_t count)
381{ 389{
382 unsigned int major, minor; 390 unsigned int major, minor;
@@ -569,6 +577,7 @@ static struct packet_data *pkt_alloc_packet_data(int frames)
569 } 577 }
570 578
571 spin_lock_init(&pkt->lock); 579 spin_lock_init(&pkt->lock);
580 bio_list_init(&pkt->orig_bios);
572 581
573 for (i = 0; i < frames; i++) { 582 for (i = 0; i < frames; i++) {
574 struct bio *bio = pkt_bio_alloc(1); 583 struct bio *bio = pkt_bio_alloc(1);
@@ -721,43 +730,6 @@ static void pkt_rbtree_insert(struct pktcdvd_device *pd, struct pkt_rb_node *nod
721} 730}
722 731
723/* 732/*
724 * Add a bio to a single linked list defined by its head and tail pointers.
725 */
726static void pkt_add_list_last(struct bio *bio, struct bio **list_head, struct bio **list_tail)
727{
728 bio->bi_next = NULL;
729 if (*list_tail) {
730 BUG_ON((*list_head) == NULL);
731 (*list_tail)->bi_next = bio;
732 (*list_tail) = bio;
733 } else {
734 BUG_ON((*list_head) != NULL);
735 (*list_head) = bio;
736 (*list_tail) = bio;
737 }
738}
739
740/*
741 * Remove and return the first bio from a single linked list defined by its
742 * head and tail pointers.
743 */
744static inline struct bio *pkt_get_list_first(struct bio **list_head, struct bio **list_tail)
745{
746 struct bio *bio;
747
748 if (*list_head == NULL)
749 return NULL;
750
751 bio = *list_head;
752 *list_head = bio->bi_next;
753 if (*list_head == NULL)
754 *list_tail = NULL;
755
756 bio->bi_next = NULL;
757 return bio;
758}
759
760/*
761 * Send a packet_command to the underlying block device and 733 * Send a packet_command to the underlying block device and
762 * wait for completion. 734 * wait for completion.
763 */ 735 */
@@ -876,13 +848,10 @@ static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd,
876static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio) 848static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio)
877{ 849{
878 spin_lock(&pd->iosched.lock); 850 spin_lock(&pd->iosched.lock);
879 if (bio_data_dir(bio) == READ) { 851 if (bio_data_dir(bio) == READ)
880 pkt_add_list_last(bio, &pd->iosched.read_queue, 852 bio_list_add(&pd->iosched.read_queue, bio);
881 &pd->iosched.read_queue_tail); 853 else
882 } else { 854 bio_list_add(&pd->iosched.write_queue, bio);
883 pkt_add_list_last(bio, &pd->iosched.write_queue,
884 &pd->iosched.write_queue_tail);
885 }
886 spin_unlock(&pd->iosched.lock); 855 spin_unlock(&pd->iosched.lock);
887 856
888 atomic_set(&pd->iosched.attention, 1); 857 atomic_set(&pd->iosched.attention, 1);
@@ -917,8 +886,8 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
917 int reads_queued, writes_queued; 886 int reads_queued, writes_queued;
918 887
919 spin_lock(&pd->iosched.lock); 888 spin_lock(&pd->iosched.lock);
920 reads_queued = (pd->iosched.read_queue != NULL); 889 reads_queued = !bio_list_empty(&pd->iosched.read_queue);
921 writes_queued = (pd->iosched.write_queue != NULL); 890 writes_queued = !bio_list_empty(&pd->iosched.write_queue);
922 spin_unlock(&pd->iosched.lock); 891 spin_unlock(&pd->iosched.lock);
923 892
924 if (!reads_queued && !writes_queued) 893 if (!reads_queued && !writes_queued)
@@ -927,7 +896,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
927 if (pd->iosched.writing) { 896 if (pd->iosched.writing) {
928 int need_write_seek = 1; 897 int need_write_seek = 1;
929 spin_lock(&pd->iosched.lock); 898 spin_lock(&pd->iosched.lock);
930 bio = pd->iosched.write_queue; 899 bio = bio_list_peek(&pd->iosched.write_queue);
931 spin_unlock(&pd->iosched.lock); 900 spin_unlock(&pd->iosched.lock);
932 if (bio && (bio->bi_sector == pd->iosched.last_write)) 901 if (bio && (bio->bi_sector == pd->iosched.last_write))
933 need_write_seek = 0; 902 need_write_seek = 0;
@@ -950,13 +919,10 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
950 } 919 }
951 920
952 spin_lock(&pd->iosched.lock); 921 spin_lock(&pd->iosched.lock);
953 if (pd->iosched.writing) { 922 if (pd->iosched.writing)
954 bio = pkt_get_list_first(&pd->iosched.write_queue, 923 bio = bio_list_pop(&pd->iosched.write_queue);
955 &pd->iosched.write_queue_tail); 924 else
956 } else { 925 bio = bio_list_pop(&pd->iosched.read_queue);
957 bio = pkt_get_list_first(&pd->iosched.read_queue,
958 &pd->iosched.read_queue_tail);
959 }
960 spin_unlock(&pd->iosched.lock); 926 spin_unlock(&pd->iosched.lock);
961 927
962 if (!bio) 928 if (!bio)
@@ -992,14 +958,14 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
992static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_queue *q) 958static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_queue *q)
993{ 959{
994 if ((pd->settings.size << 9) / CD_FRAMESIZE 960 if ((pd->settings.size << 9) / CD_FRAMESIZE
995 <= queue_max_phys_segments(q)) { 961 <= queue_max_segments(q)) {
996 /* 962 /*
997 * The cdrom device can handle one segment/frame 963 * The cdrom device can handle one segment/frame
998 */ 964 */
999 clear_bit(PACKET_MERGE_SEGS, &pd->flags); 965 clear_bit(PACKET_MERGE_SEGS, &pd->flags);
1000 return 0; 966 return 0;
1001 } else if ((pd->settings.size << 9) / PAGE_SIZE 967 } else if ((pd->settings.size << 9) / PAGE_SIZE
1002 <= queue_max_phys_segments(q)) { 968 <= queue_max_segments(q)) {
1003 /* 969 /*
1004 * We can handle this case at the expense of some extra memory 970 * We can handle this case at the expense of some extra memory
1005 * copies during write operations 971 * copies during write operations
@@ -1114,7 +1080,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
1114 int f; 1080 int f;
1115 char written[PACKET_MAX_SIZE]; 1081 char written[PACKET_MAX_SIZE];
1116 1082
1117 BUG_ON(!pkt->orig_bios); 1083 BUG_ON(bio_list_empty(&pkt->orig_bios));
1118 1084
1119 atomic_set(&pkt->io_wait, 0); 1085 atomic_set(&pkt->io_wait, 0);
1120 atomic_set(&pkt->io_errors, 0); 1086 atomic_set(&pkt->io_errors, 0);
@@ -1124,7 +1090,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
1124 */ 1090 */
1125 memset(written, 0, sizeof(written)); 1091 memset(written, 0, sizeof(written));
1126 spin_lock(&pkt->lock); 1092 spin_lock(&pkt->lock);
1127 for (bio = pkt->orig_bios; bio; bio = bio->bi_next) { 1093 bio_list_for_each(bio, &pkt->orig_bios) {
1128 int first_frame = (bio->bi_sector - pkt->sector) / (CD_FRAMESIZE >> 9); 1094 int first_frame = (bio->bi_sector - pkt->sector) / (CD_FRAMESIZE >> 9);
1129 int num_frames = bio->bi_size / CD_FRAMESIZE; 1095 int num_frames = bio->bi_size / CD_FRAMESIZE;
1130 pd->stats.secs_w += num_frames * (CD_FRAMESIZE >> 9); 1096 pd->stats.secs_w += num_frames * (CD_FRAMESIZE >> 9);
@@ -1363,7 +1329,7 @@ try_next_bio:
1363 break; 1329 break;
1364 pkt_rbtree_erase(pd, node); 1330 pkt_rbtree_erase(pd, node);
1365 spin_lock(&pkt->lock); 1331 spin_lock(&pkt->lock);
1366 pkt_add_list_last(bio, &pkt->orig_bios, &pkt->orig_bios_tail); 1332 bio_list_add(&pkt->orig_bios, bio);
1367 pkt->write_size += bio->bi_size / CD_FRAMESIZE; 1333 pkt->write_size += bio->bi_size / CD_FRAMESIZE;
1368 spin_unlock(&pkt->lock); 1334 spin_unlock(&pkt->lock);
1369 } 1335 }
@@ -1409,7 +1375,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
1409 */ 1375 */
1410 frames_write = 0; 1376 frames_write = 0;
1411 spin_lock(&pkt->lock); 1377 spin_lock(&pkt->lock);
1412 for (bio = pkt->orig_bios; bio; bio = bio->bi_next) { 1378 bio_list_for_each(bio, &pkt->orig_bios) {
1413 int segment = bio->bi_idx; 1379 int segment = bio->bi_idx;
1414 int src_offs = 0; 1380 int src_offs = 0;
1415 int first_frame = (bio->bi_sector - pkt->sector) / (CD_FRAMESIZE >> 9); 1381 int first_frame = (bio->bi_sector - pkt->sector) / (CD_FRAMESIZE >> 9);
@@ -1472,20 +1438,14 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
1472 1438
1473static void pkt_finish_packet(struct packet_data *pkt, int uptodate) 1439static void pkt_finish_packet(struct packet_data *pkt, int uptodate)
1474{ 1440{
1475 struct bio *bio, *next; 1441 struct bio *bio;
1476 1442
1477 if (!uptodate) 1443 if (!uptodate)
1478 pkt->cache_valid = 0; 1444 pkt->cache_valid = 0;
1479 1445
1480 /* Finish all bios corresponding to this packet */ 1446 /* Finish all bios corresponding to this packet */
1481 bio = pkt->orig_bios; 1447 while ((bio = bio_list_pop(&pkt->orig_bios)))
1482 while (bio) {
1483 next = bio->bi_next;
1484 bio->bi_next = NULL;
1485 bio_endio(bio, uptodate ? 0 : -EIO); 1448 bio_endio(bio, uptodate ? 0 : -EIO);
1486 bio = next;
1487 }
1488 pkt->orig_bios = pkt->orig_bios_tail = NULL;
1489} 1449}
1490 1450
1491static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data *pkt) 1451static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data *pkt)
@@ -2360,7 +2320,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
2360 * even if the size is a multiple of the packet size. 2320 * even if the size is a multiple of the packet size.
2361 */ 2321 */
2362 spin_lock_irq(q->queue_lock); 2322 spin_lock_irq(q->queue_lock);
2363 blk_queue_max_sectors(q, pd->settings.size); 2323 blk_queue_max_hw_sectors(q, pd->settings.size);
2364 spin_unlock_irq(q->queue_lock); 2324 spin_unlock_irq(q->queue_lock);
2365 set_bit(PACKET_WRITABLE, &pd->flags); 2325 set_bit(PACKET_WRITABLE, &pd->flags);
2366 } else { 2326 } else {
@@ -2567,8 +2527,7 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio)
2567 spin_lock(&pkt->lock); 2527 spin_lock(&pkt->lock);
2568 if ((pkt->state == PACKET_WAITING_STATE) || 2528 if ((pkt->state == PACKET_WAITING_STATE) ||
2569 (pkt->state == PACKET_READ_WAIT_STATE)) { 2529 (pkt->state == PACKET_READ_WAIT_STATE)) {
2570 pkt_add_list_last(bio, &pkt->orig_bios, 2530 bio_list_add(&pkt->orig_bios, bio);
2571 &pkt->orig_bios_tail);
2572 pkt->write_size += bio->bi_size / CD_FRAMESIZE; 2531 pkt->write_size += bio->bi_size / CD_FRAMESIZE;
2573 if ((pkt->write_size >= pkt->frames) && 2532 if ((pkt->write_size >= pkt->frames) &&
2574 (pkt->state == PACKET_WAITING_STATE)) { 2533 (pkt->state == PACKET_WAITING_STATE)) {
@@ -2662,7 +2621,7 @@ static void pkt_init_queue(struct pktcdvd_device *pd)
2662 2621
2663 blk_queue_make_request(q, pkt_make_request); 2622 blk_queue_make_request(q, pkt_make_request);
2664 blk_queue_logical_block_size(q, CD_FRAMESIZE); 2623 blk_queue_logical_block_size(q, CD_FRAMESIZE);
2665 blk_queue_max_sectors(q, PACKET_MAX_SECTORS); 2624 blk_queue_max_hw_sectors(q, PACKET_MAX_SECTORS);
2666 blk_queue_merge_bvec(q, pkt_merge_bvec); 2625 blk_queue_merge_bvec(q, pkt_merge_bvec);
2667 q->queuedata = pd; 2626 q->queuedata = pd;
2668} 2627}
@@ -2898,6 +2857,8 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
2898 2857
2899 spin_lock_init(&pd->lock); 2858 spin_lock_init(&pd->lock);
2900 spin_lock_init(&pd->iosched.lock); 2859 spin_lock_init(&pd->iosched.lock);
2860 bio_list_init(&pd->iosched.read_queue);
2861 bio_list_init(&pd->iosched.write_queue);
2901 sprintf(pd->name, DRIVER_NAME"%d", idx); 2862 sprintf(pd->name, DRIVER_NAME"%d", idx);
2902 init_waitqueue_head(&pd->wqueue); 2863 init_waitqueue_head(&pd->wqueue);
2903 pd->bio_queue = RB_ROOT; 2864 pd->bio_queue = RB_ROOT;
@@ -3024,7 +2985,7 @@ static void pkt_get_status(struct pkt_ctrl_command *ctrl_cmd)
3024 mutex_unlock(&ctl_mutex); 2985 mutex_unlock(&ctl_mutex);
3025} 2986}
3026 2987
3027static int pkt_ctl_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) 2988static long pkt_ctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3028{ 2989{
3029 void __user *argp = (void __user *)arg; 2990 void __user *argp = (void __user *)arg;
3030 struct pkt_ctrl_command ctrl_cmd; 2991 struct pkt_ctrl_command ctrl_cmd;
@@ -3061,10 +3022,20 @@ static int pkt_ctl_ioctl(struct inode *inode, struct file *file, unsigned int cm
3061 return ret; 3022 return ret;
3062} 3023}
3063 3024
3025#ifdef CONFIG_COMPAT
3026static long pkt_ctl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3027{
3028 return pkt_ctl_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
3029}
3030#endif
3064 3031
3065static const struct file_operations pkt_ctl_fops = { 3032static const struct file_operations pkt_ctl_fops = {
3066 .ioctl = pkt_ctl_ioctl, 3033 .open = nonseekable_open,
3067 .owner = THIS_MODULE, 3034 .unlocked_ioctl = pkt_ctl_ioctl,
3035#ifdef CONFIG_COMPAT
3036 .compat_ioctl = pkt_ctl_compat_ioctl,
3037#endif
3038 .owner = THIS_MODULE,
3068}; 3039};
3069 3040
3070static struct miscdevice pkt_misc = { 3041static struct miscdevice pkt_misc = {
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 03a130dca8ab..3b419e3fffa1 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -20,6 +20,7 @@
20 20
21#include <linux/ata.h> 21#include <linux/ata.h>
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/slab.h>
23 24
24#include <asm/lv1call.h> 25#include <asm/lv1call.h>
25#include <asm/ps3stor.h> 26#include <asm/ps3stor.h>
@@ -474,7 +475,7 @@ static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev)
474 475
475 blk_queue_bounce_limit(queue, BLK_BOUNCE_HIGH); 476 blk_queue_bounce_limit(queue, BLK_BOUNCE_HIGH);
476 477
477 blk_queue_max_sectors(queue, dev->bounce_size >> 9); 478 blk_queue_max_hw_sectors(queue, dev->bounce_size >> 9);
478 blk_queue_segment_boundary(queue, -1UL); 479 blk_queue_segment_boundary(queue, -1UL);
479 blk_queue_dma_alignment(queue, dev->blk_size-1); 480 blk_queue_dma_alignment(queue, dev->blk_size-1);
480 blk_queue_logical_block_size(queue, dev->blk_size); 481 blk_queue_logical_block_size(queue, dev->blk_size);
@@ -482,8 +483,7 @@ static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev)
482 blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH, 483 blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH,
483 ps3disk_prepare_flush); 484 ps3disk_prepare_flush);
484 485
485 blk_queue_max_phys_segments(queue, -1); 486 blk_queue_max_segments(queue, -1);
486 blk_queue_max_hw_segments(queue, -1);
487 blk_queue_max_segment_size(queue, dev->bounce_size); 487 blk_queue_max_segment_size(queue, dev->bounce_size);
488 488
489 gendisk = alloc_disk(PS3DISK_MINORS); 489 gendisk = alloc_disk(PS3DISK_MINORS);
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index 3bb7c47c869f..b3bdb8af89cf 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -12,6 +12,7 @@
12#include <linux/delay.h> 12#include <linux/delay.h>
13#include <linux/proc_fs.h> 13#include <linux/proc_fs.h>
14#include <linux/seq_file.h> 14#include <linux/seq_file.h>
15#include <linux/slab.h>
15 16
16#include <asm/cell-regs.h> 17#include <asm/cell-regs.h>
17#include <asm/firmware.h> 18#include <asm/firmware.h>
@@ -123,7 +124,15 @@ static int ps3vram_notifier_wait(struct ps3_system_bus_device *dev,
123{ 124{
124 struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); 125 struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
125 u32 *notify = ps3vram_get_notifier(priv->reports, NOTIFIER); 126 u32 *notify = ps3vram_get_notifier(priv->reports, NOTIFIER);
126 unsigned long timeout = jiffies + msecs_to_jiffies(timeout_ms); 127 unsigned long timeout;
128
129 for (timeout = 20; timeout; timeout--) {
130 if (!notify[3])
131 return 0;
132 udelay(10);
133 }
134
135 timeout = jiffies + msecs_to_jiffies(timeout_ms);
127 136
128 do { 137 do {
129 if (!notify[3]) 138 if (!notify[3])
@@ -743,10 +752,9 @@ static int __devinit ps3vram_probe(struct ps3_system_bus_device *dev)
743 priv->queue = queue; 752 priv->queue = queue;
744 queue->queuedata = dev; 753 queue->queuedata = dev;
745 blk_queue_make_request(queue, ps3vram_make_request); 754 blk_queue_make_request(queue, ps3vram_make_request);
746 blk_queue_max_phys_segments(queue, MAX_PHYS_SEGMENTS); 755 blk_queue_max_segments(queue, BLK_MAX_SEGMENTS);
747 blk_queue_max_hw_segments(queue, MAX_HW_SEGMENTS); 756 blk_queue_max_segment_size(queue, BLK_MAX_SEGMENT_SIZE);
748 blk_queue_max_segment_size(queue, MAX_SEGMENT_SIZE); 757 blk_queue_max_hw_sectors(queue, BLK_SAFE_MAX_SECTORS);
749 blk_queue_max_sectors(queue, SAFE_MAX_SECTORS);
750 758
751 gendisk = alloc_disk(1); 759 gendisk = alloc_disk(1);
752 if (!gendisk) { 760 if (!gendisk) {
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 411f064760b4..48e8fee9f2d4 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -691,9 +691,8 @@ static int probe_disk(struct vdc_port *port)
691 691
692 port->disk = g; 692 port->disk = g;
693 693
694 blk_queue_max_hw_segments(q, port->ring_cookies); 694 blk_queue_max_segments(q, port->ring_cookies);
695 blk_queue_max_phys_segments(q, port->ring_cookies); 695 blk_queue_max_hw_sectors(q, port->max_xfer_size);
696 blk_queue_max_sectors(q, port->max_xfer_size);
697 g->major = vdc_major; 696 g->major = vdc_major;
698 g->first_minor = port->vio.vdev->dev_no << PARTITION_SHIFT; 697 g->first_minor = port->vio.vdev->dev_no << PARTITION_SHIFT;
699 strcpy(g->disk_name, port->disk_name); 698 strcpy(g->disk_name, port->disk_name);
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index 8f569e3df890..e463657569ff 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/fd.h> 20#include <linux/fd.h>
21#include <linux/slab.h>
21#include <linux/blkdev.h> 22#include <linux/blkdev.h>
22#include <linux/hdreg.h> 23#include <linux/hdreg.h>
23#include <linux/kernel.h> 24#include <linux/kernel.h>
@@ -864,7 +865,7 @@ static int __devinit swim_probe(struct platform_device *dev)
864 struct swim_priv *swd; 865 struct swim_priv *swd;
865 int ret; 866 int ret;
866 867
867 res = platform_get_resource_byname(dev, IORESOURCE_MEM, "swim-regs"); 868 res = platform_get_resource(dev, IORESOURCE_MEM, 0);
868 if (!res) { 869 if (!res) {
869 ret = -ENODEV; 870 ret = -ENODEV;
870 goto out; 871 goto out;
@@ -942,7 +943,7 @@ static int __devexit swim_remove(struct platform_device *dev)
942 943
943 iounmap(swd->base); 944 iounmap(swd->base);
944 945
945 res = platform_get_resource_byname(dev, IORESOURCE_MEM, "swim-regs"); 946 res = platform_get_resource(dev, IORESOURCE_MEM, 0);
946 if (res) 947 if (res)
947 release_mem_region(res->start, resource_size(res)); 948 release_mem_region(res->start, resource_size(res));
948 949
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 6380ad8d91bd..59ca2b77b574 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -200,7 +200,7 @@ struct floppy_state {
200 int ejected; 200 int ejected;
201 wait_queue_head_t wait; 201 wait_queue_head_t wait;
202 int wanted; 202 int wanted;
203 struct device_node* media_bay; /* NULL when not in bay */ 203 struct macio_dev *mdev;
204 char dbdma_cmd_space[5 * sizeof(struct dbdma_cmd)]; 204 char dbdma_cmd_space[5 * sizeof(struct dbdma_cmd)];
205}; 205};
206 206
@@ -303,14 +303,13 @@ static int swim3_readbit(struct floppy_state *fs, int bit)
303static void do_fd_request(struct request_queue * q) 303static void do_fd_request(struct request_queue * q)
304{ 304{
305 int i; 305 int i;
306 for(i=0;i<floppy_count;i++) 306
307 { 307 for(i=0; i<floppy_count; i++) {
308#ifdef CONFIG_PMAC_MEDIABAY 308 struct floppy_state *fs = &floppy_states[i];
309 if (floppy_states[i].media_bay && 309 if (fs->mdev->media_bay &&
310 check_media_bay(floppy_states[i].media_bay, MB_FD)) 310 check_media_bay(fs->mdev->media_bay) != MB_FD)
311 continue; 311 continue;
312#endif /* CONFIG_PMAC_MEDIABAY */ 312 start_request(fs);
313 start_request(&floppy_states[i]);
314 } 313 }
315} 314}
316 315
@@ -849,10 +848,9 @@ static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
849 if ((cmd & 0x80) && !capable(CAP_SYS_ADMIN)) 848 if ((cmd & 0x80) && !capable(CAP_SYS_ADMIN))
850 return -EPERM; 849 return -EPERM;
851 850
852#ifdef CONFIG_PMAC_MEDIABAY 851 if (fs->mdev->media_bay &&
853 if (fs->media_bay && check_media_bay(fs->media_bay, MB_FD)) 852 check_media_bay(fs->mdev->media_bay) != MB_FD)
854 return -ENXIO; 853 return -ENXIO;
855#endif
856 854
857 switch (cmd) { 855 switch (cmd) {
858 case FDEJECT: 856 case FDEJECT:
@@ -876,10 +874,9 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
876 int n, err = 0; 874 int n, err = 0;
877 875
878 if (fs->ref_count == 0) { 876 if (fs->ref_count == 0) {
879#ifdef CONFIG_PMAC_MEDIABAY 877 if (fs->mdev->media_bay &&
880 if (fs->media_bay && check_media_bay(fs->media_bay, MB_FD)) 878 check_media_bay(fs->mdev->media_bay) != MB_FD)
881 return -ENXIO; 879 return -ENXIO;
882#endif
883 out_8(&sw->setup, S_IBM_DRIVE | S_FCLK_DIV2); 880 out_8(&sw->setup, S_IBM_DRIVE | S_FCLK_DIV2);
884 out_8(&sw->control_bic, 0xff); 881 out_8(&sw->control_bic, 0xff);
885 out_8(&sw->mode, 0x95); 882 out_8(&sw->mode, 0x95);
@@ -963,10 +960,9 @@ static int floppy_revalidate(struct gendisk *disk)
963 struct swim3 __iomem *sw; 960 struct swim3 __iomem *sw;
964 int ret, n; 961 int ret, n;
965 962
966#ifdef CONFIG_PMAC_MEDIABAY 963 if (fs->mdev->media_bay &&
967 if (fs->media_bay && check_media_bay(fs->media_bay, MB_FD)) 964 check_media_bay(fs->mdev->media_bay) != MB_FD)
968 return -ENXIO; 965 return -ENXIO;
969#endif
970 966
971 sw = fs->swim3; 967 sw = fs->swim3;
972 grab_drive(fs, revalidating, 0); 968 grab_drive(fs, revalidating, 0);
@@ -1009,7 +1005,6 @@ static const struct block_device_operations floppy_fops = {
1009static int swim3_add_device(struct macio_dev *mdev, int index) 1005static int swim3_add_device(struct macio_dev *mdev, int index)
1010{ 1006{
1011 struct device_node *swim = mdev->ofdev.node; 1007 struct device_node *swim = mdev->ofdev.node;
1012 struct device_node *mediabay;
1013 struct floppy_state *fs = &floppy_states[index]; 1008 struct floppy_state *fs = &floppy_states[index];
1014 int rc = -EBUSY; 1009 int rc = -EBUSY;
1015 1010
@@ -1036,9 +1031,7 @@ static int swim3_add_device(struct macio_dev *mdev, int index)
1036 } 1031 }
1037 dev_set_drvdata(&mdev->ofdev.dev, fs); 1032 dev_set_drvdata(&mdev->ofdev.dev, fs);
1038 1033
1039 mediabay = (strcasecmp(swim->parent->type, "media-bay") == 0) ? 1034 if (mdev->media_bay == NULL)
1040 swim->parent : NULL;
1041 if (mediabay == NULL)
1042 pmac_call_feature(PMAC_FTR_SWIM3_ENABLE, swim, 0, 1); 1035 pmac_call_feature(PMAC_FTR_SWIM3_ENABLE, swim, 0, 1);
1043 1036
1044 memset(fs, 0, sizeof(*fs)); 1037 memset(fs, 0, sizeof(*fs));
@@ -1068,7 +1061,7 @@ static int swim3_add_device(struct macio_dev *mdev, int index)
1068 fs->secpercyl = 36; 1061 fs->secpercyl = 36;
1069 fs->secpertrack = 18; 1062 fs->secpertrack = 18;
1070 fs->total_secs = 2880; 1063 fs->total_secs = 2880;
1071 fs->media_bay = mediabay; 1064 fs->mdev = mdev;
1072 init_waitqueue_head(&fs->wait); 1065 init_waitqueue_head(&fs->wait);
1073 1066
1074 fs->dma_cmd = (struct dbdma_cmd *) DBDMA_ALIGN(fs->dbdma_cmd_space); 1067 fs->dma_cmd = (struct dbdma_cmd *) DBDMA_ALIGN(fs->dbdma_cmd_space);
@@ -1093,7 +1086,7 @@ static int swim3_add_device(struct macio_dev *mdev, int index)
1093 init_timer(&fs->timeout); 1086 init_timer(&fs->timeout);
1094 1087
1095 printk(KERN_INFO "fd%d: SWIM3 floppy controller %s\n", floppy_count, 1088 printk(KERN_INFO "fd%d: SWIM3 floppy controller %s\n", floppy_count,
1096 mediabay ? "in media bay" : ""); 1089 mdev->media_bay ? "in media bay" : "");
1097 1090
1098 return 0; 1091 return 0;
1099 1092
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index a7c4184f4a63..b70f0fca9a42 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -409,7 +409,7 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
409static void carm_remove_one (struct pci_dev *pdev); 409static void carm_remove_one (struct pci_dev *pdev);
410static int carm_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo); 410static int carm_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo);
411 411
412static struct pci_device_id carm_pci_tbl[] = { 412static const struct pci_device_id carm_pci_tbl[] = {
413 { PCI_VENDOR_ID_PROMISE, 0x8000, PCI_ANY_ID, PCI_ANY_ID, 0, 0, }, 413 { PCI_VENDOR_ID_PROMISE, 0x8000, PCI_ANY_ID, PCI_ANY_ID, 0, 0, },
414 { PCI_VENDOR_ID_PROMISE, 0x8002, PCI_ANY_ID, PCI_ANY_ID, 0, 0, }, 414 { PCI_VENDOR_ID_PROMISE, 0x8002, PCI_ANY_ID, PCI_ANY_ID, 0, 0, },
415 { } /* terminate list */ 415 { } /* terminate list */
@@ -1518,8 +1518,7 @@ static int carm_init_disks(struct carm_host *host)
1518 break; 1518 break;
1519 } 1519 }
1520 disk->queue = q; 1520 disk->queue = q;
1521 blk_queue_max_hw_segments(q, CARM_MAX_REQ_SG); 1521 blk_queue_max_segments(q, CARM_MAX_REQ_SG);
1522 blk_queue_max_phys_segments(q, CARM_MAX_REQ_SG);
1523 blk_queue_segment_boundary(q, CARM_SG_BOUNDARY); 1522 blk_queue_segment_boundary(q, CARM_SG_BOUNDARY);
1524 1523
1525 q->queuedata = port; 1524 q->queuedata = port;
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index c739b203fe91..0536b5b29adc 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -27,6 +27,7 @@
27#include <linux/blkdev.h> 27#include <linux/blkdev.h>
28#include <linux/timer.h> 28#include <linux/timer.h>
29#include <linux/scatterlist.h> 29#include <linux/scatterlist.h>
30#include <linux/slab.h>
30#include <scsi/scsi.h> 31#include <scsi/scsi.h>
31 32
32#define DRV_NAME "ub" 33#define DRV_NAME "ub"
@@ -393,7 +394,7 @@ static int ub_probe_lun(struct ub_dev *sc, int lnum);
393#define ub_usb_ids usb_storage_usb_ids 394#define ub_usb_ids usb_storage_usb_ids
394#else 395#else
395 396
396static struct usb_device_id ub_usb_ids[] = { 397static const struct usb_device_id ub_usb_ids[] = {
397 { USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_SCSI, US_PR_BULK) }, 398 { USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_SCSI, US_PR_BULK) },
398 { } 399 { }
399}; 400};
@@ -2320,10 +2321,9 @@ static int ub_probe_lun(struct ub_dev *sc, int lnum)
2320 disk->queue = q; 2321 disk->queue = q;
2321 2322
2322 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); 2323 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
2323 blk_queue_max_hw_segments(q, UB_MAX_REQ_SG); 2324 blk_queue_max_segments(q, UB_MAX_REQ_SG);
2324 blk_queue_max_phys_segments(q, UB_MAX_REQ_SG);
2325 blk_queue_segment_boundary(q, 0xffffffff); /* Dubious. */ 2325 blk_queue_segment_boundary(q, 0xffffffff); /* Dubious. */
2326 blk_queue_max_sectors(q, UB_MAX_SECTORS); 2326 blk_queue_max_hw_sectors(q, UB_MAX_SECTORS);
2327 blk_queue_logical_block_size(q, lun->capacity.bsize); 2327 blk_queue_logical_block_size(q, lun->capacity.bsize);
2328 2328
2329 lun->disk = disk; 2329 lun->disk = disk;
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index ad1ba393801a..2f9470ff8f7c 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -40,13 +40,13 @@
40#include <linux/kernel.h> 40#include <linux/kernel.h>
41#include <linux/mm.h> 41#include <linux/mm.h>
42#include <linux/mman.h> 42#include <linux/mman.h>
43#include <linux/gfp.h>
43#include <linux/ioctl.h> 44#include <linux/ioctl.h>
44#include <linux/module.h> 45#include <linux/module.h>
45#include <linux/init.h> 46#include <linux/init.h>
46#include <linux/interrupt.h> 47#include <linux/interrupt.h>
47#include <linux/timer.h> 48#include <linux/timer.h>
48#include <linux/pci.h> 49#include <linux/pci.h>
49#include <linux/slab.h>
50#include <linux/dma-mapping.h> 50#include <linux/dma-mapping.h>
51 51
52#include <linux/fcntl.h> /* O_ACCMODE */ 52#include <linux/fcntl.h> /* O_ACCMODE */
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index a8c8b56b275e..788d93882ab9 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -28,6 +28,9 @@
28 * All disk operations are performed by sending messages back and forth to 28 * All disk operations are performed by sending messages back and forth to
29 * the OS/400 partition. 29 * the OS/400 partition.
30 */ 30 */
31
32#define pr_fmt(fmt) "viod: " fmt
33
31#include <linux/major.h> 34#include <linux/major.h>
32#include <linux/fs.h> 35#include <linux/fs.h>
33#include <linux/module.h> 36#include <linux/module.h>
@@ -63,9 +66,6 @@ MODULE_LICENSE("GPL");
63 66
64#define VIOD_VERS "1.64" 67#define VIOD_VERS "1.64"
65 68
66#define VIOD_KERN_WARNING KERN_WARNING "viod: "
67#define VIOD_KERN_INFO KERN_INFO "viod: "
68
69enum { 69enum {
70 PARTITION_SHIFT = 3, 70 PARTITION_SHIFT = 3,
71 MAX_DISKNO = HVMAXARCHITECTEDVIRTUALDISKS, 71 MAX_DISKNO = HVMAXARCHITECTEDVIRTUALDISKS,
@@ -156,7 +156,7 @@ static int viodasd_open(struct block_device *bdev, fmode_t mode)
156 ((u64)DEVICE_NO(d) << 48) | ((u64)flags << 32), 156 ((u64)DEVICE_NO(d) << 48) | ((u64)flags << 32),
157 0, 0, 0); 157 0, 0, 0);
158 if (hvrc != 0) { 158 if (hvrc != 0) {
159 printk(VIOD_KERN_WARNING "HV open failed %d\n", (int)hvrc); 159 pr_warning("HV open failed %d\n", (int)hvrc);
160 return -EIO; 160 return -EIO;
161 } 161 }
162 162
@@ -167,9 +167,8 @@ static int viodasd_open(struct block_device *bdev, fmode_t mode)
167 const struct vio_error_entry *err = 167 const struct vio_error_entry *err =
168 vio_lookup_rc(viodasd_err_table, we.sub_result); 168 vio_lookup_rc(viodasd_err_table, we.sub_result);
169 169
170 printk(VIOD_KERN_WARNING 170 pr_warning("bad rc opening disk: %d:0x%04x (%s)\n",
171 "bad rc opening disk: %d:0x%04x (%s)\n", 171 (int)we.rc, we.sub_result, err->msg);
172 (int)we.rc, we.sub_result, err->msg);
173 return -EIO; 172 return -EIO;
174 } 173 }
175 174
@@ -195,8 +194,7 @@ static int viodasd_release(struct gendisk *disk, fmode_t mode)
195 ((u64)DEVICE_NO(d) << 48) /* | ((u64)flags << 32) */, 194 ((u64)DEVICE_NO(d) << 48) /* | ((u64)flags << 32) */,
196 0, 0, 0); 195 0, 0, 0);
197 if (hvrc != 0) 196 if (hvrc != 0)
198 printk(VIOD_KERN_WARNING "HV close call failed %d\n", 197 pr_warning("HV close call failed %d\n", (int)hvrc);
199 (int)hvrc);
200 return 0; 198 return 0;
201} 199}
202 200
@@ -288,8 +286,7 @@ static int send_request(struct request *req)
288 bevent = (struct vioblocklpevent *) 286 bevent = (struct vioblocklpevent *)
289 vio_get_event_buffer(viomajorsubtype_blockio); 287 vio_get_event_buffer(viomajorsubtype_blockio);
290 if (bevent == NULL) { 288 if (bevent == NULL) {
291 printk(VIOD_KERN_WARNING 289 pr_warning("error allocating disk event buffer\n");
292 "error allocating disk event buffer\n");
293 goto error_ret; 290 goto error_ret;
294 } 291 }
295 292
@@ -333,9 +330,8 @@ static int send_request(struct request *req)
333 } 330 }
334 331
335 if (hvrc != HvLpEvent_Rc_Good) { 332 if (hvrc != HvLpEvent_Rc_Good) {
336 printk(VIOD_KERN_WARNING 333 pr_warning("error sending disk event to OS/400 (rc %d)\n",
337 "error sending disk event to OS/400 (rc %d)\n", 334 (int)hvrc);
338 (int)hvrc);
339 goto error_ret; 335 goto error_ret;
340 } 336 }
341 spin_unlock_irqrestore(&viodasd_spinlock, flags); 337 spin_unlock_irqrestore(&viodasd_spinlock, flags);
@@ -402,7 +398,7 @@ retry:
402 ((u64)dev_no << 48) | ((u64)flags<< 32), 398 ((u64)dev_no << 48) | ((u64)flags<< 32),
403 0, 0, 0); 399 0, 0, 0);
404 if (hvrc != 0) { 400 if (hvrc != 0) {
405 printk(VIOD_KERN_WARNING "bad rc on HV open %d\n", (int)hvrc); 401 pr_warning("bad rc on HV open %d\n", (int)hvrc);
406 return 0; 402 return 0;
407 } 403 }
408 404
@@ -416,9 +412,8 @@ retry:
416 goto retry; 412 goto retry;
417 } 413 }
418 if (we.max_disk > (MAX_DISKNO - 1)) { 414 if (we.max_disk > (MAX_DISKNO - 1)) {
419 printk_once(VIOD_KERN_INFO 415 printk_once(KERN_INFO pr_fmt("Only examining the first %d of %d disks connected\n"),
420 "Only examining the first %d of %d disks connected\n", 416 MAX_DISKNO, we.max_disk + 1);
421 MAX_DISKNO, we.max_disk + 1);
422 } 417 }
423 418
424 /* Send the close event to OS/400. We DON'T expect a response */ 419 /* Send the close event to OS/400. We DON'T expect a response */
@@ -432,17 +427,15 @@ retry:
432 ((u64)dev_no << 48) | ((u64)flags << 32), 427 ((u64)dev_no << 48) | ((u64)flags << 32),
433 0, 0, 0); 428 0, 0, 0);
434 if (hvrc != 0) { 429 if (hvrc != 0) {
435 printk(VIOD_KERN_WARNING 430 pr_warning("bad rc sending event to OS/400 %d\n", (int)hvrc);
436 "bad rc sending event to OS/400 %d\n", (int)hvrc);
437 return 0; 431 return 0;
438 } 432 }
439 433
440 if (d->dev == NULL) { 434 if (d->dev == NULL) {
441 /* this is when we reprobe for new disks */ 435 /* this is when we reprobe for new disks */
442 if (vio_create_viodasd(dev_no) == NULL) { 436 if (vio_create_viodasd(dev_no) == NULL) {
443 printk(VIOD_KERN_WARNING 437 pr_warning("cannot allocate virtual device for disk %d\n",
444 "cannot allocate virtual device for disk %d\n", 438 dev_no);
445 dev_no);
446 return 0; 439 return 0;
447 } 440 }
448 /* 441 /*
@@ -457,23 +450,20 @@ retry:
457 spin_lock_init(&d->q_lock); 450 spin_lock_init(&d->q_lock);
458 q = blk_init_queue(do_viodasd_request, &d->q_lock); 451 q = blk_init_queue(do_viodasd_request, &d->q_lock);
459 if (q == NULL) { 452 if (q == NULL) {
460 printk(VIOD_KERN_WARNING "cannot allocate queue for disk %d\n", 453 pr_warning("cannot allocate queue for disk %d\n", dev_no);
461 dev_no);
462 return 0; 454 return 0;
463 } 455 }
464 g = alloc_disk(1 << PARTITION_SHIFT); 456 g = alloc_disk(1 << PARTITION_SHIFT);
465 if (g == NULL) { 457 if (g == NULL) {
466 printk(VIOD_KERN_WARNING 458 pr_warning("cannot allocate disk structure for disk %d\n",
467 "cannot allocate disk structure for disk %d\n", 459 dev_no);
468 dev_no);
469 blk_cleanup_queue(q); 460 blk_cleanup_queue(q);
470 return 0; 461 return 0;
471 } 462 }
472 463
473 d->disk = g; 464 d->disk = g;
474 blk_queue_max_hw_segments(q, VIOMAXBLOCKDMA); 465 blk_queue_max_segments(q, VIOMAXBLOCKDMA);
475 blk_queue_max_phys_segments(q, VIOMAXBLOCKDMA); 466 blk_queue_max_hw_sectors(q, VIODASD_MAXSECTORS);
476 blk_queue_max_sectors(q, VIODASD_MAXSECTORS);
477 g->major = VIODASD_MAJOR; 467 g->major = VIODASD_MAJOR;
478 g->first_minor = dev_no << PARTITION_SHIFT; 468 g->first_minor = dev_no << PARTITION_SHIFT;
479 if (dev_no >= 26) 469 if (dev_no >= 26)
@@ -489,13 +479,12 @@ retry:
489 g->driverfs_dev = d->dev; 479 g->driverfs_dev = d->dev;
490 set_capacity(g, d->size >> 9); 480 set_capacity(g, d->size >> 9);
491 481
492 printk(VIOD_KERN_INFO "disk %d: %lu sectors (%lu MB) " 482 pr_info("disk %d: %lu sectors (%lu MB) CHS=%d/%d/%d sector size %d%s\n",
493 "CHS=%d/%d/%d sector size %d%s\n", 483 dev_no, (unsigned long)(d->size >> 9),
494 dev_no, (unsigned long)(d->size >> 9), 484 (unsigned long)(d->size >> 20),
495 (unsigned long)(d->size >> 20), 485 (int)d->cylinders, (int)d->tracks,
496 (int)d->cylinders, (int)d->tracks, 486 (int)d->sectors, (int)d->bytes_per_sector,
497 (int)d->sectors, (int)d->bytes_per_sector, 487 d->read_only ? " (RO)" : "");
498 d->read_only ? " (RO)" : "");
499 488
500 /* register us in the global list */ 489 /* register us in the global list */
501 add_disk(g); 490 add_disk(g);
@@ -580,8 +569,8 @@ static int viodasd_handle_read_write(struct vioblocklpevent *bevent)
580 if (error) { 569 if (error) {
581 const struct vio_error_entry *err; 570 const struct vio_error_entry *err;
582 err = vio_lookup_rc(viodasd_err_table, bevent->sub_result); 571 err = vio_lookup_rc(viodasd_err_table, bevent->sub_result);
583 printk(VIOD_KERN_WARNING "read/write error %d:0x%04x (%s)\n", 572 pr_warning("read/write error %d:0x%04x (%s)\n",
584 event->xRc, bevent->sub_result, err->msg); 573 event->xRc, bevent->sub_result, err->msg);
585 num_sect = blk_rq_sectors(req); 574 num_sect = blk_rq_sectors(req);
586 } 575 }
587 qlock = req->q->queue_lock; 576 qlock = req->q->queue_lock;
@@ -606,8 +595,7 @@ static void handle_block_event(struct HvLpEvent *event)
606 return; 595 return;
607 /* First, we should NEVER get an int here...only acks */ 596 /* First, we should NEVER get an int here...only acks */
608 if (hvlpevent_is_int(event)) { 597 if (hvlpevent_is_int(event)) {
609 printk(VIOD_KERN_WARNING 598 pr_warning("Yikes! got an int in viodasd event handler!\n");
610 "Yikes! got an int in viodasd event handler!\n");
611 if (hvlpevent_need_ack(event)) { 599 if (hvlpevent_need_ack(event)) {
612 event->xRc = HvLpEvent_Rc_InvalidSubtype; 600 event->xRc = HvLpEvent_Rc_InvalidSubtype;
613 HvCallEvent_ackLpEvent(event); 601 HvCallEvent_ackLpEvent(event);
@@ -650,7 +638,7 @@ static void handle_block_event(struct HvLpEvent *event)
650 break; 638 break;
651 639
652 default: 640 default:
653 printk(VIOD_KERN_WARNING "invalid subtype!"); 641 pr_warning("invalid subtype!");
654 if (hvlpevent_need_ack(event)) { 642 if (hvlpevent_need_ack(event)) {
655 event->xRc = HvLpEvent_Rc_InvalidSubtype; 643 event->xRc = HvLpEvent_Rc_InvalidSubtype;
656 HvCallEvent_ackLpEvent(event); 644 HvCallEvent_ackLpEvent(event);
@@ -739,29 +727,26 @@ static int __init viodasd_init(void)
739 vio_set_hostlp(); 727 vio_set_hostlp();
740 728
741 if (viopath_hostLp == HvLpIndexInvalid) { 729 if (viopath_hostLp == HvLpIndexInvalid) {
742 printk(VIOD_KERN_WARNING "invalid hosting partition\n"); 730 pr_warning("invalid hosting partition\n");
743 rc = -EIO; 731 rc = -EIO;
744 goto early_fail; 732 goto early_fail;
745 } 733 }
746 734
747 printk(VIOD_KERN_INFO "vers " VIOD_VERS ", hosting partition %d\n", 735 pr_info("vers " VIOD_VERS ", hosting partition %d\n", viopath_hostLp);
748 viopath_hostLp);
749 736
750 /* register the block device */ 737 /* register the block device */
751 rc = register_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME); 738 rc = register_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME);
752 if (rc) { 739 if (rc) {
753 printk(VIOD_KERN_WARNING 740 pr_warning("Unable to get major number %d for %s\n",
754 "Unable to get major number %d for %s\n", 741 VIODASD_MAJOR, VIOD_GENHD_NAME);
755 VIODASD_MAJOR, VIOD_GENHD_NAME);
756 goto early_fail; 742 goto early_fail;
757 } 743 }
758 /* Actually open the path to the hosting partition */ 744 /* Actually open the path to the hosting partition */
759 rc = viopath_open(viopath_hostLp, viomajorsubtype_blockio, 745 rc = viopath_open(viopath_hostLp, viomajorsubtype_blockio,
760 VIOMAXREQ + 2); 746 VIOMAXREQ + 2);
761 if (rc) { 747 if (rc) {
762 printk(VIOD_KERN_WARNING 748 pr_warning("error opening path to host partition %d\n",
763 "error opening path to host partition %d\n", 749 viopath_hostLp);
764 viopath_hostLp);
765 goto unregister_blk; 750 goto unregister_blk;
766 } 751 }
767 752
@@ -770,7 +755,7 @@ static int __init viodasd_init(void)
770 755
771 rc = vio_register_driver(&viodasd_driver); 756 rc = vio_register_driver(&viodasd_driver);
772 if (rc) { 757 if (rc) {
773 printk(VIOD_KERN_WARNING "vio_register_driver failed\n"); 758 pr_warning("vio_register_driver failed\n");
774 goto unset_handler; 759 goto unset_handler;
775 } 760 }
776 761
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 51042f0ba7e1..2138a7ae050c 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -1,5 +1,6 @@
1//#define DEBUG 1//#define DEBUG
2#include <linux/spinlock.h> 2#include <linux/spinlock.h>
3#include <linux/slab.h>
3#include <linux/blkdev.h> 4#include <linux/blkdev.h>
4#include <linux/hdreg.h> 5#include <linux/hdreg.h>
5#include <linux/virtio.h> 6#include <linux/virtio.h>
@@ -243,10 +244,12 @@ static int index_to_minor(int index)
243static int __devinit virtblk_probe(struct virtio_device *vdev) 244static int __devinit virtblk_probe(struct virtio_device *vdev)
244{ 245{
245 struct virtio_blk *vblk; 246 struct virtio_blk *vblk;
247 struct request_queue *q;
246 int err; 248 int err;
247 u64 cap; 249 u64 cap;
248 u32 v; 250 u32 v, blk_size, sg_elems, opt_io_size;
249 u32 blk_size, sg_elems; 251 u16 min_io_size;
252 u8 physical_block_exp, alignment_offset;
250 253
251 if (index_to_minor(index) >= 1 << MINORBITS) 254 if (index_to_minor(index) >= 1 << MINORBITS)
252 return -ENOSPC; 255 return -ENOSPC;
@@ -293,13 +296,13 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
293 goto out_mempool; 296 goto out_mempool;
294 } 297 }
295 298
296 vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock); 299 q = vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock);
297 if (!vblk->disk->queue) { 300 if (!q) {
298 err = -ENOMEM; 301 err = -ENOMEM;
299 goto out_put_disk; 302 goto out_put_disk;
300 } 303 }
301 304
302 vblk->disk->queue->queuedata = vblk; 305 q->queuedata = vblk;
303 306
304 if (index < 26) { 307 if (index < 26) {
305 sprintf(vblk->disk->disk_name, "vd%c", 'a' + index % 26); 308 sprintf(vblk->disk->disk_name, "vd%c", 'a' + index % 26);
@@ -323,10 +326,10 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
323 326
324 /* If barriers are supported, tell block layer that queue is ordered */ 327 /* If barriers are supported, tell block layer that queue is ordered */
325 if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH)) 328 if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH))
326 blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_DRAIN_FLUSH, 329 blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH,
327 virtblk_prepare_flush); 330 virtblk_prepare_flush);
328 else if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER)) 331 else if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER))
329 blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_TAG, NULL); 332 blk_queue_ordered(q, QUEUE_ORDERED_TAG, NULL);
330 333
331 /* If disk is read-only in the host, the guest should obey */ 334 /* If disk is read-only in the host, the guest should obey */
332 if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO)) 335 if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
@@ -345,14 +348,13 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
345 set_capacity(vblk->disk, cap); 348 set_capacity(vblk->disk, cap);
346 349
347 /* We can handle whatever the host told us to handle. */ 350 /* We can handle whatever the host told us to handle. */
348 blk_queue_max_phys_segments(vblk->disk->queue, vblk->sg_elems-2); 351 blk_queue_max_segments(q, vblk->sg_elems-2);
349 blk_queue_max_hw_segments(vblk->disk->queue, vblk->sg_elems-2);
350 352
351 /* No need to bounce any requests */ 353 /* No need to bounce any requests */
352 blk_queue_bounce_limit(vblk->disk->queue, BLK_BOUNCE_ANY); 354 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
353 355
354 /* No real sector limit. */ 356 /* No real sector limit. */
355 blk_queue_max_sectors(vblk->disk->queue, -1U); 357 blk_queue_max_hw_sectors(q, -1U);
356 358
357 /* Host can optionally specify maximum segment size and number of 359 /* Host can optionally specify maximum segment size and number of
358 * segments. */ 360 * segments. */
@@ -360,16 +362,45 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
360 offsetof(struct virtio_blk_config, size_max), 362 offsetof(struct virtio_blk_config, size_max),
361 &v); 363 &v);
362 if (!err) 364 if (!err)
363 blk_queue_max_segment_size(vblk->disk->queue, v); 365 blk_queue_max_segment_size(q, v);
364 else 366 else
365 blk_queue_max_segment_size(vblk->disk->queue, -1U); 367 blk_queue_max_segment_size(q, -1U);
366 368
367 /* Host can optionally specify the block size of the device */ 369 /* Host can optionally specify the block size of the device */
368 err = virtio_config_val(vdev, VIRTIO_BLK_F_BLK_SIZE, 370 err = virtio_config_val(vdev, VIRTIO_BLK_F_BLK_SIZE,
369 offsetof(struct virtio_blk_config, blk_size), 371 offsetof(struct virtio_blk_config, blk_size),
370 &blk_size); 372 &blk_size);
371 if (!err) 373 if (!err)
372 blk_queue_logical_block_size(vblk->disk->queue, blk_size); 374 blk_queue_logical_block_size(q, blk_size);
375 else
376 blk_size = queue_logical_block_size(q);
377
378 /* Use topology information if available */
379 err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY,
380 offsetof(struct virtio_blk_config, physical_block_exp),
381 &physical_block_exp);
382 if (!err && physical_block_exp)
383 blk_queue_physical_block_size(q,
384 blk_size * (1 << physical_block_exp));
385
386 err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY,
387 offsetof(struct virtio_blk_config, alignment_offset),
388 &alignment_offset);
389 if (!err && alignment_offset)
390 blk_queue_alignment_offset(q, blk_size * alignment_offset);
391
392 err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY,
393 offsetof(struct virtio_blk_config, min_io_size),
394 &min_io_size);
395 if (!err && min_io_size)
396 blk_queue_io_min(q, blk_size * min_io_size);
397
398 err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY,
399 offsetof(struct virtio_blk_config, opt_io_size),
400 &opt_io_size);
401 if (!err && opt_io_size)
402 blk_queue_io_opt(q, blk_size * opt_io_size);
403
373 404
374 add_disk(vblk->disk); 405 add_disk(vblk->disk);
375 return 0; 406 return 0;
@@ -404,7 +435,7 @@ static void __devexit virtblk_remove(struct virtio_device *vdev)
404 kfree(vblk); 435 kfree(vblk);
405} 436}
406 437
407static struct virtio_device_id id_table[] = { 438static const struct virtio_device_id id_table[] = {
408 { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID }, 439 { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
409 { 0 }, 440 { 0 },
410}; 441};
@@ -412,7 +443,7 @@ static struct virtio_device_id id_table[] = {
412static unsigned int features[] = { 443static unsigned int features[] = {
413 VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, 444 VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX,
414 VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, 445 VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
415 VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_FLUSH 446 VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY
416}; 447};
417 448
418/* 449/*
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index 0877d3628fda..18a80ff57ce8 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -49,6 +49,7 @@
49#include <linux/blkpg.h> 49#include <linux/blkpg.h>
50#include <linux/delay.h> 50#include <linux/delay.h>
51#include <linux/io.h> 51#include <linux/io.h>
52#include <linux/gfp.h>
52 53
53#include <asm/system.h> 54#include <asm/system.h>
54#include <asm/uaccess.h> 55#include <asm/uaccess.h>
@@ -169,13 +170,6 @@ static int __init xd_init(void)
169 170
170 init_timer (&xd_watchdog_int); xd_watchdog_int.function = xd_watchdog; 171 init_timer (&xd_watchdog_int); xd_watchdog_int.function = xd_watchdog;
171 172
172 if (!xd_dma_buffer)
173 xd_dma_buffer = (char *)xd_dma_mem_alloc(xd_maxsectors * 0x200);
174 if (!xd_dma_buffer) {
175 printk(KERN_ERR "xd: Out of memory.\n");
176 return -ENOMEM;
177 }
178
179 err = -EBUSY; 173 err = -EBUSY;
180 if (register_blkdev(XT_DISK_MAJOR, "xd")) 174 if (register_blkdev(XT_DISK_MAJOR, "xd"))
181 goto out1; 175 goto out1;
@@ -202,6 +196,19 @@ static int __init xd_init(void)
202 xd_drives,xd_drives == 1 ? "" : "s",xd_irq,xd_dma); 196 xd_drives,xd_drives == 1 ? "" : "s",xd_irq,xd_dma);
203 } 197 }
204 198
199 /*
200 * With the drive detected, xd_maxsectors should now be known.
201 * If xd_maxsectors is 0, nothing was detected and we fall through
202 * to return -ENODEV
203 */
204 if (!xd_dma_buffer && xd_maxsectors) {
205 xd_dma_buffer = (char *)xd_dma_mem_alloc(xd_maxsectors * 0x200);
206 if (!xd_dma_buffer) {
207 printk(KERN_ERR "xd: Out of memory.\n");
208 goto out3;
209 }
210 }
211
205 err = -ENODEV; 212 err = -ENODEV;
206 if (!xd_drives) 213 if (!xd_drives)
207 goto out3; 214 goto out3;
@@ -236,7 +243,7 @@ static int __init xd_init(void)
236 } 243 }
237 244
238 /* xd_maxsectors depends on controller - so set after detection */ 245 /* xd_maxsectors depends on controller - so set after detection */
239 blk_queue_max_sectors(xd_queue, xd_maxsectors); 246 blk_queue_max_hw_sectors(xd_queue, xd_maxsectors);
240 247
241 for (i = 0; i < xd_drives; i++) 248 for (i = 0; i < xd_drives; i++)
242 add_disk(xd_gendisk[i]); 249 add_disk(xd_gendisk[i]);
@@ -249,15 +256,17 @@ out4:
249 for (i = 0; i < xd_drives; i++) 256 for (i = 0; i < xd_drives; i++)
250 put_disk(xd_gendisk[i]); 257 put_disk(xd_gendisk[i]);
251out3: 258out3:
252 release_region(xd_iobase,4); 259 if (xd_maxsectors)
260 release_region(xd_iobase,4);
261
262 if (xd_dma_buffer)
263 xd_dma_mem_free((unsigned long)xd_dma_buffer,
264 xd_maxsectors * 0x200);
253out2: 265out2:
254 blk_cleanup_queue(xd_queue); 266 blk_cleanup_queue(xd_queue);
255out1a: 267out1a:
256 unregister_blkdev(XT_DISK_MAJOR, "xd"); 268 unregister_blkdev(XT_DISK_MAJOR, "xd");
257out1: 269out1:
258 if (xd_dma_buffer)
259 xd_dma_mem_free((unsigned long)xd_dma_buffer,
260 xd_maxsectors * 0x200);
261 return err; 270 return err;
262Enomem: 271Enomem:
263 err = -ENOMEM; 272 err = -ENOMEM;
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index b8578bb3f4c9..82ed403147c0 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -40,8 +40,10 @@
40#include <linux/hdreg.h> 40#include <linux/hdreg.h>
41#include <linux/cdrom.h> 41#include <linux/cdrom.h>
42#include <linux/module.h> 42#include <linux/module.h>
43#include <linux/slab.h>
43#include <linux/scatterlist.h> 44#include <linux/scatterlist.h>
44 45
46#include <xen/xen.h>
45#include <xen/xenbus.h> 47#include <xen/xenbus.h>
46#include <xen/grant_table.h> 48#include <xen/grant_table.h>
47#include <xen/events.h> 49#include <xen/events.h>
@@ -345,15 +347,14 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
345 347
346 /* Hard sector size and max sectors impersonate the equiv. hardware. */ 348 /* Hard sector size and max sectors impersonate the equiv. hardware. */
347 blk_queue_logical_block_size(rq, sector_size); 349 blk_queue_logical_block_size(rq, sector_size);
348 blk_queue_max_sectors(rq, 512); 350 blk_queue_max_hw_sectors(rq, 512);
349 351
350 /* Each segment in a request is up to an aligned page in size. */ 352 /* Each segment in a request is up to an aligned page in size. */
351 blk_queue_segment_boundary(rq, PAGE_SIZE - 1); 353 blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
352 blk_queue_max_segment_size(rq, PAGE_SIZE); 354 blk_queue_max_segment_size(rq, PAGE_SIZE);
353 355
354 /* Ensure a merged request will fit in a single I/O ring slot. */ 356 /* Ensure a merged request will fit in a single I/O ring slot. */
355 blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); 357 blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
356 blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
357 358
358 /* Make sure buffer addresses are sector-aligned. */ 359 /* Make sure buffer addresses are sector-aligned. */
359 blk_queue_dma_alignment(rq, 511); 360 blk_queue_dma_alignment(rq, 511);
@@ -1049,7 +1050,7 @@ static const struct block_device_operations xlvbd_block_fops =
1049}; 1050};
1050 1051
1051 1052
1052static struct xenbus_device_id blkfront_ids[] = { 1053static const struct xenbus_device_id blkfront_ids[] = {
1053 { "vbd" }, 1054 { "vbd" },
1054 { "" } 1055 { "" }
1055}; 1056};
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index e5c5415eb45e..e1c95e208a66 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -1227,7 +1227,7 @@ static int __devexit ace_of_remove(struct of_device *op)
1227} 1227}
1228 1228
1229/* Match table for of_platform binding */ 1229/* Match table for of_platform binding */
1230static struct of_device_id ace_of_match[] __devinitdata = { 1230static const struct of_device_id ace_of_match[] __devinitconst = {
1231 { .compatible = "xlnx,opb-sysace-1.00.b", }, 1231 { .compatible = "xlnx,opb-sysace-1.00.b", },
1232 { .compatible = "xlnx,opb-sysace-1.00.c", }, 1232 { .compatible = "xlnx,opb-sysace-1.00.c", },
1233 { .compatible = "xlnx,xps-sysace-1.00.a", }, 1233 { .compatible = "xlnx,xps-sysace-1.00.a", },
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index 64f941e0f14b..9114654b54d9 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -33,6 +33,7 @@
33#include <linux/module.h> 33#include <linux/module.h>
34#include <linux/blkdev.h> 34#include <linux/blkdev.h>
35#include <linux/bitops.h> 35#include <linux/bitops.h>
36#include <linux/slab.h>
36 37
37#include <asm/setup.h> 38#include <asm/setup.h>
38#include <asm/amigahw.h> 39#include <asm/amigahw.h>